Home | History | Annotate | Download | only in memcheck
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Instrument IR to perform memory checking operations.         ---*/
      4 /*---                                               mc_translate.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of MemCheck, a heavyweight Valgrind tool for
      9    detecting memory errors.
     10 
     11    Copyright (C) 2000-2013 Julian Seward
     12       jseward (at) acm.org
     13 
     14    This program is free software; you can redistribute it and/or
     15    modify it under the terms of the GNU General Public License as
     16    published by the Free Software Foundation; either version 2 of the
     17    License, or (at your option) any later version.
     18 
     19    This program is distributed in the hope that it will be useful, but
     20    WITHOUT ANY WARRANTY; without even the implied warranty of
     21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     22    General Public License for more details.
     23 
     24    You should have received a copy of the GNU General Public License
     25    along with this program; if not, write to the Free Software
     26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     27    02111-1307, USA.
     28 
     29    The GNU General Public License is contained in the file COPYING.
     30 */
     31 
     32 #include "pub_tool_basics.h"
     33 #include "pub_tool_poolalloc.h"     // For mc_include.h
     34 #include "pub_tool_hashtable.h"     // For mc_include.h
     35 #include "pub_tool_libcassert.h"
     36 #include "pub_tool_libcprint.h"
     37 #include "pub_tool_tooliface.h"
     38 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
     39 #include "pub_tool_xarray.h"
     40 #include "pub_tool_mallocfree.h"
     41 #include "pub_tool_libcbase.h"
     42 
     43 #include "mc_include.h"
     44 
     45 
     46 /* FIXMEs JRS 2011-June-16.
     47 
     48    Check the interpretation for vector narrowing and widening ops,
     49    particularly the saturating ones.  I suspect they are either overly
     50    pessimistic and/or wrong.
     51 
     52    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
     53    saturating shifts): the interpretation is overly pessimistic.
     54    See comments on the relevant cases below for details.
     55 
     56    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
     57    both rounding and non-rounding variants): ditto
     58 */
     59 
     60 /* This file implements the Memcheck instrumentation, and in
     61    particular contains the core of its undefined value detection
     62    machinery.  For a comprehensive background of the terminology,
     63    algorithms and rationale used herein, read:
     64 
     65      Using Valgrind to detect undefined value errors with
     66      bit-precision
     67 
     68      Julian Seward and Nicholas Nethercote
     69 
     70      2005 USENIX Annual Technical Conference (General Track),
     71      Anaheim, CA, USA, April 10-15, 2005.
     72 
     73    ----
     74 
     75    Here is as good a place as any to record exactly when V bits are and
     76    should be checked, why, and what function is responsible.
     77 
     78 
     79    Memcheck complains when an undefined value is used:
     80 
     81    1. In the condition of a conditional branch.  Because it could cause
     82       incorrect control flow, and thus cause incorrect externally-visible
     83       behaviour.  [mc_translate.c:complainIfUndefined]
     84 
     85    2. As an argument to a system call, or as the value that specifies
     86       the system call number.  Because it could cause an incorrect
     87       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
     88 
     89    3. As the address in a load or store.  Because it could cause an
     90       incorrect value to be used later, which could cause externally-visible
     91       behaviour (eg. via incorrect control flow or an incorrect system call
     92       argument)  [complainIfUndefined]
     93 
     94    4. As the target address of a branch.  Because it could cause incorrect
     95       control flow.  [complainIfUndefined]
     96 
     97    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
     98       an incorrect value into the external environment.
     99       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
    100 
    101    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
    102       [complainIfUndefined]
    103 
    104    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
    105       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
    106       requested it.  [in memcheck.h]
    107 
    108 
    109    Memcheck also complains, but should not, when an undefined value is used:
    110 
    111    8. As the shift value in certain SIMD shift operations (but not in the
    112       standard integer shift operations).  This inconsistency is due to
    113       historical reasons.)  [complainIfUndefined]
    114 
    115 
    116    Memcheck does not complain, but should, when an undefined value is used:
    117 
    118    9. As an input to a client request.  Because the client request may
    119       affect the visible behaviour -- see bug #144362 for an example
    120       involving the malloc replacements in vg_replace_malloc.c and
    121       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
    122       isn't identified.  That bug report also has some info on how to solve
    123       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
    124 
    125 
    126    In practice, 1 and 2 account for the vast majority of cases.
    127 */
    128 
    129 /* Generation of addr-definedness, addr-validity and
    130    guard-definedness checks pertaining to loads and stores (Iex_Load,
    131    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
    132    loads/stores) was re-checked 11 May 2013. */
    133 
    134 /*------------------------------------------------------------*/
    135 /*--- Forward decls                                        ---*/
    136 /*------------------------------------------------------------*/
    137 
    138 struct _MCEnv;
    139 
    140 static IRType  shadowTypeV ( IRType ty );
    141 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
    142 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
    143 
    144 static IRExpr *i128_const_zero(void);
    145 
    146 /*------------------------------------------------------------*/
    147 /*--- Memcheck running state, and tmp management.          ---*/
    148 /*------------------------------------------------------------*/
    149 
    150 /* Carries info about a particular tmp.  The tmp's number is not
    151    recorded, as this is implied by (equal to) its index in the tmpMap
    152    in MCEnv.  The tmp's type is also not recorded, as this is present
    153    in MCEnv.sb->tyenv.
    154 
    155    When .kind is Orig, .shadowV and .shadowB may give the identities
    156    of the temps currently holding the associated definedness (shadowV)
    157    and origin (shadowB) values, or these may be IRTemp_INVALID if code
    158    to compute such values has not yet been emitted.
    159 
    160    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
    161    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
    162    illogical for a shadow tmp itself to be shadowed.
    163 */
    164 typedef
    165    enum { Orig=1, VSh=2, BSh=3 }
    166    TempKind;
    167 
    168 typedef
    169    struct {
    170       TempKind kind;
    171       IRTemp   shadowV;
    172       IRTemp   shadowB;
    173    }
    174    TempMapEnt;
    175 
    176 
    177 /* Carries around state during memcheck instrumentation. */
    178 typedef
    179    struct _MCEnv {
    180       /* MODIFIED: the superblock being constructed.  IRStmts are
    181          added. */
    182       IRSB* sb;
    183       Bool  trace;
    184 
    185       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
    186          current kind and possibly shadow temps for each temp in the
    187          IRSB being constructed.  Note that it does not contain the
    188          type of each tmp.  If you want to know the type, look at the
    189          relevant entry in sb->tyenv.  It follows that at all times
    190          during the instrumentation process, the valid indices for
    191          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
    192          total number of Orig, V- and B- temps allocated so far.
    193 
    194          The reason for this strange split (types in one place, all
    195          other info in another) is that we need the types to be
    196          attached to sb so as to make it possible to do
    197          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
    198          instrumentation process. */
    199       XArray* /* of TempMapEnt */ tmpMap;
    200 
    201       /* MODIFIED: indicates whether "bogus" literals have so far been
    202          found.  Starts off False, and may change to True. */
    203       Bool bogusLiterals;
    204 
    205       /* READONLY: indicates whether we should use expensive
    206          interpretations of integer adds, since unfortunately LLVM
    207          uses them to do ORs in some circumstances.  Defaulted to True
    208          on MacOS and False everywhere else. */
    209       Bool useLLVMworkarounds;
    210 
    211       /* READONLY: the guest layout.  This indicates which parts of
    212          the guest state should be regarded as 'always defined'. */
    213       const VexGuestLayout* layout;
    214 
    215       /* READONLY: the host word type.  Needed for constructing
    216          arguments of type 'HWord' to be passed to helper functions.
    217          Ity_I32 or Ity_I64 only. */
    218       IRType hWordTy;
    219    }
    220    MCEnv;
    221 
    222 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
    223    demand), as they are encountered.  This is for two reasons.
    224 
    225    (1) (less important reason): Many original tmps are unused due to
    226    initial IR optimisation, and we do not want to spaces in tables
    227    tracking them.
    228 
    229    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
    230    table indexed [0 .. n_types-1], which gives the current shadow for
    231    each original tmp, or INVALID_IRTEMP if none is so far assigned.
    232    It is necessary to support making multiple assignments to a shadow
    233    -- specifically, after testing a shadow for definedness, it needs
    234    to be made defined.  But IR's SSA property disallows this.
    235 
    236    (2) (more important reason): Therefore, when a shadow needs to get
    237    a new value, a new temporary is created, the value is assigned to
    238    that, and the tmpMap is updated to reflect the new binding.
    239 
    240    A corollary is that if the tmpMap maps a given tmp to
    241    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
    242    there's a read-before-write error in the original tmps.  The IR
    243    sanity checker should catch all such anomalies, however.
    244 */
    245 
    246 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
    247    both the table in mce->sb and to our auxiliary mapping.  Note that
    248    newTemp may cause mce->tmpMap to resize, hence previous results
    249    from VG_(indexXA)(mce->tmpMap) are invalidated. */
    250 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
    251 {
    252    Word       newIx;
    253    TempMapEnt ent;
    254    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
    255    ent.kind    = kind;
    256    ent.shadowV = IRTemp_INVALID;
    257    ent.shadowB = IRTemp_INVALID;
    258    newIx = VG_(addToXA)( mce->tmpMap, &ent );
    259    tl_assert(newIx == (Word)tmp);
    260    return tmp;
    261 }
    262 
    263 
    264 /* Find the tmp currently shadowing the given original tmp.  If none
    265    so far exists, allocate one.  */
    266 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
    267 {
    268    TempMapEnt* ent;
    269    /* VG_(indexXA) range-checks 'orig', hence no need to check
    270       here. */
    271    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    272    tl_assert(ent->kind == Orig);
    273    if (ent->shadowV == IRTemp_INVALID) {
    274       IRTemp tmpV
    275         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
    276       /* newTemp may cause mce->tmpMap to resize, hence previous results
    277          from VG_(indexXA) are invalid. */
    278       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    279       tl_assert(ent->kind == Orig);
    280       tl_assert(ent->shadowV == IRTemp_INVALID);
    281       ent->shadowV = tmpV;
    282    }
    283    return ent->shadowV;
    284 }
    285 
    286 /* Allocate a new shadow for the given original tmp.  This means any
    287    previous shadow is abandoned.  This is needed because it is
    288    necessary to give a new value to a shadow once it has been tested
    289    for undefinedness, but unfortunately IR's SSA property disallows
    290    this.  Instead we must abandon the old shadow, allocate a new one
    291    and use that instead.
    292 
    293    This is the same as findShadowTmpV, except we don't bother to see
    294    if a shadow temp already existed -- we simply allocate a new one
    295    regardless. */
    296 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
    297 {
    298    TempMapEnt* ent;
    299    /* VG_(indexXA) range-checks 'orig', hence no need to check
    300       here. */
    301    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    302    tl_assert(ent->kind == Orig);
    303    if (1) {
    304       IRTemp tmpV
    305         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
    306       /* newTemp may cause mce->tmpMap to resize, hence previous results
    307          from VG_(indexXA) are invalid. */
    308       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    309       tl_assert(ent->kind == Orig);
    310       ent->shadowV = tmpV;
    311    }
    312 }
    313 
    314 
    315 /*------------------------------------------------------------*/
    316 /*--- IRAtoms -- a subset of IRExprs                       ---*/
    317 /*------------------------------------------------------------*/
    318 
    319 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
    320    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
    321    input, most of this code deals in atoms.  Usefully, a value atom
    322    always has a V-value which is also an atom: constants are shadowed
    323    by constants, and temps are shadowed by the corresponding shadow
    324    temporary. */
    325 
    326 typedef  IRExpr  IRAtom;
    327 
    328 /* (used for sanity checks only): is this an atom which looks
    329    like it's from original code? */
    330 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
    331 {
    332    if (a1->tag == Iex_Const)
    333       return True;
    334    if (a1->tag == Iex_RdTmp) {
    335       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
    336       return ent->kind == Orig;
    337    }
    338    return False;
    339 }
    340 
    341 /* (used for sanity checks only): is this an atom which looks
    342    like it's from shadow code? */
    343 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
    344 {
    345    if (a1->tag == Iex_Const)
    346       return True;
    347    if (a1->tag == Iex_RdTmp) {
    348       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
    349       return ent->kind == VSh || ent->kind == BSh;
    350    }
    351    return False;
    352 }
    353 
    354 /* (used for sanity checks only): check that both args are atoms and
    355    are identically-kinded. */
    356 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
    357 {
    358    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
    359       return True;
    360    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
    361       return True;
    362    return False;
    363 }
    364 
    365 
    366 /*------------------------------------------------------------*/
    367 /*--- Type management                                      ---*/
    368 /*------------------------------------------------------------*/
    369 
    370 /* Shadow state is always accessed using integer types.  This returns
    371    an integer type with the same size (as per sizeofIRType) as the
    372    given type.  The only valid shadow types are Bit, I8, I16, I32,
    373    I64, I128, V128, V256. */
    374 
    375 static IRType shadowTypeV ( IRType ty )
    376 {
    377    switch (ty) {
    378       case Ity_I1:
    379       case Ity_I8:
    380       case Ity_I16:
    381       case Ity_I32:
    382       case Ity_I64:
    383       case Ity_I128: return ty;
    384       case Ity_F16:  return Ity_I16;
    385       case Ity_F32:  return Ity_I32;
    386       case Ity_D32:  return Ity_I32;
    387       case Ity_F64:  return Ity_I64;
    388       case Ity_D64:  return Ity_I64;
    389       case Ity_F128: return Ity_I128;
    390       case Ity_D128: return Ity_I128;
    391       case Ity_V128: return Ity_V128;
    392       case Ity_V256: return Ity_V256;
    393       default: ppIRType(ty);
    394                VG_(tool_panic)("memcheck:shadowTypeV");
    395    }
    396 }
    397 
    398 /* Produce a 'defined' value of the given shadow type.  Should only be
    399    supplied shadow types (Bit/I8/I16/I32/UI64). */
    400 static IRExpr* definedOfType ( IRType ty ) {
    401    switch (ty) {
    402       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
    403       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
    404       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
    405       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
    406       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
    407       case Ity_I128: return i128_const_zero();
    408       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
    409       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
    410       default:       VG_(tool_panic)("memcheck:definedOfType");
    411    }
    412 }
    413 
    414 
    415 /*------------------------------------------------------------*/
    416 /*--- Constructing IR fragments                            ---*/
    417 /*------------------------------------------------------------*/
    418 
    419 /* add stmt to a bb */
    420 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
    421    if (mce->trace) {
    422       VG_(printf)("  %c: ", cat);
    423       ppIRStmt(st);
    424       VG_(printf)("\n");
    425    }
    426    addStmtToIRSB(mce->sb, st);
    427 }
    428 
    429 /* assign value to tmp */
    430 static inline
    431 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
    432    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
    433 }
    434 
    435 /* build various kinds of expressions */
    436 #define triop(_op, _arg1, _arg2, _arg3) \
    437                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
    438 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
    439 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
    440 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
    441 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
    442 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
    443 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
    444 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
    445 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
    446 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
    447 
    448 /* Bind the given expression to a new temporary, and return the
    449    temporary.  This effectively converts an arbitrary expression into
    450    an atom.
    451 
    452    'ty' is the type of 'e' and hence the type that the new temporary
    453    needs to be.  But passing it in is redundant, since we can deduce
    454    the type merely by inspecting 'e'.  So at least use that fact to
    455    assert that the two types agree. */
    456 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
    457 {
    458    TempKind k;
    459    IRTemp   t;
    460    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
    461 
    462    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
    463    switch (cat) {
    464       case 'V': k = VSh;  break;
    465       case 'B': k = BSh;  break;
    466       case 'C': k = Orig; break;
    467                 /* happens when we are making up new "orig"
    468                    expressions, for IRCAS handling */
    469       default: tl_assert(0);
    470    }
    471    t = newTemp(mce, ty, k);
    472    assign(cat, mce, t, e);
    473    return mkexpr(t);
    474 }
    475 
    476 
    477 /*------------------------------------------------------------*/
    478 /*--- Helper functions for 128-bit ops                     ---*/
    479 /*------------------------------------------------------------*/
    480 
    481 static IRExpr *i128_const_zero(void)
    482 {
    483    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
    484    return binop(Iop_64HLto128, z64, z64);
    485 }
    486 
    487 /* There are no I128-bit loads and/or stores [as generated by any
    488    current front ends].  So we do not need to worry about that in
    489    expr2vbits_Load */
    490 
    491 
    492 /*------------------------------------------------------------*/
    493 /*--- Constructing definedness primitive ops               ---*/
    494 /*------------------------------------------------------------*/
    495 
    496 /* --------- Defined-if-either-defined --------- */
    497 
    498 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    499    tl_assert(isShadowAtom(mce,a1));
    500    tl_assert(isShadowAtom(mce,a2));
    501    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
    502 }
    503 
    504 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    505    tl_assert(isShadowAtom(mce,a1));
    506    tl_assert(isShadowAtom(mce,a2));
    507    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
    508 }
    509 
    510 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    511    tl_assert(isShadowAtom(mce,a1));
    512    tl_assert(isShadowAtom(mce,a2));
    513    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
    514 }
    515 
    516 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    517    tl_assert(isShadowAtom(mce,a1));
    518    tl_assert(isShadowAtom(mce,a2));
    519    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
    520 }
    521 
    522 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    523    tl_assert(isShadowAtom(mce,a1));
    524    tl_assert(isShadowAtom(mce,a2));
    525    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
    526 }
    527 
    528 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    529    tl_assert(isShadowAtom(mce,a1));
    530    tl_assert(isShadowAtom(mce,a2));
    531    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
    532 }
    533 
    534 /* --------- Undefined-if-either-undefined --------- */
    535 
    536 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    537    tl_assert(isShadowAtom(mce,a1));
    538    tl_assert(isShadowAtom(mce,a2));
    539    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
    540 }
    541 
    542 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    543    tl_assert(isShadowAtom(mce,a1));
    544    tl_assert(isShadowAtom(mce,a2));
    545    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
    546 }
    547 
    548 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    549    tl_assert(isShadowAtom(mce,a1));
    550    tl_assert(isShadowAtom(mce,a2));
    551    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
    552 }
    553 
    554 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    555    tl_assert(isShadowAtom(mce,a1));
    556    tl_assert(isShadowAtom(mce,a2));
    557    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
    558 }
    559 
    560 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    561    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
    562    tl_assert(isShadowAtom(mce,a1));
    563    tl_assert(isShadowAtom(mce,a2));
    564    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
    565    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
    566    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
    567    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
    568    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
    569    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
    570 
    571    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
    572 }
    573 
    574 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    575    tl_assert(isShadowAtom(mce,a1));
    576    tl_assert(isShadowAtom(mce,a2));
    577    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
    578 }
    579 
    580 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    581    tl_assert(isShadowAtom(mce,a1));
    582    tl_assert(isShadowAtom(mce,a2));
    583    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
    584 }
    585 
    586 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
    587    switch (vty) {
    588       case Ity_I8:   return mkUifU8(mce, a1, a2);
    589       case Ity_I16:  return mkUifU16(mce, a1, a2);
    590       case Ity_I32:  return mkUifU32(mce, a1, a2);
    591       case Ity_I64:  return mkUifU64(mce, a1, a2);
    592       case Ity_I128: return mkUifU128(mce, a1, a2);
    593       case Ity_V128: return mkUifUV128(mce, a1, a2);
    594       case Ity_V256: return mkUifUV256(mce, a1, a2);
    595       default:
    596          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
    597          VG_(tool_panic)("memcheck:mkUifU");
    598    }
    599 }
    600 
    601 /* --------- The Left-family of operations. --------- */
    602 
    603 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
    604    tl_assert(isShadowAtom(mce,a1));
    605    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
    606 }
    607 
    608 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
    609    tl_assert(isShadowAtom(mce,a1));
    610    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
    611 }
    612 
    613 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
    614    tl_assert(isShadowAtom(mce,a1));
    615    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
    616 }
    617 
    618 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
    619    tl_assert(isShadowAtom(mce,a1));
    620    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
    621 }
    622 
    623 /* --------- 'Improvement' functions for AND/OR. --------- */
    624 
    625 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
    626    defined (0); all other -> undefined (1).
    627 */
    628 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    629 {
    630    tl_assert(isOriginalAtom(mce, data));
    631    tl_assert(isShadowAtom(mce, vbits));
    632    tl_assert(sameKindedAtoms(data, vbits));
    633    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
    634 }
    635 
    636 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    637 {
    638    tl_assert(isOriginalAtom(mce, data));
    639    tl_assert(isShadowAtom(mce, vbits));
    640    tl_assert(sameKindedAtoms(data, vbits));
    641    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
    642 }
    643 
    644 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    645 {
    646    tl_assert(isOriginalAtom(mce, data));
    647    tl_assert(isShadowAtom(mce, vbits));
    648    tl_assert(sameKindedAtoms(data, vbits));
    649    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
    650 }
    651 
    652 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    653 {
    654    tl_assert(isOriginalAtom(mce, data));
    655    tl_assert(isShadowAtom(mce, vbits));
    656    tl_assert(sameKindedAtoms(data, vbits));
    657    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
    658 }
    659 
    660 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    661 {
    662    tl_assert(isOriginalAtom(mce, data));
    663    tl_assert(isShadowAtom(mce, vbits));
    664    tl_assert(sameKindedAtoms(data, vbits));
    665    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
    666 }
    667 
    668 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    669 {
    670    tl_assert(isOriginalAtom(mce, data));
    671    tl_assert(isShadowAtom(mce, vbits));
    672    tl_assert(sameKindedAtoms(data, vbits));
    673    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
    674 }
    675 
    676 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
    677    defined (0); all other -> undefined (1).
    678 */
    679 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    680 {
    681    tl_assert(isOriginalAtom(mce, data));
    682    tl_assert(isShadowAtom(mce, vbits));
    683    tl_assert(sameKindedAtoms(data, vbits));
    684    return assignNew(
    685              'V', mce, Ity_I8,
    686              binop(Iop_Or8,
    687                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
    688                    vbits) );
    689 }
    690 
    691 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    692 {
    693    tl_assert(isOriginalAtom(mce, data));
    694    tl_assert(isShadowAtom(mce, vbits));
    695    tl_assert(sameKindedAtoms(data, vbits));
    696    return assignNew(
    697              'V', mce, Ity_I16,
    698              binop(Iop_Or16,
    699                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
    700                    vbits) );
    701 }
    702 
    703 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    704 {
    705    tl_assert(isOriginalAtom(mce, data));
    706    tl_assert(isShadowAtom(mce, vbits));
    707    tl_assert(sameKindedAtoms(data, vbits));
    708    return assignNew(
    709              'V', mce, Ity_I32,
    710              binop(Iop_Or32,
    711                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
    712                    vbits) );
    713 }
    714 
    715 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    716 {
    717    tl_assert(isOriginalAtom(mce, data));
    718    tl_assert(isShadowAtom(mce, vbits));
    719    tl_assert(sameKindedAtoms(data, vbits));
    720    return assignNew(
    721              'V', mce, Ity_I64,
    722              binop(Iop_Or64,
    723                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
    724                    vbits) );
    725 }
    726 
    727 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    728 {
    729    tl_assert(isOriginalAtom(mce, data));
    730    tl_assert(isShadowAtom(mce, vbits));
    731    tl_assert(sameKindedAtoms(data, vbits));
    732    return assignNew(
    733              'V', mce, Ity_V128,
    734              binop(Iop_OrV128,
    735                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
    736                    vbits) );
    737 }
    738 
    739 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    740 {
    741    tl_assert(isOriginalAtom(mce, data));
    742    tl_assert(isShadowAtom(mce, vbits));
    743    tl_assert(sameKindedAtoms(data, vbits));
    744    return assignNew(
    745              'V', mce, Ity_V256,
    746              binop(Iop_OrV256,
    747                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
    748                    vbits) );
    749 }
    750 
    751 /* --------- Pessimising casts. --------- */
    752 
    753 /* The function returns an expression of type DST_TY. If any of the VBITS
    754    is undefined (value == 1) the resulting expression has all bits set to
    755    1. Otherwise, all bits are 0. */
    756 
    757 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
    758 {
    759    IRType  src_ty;
    760    IRAtom* tmp1;
    761 
    762    /* Note, dst_ty is a shadow type, not an original type. */
    763    tl_assert(isShadowAtom(mce,vbits));
    764    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
    765 
    766    /* Fast-track some common cases */
    767    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
    768       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    769 
    770    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
    771       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
    772 
    773    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
    774       /* PCast the arg, then clone it. */
    775       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    776       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    777    }
    778 
    779    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
    780       /* PCast the arg, then clone it 4 times. */
    781       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    782       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    783       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
    784    }
    785 
    786    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
    787       /* PCast the arg, then clone it 8 times. */
    788       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    789       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    790       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
    791       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
    792    }
    793 
    794    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
    795       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
    796          the top half. */
    797       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
    798       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
    799    }
    800 
    801    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
    802       /* Use InterleaveHI64x2 to copy the top half of the vector into
    803          the bottom half.  Then we can UifU it with the original, throw
    804          away the upper half of the result, and PCast-I64-to-I64
    805          the lower half. */
    806       // Generates vbits[127:64] : vbits[127:64]
    807       IRAtom* hi64hi64
    808          = assignNew('V', mce, Ity_V128,
    809                      binop(Iop_InterleaveHI64x2, vbits, vbits));
    810       // Generates
    811       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
    812       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
    813       IRAtom* lohi64
    814          = mkUifUV128(mce, hi64hi64, vbits);
    815       // Generates UifU(vbits[127:64],vbits[63:0])
    816       IRAtom* lo64
    817          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
    818       // Generates
    819       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
    820       //   == PCast-to-I64( vbits[127:0] )
    821       IRAtom* res
    822          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
    823       return res;
    824    }
    825 
    826    /* Else do it the slow way .. */
    827    /* First of all, collapse vbits down to a single bit. */
    828    tmp1   = NULL;
    829    switch (src_ty) {
    830       case Ity_I1:
    831          tmp1 = vbits;
    832          break;
    833       case Ity_I8:
    834          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
    835          break;
    836       case Ity_I16:
    837          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
    838          break;
    839       case Ity_I32:
    840          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
    841          break;
    842       case Ity_I64:
    843          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
    844          break;
    845       case Ity_I128: {
    846          /* Gah.  Chop it in half, OR the halves together, and compare
    847             that with zero. */
    848          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
    849          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
    850          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
    851          tmp1         = assignNew('V', mce, Ity_I1,
    852                                        unop(Iop_CmpNEZ64, tmp4));
    853          break;
    854       }
    855       default:
    856          ppIRType(src_ty);
    857          VG_(tool_panic)("mkPCastTo(1)");
    858    }
    859    tl_assert(tmp1);
    860    /* Now widen up to the dst type. */
    861    switch (dst_ty) {
    862       case Ity_I1:
    863          return tmp1;
    864       case Ity_I8:
    865          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
    866       case Ity_I16:
    867          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
    868       case Ity_I32:
    869          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
    870       case Ity_I64:
    871          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
    872       case Ity_V128:
    873          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    874          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
    875          return tmp1;
    876       case Ity_I128:
    877          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    878          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
    879          return tmp1;
    880       case Ity_V256:
    881          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    882          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
    883                                                     tmp1, tmp1));
    884          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
    885                                                     tmp1, tmp1));
    886          return tmp1;
    887       default:
    888          ppIRType(dst_ty);
    889          VG_(tool_panic)("mkPCastTo(2)");
    890    }
    891 }
    892 
    893 /* This is a minor variant.  It takes an arg of some type and returns
    894    a value of the same type.  The result consists entirely of Defined
    895    (zero) bits except its least significant bit, which is a PCast of
    896    the entire argument down to a single bit. */
    897 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
    898 {
    899    if (ty == Ity_V128) {
    900       /* --- Case for V128 --- */
    901       IRAtom* varg128 = varg;
    902       // generates: PCast-to-I64(varg128)
    903       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
    904       // Now introduce zeros (defined bits) in the top 63 places
    905       // generates: Def--(63)--Def PCast-to-I1(varg128)
    906       IRAtom* d63pc
    907          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
    908       // generates: Def--(64)--Def
    909       IRAtom* d64
    910          = definedOfType(Ity_I64);
    911       // generates: Def--(127)--Def PCast-to-I1(varg128)
    912       IRAtom* res
    913          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
    914       return res;
    915    }
    916    if (ty == Ity_I64) {
    917       /* --- Case for I64 --- */
    918       // PCast to 64
    919       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
    920       // Zero (Def) out the top 63 bits
    921       IRAtom* res
    922          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
    923       return res;
    924    }
    925    /*NOTREACHED*/
    926    tl_assert(0);
    927 }
    928 
    929 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
    930 /*
    931    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
    932    PCasting to Ity_U1.  However, sometimes it is necessary to be more
    933    accurate.  The insight is that the result is defined if two
    934    corresponding bits can be found, one from each argument, so that
    935    both bits are defined but are different -- that makes EQ say "No"
    936    and NE say "Yes".  Hence, we compute an improvement term and DifD
    937    it onto the "normal" (UifU) result.
    938 
    939    The result is:
    940 
    941    PCastTo<1> (
    942       -- naive version
    943       PCastTo<sz>( UifU<sz>(vxx, vyy) )
    944 
    945       `DifD<sz>`
    946 
    947       -- improvement term
    948       PCastTo<sz>( PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) ) )
    949    )
    950 
    951    where
    952      vec contains 0 (defined) bits where the corresponding arg bits
    953      are defined but different, and 1 bits otherwise.
    954 
    955      vec = Or<sz>( vxx,   // 0 iff bit defined
    956                    vyy,   // 0 iff bit defined
    957                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
    958                  )
    959 
    960      If any bit of vec is 0, the result is defined and so the
    961      improvement term should produce 0...0, else it should produce
    962      1...1.
    963 
    964      Hence require for the improvement term:
    965 
    966         if vec == 1...1 then 1...1 else 0...0
    967      ->
    968         PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) )
    969 
    970    This was extensively re-analysed and checked on 6 July 05.
    971 */
    972 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
    973                                     IRType  ty,
    974                                     IRAtom* vxx, IRAtom* vyy,
    975                                     IRAtom* xx,  IRAtom* yy )
    976 {
    977    IRAtom *naive, *vec, *improvement_term;
    978    IRAtom *improved, *final_cast, *top;
    979    IROp   opDIFD, opUIFU, opXOR, opNOT, opCMP, opOR;
    980 
    981    tl_assert(isShadowAtom(mce,vxx));
    982    tl_assert(isShadowAtom(mce,vyy));
    983    tl_assert(isOriginalAtom(mce,xx));
    984    tl_assert(isOriginalAtom(mce,yy));
    985    tl_assert(sameKindedAtoms(vxx,xx));
    986    tl_assert(sameKindedAtoms(vyy,yy));
    987 
    988    switch (ty) {
    989       case Ity_I16:
    990          opOR   = Iop_Or16;
    991          opDIFD = Iop_And16;
    992          opUIFU = Iop_Or16;
    993          opNOT  = Iop_Not16;
    994          opXOR  = Iop_Xor16;
    995          opCMP  = Iop_CmpEQ16;
    996          top    = mkU16(0xFFFF);
    997          break;
    998       case Ity_I32:
    999          opOR   = Iop_Or32;
   1000          opDIFD = Iop_And32;
   1001          opUIFU = Iop_Or32;
   1002          opNOT  = Iop_Not32;
   1003          opXOR  = Iop_Xor32;
   1004          opCMP  = Iop_CmpEQ32;
   1005          top    = mkU32(0xFFFFFFFF);
   1006          break;
   1007       case Ity_I64:
   1008          opOR   = Iop_Or64;
   1009          opDIFD = Iop_And64;
   1010          opUIFU = Iop_Or64;
   1011          opNOT  = Iop_Not64;
   1012          opXOR  = Iop_Xor64;
   1013          opCMP  = Iop_CmpEQ64;
   1014          top    = mkU64(0xFFFFFFFFFFFFFFFFULL);
   1015          break;
   1016       default:
   1017          VG_(tool_panic)("expensiveCmpEQorNE");
   1018    }
   1019 
   1020    naive
   1021       = mkPCastTo(mce,ty,
   1022                   assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
   1023 
   1024    vec
   1025       = assignNew(
   1026            'V', mce,ty,
   1027            binop( opOR,
   1028                   assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
   1029                   assignNew(
   1030                      'V', mce,ty,
   1031                      unop( opNOT,
   1032                            assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
   1033 
   1034    improvement_term
   1035       = mkPCastTo( mce,ty,
   1036                    assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
   1037 
   1038    improved
   1039       = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
   1040 
   1041    final_cast
   1042       = mkPCastTo( mce, Ity_I1, improved );
   1043 
   1044    return final_cast;
   1045 }
   1046 
   1047 
   1048 /* --------- Semi-accurate interpretation of CmpORD. --------- */
   1049 
   1050 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
   1051 
   1052       CmpORD32S(x,y) = 1<<3   if  x <s y
   1053                      = 1<<2   if  x >s y
   1054                      = 1<<1   if  x == y
   1055 
   1056    and similarly the unsigned variant.  The default interpretation is:
   1057 
   1058       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
   1059                                   & (7<<1)
   1060 
   1061    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
   1062    are zero and therefore defined (viz, zero).
   1063 
   1064    Also deal with a special case better:
   1065 
   1066       CmpORD32S(x,0)
   1067 
   1068    Here, bit 3 (LT) of the result is a copy of the top bit of x and
   1069    will be defined even if the rest of x isn't.  In which case we do:
   1070 
   1071       CmpORD32S#(x,x#,0,{impliedly 0}#)
   1072          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
   1073            | (x# >>u 31) << 3      -- LT# = x#[31]
   1074 
   1075    Analogous handling for CmpORD64{S,U}.
   1076 */
   1077 static Bool isZeroU32 ( IRAtom* e )
   1078 {
   1079    return
   1080       toBool( e->tag == Iex_Const
   1081               && e->Iex.Const.con->tag == Ico_U32
   1082               && e->Iex.Const.con->Ico.U32 == 0 );
   1083 }
   1084 
   1085 static Bool isZeroU64 ( IRAtom* e )
   1086 {
   1087    return
   1088       toBool( e->tag == Iex_Const
   1089               && e->Iex.Const.con->tag == Ico_U64
   1090               && e->Iex.Const.con->Ico.U64 == 0 );
   1091 }
   1092 
   1093 static IRAtom* doCmpORD ( MCEnv*  mce,
   1094                           IROp    cmp_op,
   1095                           IRAtom* xxhash, IRAtom* yyhash,
   1096                           IRAtom* xx,     IRAtom* yy )
   1097 {
   1098    Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
   1099    Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
   1100    IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
   1101    IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
   1102    IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
   1103    IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
   1104    IRType ty     = m64 ? Ity_I64   : Ity_I32;
   1105    Int    width  = m64 ? 64        : 32;
   1106 
   1107    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
   1108 
   1109    IRAtom* threeLeft1 = NULL;
   1110    IRAtom* sevenLeft1 = NULL;
   1111 
   1112    tl_assert(isShadowAtom(mce,xxhash));
   1113    tl_assert(isShadowAtom(mce,yyhash));
   1114    tl_assert(isOriginalAtom(mce,xx));
   1115    tl_assert(isOriginalAtom(mce,yy));
   1116    tl_assert(sameKindedAtoms(xxhash,xx));
   1117    tl_assert(sameKindedAtoms(yyhash,yy));
   1118    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
   1119              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
   1120 
   1121    if (0) {
   1122       ppIROp(cmp_op); VG_(printf)(" ");
   1123       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
   1124    }
   1125 
   1126    if (syned && isZero(yy)) {
   1127       /* fancy interpretation */
   1128       /* if yy is zero, then it must be fully defined (zero#). */
   1129       tl_assert(isZero(yyhash));
   1130       threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
   1131       return
   1132          binop(
   1133             opOR,
   1134             assignNew(
   1135                'V', mce,ty,
   1136                binop(
   1137                   opAND,
   1138                   mkPCastTo(mce,ty, xxhash),
   1139                   threeLeft1
   1140                )),
   1141             assignNew(
   1142                'V', mce,ty,
   1143                binop(
   1144                   opSHL,
   1145                   assignNew(
   1146                      'V', mce,ty,
   1147                      binop(opSHR, xxhash, mkU8(width-1))),
   1148                   mkU8(3)
   1149                ))
   1150 	 );
   1151    } else {
   1152       /* standard interpretation */
   1153       sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
   1154       return
   1155          binop(
   1156             opAND,
   1157             mkPCastTo( mce,ty,
   1158                        mkUifU(mce,ty, xxhash,yyhash)),
   1159             sevenLeft1
   1160          );
   1161    }
   1162 }
   1163 
   1164 
   1165 /*------------------------------------------------------------*/
   1166 /*--- Emit a test and complaint if something is undefined. ---*/
   1167 /*------------------------------------------------------------*/
   1168 
   1169 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
   1170 
   1171 
   1172 /* Set the annotations on a dirty helper to indicate that the stack
   1173    pointer and instruction pointers might be read.  This is the
   1174    behaviour of all 'emit-a-complaint' style functions we might
   1175    call. */
   1176 
   1177 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
   1178    di->nFxState = 2;
   1179    di->fxState[0].fx        = Ifx_Read;
   1180    di->fxState[0].offset    = mce->layout->offset_SP;
   1181    di->fxState[0].size      = mce->layout->sizeof_SP;
   1182    di->fxState[0].nRepeats  = 0;
   1183    di->fxState[0].repeatLen = 0;
   1184    di->fxState[1].fx        = Ifx_Read;
   1185    di->fxState[1].offset    = mce->layout->offset_IP;
   1186    di->fxState[1].size      = mce->layout->sizeof_IP;
   1187    di->fxState[1].nRepeats  = 0;
   1188    di->fxState[1].repeatLen = 0;
   1189 }
   1190 
   1191 
   1192 /* Check the supplied *original* |atom| for undefinedness, and emit a
   1193    complaint if so.  Once that happens, mark it as defined.  This is
   1194    possible because the atom is either a tmp or literal.  If it's a
   1195    tmp, it will be shadowed by a tmp, and so we can set the shadow to
   1196    be defined.  In fact as mentioned above, we will have to allocate a
   1197    new tmp to carry the new 'defined' shadow value, and update the
   1198    original->tmp mapping accordingly; we cannot simply assign a new
   1199    value to an existing shadow tmp as this breaks SSAness.
   1200 
   1201    The checks are performed, any resulting complaint emitted, and
   1202    |atom|'s shadow temp set to 'defined', ONLY in the case that
   1203    |guard| evaluates to True at run-time.  If it evaluates to False
   1204    then no action is performed.  If |guard| is NULL (the usual case)
   1205    then it is assumed to be always-true, and hence these actions are
   1206    performed unconditionally.
   1207 
   1208    This routine does not generate code to check the definedness of
   1209    |guard|.  The caller is assumed to have taken care of that already.
   1210 */
   1211 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
   1212 {
   1213    IRAtom*  vatom;
   1214    IRType   ty;
   1215    Int      sz;
   1216    IRDirty* di;
   1217    IRAtom*  cond;
   1218    IRAtom*  origin;
   1219    void*    fn;
   1220    const HChar* nm;
   1221    IRExpr** args;
   1222    Int      nargs;
   1223 
   1224    // Don't do V bit tests if we're not reporting undefined value errors.
   1225    if (MC_(clo_mc_level) == 1)
   1226       return;
   1227 
   1228    if (guard)
   1229       tl_assert(isOriginalAtom(mce, guard));
   1230 
   1231    /* Since the original expression is atomic, there's no duplicated
   1232       work generated by making multiple V-expressions for it.  So we
   1233       don't really care about the possibility that someone else may
   1234       also create a V-interpretion for it. */
   1235    tl_assert(isOriginalAtom(mce, atom));
   1236    vatom = expr2vbits( mce, atom );
   1237    tl_assert(isShadowAtom(mce, vatom));
   1238    tl_assert(sameKindedAtoms(atom, vatom));
   1239 
   1240    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
   1241 
   1242    /* sz is only used for constructing the error message */
   1243    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
   1244 
   1245    cond = mkPCastTo( mce, Ity_I1, vatom );
   1246    /* cond will be 0 if all defined, and 1 if any not defined. */
   1247 
   1248    /* Get the origin info for the value we are about to check.  At
   1249       least, if we are doing origin tracking.  If not, use a dummy
   1250       zero origin. */
   1251    if (MC_(clo_mc_level) == 3) {
   1252       origin = schemeE( mce, atom );
   1253       if (mce->hWordTy == Ity_I64) {
   1254          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
   1255       }
   1256    } else {
   1257       origin = NULL;
   1258    }
   1259 
   1260    fn    = NULL;
   1261    nm    = NULL;
   1262    args  = NULL;
   1263    nargs = -1;
   1264 
   1265    switch (sz) {
   1266       case 0:
   1267          if (origin) {
   1268             fn    = &MC_(helperc_value_check0_fail_w_o);
   1269             nm    = "MC_(helperc_value_check0_fail_w_o)";
   1270             args  = mkIRExprVec_1(origin);
   1271             nargs = 1;
   1272          } else {
   1273             fn    = &MC_(helperc_value_check0_fail_no_o);
   1274             nm    = "MC_(helperc_value_check0_fail_no_o)";
   1275             args  = mkIRExprVec_0();
   1276             nargs = 0;
   1277          }
   1278          break;
   1279       case 1:
   1280          if (origin) {
   1281             fn    = &MC_(helperc_value_check1_fail_w_o);
   1282             nm    = "MC_(helperc_value_check1_fail_w_o)";
   1283             args  = mkIRExprVec_1(origin);
   1284             nargs = 1;
   1285          } else {
   1286             fn    = &MC_(helperc_value_check1_fail_no_o);
   1287             nm    = "MC_(helperc_value_check1_fail_no_o)";
   1288             args  = mkIRExprVec_0();
   1289             nargs = 0;
   1290          }
   1291          break;
   1292       case 4:
   1293          if (origin) {
   1294             fn    = &MC_(helperc_value_check4_fail_w_o);
   1295             nm    = "MC_(helperc_value_check4_fail_w_o)";
   1296             args  = mkIRExprVec_1(origin);
   1297             nargs = 1;
   1298          } else {
   1299             fn    = &MC_(helperc_value_check4_fail_no_o);
   1300             nm    = "MC_(helperc_value_check4_fail_no_o)";
   1301             args  = mkIRExprVec_0();
   1302             nargs = 0;
   1303          }
   1304          break;
   1305       case 8:
   1306          if (origin) {
   1307             fn    = &MC_(helperc_value_check8_fail_w_o);
   1308             nm    = "MC_(helperc_value_check8_fail_w_o)";
   1309             args  = mkIRExprVec_1(origin);
   1310             nargs = 1;
   1311          } else {
   1312             fn    = &MC_(helperc_value_check8_fail_no_o);
   1313             nm    = "MC_(helperc_value_check8_fail_no_o)";
   1314             args  = mkIRExprVec_0();
   1315             nargs = 0;
   1316          }
   1317          break;
   1318       case 2:
   1319       case 16:
   1320          if (origin) {
   1321             fn    = &MC_(helperc_value_checkN_fail_w_o);
   1322             nm    = "MC_(helperc_value_checkN_fail_w_o)";
   1323             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
   1324             nargs = 2;
   1325          } else {
   1326             fn    = &MC_(helperc_value_checkN_fail_no_o);
   1327             nm    = "MC_(helperc_value_checkN_fail_no_o)";
   1328             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
   1329             nargs = 1;
   1330          }
   1331          break;
   1332       default:
   1333          VG_(tool_panic)("unexpected szB");
   1334    }
   1335 
   1336    tl_assert(fn);
   1337    tl_assert(nm);
   1338    tl_assert(args);
   1339    tl_assert(nargs >= 0 && nargs <= 2);
   1340    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
   1341               || (MC_(clo_mc_level) == 2 && origin == NULL) );
   1342 
   1343    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
   1344                            VG_(fnptr_to_fnentry)( fn ), args );
   1345    di->guard = cond; // and cond is PCast-to-1(atom#)
   1346 
   1347    /* If the complaint is to be issued under a guard condition, AND
   1348       that into the guard condition for the helper call. */
   1349    if (guard) {
   1350       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
   1351       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
   1352       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
   1353       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
   1354    }
   1355 
   1356    setHelperAnns( mce, di );
   1357    stmt( 'V', mce, IRStmt_Dirty(di));
   1358 
   1359    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
   1360       defined -- but only in the case where the guard evaluates to
   1361       True at run-time.  Do the update by setting the orig->shadow
   1362       mapping for tmp to reflect the fact that this shadow is getting
   1363       a new value. */
   1364    tl_assert(isIRAtom(vatom));
   1365    /* sameKindedAtoms ... */
   1366    if (vatom->tag == Iex_RdTmp) {
   1367       tl_assert(atom->tag == Iex_RdTmp);
   1368       if (guard == NULL) {
   1369          // guard is 'always True', hence update unconditionally
   1370          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1371          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
   1372                           definedOfType(ty));
   1373       } else {
   1374          // update the temp only conditionally.  Do this by copying
   1375          // its old value when the guard is False.
   1376          // The old value ..
   1377          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1378          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1379          IRAtom* new_tmpV
   1380             = assignNew('V', mce, shadowTypeV(ty),
   1381                         IRExpr_ITE(guard, definedOfType(ty),
   1382                                           mkexpr(old_tmpV)));
   1383          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
   1384       }
   1385    }
   1386 }
   1387 
   1388 
   1389 /*------------------------------------------------------------*/
   1390 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
   1391 /*------------------------------------------------------------*/
   1392 
   1393 /* Examine the always-defined sections declared in layout to see if
   1394    the (offset,size) section is within one.  Note, is is an error to
   1395    partially fall into such a region: (offset,size) should either be
   1396    completely in such a region or completely not-in such a region.
   1397 */
   1398 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
   1399 {
   1400    Int minoffD, maxoffD, i;
   1401    Int minoff = offset;
   1402    Int maxoff = minoff + size - 1;
   1403    tl_assert((minoff & ~0xFFFF) == 0);
   1404    tl_assert((maxoff & ~0xFFFF) == 0);
   1405 
   1406    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
   1407       minoffD = mce->layout->alwaysDefd[i].offset;
   1408       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
   1409       tl_assert((minoffD & ~0xFFFF) == 0);
   1410       tl_assert((maxoffD & ~0xFFFF) == 0);
   1411 
   1412       if (maxoff < minoffD || maxoffD < minoff)
   1413          continue; /* no overlap */
   1414       if (minoff >= minoffD && maxoff <= maxoffD)
   1415          return True; /* completely contained in an always-defd section */
   1416 
   1417       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
   1418    }
   1419    return False; /* could not find any containing section */
   1420 }
   1421 
   1422 
   1423 /* Generate into bb suitable actions to shadow this Put.  If the state
   1424    slice is marked 'always defined', do nothing.  Otherwise, write the
   1425    supplied V bits to the shadow state.  We can pass in either an
   1426    original atom or a V-atom, but not both.  In the former case the
   1427    relevant V-bits are then generated from the original.
   1428    We assume here, that the definedness of GUARD has already been checked.
   1429 */
   1430 static
   1431 void do_shadow_PUT ( MCEnv* mce,  Int offset,
   1432                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
   1433 {
   1434    IRType ty;
   1435 
   1436    // Don't do shadow PUTs if we're not doing undefined value checking.
   1437    // Their absence lets Vex's optimiser remove all the shadow computation
   1438    // that they depend on, which includes GETs of the shadow registers.
   1439    if (MC_(clo_mc_level) == 1)
   1440       return;
   1441 
   1442    if (atom) {
   1443       tl_assert(!vatom);
   1444       tl_assert(isOriginalAtom(mce, atom));
   1445       vatom = expr2vbits( mce, atom );
   1446    } else {
   1447       tl_assert(vatom);
   1448       tl_assert(isShadowAtom(mce, vatom));
   1449    }
   1450 
   1451    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
   1452    tl_assert(ty != Ity_I1);
   1453    tl_assert(ty != Ity_I128);
   1454    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
   1455       /* later: no ... */
   1456       /* emit code to emit a complaint if any of the vbits are 1. */
   1457       /* complainIfUndefined(mce, atom); */
   1458    } else {
   1459       /* Do a plain shadow Put. */
   1460       if (guard) {
   1461          /* If the guard expression evaluates to false we simply Put the value
   1462             that is already stored in the guest state slot */
   1463          IRAtom *cond, *iffalse;
   1464 
   1465          cond    = assignNew('V', mce, Ity_I1, guard);
   1466          iffalse = assignNew('V', mce, ty,
   1467                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
   1468          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
   1469       }
   1470       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
   1471    }
   1472 }
   1473 
   1474 
   1475 /* Return an expression which contains the V bits corresponding to the
   1476    given GETI (passed in in pieces).
   1477 */
   1478 static
   1479 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
   1480 {
   1481    IRAtom* vatom;
   1482    IRType  ty, tyS;
   1483    Int     arrSize;;
   1484    IRRegArray* descr = puti->descr;
   1485    IRAtom*     ix    = puti->ix;
   1486    Int         bias  = puti->bias;
   1487    IRAtom*     atom  = puti->data;
   1488 
   1489    // Don't do shadow PUTIs if we're not doing undefined value checking.
   1490    // Their absence lets Vex's optimiser remove all the shadow computation
   1491    // that they depend on, which includes GETIs of the shadow registers.
   1492    if (MC_(clo_mc_level) == 1)
   1493       return;
   1494 
   1495    tl_assert(isOriginalAtom(mce,atom));
   1496    vatom = expr2vbits( mce, atom );
   1497    tl_assert(sameKindedAtoms(atom, vatom));
   1498    ty   = descr->elemTy;
   1499    tyS  = shadowTypeV(ty);
   1500    arrSize = descr->nElems * sizeofIRType(ty);
   1501    tl_assert(ty != Ity_I1);
   1502    tl_assert(isOriginalAtom(mce,ix));
   1503    complainIfUndefined(mce, ix, NULL);
   1504    if (isAlwaysDefd(mce, descr->base, arrSize)) {
   1505       /* later: no ... */
   1506       /* emit code to emit a complaint if any of the vbits are 1. */
   1507       /* complainIfUndefined(mce, atom); */
   1508    } else {
   1509       /* Do a cloned version of the Put that refers to the shadow
   1510          area. */
   1511       IRRegArray* new_descr
   1512          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
   1513                          tyS, descr->nElems);
   1514       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
   1515    }
   1516 }
   1517 
   1518 
   1519 /* Return an expression which contains the V bits corresponding to the
   1520    given GET (passed in in pieces).
   1521 */
   1522 static
   1523 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
   1524 {
   1525    IRType tyS = shadowTypeV(ty);
   1526    tl_assert(ty != Ity_I1);
   1527    tl_assert(ty != Ity_I128);
   1528    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
   1529       /* Always defined, return all zeroes of the relevant type */
   1530       return definedOfType(tyS);
   1531    } else {
   1532       /* return a cloned version of the Get that refers to the shadow
   1533          area. */
   1534       /* FIXME: this isn't an atom! */
   1535       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
   1536    }
   1537 }
   1538 
   1539 
   1540 /* Return an expression which contains the V bits corresponding to the
   1541    given GETI (passed in in pieces).
   1542 */
   1543 static
   1544 IRExpr* shadow_GETI ( MCEnv* mce,
   1545                       IRRegArray* descr, IRAtom* ix, Int bias )
   1546 {
   1547    IRType ty   = descr->elemTy;
   1548    IRType tyS  = shadowTypeV(ty);
   1549    Int arrSize = descr->nElems * sizeofIRType(ty);
   1550    tl_assert(ty != Ity_I1);
   1551    tl_assert(isOriginalAtom(mce,ix));
   1552    complainIfUndefined(mce, ix, NULL);
   1553    if (isAlwaysDefd(mce, descr->base, arrSize)) {
   1554       /* Always defined, return all zeroes of the relevant type */
   1555       return definedOfType(tyS);
   1556    } else {
   1557       /* return a cloned version of the Get that refers to the shadow
   1558          area. */
   1559       IRRegArray* new_descr
   1560          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
   1561                          tyS, descr->nElems);
   1562       return IRExpr_GetI( new_descr, ix, bias );
   1563    }
   1564 }
   1565 
   1566 
   1567 /*------------------------------------------------------------*/
   1568 /*--- Generating approximations for unknown operations,    ---*/
   1569 /*--- using lazy-propagate semantics                       ---*/
   1570 /*------------------------------------------------------------*/
   1571 
   1572 /* Lazy propagation of undefinedness from two values, resulting in the
   1573    specified shadow type.
   1574 */
   1575 static
   1576 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
   1577 {
   1578    IRAtom* at;
   1579    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1580    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1581    tl_assert(isShadowAtom(mce,va1));
   1582    tl_assert(isShadowAtom(mce,va2));
   1583 
   1584    /* The general case is inefficient because PCast is an expensive
   1585       operation.  Here are some special cases which use PCast only
   1586       once rather than twice. */
   1587 
   1588    /* I64 x I64 -> I64 */
   1589    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
   1590       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
   1591       at = mkUifU(mce, Ity_I64, va1, va2);
   1592       at = mkPCastTo(mce, Ity_I64, at);
   1593       return at;
   1594    }
   1595 
   1596    /* I64 x I64 -> I32 */
   1597    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
   1598       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
   1599       at = mkUifU(mce, Ity_I64, va1, va2);
   1600       at = mkPCastTo(mce, Ity_I32, at);
   1601       return at;
   1602    }
   1603 
   1604    if (0) {
   1605       VG_(printf)("mkLazy2 ");
   1606       ppIRType(t1);
   1607       VG_(printf)("_");
   1608       ppIRType(t2);
   1609       VG_(printf)("_");
   1610       ppIRType(finalVty);
   1611       VG_(printf)("\n");
   1612    }
   1613 
   1614    /* General case: force everything via 32-bit intermediaries. */
   1615    at = mkPCastTo(mce, Ity_I32, va1);
   1616    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
   1617    at = mkPCastTo(mce, finalVty, at);
   1618    return at;
   1619 }
   1620 
   1621 
   1622 /* 3-arg version of the above. */
   1623 static
   1624 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
   1625                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
   1626 {
   1627    IRAtom* at;
   1628    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1629    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1630    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
   1631    tl_assert(isShadowAtom(mce,va1));
   1632    tl_assert(isShadowAtom(mce,va2));
   1633    tl_assert(isShadowAtom(mce,va3));
   1634 
   1635    /* The general case is inefficient because PCast is an expensive
   1636       operation.  Here are some special cases which use PCast only
   1637       twice rather than three times. */
   1638 
   1639    /* I32 x I64 x I64 -> I64 */
   1640    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1641    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
   1642        && finalVty == Ity_I64) {
   1643       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
   1644       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
   1645          mode indication which is fully defined, this should get
   1646          folded out later. */
   1647       at = mkPCastTo(mce, Ity_I64, va1);
   1648       /* Now fold in 2nd and 3rd args. */
   1649       at = mkUifU(mce, Ity_I64, at, va2);
   1650       at = mkUifU(mce, Ity_I64, at, va3);
   1651       /* and PCast once again. */
   1652       at = mkPCastTo(mce, Ity_I64, at);
   1653       return at;
   1654    }
   1655 
   1656    /* I32 x I8 x I64 -> I64 */
   1657    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
   1658        && finalVty == Ity_I64) {
   1659       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
   1660       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
   1661        * rounding mode indication which is fully defined, this should
   1662        * get folded out later.
   1663       */
   1664       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
   1665       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
   1666       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
   1667       at = mkUifU(mce, Ity_I64, at, va3);
   1668       /* and PCast once again. */
   1669       at = mkPCastTo(mce, Ity_I64, at);
   1670       return at;
   1671    }
   1672 
   1673    /* I32 x I64 x I64 -> I32 */
   1674    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
   1675        && finalVty == Ity_I32) {
   1676       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
   1677       at = mkPCastTo(mce, Ity_I64, va1);
   1678       at = mkUifU(mce, Ity_I64, at, va2);
   1679       at = mkUifU(mce, Ity_I64, at, va3);
   1680       at = mkPCastTo(mce, Ity_I32, at);
   1681       return at;
   1682    }
   1683 
   1684    /* I32 x I32 x I32 -> I32 */
   1685    /* 32-bit FP idiom, as (eg) happens on ARM */
   1686    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
   1687        && finalVty == Ity_I32) {
   1688       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
   1689       at = va1;
   1690       at = mkUifU(mce, Ity_I32, at, va2);
   1691       at = mkUifU(mce, Ity_I32, at, va3);
   1692       at = mkPCastTo(mce, Ity_I32, at);
   1693       return at;
   1694    }
   1695 
   1696    /* I32 x I128 x I128 -> I128 */
   1697    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1698    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
   1699        && finalVty == Ity_I128) {
   1700       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
   1701       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
   1702          mode indication which is fully defined, this should get
   1703          folded out later. */
   1704       at = mkPCastTo(mce, Ity_I128, va1);
   1705       /* Now fold in 2nd and 3rd args. */
   1706       at = mkUifU(mce, Ity_I128, at, va2);
   1707       at = mkUifU(mce, Ity_I128, at, va3);
   1708       /* and PCast once again. */
   1709       at = mkPCastTo(mce, Ity_I128, at);
   1710       return at;
   1711    }
   1712 
   1713    /* I32 x I8 x I128 -> I128 */
   1714    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1715    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
   1716        && finalVty == Ity_I128) {
   1717       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
   1718       /* Use I64 as an intermediate type, which means PCasting all 3
   1719          args to I64 to start with. 1st arg is typically a rounding
   1720          mode indication which is fully defined, so we hope that it
   1721          will get folded out later. */
   1722       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
   1723       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
   1724       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
   1725       /* Now UifU all three together. */
   1726       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
   1727       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
   1728       /* and PCast once again. */
   1729       at = mkPCastTo(mce, Ity_I128, at);
   1730       return at;
   1731    }
   1732    if (1) {
   1733       VG_(printf)("mkLazy3: ");
   1734       ppIRType(t1);
   1735       VG_(printf)(" x ");
   1736       ppIRType(t2);
   1737       VG_(printf)(" x ");
   1738       ppIRType(t3);
   1739       VG_(printf)(" -> ");
   1740       ppIRType(finalVty);
   1741       VG_(printf)("\n");
   1742    }
   1743 
   1744    tl_assert(0);
   1745    /* General case: force everything via 32-bit intermediaries. */
   1746    /*
   1747    at = mkPCastTo(mce, Ity_I32, va1);
   1748    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
   1749    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
   1750    at = mkPCastTo(mce, finalVty, at);
   1751    return at;
   1752    */
   1753 }
   1754 
   1755 
   1756 /* 4-arg version of the above. */
   1757 static
   1758 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
   1759                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
   1760 {
   1761    IRAtom* at;
   1762    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1763    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1764    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
   1765    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
   1766    tl_assert(isShadowAtom(mce,va1));
   1767    tl_assert(isShadowAtom(mce,va2));
   1768    tl_assert(isShadowAtom(mce,va3));
   1769    tl_assert(isShadowAtom(mce,va4));
   1770 
   1771    /* The general case is inefficient because PCast is an expensive
   1772       operation.  Here are some special cases which use PCast only
   1773       twice rather than three times. */
   1774 
   1775    /* I32 x I64 x I64 x I64 -> I64 */
   1776    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
   1777    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
   1778        && finalVty == Ity_I64) {
   1779       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
   1780       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
   1781          mode indication which is fully defined, this should get
   1782          folded out later. */
   1783       at = mkPCastTo(mce, Ity_I64, va1);
   1784       /* Now fold in 2nd, 3rd, 4th args. */
   1785       at = mkUifU(mce, Ity_I64, at, va2);
   1786       at = mkUifU(mce, Ity_I64, at, va3);
   1787       at = mkUifU(mce, Ity_I64, at, va4);
   1788       /* and PCast once again. */
   1789       at = mkPCastTo(mce, Ity_I64, at);
   1790       return at;
   1791    }
   1792    /* I32 x I32 x I32 x I32 -> I32 */
   1793    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
   1794    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
   1795        && finalVty == Ity_I32) {
   1796       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
   1797       at = va1;
   1798       /* Now fold in 2nd, 3rd, 4th args. */
   1799       at = mkUifU(mce, Ity_I32, at, va2);
   1800       at = mkUifU(mce, Ity_I32, at, va3);
   1801       at = mkUifU(mce, Ity_I32, at, va4);
   1802       at = mkPCastTo(mce, Ity_I32, at);
   1803       return at;
   1804    }
   1805 
   1806    if (1) {
   1807       VG_(printf)("mkLazy4: ");
   1808       ppIRType(t1);
   1809       VG_(printf)(" x ");
   1810       ppIRType(t2);
   1811       VG_(printf)(" x ");
   1812       ppIRType(t3);
   1813       VG_(printf)(" x ");
   1814       ppIRType(t4);
   1815       VG_(printf)(" -> ");
   1816       ppIRType(finalVty);
   1817       VG_(printf)("\n");
   1818    }
   1819 
   1820    tl_assert(0);
   1821 }
   1822 
   1823 
   1824 /* Do the lazy propagation game from a null-terminated vector of
   1825    atoms.  This is presumably the arguments to a helper call, so the
   1826    IRCallee info is also supplied in order that we can know which
   1827    arguments should be ignored (via the .mcx_mask field).
   1828 */
   1829 static
   1830 IRAtom* mkLazyN ( MCEnv* mce,
   1831                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
   1832 {
   1833    Int     i;
   1834    IRAtom* here;
   1835    IRAtom* curr;
   1836    IRType  mergeTy;
   1837    Bool    mergeTy64 = True;
   1838 
   1839    /* Decide on the type of the merge intermediary.  If all relevant
   1840       args are I64, then it's I64.  In all other circumstances, use
   1841       I32. */
   1842    for (i = 0; exprvec[i]; i++) {
   1843       tl_assert(i < 32);
   1844       tl_assert(isOriginalAtom(mce, exprvec[i]));
   1845       if (cee->mcx_mask & (1<<i))
   1846          continue;
   1847       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
   1848          mergeTy64 = False;
   1849    }
   1850 
   1851    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
   1852    curr    = definedOfType(mergeTy);
   1853 
   1854    for (i = 0; exprvec[i]; i++) {
   1855       tl_assert(i < 32);
   1856       tl_assert(isOriginalAtom(mce, exprvec[i]));
   1857       /* Only take notice of this arg if the callee's mc-exclusion
   1858          mask does not say it is to be excluded. */
   1859       if (cee->mcx_mask & (1<<i)) {
   1860          /* the arg is to be excluded from definedness checking.  Do
   1861             nothing. */
   1862          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
   1863       } else {
   1864          /* calculate the arg's definedness, and pessimistically merge
   1865             it in. */
   1866          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i]) );
   1867          curr = mergeTy64
   1868                    ? mkUifU64(mce, here, curr)
   1869                    : mkUifU32(mce, here, curr);
   1870       }
   1871    }
   1872    return mkPCastTo(mce, finalVtype, curr );
   1873 }
   1874 
   1875 
   1876 /*------------------------------------------------------------*/
   1877 /*--- Generating expensive sequences for exact carry-chain ---*/
   1878 /*--- propagation in add/sub and related operations.       ---*/
   1879 /*------------------------------------------------------------*/
   1880 
   1881 static
   1882 IRAtom* expensiveAddSub ( MCEnv*  mce,
   1883                           Bool    add,
   1884                           IRType  ty,
   1885                           IRAtom* qaa, IRAtom* qbb,
   1886                           IRAtom* aa,  IRAtom* bb )
   1887 {
   1888    IRAtom *a_min, *b_min, *a_max, *b_max;
   1889    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
   1890 
   1891    tl_assert(isShadowAtom(mce,qaa));
   1892    tl_assert(isShadowAtom(mce,qbb));
   1893    tl_assert(isOriginalAtom(mce,aa));
   1894    tl_assert(isOriginalAtom(mce,bb));
   1895    tl_assert(sameKindedAtoms(qaa,aa));
   1896    tl_assert(sameKindedAtoms(qbb,bb));
   1897 
   1898    switch (ty) {
   1899       case Ity_I32:
   1900          opAND = Iop_And32;
   1901          opOR  = Iop_Or32;
   1902          opXOR = Iop_Xor32;
   1903          opNOT = Iop_Not32;
   1904          opADD = Iop_Add32;
   1905          opSUB = Iop_Sub32;
   1906          break;
   1907       case Ity_I64:
   1908          opAND = Iop_And64;
   1909          opOR  = Iop_Or64;
   1910          opXOR = Iop_Xor64;
   1911          opNOT = Iop_Not64;
   1912          opADD = Iop_Add64;
   1913          opSUB = Iop_Sub64;
   1914          break;
   1915       default:
   1916          VG_(tool_panic)("expensiveAddSub");
   1917    }
   1918 
   1919    // a_min = aa & ~qaa
   1920    a_min = assignNew('V', mce,ty,
   1921                      binop(opAND, aa,
   1922                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
   1923 
   1924    // b_min = bb & ~qbb
   1925    b_min = assignNew('V', mce,ty,
   1926                      binop(opAND, bb,
   1927                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
   1928 
   1929    // a_max = aa | qaa
   1930    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
   1931 
   1932    // b_max = bb | qbb
   1933    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
   1934 
   1935    if (add) {
   1936       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
   1937       return
   1938       assignNew('V', mce,ty,
   1939          binop( opOR,
   1940                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
   1941                 assignNew('V', mce,ty,
   1942                    binop( opXOR,
   1943                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
   1944                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
   1945                    )
   1946                 )
   1947          )
   1948       );
   1949    } else {
   1950       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
   1951       return
   1952       assignNew('V', mce,ty,
   1953          binop( opOR,
   1954                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
   1955                 assignNew('V', mce,ty,
   1956                    binop( opXOR,
   1957                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
   1958                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
   1959                    )
   1960                 )
   1961          )
   1962       );
   1963    }
   1964 
   1965 }
   1966 
   1967 
   1968 static
   1969 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
   1970                                        IRAtom* atom, IRAtom* vatom )
   1971 {
   1972    IRType ty;
   1973    IROp xorOp, subOp, andOp;
   1974    IRExpr *one;
   1975    IRAtom *improver, *improved;
   1976    tl_assert(isShadowAtom(mce,vatom));
   1977    tl_assert(isOriginalAtom(mce,atom));
   1978    tl_assert(sameKindedAtoms(atom,vatom));
   1979 
   1980    switch (czop) {
   1981       case Iop_Ctz32:
   1982          ty = Ity_I32;
   1983          xorOp = Iop_Xor32;
   1984          subOp = Iop_Sub32;
   1985          andOp = Iop_And32;
   1986          one = mkU32(1);
   1987          break;
   1988       case Iop_Ctz64:
   1989          ty = Ity_I64;
   1990          xorOp = Iop_Xor64;
   1991          subOp = Iop_Sub64;
   1992          andOp = Iop_And64;
   1993          one = mkU64(1);
   1994          break;
   1995       default:
   1996          ppIROp(czop);
   1997          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
   1998    }
   1999 
   2000    // improver = atom ^ (atom - 1)
   2001    //
   2002    // That is, improver has its low ctz(atom) bits equal to one;
   2003    // higher bits (if any) equal to zero.
   2004    improver = assignNew('V', mce,ty,
   2005                         binop(xorOp,
   2006                               atom,
   2007                               assignNew('V', mce, ty,
   2008                                         binop(subOp, atom, one))));
   2009 
   2010    // improved = vatom & improver
   2011    //
   2012    // That is, treat any V bits above the first ctz(atom) bits as
   2013    // "defined".
   2014    improved = assignNew('V', mce, ty,
   2015                         binop(andOp, vatom, improver));
   2016 
   2017    // Return pessimizing cast of improved.
   2018    return mkPCastTo(mce, ty, improved);
   2019 }
   2020 
   2021 
   2022 /*------------------------------------------------------------*/
   2023 /*--- Scalar shifts.                                       ---*/
   2024 /*------------------------------------------------------------*/
   2025 
   2026 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
   2027    idea is to shift the definedness bits by the original shift amount.
   2028    This introduces 0s ("defined") in new positions for left shifts and
   2029    unsigned right shifts, and copies the top definedness bit for
   2030    signed right shifts.  So, conveniently, applying the original shift
   2031    operator to the definedness bits for the left arg is exactly the
   2032    right thing to do:
   2033 
   2034       (qaa << bb)
   2035 
   2036    However if the shift amount is undefined then the whole result
   2037    is undefined.  Hence need:
   2038 
   2039       (qaa << bb) `UifU` PCast(qbb)
   2040 
   2041    If the shift amount bb is a literal than qbb will say 'all defined'
   2042    and the UifU and PCast will get folded out by post-instrumentation
   2043    optimisation.
   2044 */
   2045 static IRAtom* scalarShift ( MCEnv*  mce,
   2046                              IRType  ty,
   2047                              IROp    original_op,
   2048                              IRAtom* qaa, IRAtom* qbb,
   2049                              IRAtom* aa,  IRAtom* bb )
   2050 {
   2051    tl_assert(isShadowAtom(mce,qaa));
   2052    tl_assert(isShadowAtom(mce,qbb));
   2053    tl_assert(isOriginalAtom(mce,aa));
   2054    tl_assert(isOriginalAtom(mce,bb));
   2055    tl_assert(sameKindedAtoms(qaa,aa));
   2056    tl_assert(sameKindedAtoms(qbb,bb));
   2057    return
   2058       assignNew(
   2059          'V', mce, ty,
   2060          mkUifU( mce, ty,
   2061                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
   2062                  mkPCastTo(mce, ty, qbb)
   2063          )
   2064    );
   2065 }
   2066 
   2067 
   2068 /*------------------------------------------------------------*/
   2069 /*--- Helpers for dealing with vector primops.             ---*/
   2070 /*------------------------------------------------------------*/
   2071 
   2072 /* Vector pessimisation -- pessimise within each lane individually. */
   2073 
   2074 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
   2075 {
   2076    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
   2077 }
   2078 
   2079 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
   2080 {
   2081    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
   2082 }
   2083 
   2084 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
   2085 {
   2086    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
   2087 }
   2088 
   2089 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
   2090 {
   2091    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
   2092 }
   2093 
   2094 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
   2095 {
   2096    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
   2097 }
   2098 
   2099 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
   2100 {
   2101    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
   2102 }
   2103 
   2104 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
   2105 {
   2106    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
   2107 }
   2108 
   2109 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
   2110 {
   2111    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
   2112 }
   2113 
   2114 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
   2115 {
   2116    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
   2117 }
   2118 
   2119 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
   2120 {
   2121    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
   2122 }
   2123 
   2124 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
   2125 {
   2126    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
   2127 }
   2128 
   2129 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
   2130 {
   2131    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
   2132 }
   2133 
   2134 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
   2135 {
   2136    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
   2137 }
   2138 
   2139 
   2140 /* Here's a simple scheme capable of handling ops derived from SSE1
   2141    code and while only generating ops that can be efficiently
   2142    implemented in SSE1. */
   2143 
   2144 /* All-lanes versions are straightforward:
   2145 
   2146    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
   2147 
   2148    unary32Fx4(x,y)    ==> PCast32x4(x#)
   2149 
   2150    Lowest-lane-only versions are more complex:
   2151 
   2152    binary32F0x4(x,y)  ==> SetV128lo32(
   2153                              x#,
   2154                              PCast32(V128to32(UifUV128(x#,y#)))
   2155                           )
   2156 
   2157    This is perhaps not so obvious.  In particular, it's faster to
   2158    do a V128-bit UifU and then take the bottom 32 bits than the more
   2159    obvious scheme of taking the bottom 32 bits of each operand
   2160    and doing a 32-bit UifU.  Basically since UifU is fast and
   2161    chopping lanes off vector values is slow.
   2162 
   2163    Finally:
   2164 
   2165    unary32F0x4(x)     ==> SetV128lo32(
   2166                              x#,
   2167                              PCast32(V128to32(x#))
   2168                           )
   2169 
   2170    Where:
   2171 
   2172    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
   2173    PCast32x4(v#) = CmpNEZ32x4(v#)
   2174 */
   2175 
   2176 static
   2177 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2178 {
   2179    IRAtom* at;
   2180    tl_assert(isShadowAtom(mce, vatomX));
   2181    tl_assert(isShadowAtom(mce, vatomY));
   2182    at = mkUifUV128(mce, vatomX, vatomY);
   2183    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
   2184    return at;
   2185 }
   2186 
   2187 static
   2188 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
   2189 {
   2190    IRAtom* at;
   2191    tl_assert(isShadowAtom(mce, vatomX));
   2192    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
   2193    return at;
   2194 }
   2195 
   2196 static
   2197 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2198 {
   2199    IRAtom* at;
   2200    tl_assert(isShadowAtom(mce, vatomX));
   2201    tl_assert(isShadowAtom(mce, vatomY));
   2202    at = mkUifUV128(mce, vatomX, vatomY);
   2203    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
   2204    at = mkPCastTo(mce, Ity_I32, at);
   2205    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
   2206    return at;
   2207 }
   2208 
   2209 static
   2210 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
   2211 {
   2212    IRAtom* at;
   2213    tl_assert(isShadowAtom(mce, vatomX));
   2214    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
   2215    at = mkPCastTo(mce, Ity_I32, at);
   2216    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
   2217    return at;
   2218 }
   2219 
   2220 /* --- ... and ... 64Fx2 versions of the same ... --- */
   2221 
   2222 static
   2223 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2224 {
   2225    IRAtom* at;
   2226    tl_assert(isShadowAtom(mce, vatomX));
   2227    tl_assert(isShadowAtom(mce, vatomY));
   2228    at = mkUifUV128(mce, vatomX, vatomY);
   2229    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
   2230    return at;
   2231 }
   2232 
   2233 static
   2234 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
   2235 {
   2236    IRAtom* at;
   2237    tl_assert(isShadowAtom(mce, vatomX));
   2238    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
   2239    return at;
   2240 }
   2241 
   2242 static
   2243 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2244 {
   2245    IRAtom* at;
   2246    tl_assert(isShadowAtom(mce, vatomX));
   2247    tl_assert(isShadowAtom(mce, vatomY));
   2248    at = mkUifUV128(mce, vatomX, vatomY);
   2249    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
   2250    at = mkPCastTo(mce, Ity_I64, at);
   2251    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
   2252    return at;
   2253 }
   2254 
   2255 static
   2256 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
   2257 {
   2258    IRAtom* at;
   2259    tl_assert(isShadowAtom(mce, vatomX));
   2260    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
   2261    at = mkPCastTo(mce, Ity_I64, at);
   2262    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
   2263    return at;
   2264 }
   2265 
   2266 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
   2267 
   2268 static
   2269 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2270 {
   2271    IRAtom* at;
   2272    tl_assert(isShadowAtom(mce, vatomX));
   2273    tl_assert(isShadowAtom(mce, vatomY));
   2274    at = mkUifU64(mce, vatomX, vatomY);
   2275    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
   2276    return at;
   2277 }
   2278 
   2279 static
   2280 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
   2281 {
   2282    IRAtom* at;
   2283    tl_assert(isShadowAtom(mce, vatomX));
   2284    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
   2285    return at;
   2286 }
   2287 
   2288 /* --- ... and ... 64Fx4 versions of the same ... --- */
   2289 
   2290 static
   2291 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2292 {
   2293    IRAtom* at;
   2294    tl_assert(isShadowAtom(mce, vatomX));
   2295    tl_assert(isShadowAtom(mce, vatomY));
   2296    at = mkUifUV256(mce, vatomX, vatomY);
   2297    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
   2298    return at;
   2299 }
   2300 
   2301 static
   2302 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
   2303 {
   2304    IRAtom* at;
   2305    tl_assert(isShadowAtom(mce, vatomX));
   2306    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
   2307    return at;
   2308 }
   2309 
   2310 /* --- ... and ... 32Fx8 versions of the same ... --- */
   2311 
   2312 static
   2313 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2314 {
   2315    IRAtom* at;
   2316    tl_assert(isShadowAtom(mce, vatomX));
   2317    tl_assert(isShadowAtom(mce, vatomY));
   2318    at = mkUifUV256(mce, vatomX, vatomY);
   2319    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
   2320    return at;
   2321 }
   2322 
   2323 static
   2324 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
   2325 {
   2326    IRAtom* at;
   2327    tl_assert(isShadowAtom(mce, vatomX));
   2328    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
   2329    return at;
   2330 }
   2331 
   2332 /* --- 64Fx2 binary FP ops, with rounding mode --- */
   2333 
   2334 static
   2335 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
   2336                                        IRAtom* vatomX, IRAtom* vatomY )
   2337 {
   2338    /* This is the same as binary64Fx2, except that we subsequently
   2339       pessimise vRM (definedness of the rounding mode), widen to 128
   2340       bits and UifU it into the result.  As with the scalar cases, if
   2341       the RM is a constant then it is defined and so this extra bit
   2342       will get constant-folded out later. */
   2343    // "do" the vector args
   2344    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
   2345    // PCast the RM, and widen it to 128 bits
   2346    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2347    // Roll it into the result
   2348    t1 = mkUifUV128(mce, t1, t2);
   2349    return t1;
   2350 }
   2351 
   2352 /* --- ... and ... 32Fx4 versions of the same --- */
   2353 
   2354 static
   2355 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
   2356                                        IRAtom* vatomX, IRAtom* vatomY )
   2357 {
   2358    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
   2359    // PCast the RM, and widen it to 128 bits
   2360    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2361    // Roll it into the result
   2362    t1 = mkUifUV128(mce, t1, t2);
   2363    return t1;
   2364 }
   2365 
   2366 /* --- ... and ... 64Fx4 versions of the same --- */
   2367 
   2368 static
   2369 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
   2370                                        IRAtom* vatomX, IRAtom* vatomY )
   2371 {
   2372    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
   2373    // PCast the RM, and widen it to 256 bits
   2374    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
   2375    // Roll it into the result
   2376    t1 = mkUifUV256(mce, t1, t2);
   2377    return t1;
   2378 }
   2379 
   2380 /* --- ... and ... 32Fx8 versions of the same --- */
   2381 
   2382 static
   2383 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
   2384                                        IRAtom* vatomX, IRAtom* vatomY )
   2385 {
   2386    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
   2387    // PCast the RM, and widen it to 256 bits
   2388    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
   2389    // Roll it into the result
   2390    t1 = mkUifUV256(mce, t1, t2);
   2391    return t1;
   2392 }
   2393 
   2394 /* --- 64Fx2 unary FP ops, with rounding mode --- */
   2395 
   2396 static
   2397 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
   2398 {
   2399    /* Same scheme as binary64Fx2_w_rm. */
   2400    // "do" the vector arg
   2401    IRAtom* t1 = unary64Fx2(mce, vatomX);
   2402    // PCast the RM, and widen it to 128 bits
   2403    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2404    // Roll it into the result
   2405    t1 = mkUifUV128(mce, t1, t2);
   2406    return t1;
   2407 }
   2408 
   2409 /* --- ... and ... 32Fx4 versions of the same --- */
   2410 
   2411 static
   2412 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
   2413 {
   2414    /* Same scheme as unary32Fx4_w_rm. */
   2415    IRAtom* t1 = unary32Fx4(mce, vatomX);
   2416    // PCast the RM, and widen it to 128 bits
   2417    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2418    // Roll it into the result
   2419    t1 = mkUifUV128(mce, t1, t2);
   2420    return t1;
   2421 }
   2422 
   2423 
   2424 /* --- --- Vector saturated narrowing --- --- */
   2425 
   2426 /* We used to do something very clever here, but on closer inspection
   2427    (2011-Jun-15), and in particular bug #279698, it turns out to be
   2428    wrong.  Part of the problem came from the fact that for a long
   2429    time, the IR primops to do with saturated narrowing were
   2430    underspecified and managed to confuse multiple cases which needed
   2431    to be separate: the op names had a signedness qualifier, but in
   2432    fact the source and destination signednesses needed to be specified
   2433    independently, so the op names really need two independent
   2434    signedness specifiers.
   2435 
   2436    As of 2011-Jun-15 (ish) the underspecification was sorted out
   2437    properly.  The incorrect instrumentation remained, though.  That
   2438    has now (2011-Oct-22) been fixed.
   2439 
   2440    What we now do is simple:
   2441 
   2442    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
   2443    number of lanes, X is the source lane width and signedness, and Y
   2444    is the destination lane width and signedness.  In all cases the
   2445    destination lane width is half the source lane width, so the names
   2446    have a bit of redundancy, but are at least easy to read.
   2447 
   2448    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
   2449    to unsigned 16s.
   2450 
   2451    Let Vanilla(OP) be a function that takes OP, one of these
   2452    saturating narrowing ops, and produces the same "shaped" narrowing
   2453    op which is not saturating, but merely dumps the most significant
   2454    bits.  "same shape" means that the lane numbers and widths are the
   2455    same as with OP.
   2456 
   2457    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
   2458                   = Iop_NarrowBin32to16x8,
   2459    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
   2460    dumping the top half of each lane.
   2461 
   2462    So, with that in place, the scheme is simple, and it is simple to
   2463    pessimise each lane individually and then apply Vanilla(OP) so as
   2464    to get the result in the right "shape".  If the original OP is
   2465    QNarrowBinXtoYxZ then we produce
   2466 
   2467    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
   2468 
   2469    or for the case when OP is unary (Iop_QNarrowUn*)
   2470 
   2471    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
   2472 */
   2473 static
   2474 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
   2475 {
   2476    switch (qnarrowOp) {
   2477       /* Binary: (128, 128) -> 128 */
   2478       case Iop_QNarrowBin16Sto8Ux16:
   2479       case Iop_QNarrowBin16Sto8Sx16:
   2480       case Iop_QNarrowBin16Uto8Ux16:
   2481       case Iop_QNarrowBin64Sto32Sx4:
   2482       case Iop_QNarrowBin64Uto32Ux4:
   2483          return Iop_NarrowBin16to8x16;
   2484       case Iop_QNarrowBin32Sto16Ux8:
   2485       case Iop_QNarrowBin32Sto16Sx8:
   2486       case Iop_QNarrowBin32Uto16Ux8:
   2487          return Iop_NarrowBin32to16x8;
   2488       /* Binary: (64, 64) -> 64 */
   2489       case Iop_QNarrowBin32Sto16Sx4:
   2490          return Iop_NarrowBin32to16x4;
   2491       case Iop_QNarrowBin16Sto8Ux8:
   2492       case Iop_QNarrowBin16Sto8Sx8:
   2493          return Iop_NarrowBin16to8x8;
   2494       /* Unary: 128 -> 64 */
   2495       case Iop_QNarrowUn64Uto32Ux2:
   2496       case Iop_QNarrowUn64Sto32Sx2:
   2497       case Iop_QNarrowUn64Sto32Ux2:
   2498          return Iop_NarrowUn64to32x2;
   2499       case Iop_QNarrowUn32Uto16Ux4:
   2500       case Iop_QNarrowUn32Sto16Sx4:
   2501       case Iop_QNarrowUn32Sto16Ux4:
   2502          return Iop_NarrowUn32to16x4;
   2503       case Iop_QNarrowUn16Uto8Ux8:
   2504       case Iop_QNarrowUn16Sto8Sx8:
   2505       case Iop_QNarrowUn16Sto8Ux8:
   2506          return Iop_NarrowUn16to8x8;
   2507       default:
   2508          ppIROp(qnarrowOp);
   2509          VG_(tool_panic)("vanillaNarrowOpOfShape");
   2510    }
   2511 }
   2512 
   2513 static
   2514 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
   2515                               IRAtom* vatom1, IRAtom* vatom2)
   2516 {
   2517    IRAtom *at1, *at2, *at3;
   2518    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2519    switch (narrow_op) {
   2520       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
   2521       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
   2522       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
   2523       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
   2524       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
   2525       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
   2526       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
   2527       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
   2528       default: VG_(tool_panic)("vectorNarrowBinV128");
   2529    }
   2530    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2531    tl_assert(isShadowAtom(mce,vatom1));
   2532    tl_assert(isShadowAtom(mce,vatom2));
   2533    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
   2534    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
   2535    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
   2536    return at3;
   2537 }
   2538 
   2539 static
   2540 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
   2541                             IRAtom* vatom1, IRAtom* vatom2)
   2542 {
   2543    IRAtom *at1, *at2, *at3;
   2544    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2545    switch (narrow_op) {
   2546       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
   2547       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
   2548       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
   2549       default: VG_(tool_panic)("vectorNarrowBin64");
   2550    }
   2551    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2552    tl_assert(isShadowAtom(mce,vatom1));
   2553    tl_assert(isShadowAtom(mce,vatom2));
   2554    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
   2555    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
   2556    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
   2557    return at3;
   2558 }
   2559 
   2560 static
   2561 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
   2562                              IRAtom* vatom1)
   2563 {
   2564    IRAtom *at1, *at2;
   2565    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2566    tl_assert(isShadowAtom(mce,vatom1));
   2567    /* For vanilla narrowing (non-saturating), we can just apply
   2568       the op directly to the V bits. */
   2569    switch (narrow_op) {
   2570       case Iop_NarrowUn16to8x8:
   2571       case Iop_NarrowUn32to16x4:
   2572       case Iop_NarrowUn64to32x2:
   2573          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
   2574          return at1;
   2575       default:
   2576          break; /* Do Plan B */
   2577    }
   2578    /* Plan B: for ops that involve a saturation operation on the args,
   2579       we must PCast before the vanilla narrow. */
   2580    switch (narrow_op) {
   2581       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
   2582       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
   2583       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
   2584       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
   2585       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
   2586       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
   2587       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
   2588       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
   2589       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
   2590       default: VG_(tool_panic)("vectorNarrowUnV128");
   2591    }
   2592    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2593    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
   2594    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
   2595    return at2;
   2596 }
   2597 
   2598 static
   2599 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
   2600                          IRAtom* vatom1)
   2601 {
   2602    IRAtom *at1, *at2;
   2603    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2604    switch (longen_op) {
   2605       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
   2606       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
   2607       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
   2608       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
   2609       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
   2610       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
   2611       default: VG_(tool_panic)("vectorWidenI64");
   2612    }
   2613    tl_assert(isShadowAtom(mce,vatom1));
   2614    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
   2615    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
   2616    return at2;
   2617 }
   2618 
   2619 
   2620 /* --- --- Vector integer arithmetic --- --- */
   2621 
   2622 /* Simple ... UifU the args and per-lane pessimise the results. */
   2623 
   2624 /* --- V256-bit versions --- */
   2625 
   2626 static
   2627 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2628 {
   2629    IRAtom* at;
   2630    at = mkUifUV256(mce, vatom1, vatom2);
   2631    at = mkPCast8x32(mce, at);
   2632    return at;
   2633 }
   2634 
   2635 static
   2636 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2637 {
   2638    IRAtom* at;
   2639    at = mkUifUV256(mce, vatom1, vatom2);
   2640    at = mkPCast16x16(mce, at);
   2641    return at;
   2642 }
   2643 
   2644 static
   2645 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2646 {
   2647    IRAtom* at;
   2648    at = mkUifUV256(mce, vatom1, vatom2);
   2649    at = mkPCast32x8(mce, at);
   2650    return at;
   2651 }
   2652 
   2653 static
   2654 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2655 {
   2656    IRAtom* at;
   2657    at = mkUifUV256(mce, vatom1, vatom2);
   2658    at = mkPCast64x4(mce, at);
   2659    return at;
   2660 }
   2661 
   2662 /* --- V128-bit versions --- */
   2663 
   2664 static
   2665 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2666 {
   2667    IRAtom* at;
   2668    at = mkUifUV128(mce, vatom1, vatom2);
   2669    at = mkPCast8x16(mce, at);
   2670    return at;
   2671 }
   2672 
   2673 static
   2674 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2675 {
   2676    IRAtom* at;
   2677    at = mkUifUV128(mce, vatom1, vatom2);
   2678    at = mkPCast16x8(mce, at);
   2679    return at;
   2680 }
   2681 
   2682 static
   2683 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2684 {
   2685    IRAtom* at;
   2686    at = mkUifUV128(mce, vatom1, vatom2);
   2687    at = mkPCast32x4(mce, at);
   2688    return at;
   2689 }
   2690 
   2691 static
   2692 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2693 {
   2694    IRAtom* at;
   2695    at = mkUifUV128(mce, vatom1, vatom2);
   2696    at = mkPCast64x2(mce, at);
   2697    return at;
   2698 }
   2699 
   2700 /* --- 64-bit versions --- */
   2701 
   2702 static
   2703 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2704 {
   2705    IRAtom* at;
   2706    at = mkUifU64(mce, vatom1, vatom2);
   2707    at = mkPCast8x8(mce, at);
   2708    return at;
   2709 }
   2710 
   2711 static
   2712 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2713 {
   2714    IRAtom* at;
   2715    at = mkUifU64(mce, vatom1, vatom2);
   2716    at = mkPCast16x4(mce, at);
   2717    return at;
   2718 }
   2719 
   2720 static
   2721 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2722 {
   2723    IRAtom* at;
   2724    at = mkUifU64(mce, vatom1, vatom2);
   2725    at = mkPCast32x2(mce, at);
   2726    return at;
   2727 }
   2728 
   2729 static
   2730 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2731 {
   2732    IRAtom* at;
   2733    at = mkUifU64(mce, vatom1, vatom2);
   2734    at = mkPCastTo(mce, Ity_I64, at);
   2735    return at;
   2736 }
   2737 
   2738 /* --- 32-bit versions --- */
   2739 
   2740 static
   2741 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2742 {
   2743    IRAtom* at;
   2744    at = mkUifU32(mce, vatom1, vatom2);
   2745    at = mkPCast8x4(mce, at);
   2746    return at;
   2747 }
   2748 
   2749 static
   2750 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2751 {
   2752    IRAtom* at;
   2753    at = mkUifU32(mce, vatom1, vatom2);
   2754    at = mkPCast16x2(mce, at);
   2755    return at;
   2756 }
   2757 
   2758 
   2759 /*------------------------------------------------------------*/
   2760 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
   2761 /*------------------------------------------------------------*/
   2762 
   2763 static
   2764 IRAtom* expr2vbits_Qop ( MCEnv* mce,
   2765                          IROp op,
   2766                          IRAtom* atom1, IRAtom* atom2,
   2767                          IRAtom* atom3, IRAtom* atom4 )
   2768 {
   2769    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2770    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2771    IRAtom* vatom3 = expr2vbits( mce, atom3 );
   2772    IRAtom* vatom4 = expr2vbits( mce, atom4 );
   2773 
   2774    tl_assert(isOriginalAtom(mce,atom1));
   2775    tl_assert(isOriginalAtom(mce,atom2));
   2776    tl_assert(isOriginalAtom(mce,atom3));
   2777    tl_assert(isOriginalAtom(mce,atom4));
   2778    tl_assert(isShadowAtom(mce,vatom1));
   2779    tl_assert(isShadowAtom(mce,vatom2));
   2780    tl_assert(isShadowAtom(mce,vatom3));
   2781    tl_assert(isShadowAtom(mce,vatom4));
   2782    tl_assert(sameKindedAtoms(atom1,vatom1));
   2783    tl_assert(sameKindedAtoms(atom2,vatom2));
   2784    tl_assert(sameKindedAtoms(atom3,vatom3));
   2785    tl_assert(sameKindedAtoms(atom4,vatom4));
   2786    switch (op) {
   2787       case Iop_MAddF64:
   2788       case Iop_MAddF64r32:
   2789       case Iop_MSubF64:
   2790       case Iop_MSubF64r32:
   2791          /* I32(rm) x F64 x F64 x F64 -> F64 */
   2792          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
   2793 
   2794       case Iop_MAddF32:
   2795       case Iop_MSubF32:
   2796          /* I32(rm) x F32 x F32 x F32 -> F32 */
   2797          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
   2798 
   2799       /* V256-bit data-steering */
   2800       case Iop_64x4toV256:
   2801          return assignNew('V', mce, Ity_V256,
   2802                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
   2803 
   2804       default:
   2805          ppIROp(op);
   2806          VG_(tool_panic)("memcheck:expr2vbits_Qop");
   2807    }
   2808 }
   2809 
   2810 
   2811 static
   2812 IRAtom* expr2vbits_Triop ( MCEnv* mce,
   2813                            IROp op,
   2814                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
   2815 {
   2816    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2817    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2818    IRAtom* vatom3 = expr2vbits( mce, atom3 );
   2819 
   2820    tl_assert(isOriginalAtom(mce,atom1));
   2821    tl_assert(isOriginalAtom(mce,atom2));
   2822    tl_assert(isOriginalAtom(mce,atom3));
   2823    tl_assert(isShadowAtom(mce,vatom1));
   2824    tl_assert(isShadowAtom(mce,vatom2));
   2825    tl_assert(isShadowAtom(mce,vatom3));
   2826    tl_assert(sameKindedAtoms(atom1,vatom1));
   2827    tl_assert(sameKindedAtoms(atom2,vatom2));
   2828    tl_assert(sameKindedAtoms(atom3,vatom3));
   2829    switch (op) {
   2830       case Iop_AddF128:
   2831       case Iop_AddD128:
   2832       case Iop_SubF128:
   2833       case Iop_SubD128:
   2834       case Iop_MulF128:
   2835       case Iop_MulD128:
   2836       case Iop_DivF128:
   2837       case Iop_DivD128:
   2838       case Iop_QuantizeD128:
   2839          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
   2840          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
   2841       case Iop_AddF64:
   2842       case Iop_AddD64:
   2843       case Iop_AddF64r32:
   2844       case Iop_SubF64:
   2845       case Iop_SubD64:
   2846       case Iop_SubF64r32:
   2847       case Iop_MulF64:
   2848       case Iop_MulD64:
   2849       case Iop_MulF64r32:
   2850       case Iop_DivF64:
   2851       case Iop_DivD64:
   2852       case Iop_DivF64r32:
   2853       case Iop_ScaleF64:
   2854       case Iop_Yl2xF64:
   2855       case Iop_Yl2xp1F64:
   2856       case Iop_AtanF64:
   2857       case Iop_PRemF64:
   2858       case Iop_PRem1F64:
   2859       case Iop_QuantizeD64:
   2860          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
   2861          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
   2862       case Iop_PRemC3210F64:
   2863       case Iop_PRem1C3210F64:
   2864          /* I32(rm) x F64 x F64 -> I32 */
   2865          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
   2866       case Iop_AddF32:
   2867       case Iop_SubF32:
   2868       case Iop_MulF32:
   2869       case Iop_DivF32:
   2870          /* I32(rm) x F32 x F32 -> I32 */
   2871          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
   2872       case Iop_SignificanceRoundD64:
   2873          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
   2874          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
   2875       case Iop_SignificanceRoundD128:
   2876          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
   2877          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
   2878       case Iop_SliceV128:
   2879          /* (V128, V128, I8) -> V128 */
   2880          complainIfUndefined(mce, atom3, NULL);
   2881          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
   2882       case Iop_Slice64:
   2883          /* (I64, I64, I8) -> I64 */
   2884          complainIfUndefined(mce, atom3, NULL);
   2885          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
   2886       case Iop_SetElem8x8:
   2887       case Iop_SetElem16x4:
   2888       case Iop_SetElem32x2:
   2889          complainIfUndefined(mce, atom2, NULL);
   2890          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
   2891       /* BCDIops */
   2892       case Iop_BCDAdd:
   2893       case Iop_BCDSub:
   2894          complainIfUndefined(mce, atom3, NULL);
   2895          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
   2896 
   2897       /* Vector FP with rounding mode as the first arg */
   2898       case Iop_Add64Fx2:
   2899       case Iop_Sub64Fx2:
   2900       case Iop_Mul64Fx2:
   2901       case Iop_Div64Fx2:
   2902          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
   2903 
   2904       case Iop_Add32Fx4:
   2905       case Iop_Sub32Fx4:
   2906       case Iop_Mul32Fx4:
   2907       case Iop_Div32Fx4:
   2908         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
   2909 
   2910       case Iop_Add64Fx4:
   2911       case Iop_Sub64Fx4:
   2912       case Iop_Mul64Fx4:
   2913       case Iop_Div64Fx4:
   2914          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
   2915 
   2916       case Iop_Add32Fx8:
   2917       case Iop_Sub32Fx8:
   2918       case Iop_Mul32Fx8:
   2919       case Iop_Div32Fx8:
   2920          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
   2921 
   2922       default:
   2923          ppIROp(op);
   2924          VG_(tool_panic)("memcheck:expr2vbits_Triop");
   2925    }
   2926 }
   2927 
   2928 
   2929 static
   2930 IRAtom* expr2vbits_Binop ( MCEnv* mce,
   2931                            IROp op,
   2932                            IRAtom* atom1, IRAtom* atom2 )
   2933 {
   2934    IRType  and_or_ty;
   2935    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
   2936    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
   2937    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
   2938 
   2939    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2940    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2941 
   2942    tl_assert(isOriginalAtom(mce,atom1));
   2943    tl_assert(isOriginalAtom(mce,atom2));
   2944    tl_assert(isShadowAtom(mce,vatom1));
   2945    tl_assert(isShadowAtom(mce,vatom2));
   2946    tl_assert(sameKindedAtoms(atom1,vatom1));
   2947    tl_assert(sameKindedAtoms(atom2,vatom2));
   2948    switch (op) {
   2949 
   2950       /* 32-bit SIMD */
   2951 
   2952       case Iop_Add16x2:
   2953       case Iop_HAdd16Ux2:
   2954       case Iop_HAdd16Sx2:
   2955       case Iop_Sub16x2:
   2956       case Iop_HSub16Ux2:
   2957       case Iop_HSub16Sx2:
   2958       case Iop_QAdd16Sx2:
   2959       case Iop_QSub16Sx2:
   2960       case Iop_QSub16Ux2:
   2961       case Iop_QAdd16Ux2:
   2962          return binary16Ix2(mce, vatom1, vatom2);
   2963 
   2964       case Iop_Add8x4:
   2965       case Iop_HAdd8Ux4:
   2966       case Iop_HAdd8Sx4:
   2967       case Iop_Sub8x4:
   2968       case Iop_HSub8Ux4:
   2969       case Iop_HSub8Sx4:
   2970       case Iop_QSub8Ux4:
   2971       case Iop_QAdd8Ux4:
   2972       case Iop_QSub8Sx4:
   2973       case Iop_QAdd8Sx4:
   2974          return binary8Ix4(mce, vatom1, vatom2);
   2975 
   2976       /* 64-bit SIMD */
   2977 
   2978       case Iop_ShrN8x8:
   2979       case Iop_ShrN16x4:
   2980       case Iop_ShrN32x2:
   2981       case Iop_SarN8x8:
   2982       case Iop_SarN16x4:
   2983       case Iop_SarN32x2:
   2984       case Iop_ShlN16x4:
   2985       case Iop_ShlN32x2:
   2986       case Iop_ShlN8x8:
   2987          /* Same scheme as with all other shifts. */
   2988          complainIfUndefined(mce, atom2, NULL);
   2989          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
   2990 
   2991       case Iop_QNarrowBin32Sto16Sx4:
   2992       case Iop_QNarrowBin16Sto8Sx8:
   2993       case Iop_QNarrowBin16Sto8Ux8:
   2994          return vectorNarrowBin64(mce, op, vatom1, vatom2);
   2995 
   2996       case Iop_Min8Ux8:
   2997       case Iop_Min8Sx8:
   2998       case Iop_Max8Ux8:
   2999       case Iop_Max8Sx8:
   3000       case Iop_Avg8Ux8:
   3001       case Iop_QSub8Sx8:
   3002       case Iop_QSub8Ux8:
   3003       case Iop_Sub8x8:
   3004       case Iop_CmpGT8Sx8:
   3005       case Iop_CmpGT8Ux8:
   3006       case Iop_CmpEQ8x8:
   3007       case Iop_QAdd8Sx8:
   3008       case Iop_QAdd8Ux8:
   3009       case Iop_QSal8x8:
   3010       case Iop_QShl8x8:
   3011       case Iop_Add8x8:
   3012       case Iop_Mul8x8:
   3013       case Iop_PolynomialMul8x8:
   3014          return binary8Ix8(mce, vatom1, vatom2);
   3015 
   3016       case Iop_Min16Sx4:
   3017       case Iop_Min16Ux4:
   3018       case Iop_Max16Sx4:
   3019       case Iop_Max16Ux4:
   3020       case Iop_Avg16Ux4:
   3021       case Iop_QSub16Ux4:
   3022       case Iop_QSub16Sx4:
   3023       case Iop_Sub16x4:
   3024       case Iop_Mul16x4:
   3025       case Iop_MulHi16Sx4:
   3026       case Iop_MulHi16Ux4:
   3027       case Iop_CmpGT16Sx4:
   3028       case Iop_CmpGT16Ux4:
   3029       case Iop_CmpEQ16x4:
   3030       case Iop_QAdd16Sx4:
   3031       case Iop_QAdd16Ux4:
   3032       case Iop_QSal16x4:
   3033       case Iop_QShl16x4:
   3034       case Iop_Add16x4:
   3035       case Iop_QDMulHi16Sx4:
   3036       case Iop_QRDMulHi16Sx4:
   3037          return binary16Ix4(mce, vatom1, vatom2);
   3038 
   3039       case Iop_Sub32x2:
   3040       case Iop_Mul32x2:
   3041       case Iop_Max32Sx2:
   3042       case Iop_Max32Ux2:
   3043       case Iop_Min32Sx2:
   3044       case Iop_Min32Ux2:
   3045       case Iop_CmpGT32Sx2:
   3046       case Iop_CmpGT32Ux2:
   3047       case Iop_CmpEQ32x2:
   3048       case Iop_Add32x2:
   3049       case Iop_QAdd32Ux2:
   3050       case Iop_QAdd32Sx2:
   3051       case Iop_QSub32Ux2:
   3052       case Iop_QSub32Sx2:
   3053       case Iop_QSal32x2:
   3054       case Iop_QShl32x2:
   3055       case Iop_QDMulHi32Sx2:
   3056       case Iop_QRDMulHi32Sx2:
   3057          return binary32Ix2(mce, vatom1, vatom2);
   3058 
   3059       case Iop_QSub64Ux1:
   3060       case Iop_QSub64Sx1:
   3061       case Iop_QAdd64Ux1:
   3062       case Iop_QAdd64Sx1:
   3063       case Iop_QSal64x1:
   3064       case Iop_QShl64x1:
   3065       case Iop_Sal64x1:
   3066          return binary64Ix1(mce, vatom1, vatom2);
   3067 
   3068       case Iop_QShlNsatSU8x8:
   3069       case Iop_QShlNsatUU8x8:
   3070       case Iop_QShlNsatSS8x8:
   3071          complainIfUndefined(mce, atom2, NULL);
   3072          return mkPCast8x8(mce, vatom1);
   3073 
   3074       case Iop_QShlNsatSU16x4:
   3075       case Iop_QShlNsatUU16x4:
   3076       case Iop_QShlNsatSS16x4:
   3077          complainIfUndefined(mce, atom2, NULL);
   3078          return mkPCast16x4(mce, vatom1);
   3079 
   3080       case Iop_QShlNsatSU32x2:
   3081       case Iop_QShlNsatUU32x2:
   3082       case Iop_QShlNsatSS32x2:
   3083          complainIfUndefined(mce, atom2, NULL);
   3084          return mkPCast32x2(mce, vatom1);
   3085 
   3086       case Iop_QShlNsatSU64x1:
   3087       case Iop_QShlNsatUU64x1:
   3088       case Iop_QShlNsatSS64x1:
   3089          complainIfUndefined(mce, atom2, NULL);
   3090          return mkPCast32x2(mce, vatom1);
   3091 
   3092       case Iop_PwMax32Sx2:
   3093       case Iop_PwMax32Ux2:
   3094       case Iop_PwMin32Sx2:
   3095       case Iop_PwMin32Ux2:
   3096       case Iop_PwMax32Fx2:
   3097       case Iop_PwMin32Fx2:
   3098          return assignNew('V', mce, Ity_I64,
   3099                           binop(Iop_PwMax32Ux2,
   3100                                 mkPCast32x2(mce, vatom1),
   3101                                 mkPCast32x2(mce, vatom2)));
   3102 
   3103       case Iop_PwMax16Sx4:
   3104       case Iop_PwMax16Ux4:
   3105       case Iop_PwMin16Sx4:
   3106       case Iop_PwMin16Ux4:
   3107          return assignNew('V', mce, Ity_I64,
   3108                           binop(Iop_PwMax16Ux4,
   3109                                 mkPCast16x4(mce, vatom1),
   3110                                 mkPCast16x4(mce, vatom2)));
   3111 
   3112       case Iop_PwMax8Sx8:
   3113       case Iop_PwMax8Ux8:
   3114       case Iop_PwMin8Sx8:
   3115       case Iop_PwMin8Ux8:
   3116          return assignNew('V', mce, Ity_I64,
   3117                           binop(Iop_PwMax8Ux8,
   3118                                 mkPCast8x8(mce, vatom1),
   3119                                 mkPCast8x8(mce, vatom2)));
   3120 
   3121       case Iop_PwAdd32x2:
   3122       case Iop_PwAdd32Fx2:
   3123          return mkPCast32x2(mce,
   3124                assignNew('V', mce, Ity_I64,
   3125                          binop(Iop_PwAdd32x2,
   3126                                mkPCast32x2(mce, vatom1),
   3127                                mkPCast32x2(mce, vatom2))));
   3128 
   3129       case Iop_PwAdd16x4:
   3130          return mkPCast16x4(mce,
   3131                assignNew('V', mce, Ity_I64,
   3132                          binop(op, mkPCast16x4(mce, vatom1),
   3133                                    mkPCast16x4(mce, vatom2))));
   3134 
   3135       case Iop_PwAdd8x8:
   3136          return mkPCast8x8(mce,
   3137                assignNew('V', mce, Ity_I64,
   3138                          binop(op, mkPCast8x8(mce, vatom1),
   3139                                    mkPCast8x8(mce, vatom2))));
   3140 
   3141       case Iop_Shl8x8:
   3142       case Iop_Shr8x8:
   3143       case Iop_Sar8x8:
   3144       case Iop_Sal8x8:
   3145          return mkUifU64(mce,
   3146                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3147                    mkPCast8x8(mce,vatom2)
   3148                 );
   3149 
   3150       case Iop_Shl16x4:
   3151       case Iop_Shr16x4:
   3152       case Iop_Sar16x4:
   3153       case Iop_Sal16x4:
   3154          return mkUifU64(mce,
   3155                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3156                    mkPCast16x4(mce,vatom2)
   3157                 );
   3158 
   3159       case Iop_Shl32x2:
   3160       case Iop_Shr32x2:
   3161       case Iop_Sar32x2:
   3162       case Iop_Sal32x2:
   3163          return mkUifU64(mce,
   3164                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3165                    mkPCast32x2(mce,vatom2)
   3166                 );
   3167 
   3168       /* 64-bit data-steering */
   3169       case Iop_InterleaveLO32x2:
   3170       case Iop_InterleaveLO16x4:
   3171       case Iop_InterleaveLO8x8:
   3172       case Iop_InterleaveHI32x2:
   3173       case Iop_InterleaveHI16x4:
   3174       case Iop_InterleaveHI8x8:
   3175       case Iop_CatOddLanes8x8:
   3176       case Iop_CatEvenLanes8x8:
   3177       case Iop_CatOddLanes16x4:
   3178       case Iop_CatEvenLanes16x4:
   3179       case Iop_InterleaveOddLanes8x8:
   3180       case Iop_InterleaveEvenLanes8x8:
   3181       case Iop_InterleaveOddLanes16x4:
   3182       case Iop_InterleaveEvenLanes16x4:
   3183          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
   3184 
   3185       case Iop_GetElem8x8:
   3186          complainIfUndefined(mce, atom2, NULL);
   3187          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
   3188       case Iop_GetElem16x4:
   3189          complainIfUndefined(mce, atom2, NULL);
   3190          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
   3191       case Iop_GetElem32x2:
   3192          complainIfUndefined(mce, atom2, NULL);
   3193          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
   3194 
   3195       /* Perm8x8: rearrange values in left arg using steering values
   3196         from right arg.  So rearrange the vbits in the same way but
   3197         pessimise wrt steering values. */
   3198       case Iop_Perm8x8:
   3199          return mkUifU64(
   3200                    mce,
   3201                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3202                    mkPCast8x8(mce, vatom2)
   3203                 );
   3204 
   3205       /* V128-bit SIMD */
   3206 
   3207       case Iop_Sqrt32Fx4:
   3208          return unary32Fx4_w_rm(mce, vatom1, vatom2);
   3209       case Iop_Sqrt64Fx2:
   3210          return unary64Fx2_w_rm(mce, vatom1, vatom2);
   3211 
   3212       case Iop_ShrN8x16:
   3213       case Iop_ShrN16x8:
   3214       case Iop_ShrN32x4:
   3215       case Iop_ShrN64x2:
   3216       case Iop_SarN8x16:
   3217       case Iop_SarN16x8:
   3218       case Iop_SarN32x4:
   3219       case Iop_SarN64x2:
   3220       case Iop_ShlN8x16:
   3221       case Iop_ShlN16x8:
   3222       case Iop_ShlN32x4:
   3223       case Iop_ShlN64x2:
   3224          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
   3225             this is wrong now, scalar shifts are done properly lazily.
   3226             Vector shifts should be fixed too. */
   3227          complainIfUndefined(mce, atom2, NULL);
   3228          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3229 
   3230       /* V x V shifts/rotates are done using the standard lazy scheme. */
   3231       /* For the non-rounding variants of bi-di vector x vector
   3232          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
   3233          But note that this is overly pessimistic, because in fact only
   3234          the bottom 8 bits of each lane of the second argument are taken
   3235          into account when shifting.  So really we ought to ignore
   3236          undefinedness in bits 8 and above of each lane in the
   3237          second argument. */
   3238       case Iop_Shl8x16:
   3239       case Iop_Shr8x16:
   3240       case Iop_Sar8x16:
   3241       case Iop_Sal8x16:
   3242       case Iop_Rol8x16:
   3243       case Iop_Sh8Sx16:
   3244       case Iop_Sh8Ux16:
   3245          return mkUifUV128(mce,
   3246                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3247                    mkPCast8x16(mce,vatom2)
   3248                 );
   3249 
   3250       case Iop_Shl16x8:
   3251       case Iop_Shr16x8:
   3252       case Iop_Sar16x8:
   3253       case Iop_Sal16x8:
   3254       case Iop_Rol16x8:
   3255       case Iop_Sh16Sx8:
   3256       case Iop_Sh16Ux8:
   3257          return mkUifUV128(mce,
   3258                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3259                    mkPCast16x8(mce,vatom2)
   3260                 );
   3261 
   3262       case Iop_Shl32x4:
   3263       case Iop_Shr32x4:
   3264       case Iop_Sar32x4:
   3265       case Iop_Sal32x4:
   3266       case Iop_Rol32x4:
   3267       case Iop_Sh32Sx4:
   3268       case Iop_Sh32Ux4:
   3269          return mkUifUV128(mce,
   3270                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3271                    mkPCast32x4(mce,vatom2)
   3272                 );
   3273 
   3274       case Iop_Shl64x2:
   3275       case Iop_Shr64x2:
   3276       case Iop_Sar64x2:
   3277       case Iop_Sal64x2:
   3278       case Iop_Rol64x2:
   3279       case Iop_Sh64Sx2:
   3280       case Iop_Sh64Ux2:
   3281          return mkUifUV128(mce,
   3282                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3283                    mkPCast64x2(mce,vatom2)
   3284                 );
   3285 
   3286       /* For the rounding variants of bi-di vector x vector shifts, the
   3287          rounding adjustment can cause undefinedness to propagate through
   3288          the entire lane, in the worst case.  Too complex to handle
   3289          properly .. just UifU the arguments and then PCast them.
   3290          Suboptimal but safe. */
   3291       case Iop_Rsh8Sx16:
   3292       case Iop_Rsh8Ux16:
   3293          return binary8Ix16(mce, vatom1, vatom2);
   3294       case Iop_Rsh16Sx8:
   3295       case Iop_Rsh16Ux8:
   3296          return binary16Ix8(mce, vatom1, vatom2);
   3297       case Iop_Rsh32Sx4:
   3298       case Iop_Rsh32Ux4:
   3299          return binary32Ix4(mce, vatom1, vatom2);
   3300       case Iop_Rsh64Sx2:
   3301       case Iop_Rsh64Ux2:
   3302          return binary64Ix2(mce, vatom1, vatom2);
   3303 
   3304       case Iop_F32ToFixed32Ux4_RZ:
   3305       case Iop_F32ToFixed32Sx4_RZ:
   3306       case Iop_Fixed32UToF32x4_RN:
   3307       case Iop_Fixed32SToF32x4_RN:
   3308          complainIfUndefined(mce, atom2, NULL);
   3309          return mkPCast32x4(mce, vatom1);
   3310 
   3311       case Iop_F32ToFixed32Ux2_RZ:
   3312       case Iop_F32ToFixed32Sx2_RZ:
   3313       case Iop_Fixed32UToF32x2_RN:
   3314       case Iop_Fixed32SToF32x2_RN:
   3315          complainIfUndefined(mce, atom2, NULL);
   3316          return mkPCast32x2(mce, vatom1);
   3317 
   3318       case Iop_QSub8Ux16:
   3319       case Iop_QSub8Sx16:
   3320       case Iop_Sub8x16:
   3321       case Iop_Min8Ux16:
   3322       case Iop_Min8Sx16:
   3323       case Iop_Max8Ux16:
   3324       case Iop_Max8Sx16:
   3325       case Iop_CmpGT8Sx16:
   3326       case Iop_CmpGT8Ux16:
   3327       case Iop_CmpEQ8x16:
   3328       case Iop_Avg8Ux16:
   3329       case Iop_Avg8Sx16:
   3330       case Iop_QAdd8Ux16:
   3331       case Iop_QAdd8Sx16:
   3332       case Iop_QAddExtUSsatSS8x16:
   3333       case Iop_QAddExtSUsatUU8x16:
   3334       case Iop_QSal8x16:
   3335       case Iop_QShl8x16:
   3336       case Iop_Add8x16:
   3337       case Iop_Mul8x16:
   3338       case Iop_PolynomialMul8x16:
   3339       case Iop_PolynomialMulAdd8x16:
   3340          return binary8Ix16(mce, vatom1, vatom2);
   3341 
   3342       case Iop_QSub16Ux8:
   3343       case Iop_QSub16Sx8:
   3344       case Iop_Sub16x8:
   3345       case Iop_Mul16x8:
   3346       case Iop_MulHi16Sx8:
   3347       case Iop_MulHi16Ux8:
   3348       case Iop_Min16Sx8:
   3349       case Iop_Min16Ux8:
   3350       case Iop_Max16Sx8:
   3351       case Iop_Max16Ux8:
   3352       case Iop_CmpGT16Sx8:
   3353       case Iop_CmpGT16Ux8:
   3354       case Iop_CmpEQ16x8:
   3355       case Iop_Avg16Ux8:
   3356       case Iop_Avg16Sx8:
   3357       case Iop_QAdd16Ux8:
   3358       case Iop_QAdd16Sx8:
   3359       case Iop_QAddExtUSsatSS16x8:
   3360       case Iop_QAddExtSUsatUU16x8:
   3361       case Iop_QSal16x8:
   3362       case Iop_QShl16x8:
   3363       case Iop_Add16x8:
   3364       case Iop_QDMulHi16Sx8:
   3365       case Iop_QRDMulHi16Sx8:
   3366       case Iop_PolynomialMulAdd16x8:
   3367          return binary16Ix8(mce, vatom1, vatom2);
   3368 
   3369       case Iop_Sub32x4:
   3370       case Iop_CmpGT32Sx4:
   3371       case Iop_CmpGT32Ux4:
   3372       case Iop_CmpEQ32x4:
   3373       case Iop_QAdd32Sx4:
   3374       case Iop_QAdd32Ux4:
   3375       case Iop_QSub32Sx4:
   3376       case Iop_QSub32Ux4:
   3377       case Iop_QAddExtUSsatSS32x4:
   3378       case Iop_QAddExtSUsatUU32x4:
   3379       case Iop_QSal32x4:
   3380       case Iop_QShl32x4:
   3381       case Iop_Avg32Ux4:
   3382       case Iop_Avg32Sx4:
   3383       case Iop_Add32x4:
   3384       case Iop_Max32Ux4:
   3385       case Iop_Max32Sx4:
   3386       case Iop_Min32Ux4:
   3387       case Iop_Min32Sx4:
   3388       case Iop_Mul32x4:
   3389       case Iop_QDMulHi32Sx4:
   3390       case Iop_QRDMulHi32Sx4:
   3391       case Iop_PolynomialMulAdd32x4:
   3392          return binary32Ix4(mce, vatom1, vatom2);
   3393 
   3394       case Iop_Sub64x2:
   3395       case Iop_Add64x2:
   3396       case Iop_Max64Sx2:
   3397       case Iop_Max64Ux2:
   3398       case Iop_Min64Sx2:
   3399       case Iop_Min64Ux2:
   3400       case Iop_CmpEQ64x2:
   3401       case Iop_CmpGT64Sx2:
   3402       case Iop_CmpGT64Ux2:
   3403       case Iop_QSal64x2:
   3404       case Iop_QShl64x2:
   3405       case Iop_QAdd64Ux2:
   3406       case Iop_QAdd64Sx2:
   3407       case Iop_QSub64Ux2:
   3408       case Iop_QSub64Sx2:
   3409       case Iop_QAddExtUSsatSS64x2:
   3410       case Iop_QAddExtSUsatUU64x2:
   3411       case Iop_PolynomialMulAdd64x2:
   3412       case Iop_CipherV128:
   3413       case Iop_CipherLV128:
   3414       case Iop_NCipherV128:
   3415       case Iop_NCipherLV128:
   3416         return binary64Ix2(mce, vatom1, vatom2);
   3417 
   3418       case Iop_QNarrowBin64Sto32Sx4:
   3419       case Iop_QNarrowBin64Uto32Ux4:
   3420       case Iop_QNarrowBin32Sto16Sx8:
   3421       case Iop_QNarrowBin32Uto16Ux8:
   3422       case Iop_QNarrowBin32Sto16Ux8:
   3423       case Iop_QNarrowBin16Sto8Sx16:
   3424       case Iop_QNarrowBin16Uto8Ux16:
   3425       case Iop_QNarrowBin16Sto8Ux16:
   3426          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
   3427 
   3428       case Iop_Min64Fx2:
   3429       case Iop_Max64Fx2:
   3430       case Iop_CmpLT64Fx2:
   3431       case Iop_CmpLE64Fx2:
   3432       case Iop_CmpEQ64Fx2:
   3433       case Iop_CmpUN64Fx2:
   3434       case Iop_RecipStep64Fx2:
   3435       case Iop_RSqrtStep64Fx2:
   3436          return binary64Fx2(mce, vatom1, vatom2);
   3437 
   3438       case Iop_Sub64F0x2:
   3439       case Iop_Mul64F0x2:
   3440       case Iop_Min64F0x2:
   3441       case Iop_Max64F0x2:
   3442       case Iop_Div64F0x2:
   3443       case Iop_CmpLT64F0x2:
   3444       case Iop_CmpLE64F0x2:
   3445       case Iop_CmpEQ64F0x2:
   3446       case Iop_CmpUN64F0x2:
   3447       case Iop_Add64F0x2:
   3448          return binary64F0x2(mce, vatom1, vatom2);
   3449 
   3450       case Iop_Min32Fx4:
   3451       case Iop_Max32Fx4:
   3452       case Iop_CmpLT32Fx4:
   3453       case Iop_CmpLE32Fx4:
   3454       case Iop_CmpEQ32Fx4:
   3455       case Iop_CmpUN32Fx4:
   3456       case Iop_CmpGT32Fx4:
   3457       case Iop_CmpGE32Fx4:
   3458       case Iop_RecipStep32Fx4:
   3459       case Iop_RSqrtStep32Fx4:
   3460          return binary32Fx4(mce, vatom1, vatom2);
   3461 
   3462       case Iop_Sub32Fx2:
   3463       case Iop_Mul32Fx2:
   3464       case Iop_Min32Fx2:
   3465       case Iop_Max32Fx2:
   3466       case Iop_CmpEQ32Fx2:
   3467       case Iop_CmpGT32Fx2:
   3468       case Iop_CmpGE32Fx2:
   3469       case Iop_Add32Fx2:
   3470       case Iop_RecipStep32Fx2:
   3471       case Iop_RSqrtStep32Fx2:
   3472          return binary32Fx2(mce, vatom1, vatom2);
   3473 
   3474       case Iop_Sub32F0x4:
   3475       case Iop_Mul32F0x4:
   3476       case Iop_Min32F0x4:
   3477       case Iop_Max32F0x4:
   3478       case Iop_Div32F0x4:
   3479       case Iop_CmpLT32F0x4:
   3480       case Iop_CmpLE32F0x4:
   3481       case Iop_CmpEQ32F0x4:
   3482       case Iop_CmpUN32F0x4:
   3483       case Iop_Add32F0x4:
   3484          return binary32F0x4(mce, vatom1, vatom2);
   3485 
   3486       case Iop_QShlNsatSU8x16:
   3487       case Iop_QShlNsatUU8x16:
   3488       case Iop_QShlNsatSS8x16:
   3489          complainIfUndefined(mce, atom2, NULL);
   3490          return mkPCast8x16(mce, vatom1);
   3491 
   3492       case Iop_QShlNsatSU16x8:
   3493       case Iop_QShlNsatUU16x8:
   3494       case Iop_QShlNsatSS16x8:
   3495          complainIfUndefined(mce, atom2, NULL);
   3496          return mkPCast16x8(mce, vatom1);
   3497 
   3498       case Iop_QShlNsatSU32x4:
   3499       case Iop_QShlNsatUU32x4:
   3500       case Iop_QShlNsatSS32x4:
   3501          complainIfUndefined(mce, atom2, NULL);
   3502          return mkPCast32x4(mce, vatom1);
   3503 
   3504       case Iop_QShlNsatSU64x2:
   3505       case Iop_QShlNsatUU64x2:
   3506       case Iop_QShlNsatSS64x2:
   3507          complainIfUndefined(mce, atom2, NULL);
   3508          return mkPCast32x4(mce, vatom1);
   3509 
   3510       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
   3511          To make this simpler, do the following:
   3512          * complain if the shift amount (the I8) is undefined
   3513          * pcast each lane at the wide width
   3514          * truncate each lane to half width
   3515          * pcast the resulting 64-bit value to a single bit and use
   3516            that as the least significant bit of the upper half of the
   3517            result. */
   3518       case Iop_QandQShrNnarrow64Uto32Ux2:
   3519       case Iop_QandQSarNnarrow64Sto32Sx2:
   3520       case Iop_QandQSarNnarrow64Sto32Ux2:
   3521       case Iop_QandQRShrNnarrow64Uto32Ux2:
   3522       case Iop_QandQRSarNnarrow64Sto32Sx2:
   3523       case Iop_QandQRSarNnarrow64Sto32Ux2:
   3524       case Iop_QandQShrNnarrow32Uto16Ux4:
   3525       case Iop_QandQSarNnarrow32Sto16Sx4:
   3526       case Iop_QandQSarNnarrow32Sto16Ux4:
   3527       case Iop_QandQRShrNnarrow32Uto16Ux4:
   3528       case Iop_QandQRSarNnarrow32Sto16Sx4:
   3529       case Iop_QandQRSarNnarrow32Sto16Ux4:
   3530       case Iop_QandQShrNnarrow16Uto8Ux8:
   3531       case Iop_QandQSarNnarrow16Sto8Sx8:
   3532       case Iop_QandQSarNnarrow16Sto8Ux8:
   3533       case Iop_QandQRShrNnarrow16Uto8Ux8:
   3534       case Iop_QandQRSarNnarrow16Sto8Sx8:
   3535       case Iop_QandQRSarNnarrow16Sto8Ux8:
   3536       {
   3537          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
   3538          IROp opNarrow = Iop_INVALID;
   3539          switch (op) {
   3540             case Iop_QandQShrNnarrow64Uto32Ux2:
   3541             case Iop_QandQSarNnarrow64Sto32Sx2:
   3542             case Iop_QandQSarNnarrow64Sto32Ux2:
   3543             case Iop_QandQRShrNnarrow64Uto32Ux2:
   3544             case Iop_QandQRSarNnarrow64Sto32Sx2:
   3545             case Iop_QandQRSarNnarrow64Sto32Ux2:
   3546                fnPessim = mkPCast64x2;
   3547                opNarrow = Iop_NarrowUn64to32x2;
   3548                break;
   3549             case Iop_QandQShrNnarrow32Uto16Ux4:
   3550             case Iop_QandQSarNnarrow32Sto16Sx4:
   3551             case Iop_QandQSarNnarrow32Sto16Ux4:
   3552             case Iop_QandQRShrNnarrow32Uto16Ux4:
   3553             case Iop_QandQRSarNnarrow32Sto16Sx4:
   3554             case Iop_QandQRSarNnarrow32Sto16Ux4:
   3555                fnPessim = mkPCast32x4;
   3556                opNarrow = Iop_NarrowUn32to16x4;
   3557                break;
   3558             case Iop_QandQShrNnarrow16Uto8Ux8:
   3559             case Iop_QandQSarNnarrow16Sto8Sx8:
   3560             case Iop_QandQSarNnarrow16Sto8Ux8:
   3561             case Iop_QandQRShrNnarrow16Uto8Ux8:
   3562             case Iop_QandQRSarNnarrow16Sto8Sx8:
   3563             case Iop_QandQRSarNnarrow16Sto8Ux8:
   3564                fnPessim = mkPCast16x8;
   3565                opNarrow = Iop_NarrowUn16to8x8;
   3566                break;
   3567             default:
   3568                tl_assert(0);
   3569          }
   3570          complainIfUndefined(mce, atom2, NULL);
   3571          // Pessimised shift result
   3572          IRAtom* shV
   3573             = fnPessim(mce, vatom1);
   3574          // Narrowed, pessimised shift result
   3575          IRAtom* shVnarrowed
   3576             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
   3577          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
   3578          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
   3579          // and assemble the result
   3580          return assignNew('V', mce, Ity_V128,
   3581                           binop(Iop_64HLtoV128, qV, shVnarrowed));
   3582       }
   3583 
   3584       case Iop_Mull32Sx2:
   3585       case Iop_Mull32Ux2:
   3586       case Iop_QDMull32Sx2:
   3587          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
   3588                                     mkUifU64(mce, vatom1, vatom2));
   3589 
   3590       case Iop_Mull16Sx4:
   3591       case Iop_Mull16Ux4:
   3592       case Iop_QDMull16Sx4:
   3593          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
   3594                                     mkUifU64(mce, vatom1, vatom2));
   3595 
   3596       case Iop_Mull8Sx8:
   3597       case Iop_Mull8Ux8:
   3598       case Iop_PolynomialMull8x8:
   3599          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
   3600                                     mkUifU64(mce, vatom1, vatom2));
   3601 
   3602       case Iop_PwAdd32x4:
   3603          return mkPCast32x4(mce,
   3604                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
   3605                      mkPCast32x4(mce, vatom2))));
   3606 
   3607       case Iop_PwAdd16x8:
   3608          return mkPCast16x8(mce,
   3609                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
   3610                      mkPCast16x8(mce, vatom2))));
   3611 
   3612       case Iop_PwAdd8x16:
   3613          return mkPCast8x16(mce,
   3614                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
   3615                      mkPCast8x16(mce, vatom2))));
   3616 
   3617       /* V128-bit data-steering */
   3618       case Iop_SetV128lo32:
   3619       case Iop_SetV128lo64:
   3620       case Iop_64HLtoV128:
   3621       case Iop_InterleaveLO64x2:
   3622       case Iop_InterleaveLO32x4:
   3623       case Iop_InterleaveLO16x8:
   3624       case Iop_InterleaveLO8x16:
   3625       case Iop_InterleaveHI64x2:
   3626       case Iop_InterleaveHI32x4:
   3627       case Iop_InterleaveHI16x8:
   3628       case Iop_InterleaveHI8x16:
   3629       case Iop_CatOddLanes8x16:
   3630       case Iop_CatOddLanes16x8:
   3631       case Iop_CatOddLanes32x4:
   3632       case Iop_CatEvenLanes8x16:
   3633       case Iop_CatEvenLanes16x8:
   3634       case Iop_CatEvenLanes32x4:
   3635       case Iop_InterleaveOddLanes8x16:
   3636       case Iop_InterleaveOddLanes16x8:
   3637       case Iop_InterleaveOddLanes32x4:
   3638       case Iop_InterleaveEvenLanes8x16:
   3639       case Iop_InterleaveEvenLanes16x8:
   3640       case Iop_InterleaveEvenLanes32x4:
   3641          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
   3642 
   3643       case Iop_GetElem8x16:
   3644          complainIfUndefined(mce, atom2, NULL);
   3645          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
   3646       case Iop_GetElem16x8:
   3647          complainIfUndefined(mce, atom2, NULL);
   3648          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
   3649       case Iop_GetElem32x4:
   3650          complainIfUndefined(mce, atom2, NULL);
   3651          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
   3652       case Iop_GetElem64x2:
   3653          complainIfUndefined(mce, atom2, NULL);
   3654          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
   3655 
   3656      /* Perm8x16: rearrange values in left arg using steering values
   3657         from right arg.  So rearrange the vbits in the same way but
   3658         pessimise wrt steering values.  Perm32x4 ditto. */
   3659       case Iop_Perm8x16:
   3660          return mkUifUV128(
   3661                    mce,
   3662                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3663                    mkPCast8x16(mce, vatom2)
   3664                 );
   3665       case Iop_Perm32x4:
   3666          return mkUifUV128(
   3667                    mce,
   3668                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3669                    mkPCast32x4(mce, vatom2)
   3670                 );
   3671 
   3672      /* These two take the lower half of each 16-bit lane, sign/zero
   3673         extend it to 32, and multiply together, producing a 32x4
   3674         result (and implicitly ignoring half the operand bits).  So
   3675         treat it as a bunch of independent 16x8 operations, but then
   3676         do 32-bit shifts left-right to copy the lower half results
   3677         (which are all 0s or all 1s due to PCasting in binary16Ix8)
   3678         into the upper half of each result lane. */
   3679       case Iop_MullEven16Ux8:
   3680       case Iop_MullEven16Sx8: {
   3681          IRAtom* at;
   3682          at = binary16Ix8(mce,vatom1,vatom2);
   3683          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
   3684          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
   3685 	 return at;
   3686       }
   3687 
   3688       /* Same deal as Iop_MullEven16{S,U}x8 */
   3689       case Iop_MullEven8Ux16:
   3690       case Iop_MullEven8Sx16: {
   3691          IRAtom* at;
   3692          at = binary8Ix16(mce,vatom1,vatom2);
   3693          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
   3694          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
   3695 	 return at;
   3696       }
   3697 
   3698       /* Same deal as Iop_MullEven16{S,U}x8 */
   3699       case Iop_MullEven32Ux4:
   3700       case Iop_MullEven32Sx4: {
   3701          IRAtom* at;
   3702          at = binary32Ix4(mce,vatom1,vatom2);
   3703          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
   3704          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
   3705          return at;
   3706       }
   3707 
   3708       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
   3709          32x4 -> 16x8 laneage, discarding the upper half of each lane.
   3710          Simply apply same op to the V bits, since this really no more
   3711          than a data steering operation. */
   3712       case Iop_NarrowBin32to16x8:
   3713       case Iop_NarrowBin16to8x16:
   3714       case Iop_NarrowBin64to32x4:
   3715          return assignNew('V', mce, Ity_V128,
   3716                                     binop(op, vatom1, vatom2));
   3717 
   3718       case Iop_ShrV128:
   3719       case Iop_ShlV128:
   3720          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
   3721             this is wrong now, scalar shifts are done properly lazily.
   3722             Vector shifts should be fixed too. */
   3723          complainIfUndefined(mce, atom2, NULL);
   3724          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3725 
   3726       /* SHA Iops */
   3727       case Iop_SHA256:
   3728       case Iop_SHA512:
   3729          complainIfUndefined(mce, atom2, NULL);
   3730          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3731 
   3732       /* I128-bit data-steering */
   3733       case Iop_64HLto128:
   3734          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
   3735 
   3736       /* V256-bit SIMD */
   3737 
   3738       case Iop_Max64Fx4:
   3739       case Iop_Min64Fx4:
   3740          return binary64Fx4(mce, vatom1, vatom2);
   3741 
   3742       case Iop_Max32Fx8:
   3743       case Iop_Min32Fx8:
   3744          return binary32Fx8(mce, vatom1, vatom2);
   3745 
   3746       /* V256-bit data-steering */
   3747       case Iop_V128HLtoV256:
   3748          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
   3749 
   3750       /* Scalar floating point */
   3751 
   3752       case Iop_F32toI64S:
   3753       case Iop_F32toI64U:
   3754          /* I32(rm) x F32 -> I64 */
   3755          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3756 
   3757       case Iop_I64StoF32:
   3758          /* I32(rm) x I64 -> F32 */
   3759          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3760 
   3761       case Iop_RoundF64toInt:
   3762       case Iop_RoundF64toF32:
   3763       case Iop_F64toI64S:
   3764       case Iop_F64toI64U:
   3765       case Iop_I64StoF64:
   3766       case Iop_I64UtoF64:
   3767       case Iop_SinF64:
   3768       case Iop_CosF64:
   3769       case Iop_TanF64:
   3770       case Iop_2xm1F64:
   3771       case Iop_SqrtF64:
   3772       case Iop_RecpExpF64:
   3773          /* I32(rm) x I64/F64 -> I64/F64 */
   3774          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3775 
   3776       case Iop_ShlD64:
   3777       case Iop_ShrD64:
   3778       case Iop_RoundD64toInt:
   3779          /* I32(rm) x D64 -> D64 */
   3780          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3781 
   3782       case Iop_ShlD128:
   3783       case Iop_ShrD128:
   3784       case Iop_RoundD128toInt:
   3785          /* I32(rm) x D128 -> D128 */
   3786          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3787 
   3788       case Iop_D64toI64S:
   3789       case Iop_D64toI64U:
   3790       case Iop_I64StoD64:
   3791       case Iop_I64UtoD64:
   3792          /* I32(rm) x I64/D64 -> D64/I64 */
   3793          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3794 
   3795       case Iop_F32toD32:
   3796       case Iop_F64toD32:
   3797       case Iop_F128toD32:
   3798       case Iop_D32toF32:
   3799       case Iop_D64toF32:
   3800       case Iop_D128toF32:
   3801          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
   3802          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3803 
   3804       case Iop_F32toD64:
   3805       case Iop_F64toD64:
   3806       case Iop_F128toD64:
   3807       case Iop_D32toF64:
   3808       case Iop_D64toF64:
   3809       case Iop_D128toF64:
   3810          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
   3811          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3812 
   3813       case Iop_F32toD128:
   3814       case Iop_F64toD128:
   3815       case Iop_F128toD128:
   3816       case Iop_D32toF128:
   3817       case Iop_D64toF128:
   3818       case Iop_D128toF128:
   3819          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
   3820          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3821 
   3822       case Iop_RoundF32toInt:
   3823       case Iop_SqrtF32:
   3824       case Iop_RecpExpF32:
   3825          /* I32(rm) x I32/F32 -> I32/F32 */
   3826          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3827 
   3828       case Iop_SqrtF128:
   3829          /* I32(rm) x F128 -> F128 */
   3830          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3831 
   3832       case Iop_I32StoF32:
   3833       case Iop_I32UtoF32:
   3834       case Iop_F32toI32S:
   3835       case Iop_F32toI32U:
   3836          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
   3837          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3838 
   3839       case Iop_F64toF16:
   3840       case Iop_F32toF16:
   3841          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
   3842          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
   3843 
   3844       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
   3845       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
   3846       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
   3847       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
   3848       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
   3849          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3850 
   3851       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
   3852       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
   3853       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
   3854       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
   3855       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
   3856       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
   3857          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3858 
   3859       case Iop_F64HLtoF128:
   3860       case Iop_D64HLtoD128:
   3861          return assignNew('V', mce, Ity_I128,
   3862                           binop(Iop_64HLto128, vatom1, vatom2));
   3863 
   3864       case Iop_F64toI32U:
   3865       case Iop_F64toI32S:
   3866       case Iop_F64toF32:
   3867       case Iop_I64UtoF32:
   3868       case Iop_D64toI32U:
   3869       case Iop_D64toI32S:
   3870          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
   3871          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3872 
   3873       case Iop_D64toD32:
   3874          /* First arg is I32 (rounding mode), second is D64 (data). */
   3875          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3876 
   3877       case Iop_F64toI16S:
   3878          /* First arg is I32 (rounding mode), second is F64 (data). */
   3879          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
   3880 
   3881       case Iop_InsertExpD64:
   3882          /*  I64 x I64 -> D64 */
   3883          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3884 
   3885       case Iop_InsertExpD128:
   3886          /*  I64 x I128 -> D128 */
   3887          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3888 
   3889       case Iop_CmpF32:
   3890       case Iop_CmpF64:
   3891       case Iop_CmpF128:
   3892       case Iop_CmpD64:
   3893       case Iop_CmpD128:
   3894       case Iop_CmpExpD64:
   3895       case Iop_CmpExpD128:
   3896          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3897 
   3898       /* non-FP after here */
   3899 
   3900       case Iop_DivModU64to32:
   3901       case Iop_DivModS64to32:
   3902          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3903 
   3904       case Iop_DivModU128to64:
   3905       case Iop_DivModS128to64:
   3906          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3907 
   3908       case Iop_8HLto16:
   3909          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
   3910       case Iop_16HLto32:
   3911          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
   3912       case Iop_32HLto64:
   3913          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
   3914 
   3915       case Iop_DivModS64to64:
   3916       case Iop_MullS64:
   3917       case Iop_MullU64: {
   3918          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
   3919          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
   3920          return assignNew('V', mce, Ity_I128,
   3921                           binop(Iop_64HLto128, vHi64, vLo64));
   3922       }
   3923 
   3924       case Iop_MullS32:
   3925       case Iop_MullU32: {
   3926          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
   3927          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
   3928          return assignNew('V', mce, Ity_I64,
   3929                           binop(Iop_32HLto64, vHi32, vLo32));
   3930       }
   3931 
   3932       case Iop_MullS16:
   3933       case Iop_MullU16: {
   3934          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
   3935          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
   3936          return assignNew('V', mce, Ity_I32,
   3937                           binop(Iop_16HLto32, vHi16, vLo16));
   3938       }
   3939 
   3940       case Iop_MullS8:
   3941       case Iop_MullU8: {
   3942          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
   3943          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
   3944          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
   3945       }
   3946 
   3947       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
   3948       case Iop_DivS32:
   3949       case Iop_DivU32:
   3950       case Iop_DivU32E:
   3951       case Iop_DivS32E:
   3952       case Iop_QAdd32S: /* could probably do better */
   3953       case Iop_QSub32S: /* could probably do better */
   3954          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3955 
   3956       case Iop_DivS64:
   3957       case Iop_DivU64:
   3958       case Iop_DivS64E:
   3959       case Iop_DivU64E:
   3960          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3961 
   3962       case Iop_Add32:
   3963          if (mce->bogusLiterals || mce->useLLVMworkarounds)
   3964             return expensiveAddSub(mce,True,Ity_I32,
   3965                                    vatom1,vatom2, atom1,atom2);
   3966          else
   3967             goto cheap_AddSub32;
   3968       case Iop_Sub32:
   3969          if (mce->bogusLiterals)
   3970             return expensiveAddSub(mce,False,Ity_I32,
   3971                                    vatom1,vatom2, atom1,atom2);
   3972          else
   3973             goto cheap_AddSub32;
   3974 
   3975       cheap_AddSub32:
   3976       case Iop_Mul32:
   3977          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
   3978 
   3979       case Iop_CmpORD32S:
   3980       case Iop_CmpORD32U:
   3981       case Iop_CmpORD64S:
   3982       case Iop_CmpORD64U:
   3983          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
   3984 
   3985       case Iop_Add64:
   3986          if (mce->bogusLiterals || mce->useLLVMworkarounds)
   3987             return expensiveAddSub(mce,True,Ity_I64,
   3988                                    vatom1,vatom2, atom1,atom2);
   3989          else
   3990             goto cheap_AddSub64;
   3991       case Iop_Sub64:
   3992          if (mce->bogusLiterals)
   3993             return expensiveAddSub(mce,False,Ity_I64,
   3994                                    vatom1,vatom2, atom1,atom2);
   3995          else
   3996             goto cheap_AddSub64;
   3997 
   3998       cheap_AddSub64:
   3999       case Iop_Mul64:
   4000          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
   4001 
   4002       case Iop_Mul16:
   4003       case Iop_Add16:
   4004       case Iop_Sub16:
   4005          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
   4006 
   4007       case Iop_Mul8:
   4008       case Iop_Sub8:
   4009       case Iop_Add8:
   4010          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
   4011 
   4012       case Iop_CmpEQ64:
   4013       case Iop_CmpNE64:
   4014          if (mce->bogusLiterals)
   4015             goto expensive_cmp64;
   4016          else
   4017             goto cheap_cmp64;
   4018 
   4019       expensive_cmp64:
   4020       case Iop_ExpCmpNE64:
   4021          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
   4022 
   4023       cheap_cmp64:
   4024       case Iop_CmpLE64S: case Iop_CmpLE64U:
   4025       case Iop_CmpLT64U: case Iop_CmpLT64S:
   4026          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
   4027 
   4028       case Iop_CmpEQ32:
   4029       case Iop_CmpNE32:
   4030          if (mce->bogusLiterals)
   4031             goto expensive_cmp32;
   4032          else
   4033             goto cheap_cmp32;
   4034 
   4035       expensive_cmp32:
   4036       case Iop_ExpCmpNE32:
   4037          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
   4038 
   4039       cheap_cmp32:
   4040       case Iop_CmpLE32S: case Iop_CmpLE32U:
   4041       case Iop_CmpLT32U: case Iop_CmpLT32S:
   4042          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
   4043 
   4044       case Iop_CmpEQ16: case Iop_CmpNE16:
   4045          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
   4046 
   4047       case Iop_ExpCmpNE16:
   4048          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
   4049 
   4050       case Iop_CmpEQ8: case Iop_CmpNE8:
   4051          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
   4052 
   4053       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
   4054       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
   4055       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
   4056       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
   4057          /* Just say these all produce a defined result, regardless
   4058             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
   4059          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
   4060 
   4061       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
   4062          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
   4063 
   4064       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
   4065          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
   4066 
   4067       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
   4068          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
   4069 
   4070       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
   4071          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
   4072 
   4073       case Iop_AndV256:
   4074          uifu = mkUifUV256; difd = mkDifDV256;
   4075          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
   4076       case Iop_AndV128:
   4077          uifu = mkUifUV128; difd = mkDifDV128;
   4078          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
   4079       case Iop_And64:
   4080          uifu = mkUifU64; difd = mkDifD64;
   4081          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
   4082       case Iop_And32:
   4083          uifu = mkUifU32; difd = mkDifD32;
   4084          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
   4085       case Iop_And16:
   4086          uifu = mkUifU16; difd = mkDifD16;
   4087          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
   4088       case Iop_And8:
   4089          uifu = mkUifU8; difd = mkDifD8;
   4090          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
   4091 
   4092       case Iop_OrV256:
   4093          uifu = mkUifUV256; difd = mkDifDV256;
   4094          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
   4095       case Iop_OrV128:
   4096          uifu = mkUifUV128; difd = mkDifDV128;
   4097          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
   4098       case Iop_Or64:
   4099          uifu = mkUifU64; difd = mkDifD64;
   4100          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
   4101       case Iop_Or32:
   4102          uifu = mkUifU32; difd = mkDifD32;
   4103          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
   4104       case Iop_Or16:
   4105          uifu = mkUifU16; difd = mkDifD16;
   4106          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
   4107       case Iop_Or8:
   4108          uifu = mkUifU8; difd = mkDifD8;
   4109          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
   4110 
   4111       do_And_Or:
   4112          return
   4113          assignNew(
   4114             'V', mce,
   4115             and_or_ty,
   4116             difd(mce, uifu(mce, vatom1, vatom2),
   4117                       difd(mce, improve(mce, atom1, vatom1),
   4118                                 improve(mce, atom2, vatom2) ) ) );
   4119 
   4120       case Iop_Xor8:
   4121          return mkUifU8(mce, vatom1, vatom2);
   4122       case Iop_Xor16:
   4123          return mkUifU16(mce, vatom1, vatom2);
   4124       case Iop_Xor32:
   4125          return mkUifU32(mce, vatom1, vatom2);
   4126       case Iop_Xor64:
   4127          return mkUifU64(mce, vatom1, vatom2);
   4128       case Iop_XorV128:
   4129          return mkUifUV128(mce, vatom1, vatom2);
   4130       case Iop_XorV256:
   4131          return mkUifUV256(mce, vatom1, vatom2);
   4132 
   4133       /* V256-bit SIMD */
   4134 
   4135       case Iop_ShrN16x16:
   4136       case Iop_ShrN32x8:
   4137       case Iop_ShrN64x4:
   4138       case Iop_SarN16x16:
   4139       case Iop_SarN32x8:
   4140       case Iop_ShlN16x16:
   4141       case Iop_ShlN32x8:
   4142       case Iop_ShlN64x4:
   4143          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
   4144             this is wrong now, scalar shifts are done properly lazily.
   4145             Vector shifts should be fixed too. */
   4146          complainIfUndefined(mce, atom2, NULL);
   4147          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
   4148 
   4149       case Iop_QSub8Ux32:
   4150       case Iop_QSub8Sx32:
   4151       case Iop_Sub8x32:
   4152       case Iop_Min8Ux32:
   4153       case Iop_Min8Sx32:
   4154       case Iop_Max8Ux32:
   4155       case Iop_Max8Sx32:
   4156       case Iop_CmpGT8Sx32:
   4157       case Iop_CmpEQ8x32:
   4158       case Iop_Avg8Ux32:
   4159       case Iop_QAdd8Ux32:
   4160       case Iop_QAdd8Sx32:
   4161       case Iop_Add8x32:
   4162          return binary8Ix32(mce, vatom1, vatom2);
   4163 
   4164       case Iop_QSub16Ux16:
   4165       case Iop_QSub16Sx16:
   4166       case Iop_Sub16x16:
   4167       case Iop_Mul16x16:
   4168       case Iop_MulHi16Sx16:
   4169       case Iop_MulHi16Ux16:
   4170       case Iop_Min16Sx16:
   4171       case Iop_Min16Ux16:
   4172       case Iop_Max16Sx16:
   4173       case Iop_Max16Ux16:
   4174       case Iop_CmpGT16Sx16:
   4175       case Iop_CmpEQ16x16:
   4176       case Iop_Avg16Ux16:
   4177       case Iop_QAdd16Ux16:
   4178       case Iop_QAdd16Sx16:
   4179       case Iop_Add16x16:
   4180          return binary16Ix16(mce, vatom1, vatom2);
   4181 
   4182       case Iop_Sub32x8:
   4183       case Iop_CmpGT32Sx8:
   4184       case Iop_CmpEQ32x8:
   4185       case Iop_Add32x8:
   4186       case Iop_Max32Ux8:
   4187       case Iop_Max32Sx8:
   4188       case Iop_Min32Ux8:
   4189       case Iop_Min32Sx8:
   4190       case Iop_Mul32x8:
   4191          return binary32Ix8(mce, vatom1, vatom2);
   4192 
   4193       case Iop_Sub64x4:
   4194       case Iop_Add64x4:
   4195       case Iop_CmpEQ64x4:
   4196       case Iop_CmpGT64Sx4:
   4197          return binary64Ix4(mce, vatom1, vatom2);
   4198 
   4199      /* Perm32x8: rearrange values in left arg using steering values
   4200         from right arg.  So rearrange the vbits in the same way but
   4201         pessimise wrt steering values. */
   4202       case Iop_Perm32x8:
   4203          return mkUifUV256(
   4204                    mce,
   4205                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
   4206                    mkPCast32x8(mce, vatom2)
   4207                 );
   4208 
   4209       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
   4210          Handle the shifted results in the same way that other
   4211          binary Q ops are handled, eg QSub: UifU the two args,
   4212          then pessimise -- which is binaryNIxM.  But for the upper
   4213          V128, we require to generate just 1 bit which is the
   4214          pessimised shift result, with 127 defined zeroes above it.
   4215 
   4216          Note that this overly pessimistic in that in fact only the
   4217          bottom 8 bits of each lane of the second arg determine the shift
   4218          amount.  Really we ought to ignore any undefinedness in the
   4219          rest of the lanes of the second arg. */
   4220       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
   4221       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
   4222       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
   4223       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
   4224       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
   4225       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
   4226       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
   4227       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
   4228       {
   4229          // The function to generate the pessimised shift result
   4230          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
   4231          switch (op) {
   4232             case Iop_QandSQsh64x2:
   4233             case Iop_QandUQsh64x2:
   4234             case Iop_QandSQRsh64x2:
   4235             case Iop_QandUQRsh64x2:
   4236                binaryNIxM = binary64Ix2;
   4237                break;
   4238             case Iop_QandSQsh32x4:
   4239             case Iop_QandUQsh32x4:
   4240             case Iop_QandSQRsh32x4:
   4241             case Iop_QandUQRsh32x4:
   4242                binaryNIxM = binary32Ix4;
   4243                break;
   4244             case Iop_QandSQsh16x8:
   4245             case Iop_QandUQsh16x8:
   4246             case Iop_QandSQRsh16x8:
   4247             case Iop_QandUQRsh16x8:
   4248                binaryNIxM = binary16Ix8;
   4249                break;
   4250             case Iop_QandSQsh8x16:
   4251             case Iop_QandUQsh8x16:
   4252             case Iop_QandSQRsh8x16:
   4253             case Iop_QandUQRsh8x16:
   4254                binaryNIxM = binary8Ix16;
   4255                break;
   4256             default:
   4257                tl_assert(0);
   4258          }
   4259          tl_assert(binaryNIxM);
   4260          // Pessimised shift result, shV[127:0]
   4261          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
   4262          // Generates: Def--(127)--Def PCast-to-I1(shV)
   4263          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
   4264          // and assemble the result
   4265          return assignNew('V', mce, Ity_V256,
   4266                           binop(Iop_V128HLtoV256, qV, shV));
   4267       }
   4268 
   4269       default:
   4270          ppIROp(op);
   4271          VG_(tool_panic)("memcheck:expr2vbits_Binop");
   4272    }
   4273 }
   4274 
   4275 
   4276 static
   4277 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
   4278 {
   4279    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
   4280       selection of shadow operation implicitly duplicates the logic in
   4281       do_shadow_LoadG and should be kept in sync (in the very unlikely
   4282       event that the interpretation of such widening ops changes in
   4283       future).  See comment in do_shadow_LoadG. */
   4284    IRAtom* vatom = expr2vbits( mce, atom );
   4285    tl_assert(isOriginalAtom(mce,atom));
   4286    switch (op) {
   4287 
   4288       case Iop_Abs64Fx2:
   4289       case Iop_Neg64Fx2:
   4290       case Iop_RSqrtEst64Fx2:
   4291       case Iop_RecipEst64Fx2:
   4292          return unary64Fx2(mce, vatom);
   4293 
   4294       case Iop_Sqrt64F0x2:
   4295          return unary64F0x2(mce, vatom);
   4296 
   4297       case Iop_Sqrt32Fx8:
   4298       case Iop_RSqrtEst32Fx8:
   4299       case Iop_RecipEst32Fx8:
   4300          return unary32Fx8(mce, vatom);
   4301 
   4302       case Iop_Sqrt64Fx4:
   4303          return unary64Fx4(mce, vatom);
   4304 
   4305       case Iop_RecipEst32Fx4:
   4306       case Iop_I32UtoFx4:
   4307       case Iop_I32StoFx4:
   4308       case Iop_QFtoI32Ux4_RZ:
   4309       case Iop_QFtoI32Sx4_RZ:
   4310       case Iop_RoundF32x4_RM:
   4311       case Iop_RoundF32x4_RP:
   4312       case Iop_RoundF32x4_RN:
   4313       case Iop_RoundF32x4_RZ:
   4314       case Iop_RecipEst32Ux4:
   4315       case Iop_Abs32Fx4:
   4316       case Iop_Neg32Fx4:
   4317       case Iop_RSqrtEst32Fx4:
   4318          return unary32Fx4(mce, vatom);
   4319 
   4320       case Iop_I32UtoFx2:
   4321       case Iop_I32StoFx2:
   4322       case Iop_RecipEst32Fx2:
   4323       case Iop_RecipEst32Ux2:
   4324       case Iop_Abs32Fx2:
   4325       case Iop_Neg32Fx2:
   4326       case Iop_RSqrtEst32Fx2:
   4327          return unary32Fx2(mce, vatom);
   4328 
   4329       case Iop_Sqrt32F0x4:
   4330       case Iop_RSqrtEst32F0x4:
   4331       case Iop_RecipEst32F0x4:
   4332          return unary32F0x4(mce, vatom);
   4333 
   4334       case Iop_32UtoV128:
   4335       case Iop_64UtoV128:
   4336       case Iop_Dup8x16:
   4337       case Iop_Dup16x8:
   4338       case Iop_Dup32x4:
   4339       case Iop_Reverse1sIn8_x16:
   4340       case Iop_Reverse8sIn16_x8:
   4341       case Iop_Reverse8sIn32_x4:
   4342       case Iop_Reverse16sIn32_x4:
   4343       case Iop_Reverse8sIn64_x2:
   4344       case Iop_Reverse16sIn64_x2:
   4345       case Iop_Reverse32sIn64_x2:
   4346       case Iop_V256toV128_1: case Iop_V256toV128_0:
   4347       case Iop_ZeroHI64ofV128:
   4348       case Iop_ZeroHI96ofV128:
   4349       case Iop_ZeroHI112ofV128:
   4350       case Iop_ZeroHI120ofV128:
   4351          return assignNew('V', mce, Ity_V128, unop(op, vatom));
   4352 
   4353       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
   4354       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
   4355          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
   4356       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
   4357       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
   4358          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
   4359 
   4360       case Iop_NegF128:
   4361       case Iop_AbsF128:
   4362          return mkPCastTo(mce, Ity_I128, vatom);
   4363 
   4364       case Iop_I32StoF128: /* signed I32 -> F128 */
   4365       case Iop_I64StoF128: /* signed I64 -> F128 */
   4366       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
   4367       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
   4368       case Iop_F32toF128:  /* F32 -> F128 */
   4369       case Iop_F64toF128:  /* F64 -> F128 */
   4370       case Iop_I32StoD128: /* signed I64 -> D128 */
   4371       case Iop_I64StoD128: /* signed I64 -> D128 */
   4372       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
   4373       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
   4374          return mkPCastTo(mce, Ity_I128, vatom);
   4375 
   4376       case Iop_F16toF64:
   4377       case Iop_F32toF64:
   4378       case Iop_I32StoF64:
   4379       case Iop_I32UtoF64:
   4380       case Iop_NegF64:
   4381       case Iop_AbsF64:
   4382       case Iop_RSqrtEst5GoodF64:
   4383       case Iop_RoundF64toF64_NEAREST:
   4384       case Iop_RoundF64toF64_NegINF:
   4385       case Iop_RoundF64toF64_PosINF:
   4386       case Iop_RoundF64toF64_ZERO:
   4387       case Iop_Clz64:
   4388       case Iop_D32toD64:
   4389       case Iop_I32StoD64:
   4390       case Iop_I32UtoD64:
   4391       case Iop_ExtractExpD64:    /* D64  -> I64 */
   4392       case Iop_ExtractExpD128:   /* D128 -> I64 */
   4393       case Iop_ExtractSigD64:    /* D64  -> I64 */
   4394       case Iop_ExtractSigD128:   /* D128 -> I64 */
   4395       case Iop_DPBtoBCD:
   4396       case Iop_BCDtoDPB:
   4397          return mkPCastTo(mce, Ity_I64, vatom);
   4398 
   4399       case Iop_D64toD128:
   4400          return mkPCastTo(mce, Ity_I128, vatom);
   4401 
   4402       case Iop_Clz32:
   4403       case Iop_TruncF64asF32:
   4404       case Iop_NegF32:
   4405       case Iop_AbsF32:
   4406       case Iop_F16toF32:
   4407          return mkPCastTo(mce, Ity_I32, vatom);
   4408 
   4409       case Iop_Ctz32:
   4410       case Iop_Ctz64:
   4411          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
   4412 
   4413       case Iop_1Uto64:
   4414       case Iop_1Sto64:
   4415       case Iop_8Uto64:
   4416       case Iop_8Sto64:
   4417       case Iop_16Uto64:
   4418       case Iop_16Sto64:
   4419       case Iop_32Sto64:
   4420       case Iop_32Uto64:
   4421       case Iop_V128to64:
   4422       case Iop_V128HIto64:
   4423       case Iop_128HIto64:
   4424       case Iop_128to64:
   4425       case Iop_Dup8x8:
   4426       case Iop_Dup16x4:
   4427       case Iop_Dup32x2:
   4428       case Iop_Reverse8sIn16_x4:
   4429       case Iop_Reverse8sIn32_x2:
   4430       case Iop_Reverse16sIn32_x2:
   4431       case Iop_Reverse8sIn64_x1:
   4432       case Iop_Reverse16sIn64_x1:
   4433       case Iop_Reverse32sIn64_x1:
   4434       case Iop_V256to64_0: case Iop_V256to64_1:
   4435       case Iop_V256to64_2: case Iop_V256to64_3:
   4436          return assignNew('V', mce, Ity_I64, unop(op, vatom));
   4437 
   4438       case Iop_64to32:
   4439       case Iop_64HIto32:
   4440       case Iop_1Uto32:
   4441       case Iop_1Sto32:
   4442       case Iop_8Uto32:
   4443       case Iop_16Uto32:
   4444       case Iop_16Sto32:
   4445       case Iop_8Sto32:
   4446       case Iop_V128to32:
   4447          return assignNew('V', mce, Ity_I32, unop(op, vatom));
   4448 
   4449       case Iop_8Sto16:
   4450       case Iop_8Uto16:
   4451       case Iop_32to16:
   4452       case Iop_32HIto16:
   4453       case Iop_64to16:
   4454       case Iop_GetMSBs8x16:
   4455          return assignNew('V', mce, Ity_I16, unop(op, vatom));
   4456 
   4457       case Iop_1Uto8:
   4458       case Iop_1Sto8:
   4459       case Iop_16to8:
   4460       case Iop_16HIto8:
   4461       case Iop_32to8:
   4462       case Iop_64to8:
   4463       case Iop_GetMSBs8x8:
   4464          return assignNew('V', mce, Ity_I8, unop(op, vatom));
   4465 
   4466       case Iop_32to1:
   4467          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
   4468 
   4469       case Iop_64to1:
   4470          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
   4471 
   4472       case Iop_ReinterpF64asI64:
   4473       case Iop_ReinterpI64asF64:
   4474       case Iop_ReinterpI32asF32:
   4475       case Iop_ReinterpF32asI32:
   4476       case Iop_ReinterpI64asD64:
   4477       case Iop_ReinterpD64asI64:
   4478       case Iop_NotV256:
   4479       case Iop_NotV128:
   4480       case Iop_Not64:
   4481       case Iop_Not32:
   4482       case Iop_Not16:
   4483       case Iop_Not8:
   4484       case Iop_Not1:
   4485          return vatom;
   4486 
   4487       case Iop_CmpNEZ8x8:
   4488       case Iop_Cnt8x8:
   4489       case Iop_Clz8x8:
   4490       case Iop_Cls8x8:
   4491       case Iop_Abs8x8:
   4492          return mkPCast8x8(mce, vatom);
   4493 
   4494       case Iop_CmpNEZ8x16:
   4495       case Iop_Cnt8x16:
   4496       case Iop_Clz8x16:
   4497       case Iop_Cls8x16:
   4498       case Iop_Abs8x16:
   4499          return mkPCast8x16(mce, vatom);
   4500 
   4501       case Iop_CmpNEZ16x4:
   4502       case Iop_Clz16x4:
   4503       case Iop_Cls16x4:
   4504       case Iop_Abs16x4:
   4505          return mkPCast16x4(mce, vatom);
   4506 
   4507       case Iop_CmpNEZ16x8:
   4508       case Iop_Clz16x8:
   4509       case Iop_Cls16x8:
   4510       case Iop_Abs16x8:
   4511          return mkPCast16x8(mce, vatom);
   4512 
   4513       case Iop_CmpNEZ32x2:
   4514       case Iop_Clz32x2:
   4515       case Iop_Cls32x2:
   4516       case Iop_FtoI32Ux2_RZ:
   4517       case Iop_FtoI32Sx2_RZ:
   4518       case Iop_Abs32x2:
   4519          return mkPCast32x2(mce, vatom);
   4520 
   4521       case Iop_CmpNEZ32x4:
   4522       case Iop_Clz32x4:
   4523       case Iop_Cls32x4:
   4524       case Iop_FtoI32Ux4_RZ:
   4525       case Iop_FtoI32Sx4_RZ:
   4526       case Iop_Abs32x4:
   4527       case Iop_RSqrtEst32Ux4:
   4528          return mkPCast32x4(mce, vatom);
   4529 
   4530       case Iop_CmpwNEZ32:
   4531          return mkPCastTo(mce, Ity_I32, vatom);
   4532 
   4533       case Iop_CmpwNEZ64:
   4534          return mkPCastTo(mce, Ity_I64, vatom);
   4535 
   4536       case Iop_CmpNEZ64x2:
   4537       case Iop_CipherSV128:
   4538       case Iop_Clz64x2:
   4539       case Iop_Abs64x2:
   4540          return mkPCast64x2(mce, vatom);
   4541 
   4542       case Iop_PwBitMtxXpose64x2:
   4543          return assignNew('V', mce, Ity_V128, unop(op, vatom));
   4544 
   4545       case Iop_NarrowUn16to8x8:
   4546       case Iop_NarrowUn32to16x4:
   4547       case Iop_NarrowUn64to32x2:
   4548       case Iop_QNarrowUn16Sto8Sx8:
   4549       case Iop_QNarrowUn16Sto8Ux8:
   4550       case Iop_QNarrowUn16Uto8Ux8:
   4551       case Iop_QNarrowUn32Sto16Sx4:
   4552       case Iop_QNarrowUn32Sto16Ux4:
   4553       case Iop_QNarrowUn32Uto16Ux4:
   4554       case Iop_QNarrowUn64Sto32Sx2:
   4555       case Iop_QNarrowUn64Sto32Ux2:
   4556       case Iop_QNarrowUn64Uto32Ux2:
   4557          return vectorNarrowUnV128(mce, op, vatom);
   4558 
   4559       case Iop_Widen8Sto16x8:
   4560       case Iop_Widen8Uto16x8:
   4561       case Iop_Widen16Sto32x4:
   4562       case Iop_Widen16Uto32x4:
   4563       case Iop_Widen32Sto64x2:
   4564       case Iop_Widen32Uto64x2:
   4565          return vectorWidenI64(mce, op, vatom);
   4566 
   4567       case Iop_PwAddL32Ux2:
   4568       case Iop_PwAddL32Sx2:
   4569          return mkPCastTo(mce, Ity_I64,
   4570                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
   4571 
   4572       case Iop_PwAddL16Ux4:
   4573       case Iop_PwAddL16Sx4:
   4574          return mkPCast32x2(mce,
   4575                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
   4576 
   4577       case Iop_PwAddL8Ux8:
   4578       case Iop_PwAddL8Sx8:
   4579          return mkPCast16x4(mce,
   4580                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
   4581 
   4582       case Iop_PwAddL32Ux4:
   4583       case Iop_PwAddL32Sx4:
   4584          return mkPCast64x2(mce,
   4585                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
   4586 
   4587       case Iop_PwAddL16Ux8:
   4588       case Iop_PwAddL16Sx8:
   4589          return mkPCast32x4(mce,
   4590                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
   4591 
   4592       case Iop_PwAddL8Ux16:
   4593       case Iop_PwAddL8Sx16:
   4594          return mkPCast16x8(mce,
   4595                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
   4596 
   4597       case Iop_I64UtoF32:
   4598       default:
   4599          ppIROp(op);
   4600          VG_(tool_panic)("memcheck:expr2vbits_Unop");
   4601    }
   4602 }
   4603 
   4604 
   4605 /* Worker function -- do not call directly.  See comments on
   4606    expr2vbits_Load for the meaning of |guard|.
   4607 
   4608    Generates IR to (1) perform a definedness test of |addr|, (2)
   4609    perform a validity test of |addr|, and (3) return the Vbits for the
   4610    location indicated by |addr|.  All of this only happens when
   4611    |guard| is NULL or |guard| evaluates to True at run time.
   4612 
   4613    If |guard| evaluates to False at run time, the returned value is
   4614    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
   4615    performed.
   4616 
   4617    The definedness of |guard| itself is not checked.  That is assumed
   4618    to have been done before this point, by the caller. */
   4619 static
   4620 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
   4621                               IREndness end, IRType ty,
   4622                               IRAtom* addr, UInt bias, IRAtom* guard )
   4623 {
   4624    tl_assert(isOriginalAtom(mce,addr));
   4625    tl_assert(end == Iend_LE || end == Iend_BE);
   4626 
   4627    /* First, emit a definedness test for the address.  This also sets
   4628       the address (shadow) to 'defined' following the test. */
   4629    complainIfUndefined( mce, addr, guard );
   4630 
   4631    /* Now cook up a call to the relevant helper function, to read the
   4632       data V bits from shadow memory. */
   4633    ty = shadowTypeV(ty);
   4634 
   4635    void*        helper           = NULL;
   4636    const HChar* hname            = NULL;
   4637    Bool         ret_via_outparam = False;
   4638 
   4639    if (end == Iend_LE) {
   4640       switch (ty) {
   4641          case Ity_V256: helper = &MC_(helperc_LOADV256le);
   4642                         hname = "MC_(helperc_LOADV256le)";
   4643                         ret_via_outparam = True;
   4644                         break;
   4645          case Ity_V128: helper = &MC_(helperc_LOADV128le);
   4646                         hname = "MC_(helperc_LOADV128le)";
   4647                         ret_via_outparam = True;
   4648                         break;
   4649          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
   4650                         hname = "MC_(helperc_LOADV64le)";
   4651                         break;
   4652          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
   4653                         hname = "MC_(helperc_LOADV32le)";
   4654                         break;
   4655          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
   4656                         hname = "MC_(helperc_LOADV16le)";
   4657                         break;
   4658          case Ity_I8:   helper = &MC_(helperc_LOADV8);
   4659                         hname = "MC_(helperc_LOADV8)";
   4660                         break;
   4661          default:       ppIRType(ty);
   4662                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
   4663       }
   4664    } else {
   4665       switch (ty) {
   4666          case Ity_V256: helper = &MC_(helperc_LOADV256be);
   4667                         hname = "MC_(helperc_LOADV256be)";
   4668                         ret_via_outparam = True;
   4669                         break;
   4670          case Ity_V128: helper = &MC_(helperc_LOADV128be);
   4671                         hname = "MC_(helperc_LOADV128be)";
   4672                         ret_via_outparam = True;
   4673                         break;
   4674          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
   4675                         hname = "MC_(helperc_LOADV64be)";
   4676                         break;
   4677          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
   4678                         hname = "MC_(helperc_LOADV32be)";
   4679                         break;
   4680          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
   4681                         hname = "MC_(helperc_LOADV16be)";
   4682                         break;
   4683          case Ity_I8:   helper = &MC_(helperc_LOADV8);
   4684                         hname = "MC_(helperc_LOADV8)";
   4685                         break;
   4686          default:       ppIRType(ty);
   4687                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
   4688       }
   4689    }
   4690 
   4691    tl_assert(helper);
   4692    tl_assert(hname);
   4693 
   4694    /* Generate the actual address into addrAct. */
   4695    IRAtom* addrAct;
   4696    if (bias == 0) {
   4697       addrAct = addr;
   4698    } else {
   4699       IROp    mkAdd;
   4700       IRAtom* eBias;
   4701       IRType  tyAddr  = mce->hWordTy;
   4702       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
   4703       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
   4704       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
   4705       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
   4706    }
   4707 
   4708    /* We need to have a place to park the V bits we're just about to
   4709       read. */
   4710    IRTemp datavbits = newTemp(mce, ty, VSh);
   4711 
   4712    /* Here's the call. */
   4713    IRDirty* di;
   4714    if (ret_via_outparam) {
   4715       di = unsafeIRDirty_1_N( datavbits,
   4716                               2/*regparms*/,
   4717                               hname, VG_(fnptr_to_fnentry)( helper ),
   4718                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
   4719    } else {
   4720       di = unsafeIRDirty_1_N( datavbits,
   4721                               1/*regparms*/,
   4722                               hname, VG_(fnptr_to_fnentry)( helper ),
   4723                               mkIRExprVec_1( addrAct ) );
   4724    }
   4725 
   4726    setHelperAnns( mce, di );
   4727    if (guard) {
   4728       di->guard = guard;
   4729       /* Ideally the didn't-happen return value here would be all-ones
   4730          (all-undefined), so it'd be obvious if it got used
   4731          inadvertantly.  We can get by with the IR-mandated default
   4732          value (0b01 repeating, 0x55 etc) as that'll still look pretty
   4733          undefined if it ever leaks out. */
   4734    }
   4735    stmt( 'V', mce, IRStmt_Dirty(di) );
   4736 
   4737    return mkexpr(datavbits);
   4738 }
   4739 
   4740 
   4741 /* Generate IR to do a shadow load.  The helper is expected to check
   4742    the validity of the address and return the V bits for that address.
   4743    This can optionally be controlled by a guard, which is assumed to
   4744    be True if NULL.  In the case where the guard is False at runtime,
   4745    the helper will return the didn't-do-the-call value of 0x55..55.
   4746    Since that means "completely undefined result", the caller of
   4747    this function will need to fix up the result somehow in that
   4748    case.
   4749 
   4750    Caller of this function is also expected to have checked the
   4751    definedness of |guard| before this point.
   4752 */
   4753 static
   4754 IRAtom* expr2vbits_Load ( MCEnv* mce,
   4755                           IREndness end, IRType ty,
   4756                           IRAtom* addr, UInt bias,
   4757                           IRAtom* guard )
   4758 {
   4759    tl_assert(end == Iend_LE || end == Iend_BE);
   4760    switch (shadowTypeV(ty)) {
   4761       case Ity_I8:
   4762       case Ity_I16:
   4763       case Ity_I32:
   4764       case Ity_I64:
   4765       case Ity_V128:
   4766       case Ity_V256:
   4767          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
   4768       default:
   4769          VG_(tool_panic)("expr2vbits_Load");
   4770    }
   4771 }
   4772 
   4773 
   4774 /* The most general handler for guarded loads.  Assumes the
   4775    definedness of GUARD has already been checked by the caller.  A
   4776    GUARD of NULL is assumed to mean "always True".  Generates code to
   4777    check the definedness and validity of ADDR.
   4778 
   4779    Generate IR to do a shadow load from ADDR and return the V bits.
   4780    The loaded type is TY.  The loaded data is then (shadow) widened by
   4781    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
   4782    evaluates to False at run time then the returned Vbits are simply
   4783    VALT instead.  Note therefore that the argument type of VWIDEN must
   4784    be TY and the result type of VWIDEN must equal the type of VALT.
   4785 */
   4786 static
   4787 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
   4788                                           IREndness end, IRType ty,
   4789                                           IRAtom* addr, UInt bias,
   4790                                           IRAtom* guard,
   4791                                           IROp vwiden, IRAtom* valt )
   4792 {
   4793    /* Sanity check the conversion operation, and also set TYWIDE. */
   4794    IRType tyWide = Ity_INVALID;
   4795    switch (vwiden) {
   4796       case Iop_INVALID:
   4797          tyWide = ty;
   4798          break;
   4799       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
   4800          tyWide = Ity_I32;
   4801          break;
   4802       default:
   4803          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
   4804    }
   4805 
   4806    /* If the guard evaluates to True, this will hold the loaded V bits
   4807       at TY.  If the guard evaluates to False, this will be all
   4808       ones, meaning "all undefined", in which case we will have to
   4809       replace it using an ITE below. */
   4810    IRAtom* iftrue1
   4811       = assignNew('V', mce, ty,
   4812                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
   4813    /* Now (shadow-) widen the loaded V bits to the desired width.  In
   4814       the guard-is-False case, the allowable widening operators will
   4815       in the worst case (unsigned widening) at least leave the
   4816       pre-widened part as being marked all-undefined, and in the best
   4817       case (signed widening) mark the whole widened result as
   4818       undefined.  Anyway, it doesn't matter really, since in this case
   4819       we will replace said value with the default value |valt| using an
   4820       ITE. */
   4821    IRAtom* iftrue2
   4822       = vwiden == Iop_INVALID
   4823            ? iftrue1
   4824            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
   4825    /* These are the V bits we will return if the load doesn't take
   4826       place. */
   4827    IRAtom* iffalse
   4828       = valt;
   4829    /* Prepare the cond for the ITE.  Convert a NULL cond into
   4830       something that iropt knows how to fold out later. */
   4831    IRAtom* cond
   4832       = guard == NULL  ? mkU1(1)  : guard;
   4833    /* And assemble the final result. */
   4834    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
   4835 }
   4836 
   4837 
   4838 /* A simpler handler for guarded loads, in which there is no
   4839    conversion operation, and the default V bit return (when the guard
   4840    evaluates to False at runtime) is "all defined".  If there is no
   4841    guard expression or the guard is always TRUE this function behaves
   4842    like expr2vbits_Load.  It is assumed that definedness of GUARD has
   4843    already been checked at the call site. */
   4844 static
   4845 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
   4846                                          IREndness end, IRType ty,
   4847                                          IRAtom* addr, UInt bias,
   4848                                          IRAtom *guard )
   4849 {
   4850    return expr2vbits_Load_guarded_General(
   4851              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
   4852           );
   4853 }
   4854 
   4855 
   4856 static
   4857 IRAtom* expr2vbits_ITE ( MCEnv* mce,
   4858                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
   4859 {
   4860    IRAtom *vbitsC, *vbits0, *vbits1;
   4861    IRType ty;
   4862    /* Given ITE(cond, iftrue,  iffalse),  generate
   4863             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
   4864       That is, steer the V bits like the originals, but trash the
   4865       result if the steering value is undefined.  This gives
   4866       lazy propagation. */
   4867    tl_assert(isOriginalAtom(mce, cond));
   4868    tl_assert(isOriginalAtom(mce, iftrue));
   4869    tl_assert(isOriginalAtom(mce, iffalse));
   4870 
   4871    vbitsC = expr2vbits(mce, cond);
   4872    vbits1 = expr2vbits(mce, iftrue);
   4873    vbits0 = expr2vbits(mce, iffalse);
   4874    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
   4875 
   4876    return
   4877       mkUifU(mce, ty, assignNew('V', mce, ty,
   4878                                      IRExpr_ITE(cond, vbits1, vbits0)),
   4879                       mkPCastTo(mce, ty, vbitsC) );
   4880 }
   4881 
   4882 /* --------- This is the main expression-handling function. --------- */
   4883 
   4884 static
   4885 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
   4886 {
   4887    switch (e->tag) {
   4888 
   4889       case Iex_Get:
   4890          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
   4891 
   4892       case Iex_GetI:
   4893          return shadow_GETI( mce, e->Iex.GetI.descr,
   4894                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
   4895 
   4896       case Iex_RdTmp:
   4897          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
   4898 
   4899       case Iex_Const:
   4900          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
   4901 
   4902       case Iex_Qop:
   4903          return expr2vbits_Qop(
   4904                    mce,
   4905                    e->Iex.Qop.details->op,
   4906                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
   4907                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
   4908                 );
   4909 
   4910       case Iex_Triop:
   4911          return expr2vbits_Triop(
   4912                    mce,
   4913                    e->Iex.Triop.details->op,
   4914                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
   4915                    e->Iex.Triop.details->arg3
   4916                 );
   4917 
   4918       case Iex_Binop:
   4919          return expr2vbits_Binop(
   4920                    mce,
   4921                    e->Iex.Binop.op,
   4922                    e->Iex.Binop.arg1, e->Iex.Binop.arg2
   4923                 );
   4924 
   4925       case Iex_Unop:
   4926          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
   4927 
   4928       case Iex_Load:
   4929          return expr2vbits_Load( mce, e->Iex.Load.end,
   4930                                       e->Iex.Load.ty,
   4931                                       e->Iex.Load.addr, 0/*addr bias*/,
   4932                                       NULL/* guard == "always True"*/ );
   4933 
   4934       case Iex_CCall:
   4935          return mkLazyN( mce, e->Iex.CCall.args,
   4936                               e->Iex.CCall.retty,
   4937                               e->Iex.CCall.cee );
   4938 
   4939       case Iex_ITE:
   4940          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
   4941                                      e->Iex.ITE.iffalse);
   4942 
   4943       default:
   4944          VG_(printf)("\n");
   4945          ppIRExpr(e);
   4946          VG_(printf)("\n");
   4947          VG_(tool_panic)("memcheck: expr2vbits");
   4948    }
   4949 }
   4950 
   4951 /*------------------------------------------------------------*/
   4952 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
   4953 /*------------------------------------------------------------*/
   4954 
   4955 /* Widen a value to the host word size. */
   4956 
   4957 static
   4958 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
   4959 {
   4960    IRType ty, tyH;
   4961 
   4962    /* vatom is vbits-value and as such can only have a shadow type. */
   4963    tl_assert(isShadowAtom(mce,vatom));
   4964 
   4965    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
   4966    tyH = mce->hWordTy;
   4967 
   4968    if (tyH == Ity_I32) {
   4969       switch (ty) {
   4970          case Ity_I32:
   4971             return vatom;
   4972          case Ity_I16:
   4973             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
   4974          case Ity_I8:
   4975             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
   4976          default:
   4977             goto unhandled;
   4978       }
   4979    } else
   4980    if (tyH == Ity_I64) {
   4981       switch (ty) {
   4982          case Ity_I32:
   4983             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
   4984          case Ity_I16:
   4985             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
   4986                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
   4987          case Ity_I8:
   4988             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
   4989                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
   4990          default:
   4991             goto unhandled;
   4992       }
   4993    } else {
   4994       goto unhandled;
   4995    }
   4996   unhandled:
   4997    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
   4998    VG_(tool_panic)("zwidenToHostWord");
   4999 }
   5000 
   5001 
   5002 /* Generate a shadow store.  |addr| is always the original address
   5003    atom.  You can pass in either originals or V-bits for the data
   5004    atom, but obviously not both.  This function generates a check for
   5005    the definedness and (indirectly) the validity of |addr|, but only
   5006    when |guard| evaluates to True at run time (or is NULL).
   5007 
   5008    |guard| :: Ity_I1 controls whether the store really happens; NULL
   5009    means it unconditionally does.  Note that |guard| itself is not
   5010    checked for definedness; the caller of this function must do that
   5011    if necessary.
   5012 */
   5013 static
   5014 void do_shadow_Store ( MCEnv* mce,
   5015                        IREndness end,
   5016                        IRAtom* addr, UInt bias,
   5017                        IRAtom* data, IRAtom* vdata,
   5018                        IRAtom* guard )
   5019 {
   5020    IROp     mkAdd;
   5021    IRType   ty, tyAddr;
   5022    void*    helper = NULL;
   5023    const HChar* hname = NULL;
   5024    IRConst* c;
   5025 
   5026    tyAddr = mce->hWordTy;
   5027    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
   5028    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
   5029    tl_assert( end == Iend_LE || end == Iend_BE );
   5030 
   5031    if (data) {
   5032       tl_assert(!vdata);
   5033       tl_assert(isOriginalAtom(mce, data));
   5034       tl_assert(bias == 0);
   5035       vdata = expr2vbits( mce, data );
   5036    } else {
   5037       tl_assert(vdata);
   5038    }
   5039 
   5040    tl_assert(isOriginalAtom(mce,addr));
   5041    tl_assert(isShadowAtom(mce,vdata));
   5042 
   5043    if (guard) {
   5044       tl_assert(isOriginalAtom(mce, guard));
   5045       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
   5046    }
   5047 
   5048    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
   5049 
   5050    // If we're not doing undefined value checking, pretend that this value
   5051    // is "all valid".  That lets Vex's optimiser remove some of the V bit
   5052    // shadow computation ops that precede it.
   5053    if (MC_(clo_mc_level) == 1) {
   5054       switch (ty) {
   5055          case Ity_V256: // V256 weirdness -- used four times
   5056                         c = IRConst_V256(V_BITS32_DEFINED); break;
   5057          case Ity_V128: // V128 weirdness -- used twice
   5058                         c = IRConst_V128(V_BITS16_DEFINED); break;
   5059          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
   5060          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
   5061          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
   5062          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
   5063          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
   5064       }
   5065       vdata = IRExpr_Const( c );
   5066    }
   5067 
   5068    /* First, emit a definedness test for the address.  This also sets
   5069       the address (shadow) to 'defined' following the test.  Both of
   5070       those actions are gated on |guard|. */
   5071    complainIfUndefined( mce, addr, guard );
   5072 
   5073    /* Now decide which helper function to call to write the data V
   5074       bits into shadow memory. */
   5075    if (end == Iend_LE) {
   5076       switch (ty) {
   5077          case Ity_V256: /* we'll use the helper four times */
   5078          case Ity_V128: /* we'll use the helper twice */
   5079          case Ity_I64: helper = &MC_(helperc_STOREV64le);
   5080                        hname = "MC_(helperc_STOREV64le)";
   5081                        break;
   5082          case Ity_I32: helper = &MC_(helperc_STOREV32le);
   5083                        hname = "MC_(helperc_STOREV32le)";
   5084                        break;
   5085          case Ity_I16: helper = &MC_(helperc_STOREV16le);
   5086                        hname = "MC_(helperc_STOREV16le)";
   5087                        break;
   5088          case Ity_I8:  helper = &MC_(helperc_STOREV8);
   5089                        hname = "MC_(helperc_STOREV8)";
   5090                        break;
   5091          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
   5092       }
   5093    } else {
   5094       switch (ty) {
   5095          case Ity_V128: /* we'll use the helper twice */
   5096          case Ity_I64: helper = &MC_(helperc_STOREV64be);
   5097                        hname = "MC_(helperc_STOREV64be)";
   5098                        break;
   5099          case Ity_I32: helper = &MC_(helperc_STOREV32be);
   5100                        hname = "MC_(helperc_STOREV32be)";
   5101                        break;
   5102          case Ity_I16: helper = &MC_(helperc_STOREV16be);
   5103                        hname = "MC_(helperc_STOREV16be)";
   5104                        break;
   5105          case Ity_I8:  helper = &MC_(helperc_STOREV8);
   5106                        hname = "MC_(helperc_STOREV8)";
   5107                        break;
   5108          /* Note, no V256 case here, because no big-endian target that
   5109             we support, has 256 vectors. */
   5110          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
   5111       }
   5112    }
   5113 
   5114    if (UNLIKELY(ty == Ity_V256)) {
   5115 
   5116       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
   5117          Q3 being the most significant lane. */
   5118       /* These are the offsets of the Qs in memory. */
   5119       Int     offQ0, offQ1, offQ2, offQ3;
   5120 
   5121       /* Various bits for constructing the 4 lane helper calls */
   5122       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
   5123       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
   5124       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
   5125       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
   5126 
   5127       if (end == Iend_LE) {
   5128          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
   5129       } else {
   5130          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
   5131       }
   5132 
   5133       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
   5134       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
   5135       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
   5136       diQ0    = unsafeIRDirty_0_N(
   5137                    1/*regparms*/,
   5138                    hname, VG_(fnptr_to_fnentry)( helper ),
   5139                    mkIRExprVec_2( addrQ0, vdataQ0 )
   5140                 );
   5141 
   5142       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
   5143       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
   5144       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
   5145       diQ1    = unsafeIRDirty_0_N(
   5146                    1/*regparms*/,
   5147                    hname, VG_(fnptr_to_fnentry)( helper ),
   5148                    mkIRExprVec_2( addrQ1, vdataQ1 )
   5149                 );
   5150 
   5151       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
   5152       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
   5153       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
   5154       diQ2    = unsafeIRDirty_0_N(
   5155                    1/*regparms*/,
   5156                    hname, VG_(fnptr_to_fnentry)( helper ),
   5157                    mkIRExprVec_2( addrQ2, vdataQ2 )
   5158                 );
   5159 
   5160       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
   5161       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
   5162       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
   5163       diQ3    = unsafeIRDirty_0_N(
   5164                    1/*regparms*/,
   5165                    hname, VG_(fnptr_to_fnentry)( helper ),
   5166                    mkIRExprVec_2( addrQ3, vdataQ3 )
   5167                 );
   5168 
   5169       if (guard)
   5170          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
   5171 
   5172       setHelperAnns( mce, diQ0 );
   5173       setHelperAnns( mce, diQ1 );
   5174       setHelperAnns( mce, diQ2 );
   5175       setHelperAnns( mce, diQ3 );
   5176       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
   5177       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
   5178       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
   5179       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
   5180 
   5181    }
   5182    else if (UNLIKELY(ty == Ity_V128)) {
   5183 
   5184       /* V128-bit case */
   5185       /* See comment in next clause re 64-bit regparms */
   5186       /* also, need to be careful about endianness */
   5187 
   5188       Int     offLo64, offHi64;
   5189       IRDirty *diLo64, *diHi64;
   5190       IRAtom  *addrLo64, *addrHi64;
   5191       IRAtom  *vdataLo64, *vdataHi64;
   5192       IRAtom  *eBiasLo64, *eBiasHi64;
   5193 
   5194       if (end == Iend_LE) {
   5195          offLo64 = 0;
   5196          offHi64 = 8;
   5197       } else {
   5198          offLo64 = 8;
   5199          offHi64 = 0;
   5200       }
   5201 
   5202       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
   5203       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
   5204       vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
   5205       diLo64    = unsafeIRDirty_0_N(
   5206                      1/*regparms*/,
   5207                      hname, VG_(fnptr_to_fnentry)( helper ),
   5208                      mkIRExprVec_2( addrLo64, vdataLo64 )
   5209                   );
   5210       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
   5211       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
   5212       vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
   5213       diHi64    = unsafeIRDirty_0_N(
   5214                      1/*regparms*/,
   5215                      hname, VG_(fnptr_to_fnentry)( helper ),
   5216                      mkIRExprVec_2( addrHi64, vdataHi64 )
   5217                   );
   5218       if (guard) diLo64->guard = guard;
   5219       if (guard) diHi64->guard = guard;
   5220       setHelperAnns( mce, diLo64 );
   5221       setHelperAnns( mce, diHi64 );
   5222       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
   5223       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
   5224 
   5225    } else {
   5226 
   5227       IRDirty *di;
   5228       IRAtom  *addrAct;
   5229 
   5230       /* 8/16/32/64-bit cases */
   5231       /* Generate the actual address into addrAct. */
   5232       if (bias == 0) {
   5233          addrAct = addr;
   5234       } else {
   5235          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
   5236          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
   5237       }
   5238 
   5239       if (ty == Ity_I64) {
   5240          /* We can't do this with regparm 2 on 32-bit platforms, since
   5241             the back ends aren't clever enough to handle 64-bit
   5242             regparm args.  Therefore be different. */
   5243          di = unsafeIRDirty_0_N(
   5244                  1/*regparms*/,
   5245                  hname, VG_(fnptr_to_fnentry)( helper ),
   5246                  mkIRExprVec_2( addrAct, vdata )
   5247               );
   5248       } else {
   5249          di = unsafeIRDirty_0_N(
   5250                  2/*regparms*/,
   5251                  hname, VG_(fnptr_to_fnentry)( helper ),
   5252                  mkIRExprVec_2( addrAct,
   5253                                 zwidenToHostWord( mce, vdata ))
   5254               );
   5255       }
   5256       if (guard) di->guard = guard;
   5257       setHelperAnns( mce, di );
   5258       stmt( 'V', mce, IRStmt_Dirty(di) );
   5259    }
   5260 
   5261 }
   5262 
   5263 
   5264 /* Do lazy pessimistic propagation through a dirty helper call, by
   5265    looking at the annotations on it.  This is the most complex part of
   5266    Memcheck. */
   5267 
   5268 static IRType szToITy ( Int n )
   5269 {
   5270    switch (n) {
   5271       case 1: return Ity_I8;
   5272       case 2: return Ity_I16;
   5273       case 4: return Ity_I32;
   5274       case 8: return Ity_I64;
   5275       default: VG_(tool_panic)("szToITy(memcheck)");
   5276    }
   5277 }
   5278 
   5279 static
   5280 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
   5281 {
   5282    Int       i, k, n, toDo, gSz, gOff;
   5283    IRAtom    *src, *here, *curr;
   5284    IRType    tySrc, tyDst;
   5285    IRTemp    dst;
   5286    IREndness end;
   5287 
   5288    /* What's the native endianness?  We need to know this. */
   5289 #  if defined(VG_BIGENDIAN)
   5290    end = Iend_BE;
   5291 #  elif defined(VG_LITTLEENDIAN)
   5292    end = Iend_LE;
   5293 #  else
   5294 #    error "Unknown endianness"
   5295 #  endif
   5296 
   5297    /* First check the guard. */
   5298    complainIfUndefined(mce, d->guard, NULL);
   5299 
   5300    /* Now round up all inputs and PCast over them. */
   5301    curr = definedOfType(Ity_I32);
   5302 
   5303    /* Inputs: unmasked args
   5304       Note: arguments are evaluated REGARDLESS of the guard expression */
   5305    for (i = 0; d->args[i]; i++) {
   5306       IRAtom* arg = d->args[i];
   5307       if ( (d->cee->mcx_mask & (1<<i))
   5308            || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
   5309          /* ignore this arg */
   5310       } else {
   5311          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg) );
   5312          curr = mkUifU32(mce, here, curr);
   5313       }
   5314    }
   5315 
   5316    /* Inputs: guest state that we read. */
   5317    for (i = 0; i < d->nFxState; i++) {
   5318       tl_assert(d->fxState[i].fx != Ifx_None);
   5319       if (d->fxState[i].fx == Ifx_Write)
   5320          continue;
   5321 
   5322       /* Enumerate the described state segments */
   5323       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   5324          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   5325          gSz  = d->fxState[i].size;
   5326 
   5327          /* Ignore any sections marked as 'always defined'. */
   5328          if (isAlwaysDefd(mce, gOff, gSz)) {
   5329             if (0)
   5330             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
   5331                         gOff, gSz);
   5332             continue;
   5333          }
   5334 
   5335          /* This state element is read or modified.  So we need to
   5336             consider it.  If larger than 8 bytes, deal with it in
   5337             8-byte chunks. */
   5338          while (True) {
   5339             tl_assert(gSz >= 0);
   5340             if (gSz == 0) break;
   5341             n = gSz <= 8 ? gSz : 8;
   5342             /* update 'curr' with UifU of the state slice
   5343                gOff .. gOff+n-1 */
   5344             tySrc = szToITy( n );
   5345 
   5346             /* Observe the guard expression. If it is false use an
   5347                all-bits-defined bit pattern */
   5348             IRAtom *cond, *iffalse, *iftrue;
   5349 
   5350             cond    = assignNew('V', mce, Ity_I1, d->guard);
   5351             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
   5352             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
   5353             src     = assignNew('V', mce, tySrc,
   5354                                 IRExpr_ITE(cond, iftrue, iffalse));
   5355 
   5356             here = mkPCastTo( mce, Ity_I32, src );
   5357             curr = mkUifU32(mce, here, curr);
   5358             gSz -= n;
   5359             gOff += n;
   5360          }
   5361       }
   5362    }
   5363 
   5364    /* Inputs: memory.  First set up some info needed regardless of
   5365       whether we're doing reads or writes. */
   5366 
   5367    if (d->mFx != Ifx_None) {
   5368       /* Because we may do multiple shadow loads/stores from the same
   5369          base address, it's best to do a single test of its
   5370          definedness right now.  Post-instrumentation optimisation
   5371          should remove all but this test. */
   5372       IRType tyAddr;
   5373       tl_assert(d->mAddr);
   5374       complainIfUndefined(mce, d->mAddr, d->guard);
   5375 
   5376       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
   5377       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
   5378       tl_assert(tyAddr == mce->hWordTy); /* not really right */
   5379    }
   5380 
   5381    /* Deal with memory inputs (reads or modifies) */
   5382    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
   5383       toDo   = d->mSize;
   5384       /* chew off 32-bit chunks.  We don't care about the endianness
   5385          since it's all going to be condensed down to a single bit,
   5386          but nevertheless choose an endianness which is hopefully
   5387          native to the platform. */
   5388       while (toDo >= 4) {
   5389          here = mkPCastTo(
   5390                    mce, Ity_I32,
   5391                    expr2vbits_Load_guarded_Simple(
   5392                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
   5393                 );
   5394          curr = mkUifU32(mce, here, curr);
   5395          toDo -= 4;
   5396       }
   5397       /* chew off 16-bit chunks */
   5398       while (toDo >= 2) {
   5399          here = mkPCastTo(
   5400                    mce, Ity_I32,
   5401                    expr2vbits_Load_guarded_Simple(
   5402                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
   5403                 );
   5404          curr = mkUifU32(mce, here, curr);
   5405          toDo -= 2;
   5406       }
   5407       /* chew off the remaining 8-bit chunk, if any */
   5408       if (toDo == 1) {
   5409          here = mkPCastTo(
   5410                    mce, Ity_I32,
   5411                    expr2vbits_Load_guarded_Simple(
   5412                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
   5413                 );
   5414          curr = mkUifU32(mce, here, curr);
   5415          toDo -= 1;
   5416       }
   5417       tl_assert(toDo == 0);
   5418    }
   5419 
   5420    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
   5421       all the inputs to the helper.  Now we need to re-distribute the
   5422       results to all destinations. */
   5423 
   5424    /* Outputs: the destination temporary, if there is one. */
   5425    if (d->tmp != IRTemp_INVALID) {
   5426       dst   = findShadowTmpV(mce, d->tmp);
   5427       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
   5428       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
   5429    }
   5430 
   5431    /* Outputs: guest state that we write or modify. */
   5432    for (i = 0; i < d->nFxState; i++) {
   5433       tl_assert(d->fxState[i].fx != Ifx_None);
   5434       if (d->fxState[i].fx == Ifx_Read)
   5435          continue;
   5436 
   5437       /* Enumerate the described state segments */
   5438       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   5439          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   5440          gSz  = d->fxState[i].size;
   5441 
   5442          /* Ignore any sections marked as 'always defined'. */
   5443          if (isAlwaysDefd(mce, gOff, gSz))
   5444             continue;
   5445 
   5446          /* This state element is written or modified.  So we need to
   5447             consider it.  If larger than 8 bytes, deal with it in
   5448             8-byte chunks. */
   5449          while (True) {
   5450             tl_assert(gSz >= 0);
   5451             if (gSz == 0) break;
   5452             n = gSz <= 8 ? gSz : 8;
   5453             /* Write suitably-casted 'curr' to the state slice
   5454                gOff .. gOff+n-1 */
   5455             tyDst = szToITy( n );
   5456             do_shadow_PUT( mce, gOff,
   5457                                 NULL, /* original atom */
   5458                                 mkPCastTo( mce, tyDst, curr ), d->guard );
   5459             gSz -= n;
   5460             gOff += n;
   5461          }
   5462       }
   5463    }
   5464 
   5465    /* Outputs: memory that we write or modify.  Same comments about
   5466       endianness as above apply. */
   5467    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
   5468       toDo   = d->mSize;
   5469       /* chew off 32-bit chunks */
   5470       while (toDo >= 4) {
   5471          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5472                           NULL, /* original data */
   5473                           mkPCastTo( mce, Ity_I32, curr ),
   5474                           d->guard );
   5475          toDo -= 4;
   5476       }
   5477       /* chew off 16-bit chunks */
   5478       while (toDo >= 2) {
   5479          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5480                           NULL, /* original data */
   5481                           mkPCastTo( mce, Ity_I16, curr ),
   5482                           d->guard );
   5483          toDo -= 2;
   5484       }
   5485       /* chew off the remaining 8-bit chunk, if any */
   5486       if (toDo == 1) {
   5487          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5488                           NULL, /* original data */
   5489                           mkPCastTo( mce, Ity_I8, curr ),
   5490                           d->guard );
   5491          toDo -= 1;
   5492       }
   5493       tl_assert(toDo == 0);
   5494    }
   5495 
   5496 }
   5497 
   5498 
   5499 /* We have an ABI hint telling us that [base .. base+len-1] is to
   5500    become undefined ("writable").  Generate code to call a helper to
   5501    notify the A/V bit machinery of this fact.
   5502 
   5503    We call
   5504    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
   5505                                                     Addr nia );
   5506 */
   5507 static
   5508 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
   5509 {
   5510    IRDirty* di;
   5511    /* Minor optimisation: if not doing origin tracking, ignore the
   5512       supplied nia and pass zero instead.  This is on the basis that
   5513       MC_(helperc_MAKE_STACK_UNINIT) will ignore it anyway, and we can
   5514       almost always generate a shorter instruction to put zero into a
   5515       register than any other value. */
   5516    if (MC_(clo_mc_level) < 3)
   5517       nia = mkIRExpr_HWord(0);
   5518 
   5519    di = unsafeIRDirty_0_N(
   5520            0/*regparms*/,
   5521            "MC_(helperc_MAKE_STACK_UNINIT)",
   5522            VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT) ),
   5523            mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
   5524         );
   5525    stmt( 'V', mce, IRStmt_Dirty(di) );
   5526 }
   5527 
   5528 
   5529 /* ------ Dealing with IRCAS (big and complex) ------ */
   5530 
   5531 /* FWDS */
   5532 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
   5533                              IRAtom* baseaddr, Int offset );
   5534 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
   5535 static void    gen_store_b ( MCEnv* mce, Int szB,
   5536                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
   5537                              IRAtom* guard );
   5538 
   5539 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
   5540 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
   5541 
   5542 
   5543 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
   5544    IRExpr.Consts, else this asserts.  If they are both Consts, it
   5545    doesn't do anything.  So that just leaves the RdTmp case.
   5546 
   5547    In which case: this assigns the shadow value SHADOW to the IR
   5548    shadow temporary associated with ORIG.  That is, ORIG, being an
   5549    original temporary, will have a shadow temporary associated with
   5550    it.  However, in the case envisaged here, there will so far have
   5551    been no IR emitted to actually write a shadow value into that
   5552    temporary.  What this routine does is to (emit IR to) copy the
   5553    value in SHADOW into said temporary, so that after this call,
   5554    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
   5555    value in SHADOW.
   5556 
   5557    Point is to allow callers to compute "by hand" a shadow value for
   5558    ORIG, and force it to be associated with ORIG.
   5559 
   5560    How do we know that that shadow associated with ORIG has not so far
   5561    been assigned to?  Well, we don't per se know that, but supposing
   5562    it had.  Then this routine would create a second assignment to it,
   5563    and later the IR sanity checker would barf.  But that never
   5564    happens.  QED.
   5565 */
   5566 static void bind_shadow_tmp_to_orig ( UChar how,
   5567                                       MCEnv* mce,
   5568                                       IRAtom* orig, IRAtom* shadow )
   5569 {
   5570    tl_assert(isOriginalAtom(mce, orig));
   5571    tl_assert(isShadowAtom(mce, shadow));
   5572    switch (orig->tag) {
   5573       case Iex_Const:
   5574          tl_assert(shadow->tag == Iex_Const);
   5575          break;
   5576       case Iex_RdTmp:
   5577          tl_assert(shadow->tag == Iex_RdTmp);
   5578          if (how == 'V') {
   5579             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
   5580                    shadow);
   5581          } else {
   5582             tl_assert(how == 'B');
   5583             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
   5584                    shadow);
   5585          }
   5586          break;
   5587       default:
   5588          tl_assert(0);
   5589    }
   5590 }
   5591 
   5592 
   5593 static
   5594 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
   5595 {
   5596    /* Scheme is (both single- and double- cases):
   5597 
   5598       1. fetch data#,dataB (the proposed new value)
   5599 
   5600       2. fetch expd#,expdB (what we expect to see at the address)
   5601 
   5602       3. check definedness of address
   5603 
   5604       4. load old#,oldB from shadow memory; this also checks
   5605          addressibility of the address
   5606 
   5607       5. the CAS itself
   5608 
   5609       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
   5610 
   5611       7. if "expected == old" (as computed by (6))
   5612             store data#,dataB to shadow memory
   5613 
   5614       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
   5615       'data' but 7 stores 'data#'.  Hence it is possible for the
   5616       shadow data to be incorrectly checked and/or updated:
   5617 
   5618       * 7 is at least gated correctly, since the 'expected == old'
   5619         condition is derived from outputs of 5.  However, the shadow
   5620         write could happen too late: imagine after 5 we are
   5621         descheduled, a different thread runs, writes a different
   5622         (shadow) value at the address, and then we resume, hence
   5623         overwriting the shadow value written by the other thread.
   5624 
   5625       Because the original memory access is atomic, there's no way to
   5626       make both the original and shadow accesses into a single atomic
   5627       thing, hence this is unavoidable.
   5628 
   5629       At least as Valgrind stands, I don't think it's a problem, since
   5630       we're single threaded *and* we guarantee that there are no
   5631       context switches during the execution of any specific superblock
   5632       -- context switches can only happen at superblock boundaries.
   5633 
   5634       If Valgrind ever becomes MT in the future, then it might be more
   5635       of a problem.  A possible kludge would be to artificially
   5636       associate with the location, a lock, which we must acquire and
   5637       release around the transaction as a whole.  Hmm, that probably
   5638       would't work properly since it only guards us against other
   5639       threads doing CASs on the same location, not against other
   5640       threads doing normal reads and writes.
   5641 
   5642       ------------------------------------------------------------
   5643 
   5644       COMMENT_ON_CasCmpEQ:
   5645 
   5646       Note two things.  Firstly, in the sequence above, we compute
   5647       "expected == old", but we don't check definedness of it.  Why
   5648       not?  Also, the x86 and amd64 front ends use
   5649       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
   5650       determination (expected == old ?) for themselves, and we also
   5651       don't check definedness for those primops; we just say that the
   5652       result is defined.  Why?  Details follow.
   5653 
   5654       x86/amd64 contains various forms of locked insns:
   5655       * lock prefix before all basic arithmetic insn;
   5656         eg lock xorl %reg1,(%reg2)
   5657       * atomic exchange reg-mem
   5658       * compare-and-swaps
   5659 
   5660       Rather than attempt to represent them all, which would be a
   5661       royal PITA, I used a result from Maurice Herlihy
   5662       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
   5663       demonstrates that compare-and-swap is a primitive more general
   5664       than the other two, and so can be used to represent all of them.
   5665       So the translation scheme for (eg) lock incl (%reg) is as
   5666       follows:
   5667 
   5668         again:
   5669          old = * %reg
   5670          new = old + 1
   5671          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
   5672 
   5673       The "atomically" is the CAS bit.  The scheme is always the same:
   5674       get old value from memory, compute new value, atomically stuff
   5675       new value back in memory iff the old value has not changed (iow,
   5676       no other thread modified it in the meantime).  If it has changed
   5677       then we've been out-raced and we have to start over.
   5678 
   5679       Now that's all very neat, but it has the bad side effect of
   5680       introducing an explicit equality test into the translation.
   5681       Consider the behaviour of said code on a memory location which
   5682       is uninitialised.  We will wind up doing a comparison on
   5683       uninitialised data, and mc duly complains.
   5684 
   5685       What's difficult about this is, the common case is that the
   5686       location is uncontended, and so we're usually comparing the same
   5687       value (* %reg) with itself.  So we shouldn't complain even if it
   5688       is undefined.  But mc doesn't know that.
   5689 
   5690       My solution is to mark the == in the IR specially, so as to tell
   5691       mc that it almost certainly compares a value with itself, and we
   5692       should just regard the result as always defined.  Rather than
   5693       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
   5694       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
   5695 
   5696       So there's always the question of, can this give a false
   5697       negative?  eg, imagine that initially, * %reg is defined; and we
   5698       read that; but then in the gap between the read and the CAS, a
   5699       different thread writes an undefined (and different) value at
   5700       the location.  Then the CAS in this thread will fail and we will
   5701       go back to "again:", but without knowing that the trip back
   5702       there was based on an undefined comparison.  No matter; at least
   5703       the other thread won the race and the location is correctly
   5704       marked as undefined.  What if it wrote an uninitialised version
   5705       of the same value that was there originally, though?
   5706 
   5707       etc etc.  Seems like there's a small corner case in which we
   5708       might lose the fact that something's defined -- we're out-raced
   5709       in between the "old = * reg" and the "atomically {", _and_ the
   5710       other thread is writing in an undefined version of what's
   5711       already there.  Well, that seems pretty unlikely.
   5712 
   5713       ---
   5714 
   5715       If we ever need to reinstate it .. code which generates a
   5716       definedness test for "expected == old" was removed at r10432 of
   5717       this file.
   5718    */
   5719    if (cas->oldHi == IRTemp_INVALID) {
   5720       do_shadow_CAS_single( mce, cas );
   5721    } else {
   5722       do_shadow_CAS_double( mce, cas );
   5723    }
   5724 }
   5725 
   5726 
   5727 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
   5728 {
   5729    IRAtom *vdataLo = NULL, *bdataLo = NULL;
   5730    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
   5731    IRAtom *voldLo  = NULL, *boldLo  = NULL;
   5732    IRAtom *expd_eq_old = NULL;
   5733    IROp   opCasCmpEQ;
   5734    Int    elemSzB;
   5735    IRType elemTy;
   5736    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
   5737 
   5738    /* single CAS */
   5739    tl_assert(cas->oldHi == IRTemp_INVALID);
   5740    tl_assert(cas->expdHi == NULL);
   5741    tl_assert(cas->dataHi == NULL);
   5742 
   5743    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
   5744    switch (elemTy) {
   5745       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
   5746       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
   5747       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
   5748       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
   5749       default: tl_assert(0); /* IR defn disallows any other types */
   5750    }
   5751 
   5752    /* 1. fetch data# (the proposed new value) */
   5753    tl_assert(isOriginalAtom(mce, cas->dataLo));
   5754    vdataLo
   5755       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
   5756    tl_assert(isShadowAtom(mce, vdataLo));
   5757    if (otrak) {
   5758       bdataLo
   5759          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
   5760       tl_assert(isShadowAtom(mce, bdataLo));
   5761    }
   5762 
   5763    /* 2. fetch expected# (what we expect to see at the address) */
   5764    tl_assert(isOriginalAtom(mce, cas->expdLo));
   5765    vexpdLo
   5766       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
   5767    tl_assert(isShadowAtom(mce, vexpdLo));
   5768    if (otrak) {
   5769       bexpdLo
   5770          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
   5771       tl_assert(isShadowAtom(mce, bexpdLo));
   5772    }
   5773 
   5774    /* 3. check definedness of address */
   5775    /* 4. fetch old# from shadow memory; this also checks
   5776          addressibility of the address */
   5777    voldLo
   5778       = assignNew(
   5779            'V', mce, elemTy,
   5780            expr2vbits_Load(
   5781               mce,
   5782               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
   5783               NULL/*always happens*/
   5784         ));
   5785    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
   5786    if (otrak) {
   5787       boldLo
   5788          = assignNew('B', mce, Ity_I32,
   5789                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
   5790       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
   5791    }
   5792 
   5793    /* 5. the CAS itself */
   5794    stmt( 'C', mce, IRStmt_CAS(cas) );
   5795 
   5796    /* 6. compute "expected == old" */
   5797    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
   5798    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
   5799       tree, but it's not copied from the input block. */
   5800    expd_eq_old
   5801       = assignNew('C', mce, Ity_I1,
   5802                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
   5803 
   5804    /* 7. if "expected == old"
   5805             store data# to shadow memory */
   5806    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
   5807                     NULL/*data*/, vdataLo/*vdata*/,
   5808                     expd_eq_old/*guard for store*/ );
   5809    if (otrak) {
   5810       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
   5811                    bdataLo/*bdata*/,
   5812                    expd_eq_old/*guard for store*/ );
   5813    }
   5814 }
   5815 
   5816 
   5817 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
   5818 {
   5819    IRAtom *vdataHi = NULL, *bdataHi = NULL;
   5820    IRAtom *vdataLo = NULL, *bdataLo = NULL;
   5821    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
   5822    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
   5823    IRAtom *voldHi  = NULL, *boldHi  = NULL;
   5824    IRAtom *voldLo  = NULL, *boldLo  = NULL;
   5825    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
   5826    IRAtom *expd_eq_old = NULL, *zero = NULL;
   5827    IROp   opCasCmpEQ, opOr, opXor;
   5828    Int    elemSzB, memOffsLo, memOffsHi;
   5829    IRType elemTy;
   5830    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
   5831 
   5832    /* double CAS */
   5833    tl_assert(cas->oldHi != IRTemp_INVALID);
   5834    tl_assert(cas->expdHi != NULL);
   5835    tl_assert(cas->dataHi != NULL);
   5836 
   5837    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
   5838    switch (elemTy) {
   5839       case Ity_I8:
   5840          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
   5841          elemSzB = 1; zero = mkU8(0);
   5842          break;
   5843       case Ity_I16:
   5844          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
   5845          elemSzB = 2; zero = mkU16(0);
   5846          break;
   5847       case Ity_I32:
   5848          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
   5849          elemSzB = 4; zero = mkU32(0);
   5850          break;
   5851       case Ity_I64:
   5852          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
   5853          elemSzB = 8; zero = mkU64(0);
   5854          break;
   5855       default:
   5856          tl_assert(0); /* IR defn disallows any other types */
   5857    }
   5858 
   5859    /* 1. fetch data# (the proposed new value) */
   5860    tl_assert(isOriginalAtom(mce, cas->dataHi));
   5861    tl_assert(isOriginalAtom(mce, cas->dataLo));
   5862    vdataHi
   5863       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi));
   5864    vdataLo
   5865       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
   5866    tl_assert(isShadowAtom(mce, vdataHi));
   5867    tl_assert(isShadowAtom(mce, vdataLo));
   5868    if (otrak) {
   5869       bdataHi
   5870          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
   5871       bdataLo
   5872          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
   5873       tl_assert(isShadowAtom(mce, bdataHi));
   5874       tl_assert(isShadowAtom(mce, bdataLo));
   5875    }
   5876 
   5877    /* 2. fetch expected# (what we expect to see at the address) */
   5878    tl_assert(isOriginalAtom(mce, cas->expdHi));
   5879    tl_assert(isOriginalAtom(mce, cas->expdLo));
   5880    vexpdHi
   5881       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi));
   5882    vexpdLo
   5883       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
   5884    tl_assert(isShadowAtom(mce, vexpdHi));
   5885    tl_assert(isShadowAtom(mce, vexpdLo));
   5886    if (otrak) {
   5887       bexpdHi
   5888          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
   5889       bexpdLo
   5890          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
   5891       tl_assert(isShadowAtom(mce, bexpdHi));
   5892       tl_assert(isShadowAtom(mce, bexpdLo));
   5893    }
   5894 
   5895    /* 3. check definedness of address */
   5896    /* 4. fetch old# from shadow memory; this also checks
   5897          addressibility of the address */
   5898    if (cas->end == Iend_LE) {
   5899       memOffsLo = 0;
   5900       memOffsHi = elemSzB;
   5901    } else {
   5902       tl_assert(cas->end == Iend_BE);
   5903       memOffsLo = elemSzB;
   5904       memOffsHi = 0;
   5905    }
   5906    voldHi
   5907       = assignNew(
   5908            'V', mce, elemTy,
   5909            expr2vbits_Load(
   5910               mce,
   5911               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
   5912               NULL/*always happens*/
   5913         ));
   5914    voldLo
   5915       = assignNew(
   5916            'V', mce, elemTy,
   5917            expr2vbits_Load(
   5918               mce,
   5919               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
   5920               NULL/*always happens*/
   5921         ));
   5922    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
   5923    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
   5924    if (otrak) {
   5925       boldHi
   5926          = assignNew('B', mce, Ity_I32,
   5927                      gen_load_b(mce, elemSzB, cas->addr,
   5928                                 memOffsHi/*addr bias*/));
   5929       boldLo
   5930          = assignNew('B', mce, Ity_I32,
   5931                      gen_load_b(mce, elemSzB, cas->addr,
   5932                                 memOffsLo/*addr bias*/));
   5933       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
   5934       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
   5935    }
   5936 
   5937    /* 5. the CAS itself */
   5938    stmt( 'C', mce, IRStmt_CAS(cas) );
   5939 
   5940    /* 6. compute "expected == old" */
   5941    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
   5942    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
   5943       tree, but it's not copied from the input block. */
   5944    /*
   5945       xHi = oldHi ^ expdHi;
   5946       xLo = oldLo ^ expdLo;
   5947       xHL = xHi | xLo;
   5948       expd_eq_old = xHL == 0;
   5949    */
   5950    xHi = assignNew('C', mce, elemTy,
   5951                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
   5952    xLo = assignNew('C', mce, elemTy,
   5953                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
   5954    xHL = assignNew('C', mce, elemTy,
   5955                    binop(opOr, xHi, xLo));
   5956    expd_eq_old
   5957       = assignNew('C', mce, Ity_I1,
   5958                   binop(opCasCmpEQ, xHL, zero));
   5959 
   5960    /* 7. if "expected == old"
   5961             store data# to shadow memory */
   5962    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
   5963                     NULL/*data*/, vdataHi/*vdata*/,
   5964                     expd_eq_old/*guard for store*/ );
   5965    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
   5966                     NULL/*data*/, vdataLo/*vdata*/,
   5967                     expd_eq_old/*guard for store*/ );
   5968    if (otrak) {
   5969       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
   5970                    bdataHi/*bdata*/,
   5971                    expd_eq_old/*guard for store*/ );
   5972       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
   5973                    bdataLo/*bdata*/,
   5974                    expd_eq_old/*guard for store*/ );
   5975    }
   5976 }
   5977 
   5978 
   5979 /* ------ Dealing with LL/SC (not difficult) ------ */
   5980 
   5981 static void do_shadow_LLSC ( MCEnv*    mce,
   5982                              IREndness stEnd,
   5983                              IRTemp    stResult,
   5984                              IRExpr*   stAddr,
   5985                              IRExpr*   stStoredata )
   5986 {
   5987    /* In short: treat a load-linked like a normal load followed by an
   5988       assignment of the loaded (shadow) data to the result temporary.
   5989       Treat a store-conditional like a normal store, and mark the
   5990       result temporary as defined. */
   5991    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
   5992    IRTemp resTmp = findShadowTmpV(mce, stResult);
   5993 
   5994    tl_assert(isIRAtom(stAddr));
   5995    if (stStoredata)
   5996       tl_assert(isIRAtom(stStoredata));
   5997 
   5998    if (stStoredata == NULL) {
   5999       /* Load Linked */
   6000       /* Just treat this as a normal load, followed by an assignment of
   6001          the value to .result. */
   6002       /* Stay sane */
   6003       tl_assert(resTy == Ity_I64 || resTy == Ity_I32
   6004                 || resTy == Ity_I16 || resTy == Ity_I8);
   6005       assign( 'V', mce, resTmp,
   6006                    expr2vbits_Load(
   6007                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
   6008                       NULL/*always happens*/) );
   6009    } else {
   6010       /* Store Conditional */
   6011       /* Stay sane */
   6012       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
   6013                                    stStoredata);
   6014       tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
   6015                 || dataTy == Ity_I16 || dataTy == Ity_I8);
   6016       do_shadow_Store( mce, stEnd,
   6017                             stAddr, 0/* addr bias */,
   6018                             stStoredata,
   6019                             NULL /* shadow data */,
   6020                             NULL/*guard*/ );
   6021       /* This is a store conditional, so it writes to .result a value
   6022          indicating whether or not the store succeeded.  Just claim
   6023          this value is always defined.  In the PowerPC interpretation
   6024          of store-conditional, definedness of the success indication
   6025          depends on whether the address of the store matches the
   6026          reservation address.  But we can't tell that here (and
   6027          anyway, we're not being PowerPC-specific).  At least we are
   6028          guaranteed that the definedness of the store address, and its
   6029          addressibility, will be checked as per normal.  So it seems
   6030          pretty safe to just say that the success indication is always
   6031          defined.
   6032 
   6033          In schemeS, for origin tracking, we must correspondingly set
   6034          a no-origin value for the origin shadow of .result.
   6035       */
   6036       tl_assert(resTy == Ity_I1);
   6037       assign( 'V', mce, resTmp, definedOfType(resTy) );
   6038    }
   6039 }
   6040 
   6041 
   6042 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
   6043 
   6044 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
   6045 {
   6046    complainIfUndefined(mce, sg->guard, NULL);
   6047    /* do_shadow_Store will generate code to check the definedness and
   6048       validity of sg->addr, in the case where sg->guard evaluates to
   6049       True at run-time. */
   6050    do_shadow_Store( mce, sg->end,
   6051                     sg->addr, 0/* addr bias */,
   6052                     sg->data,
   6053                     NULL /* shadow data */,
   6054                     sg->guard );
   6055 }
   6056 
   6057 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
   6058 {
   6059    complainIfUndefined(mce, lg->guard, NULL);
   6060    /* expr2vbits_Load_guarded_General will generate code to check the
   6061       definedness and validity of lg->addr, in the case where
   6062       lg->guard evaluates to True at run-time. */
   6063 
   6064    /* Look at the LoadG's built-in conversion operation, to determine
   6065       the source (actual loaded data) type, and the equivalent IROp.
   6066       NOTE that implicitly we are taking a widening operation to be
   6067       applied to original atoms and producing one that applies to V
   6068       bits.  Since signed and unsigned widening are self-shadowing,
   6069       this is a straight copy of the op (modulo swapping from the
   6070       IRLoadGOp form to the IROp form).  Note also therefore that this
   6071       implicitly duplicates the logic to do with said widening ops in
   6072       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
   6073    IROp   vwiden   = Iop_INVALID;
   6074    IRType loadedTy = Ity_INVALID;
   6075    switch (lg->cvt) {
   6076       case ILGop_Ident64: loadedTy = Ity_I64; vwiden = Iop_INVALID; break;
   6077       case ILGop_Ident32: loadedTy = Ity_I32; vwiden = Iop_INVALID; break;
   6078       case ILGop_16Uto32: loadedTy = Ity_I16; vwiden = Iop_16Uto32; break;
   6079       case ILGop_16Sto32: loadedTy = Ity_I16; vwiden = Iop_16Sto32; break;
   6080       case ILGop_8Uto32:  loadedTy = Ity_I8;  vwiden = Iop_8Uto32;  break;
   6081       case ILGop_8Sto32:  loadedTy = Ity_I8;  vwiden = Iop_8Sto32;  break;
   6082       default: VG_(tool_panic)("do_shadow_LoadG");
   6083    }
   6084 
   6085    IRAtom* vbits_alt
   6086       = expr2vbits( mce, lg->alt );
   6087    IRAtom* vbits_final
   6088       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
   6089                                         lg->addr, 0/*addr bias*/,
   6090                                         lg->guard, vwiden, vbits_alt );
   6091    /* And finally, bind the V bits to the destination temporary. */
   6092    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
   6093 }
   6094 
   6095 
   6096 /*------------------------------------------------------------*/
   6097 /*--- Memcheck main                                        ---*/
   6098 /*------------------------------------------------------------*/
   6099 
   6100 static void schemeS ( MCEnv* mce, IRStmt* st );
   6101 
   6102 static Bool isBogusAtom ( IRAtom* at )
   6103 {
   6104    ULong n = 0;
   6105    IRConst* con;
   6106    tl_assert(isIRAtom(at));
   6107    if (at->tag == Iex_RdTmp)
   6108       return False;
   6109    tl_assert(at->tag == Iex_Const);
   6110    con = at->Iex.Const.con;
   6111    switch (con->tag) {
   6112       case Ico_U1:   return False;
   6113       case Ico_U8:   n = (ULong)con->Ico.U8; break;
   6114       case Ico_U16:  n = (ULong)con->Ico.U16; break;
   6115       case Ico_U32:  n = (ULong)con->Ico.U32; break;
   6116       case Ico_U64:  n = (ULong)con->Ico.U64; break;
   6117       case Ico_F32:  return False;
   6118       case Ico_F64:  return False;
   6119       case Ico_F32i: return False;
   6120       case Ico_F64i: return False;
   6121       case Ico_V128: return False;
   6122       case Ico_V256: return False;
   6123       default: ppIRExpr(at); tl_assert(0);
   6124    }
   6125    /* VG_(printf)("%llx\n", n); */
   6126    return (/*32*/    n == 0xFEFEFEFFULL
   6127            /*32*/ || n == 0x80808080ULL
   6128            /*32*/ || n == 0x7F7F7F7FULL
   6129            /*32*/ || n == 0x7EFEFEFFULL
   6130            /*32*/ || n == 0x81010100ULL
   6131            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
   6132            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
   6133            /*64*/ || n == 0x0000000000008080ULL
   6134            /*64*/ || n == 0x8080808080808080ULL
   6135            /*64*/ || n == 0x0101010101010101ULL
   6136           );
   6137 }
   6138 
   6139 static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
   6140 {
   6141    Int      i;
   6142    IRExpr*  e;
   6143    IRDirty* d;
   6144    IRCAS*   cas;
   6145    switch (st->tag) {
   6146       case Ist_WrTmp:
   6147          e = st->Ist.WrTmp.data;
   6148          switch (e->tag) {
   6149             case Iex_Get:
   6150             case Iex_RdTmp:
   6151                return False;
   6152             case Iex_Const:
   6153                return isBogusAtom(e);
   6154             case Iex_Unop:
   6155                return isBogusAtom(e->Iex.Unop.arg)
   6156                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
   6157             case Iex_GetI:
   6158                return isBogusAtom(e->Iex.GetI.ix);
   6159             case Iex_Binop:
   6160                return isBogusAtom(e->Iex.Binop.arg1)
   6161                       || isBogusAtom(e->Iex.Binop.arg2);
   6162             case Iex_Triop:
   6163                return isBogusAtom(e->Iex.Triop.details->arg1)
   6164                       || isBogusAtom(e->Iex.Triop.details->arg2)
   6165                       || isBogusAtom(e->Iex.Triop.details->arg3);
   6166             case Iex_Qop:
   6167                return isBogusAtom(e->Iex.Qop.details->arg1)
   6168                       || isBogusAtom(e->Iex.Qop.details->arg2)
   6169                       || isBogusAtom(e->Iex.Qop.details->arg3)
   6170                       || isBogusAtom(e->Iex.Qop.details->arg4);
   6171             case Iex_ITE:
   6172                return isBogusAtom(e->Iex.ITE.cond)
   6173                       || isBogusAtom(e->Iex.ITE.iftrue)
   6174                       || isBogusAtom(e->Iex.ITE.iffalse);
   6175             case Iex_Load:
   6176                return isBogusAtom(e->Iex.Load.addr);
   6177             case Iex_CCall:
   6178                for (i = 0; e->Iex.CCall.args[i]; i++)
   6179                   if (isBogusAtom(e->Iex.CCall.args[i]))
   6180                      return True;
   6181                return False;
   6182             default:
   6183                goto unhandled;
   6184          }
   6185       case Ist_Dirty:
   6186          d = st->Ist.Dirty.details;
   6187          for (i = 0; d->args[i]; i++) {
   6188             IRAtom* atom = d->args[i];
   6189             if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(atom))) {
   6190                if (isBogusAtom(atom))
   6191                   return True;
   6192             }
   6193          }
   6194          if (isBogusAtom(d->guard))
   6195             return True;
   6196          if (d->mAddr && isBogusAtom(d->mAddr))
   6197             return True;
   6198          return False;
   6199       case Ist_Put:
   6200          return isBogusAtom(st->Ist.Put.data);
   6201       case Ist_PutI:
   6202          return isBogusAtom(st->Ist.PutI.details->ix)
   6203                 || isBogusAtom(st->Ist.PutI.details->data);
   6204       case Ist_Store:
   6205          return isBogusAtom(st->Ist.Store.addr)
   6206                 || isBogusAtom(st->Ist.Store.data);
   6207       case Ist_StoreG: {
   6208          IRStoreG* sg = st->Ist.StoreG.details;
   6209          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
   6210                 || isBogusAtom(sg->guard);
   6211       }
   6212       case Ist_LoadG: {
   6213          IRLoadG* lg = st->Ist.LoadG.details;
   6214          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
   6215                 || isBogusAtom(lg->guard);
   6216       }
   6217       case Ist_Exit:
   6218          return isBogusAtom(st->Ist.Exit.guard);
   6219       case Ist_AbiHint:
   6220          return isBogusAtom(st->Ist.AbiHint.base)
   6221                 || isBogusAtom(st->Ist.AbiHint.nia);
   6222       case Ist_NoOp:
   6223       case Ist_IMark:
   6224       case Ist_MBE:
   6225          return False;
   6226       case Ist_CAS:
   6227          cas = st->Ist.CAS.details;
   6228          return isBogusAtom(cas->addr)
   6229                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
   6230                 || isBogusAtom(cas->expdLo)
   6231                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
   6232                 || isBogusAtom(cas->dataLo);
   6233       case Ist_LLSC:
   6234          return isBogusAtom(st->Ist.LLSC.addr)
   6235                 || (st->Ist.LLSC.storedata
   6236                        ? isBogusAtom(st->Ist.LLSC.storedata)
   6237                        : False);
   6238       default:
   6239       unhandled:
   6240          ppIRStmt(st);
   6241          VG_(tool_panic)("hasBogusLiterals");
   6242    }
   6243 }
   6244 
   6245 
   6246 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
   6247                         IRSB* sb_in,
   6248                         const VexGuestLayout* layout,
   6249                         const VexGuestExtents* vge,
   6250                         const VexArchInfo* archinfo_host,
   6251                         IRType gWordTy, IRType hWordTy )
   6252 {
   6253    Bool    verboze = 0||False;
   6254    Bool    bogus;
   6255    Int     i, j, first_stmt;
   6256    IRStmt* st;
   6257    MCEnv   mce;
   6258    IRSB*   sb_out;
   6259 
   6260    if (gWordTy != hWordTy) {
   6261       /* We don't currently support this case. */
   6262       VG_(tool_panic)("host/guest word size mismatch");
   6263    }
   6264 
   6265    /* Check we're not completely nuts */
   6266    tl_assert(sizeof(UWord)  == sizeof(void*));
   6267    tl_assert(sizeof(Word)   == sizeof(void*));
   6268    tl_assert(sizeof(Addr)   == sizeof(void*));
   6269    tl_assert(sizeof(ULong)  == 8);
   6270    tl_assert(sizeof(Long)   == 8);
   6271    tl_assert(sizeof(UInt)   == 4);
   6272    tl_assert(sizeof(Int)    == 4);
   6273 
   6274    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
   6275 
   6276    /* Set up SB */
   6277    sb_out = deepCopyIRSBExceptStmts(sb_in);
   6278 
   6279    /* Set up the running environment.  Both .sb and .tmpMap are
   6280       modified as we go along.  Note that tmps are added to both
   6281       .sb->tyenv and .tmpMap together, so the valid index-set for
   6282       those two arrays should always be identical. */
   6283    VG_(memset)(&mce, 0, sizeof(mce));
   6284    mce.sb             = sb_out;
   6285    mce.trace          = verboze;
   6286    mce.layout         = layout;
   6287    mce.hWordTy        = hWordTy;
   6288    mce.bogusLiterals  = False;
   6289 
   6290    /* Do expensive interpretation for Iop_Add32 and Iop_Add64 on
   6291       Darwin.  10.7 is mostly built with LLVM, which uses these for
   6292       bitfield inserts, and we get a lot of false errors if the cheap
   6293       interpretation is used, alas.  Could solve this much better if
   6294       we knew which of such adds came from x86/amd64 LEA instructions,
   6295       since these are the only ones really needing the expensive
   6296       interpretation, but that would require some way to tag them in
   6297       the _toIR.c front ends, which is a lot of faffing around.  So
   6298       for now just use the slow and blunt-instrument solution. */
   6299    mce.useLLVMworkarounds = False;
   6300 #  if defined(VGO_darwin)
   6301    mce.useLLVMworkarounds = True;
   6302 #  endif
   6303 
   6304    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
   6305                             sizeof(TempMapEnt));
   6306    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
   6307    for (i = 0; i < sb_in->tyenv->types_used; i++) {
   6308       TempMapEnt ent;
   6309       ent.kind    = Orig;
   6310       ent.shadowV = IRTemp_INVALID;
   6311       ent.shadowB = IRTemp_INVALID;
   6312       VG_(addToXA)( mce.tmpMap, &ent );
   6313    }
   6314    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
   6315 
   6316    /* Make a preliminary inspection of the statements, to see if there
   6317       are any dodgy-looking literals.  If there are, we generate
   6318       extra-detailed (hence extra-expensive) instrumentation in
   6319       places.  Scan the whole bb even if dodgyness is found earlier,
   6320       so that the flatness assertion is applied to all stmts. */
   6321 
   6322    bogus = False;
   6323 
   6324    for (i = 0; i < sb_in->stmts_used; i++) {
   6325 
   6326       st = sb_in->stmts[i];
   6327       tl_assert(st);
   6328       tl_assert(isFlatIRStmt(st));
   6329 
   6330       if (!bogus) {
   6331          bogus = checkForBogusLiterals(st);
   6332          if (0 && bogus) {
   6333             VG_(printf)("bogus: ");
   6334             ppIRStmt(st);
   6335             VG_(printf)("\n");
   6336          }
   6337       }
   6338 
   6339    }
   6340 
   6341    mce.bogusLiterals = bogus;
   6342 
   6343    /* Copy verbatim any IR preamble preceding the first IMark */
   6344 
   6345    tl_assert(mce.sb == sb_out);
   6346    tl_assert(mce.sb != sb_in);
   6347 
   6348    i = 0;
   6349    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
   6350 
   6351       st = sb_in->stmts[i];
   6352       tl_assert(st);
   6353       tl_assert(isFlatIRStmt(st));
   6354 
   6355       stmt( 'C', &mce, sb_in->stmts[i] );
   6356       i++;
   6357    }
   6358 
   6359    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
   6360       cause the IR following the preamble to contain references to IR
   6361       temporaries defined in the preamble.  Because the preamble isn't
   6362       instrumented, these temporaries don't have any shadows.
   6363       Nevertheless uses of them following the preamble will cause
   6364       memcheck to generate references to their shadows.  End effect is
   6365       to cause IR sanity check failures, due to references to
   6366       non-existent shadows.  This is only evident for the complex
   6367       preambles used for function wrapping on TOC-afflicted platforms
   6368       (ppc64-linux).
   6369 
   6370       The following loop therefore scans the preamble looking for
   6371       assignments to temporaries.  For each one found it creates an
   6372       assignment to the corresponding (V) shadow temp, marking it as
   6373       'defined'.  This is the same resulting IR as if the main
   6374       instrumentation loop before had been applied to the statement
   6375       'tmp = CONSTANT'.
   6376 
   6377       Similarly, if origin tracking is enabled, we must generate an
   6378       assignment for the corresponding origin (B) shadow, claiming
   6379       no-origin, as appropriate for a defined value.
   6380    */
   6381    for (j = 0; j < i; j++) {
   6382       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
   6383          /* findShadowTmpV checks its arg is an original tmp;
   6384             no need to assert that here. */
   6385          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
   6386          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
   6387          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
   6388          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
   6389          if (MC_(clo_mc_level) == 3) {
   6390             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
   6391             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
   6392             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
   6393          }
   6394          if (0) {
   6395             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
   6396             ppIRType( ty_v );
   6397             VG_(printf)("\n");
   6398          }
   6399       }
   6400    }
   6401 
   6402    /* Iterate over the remaining stmts to generate instrumentation. */
   6403 
   6404    tl_assert(sb_in->stmts_used > 0);
   6405    tl_assert(i >= 0);
   6406    tl_assert(i < sb_in->stmts_used);
   6407    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
   6408 
   6409    for (/* use current i*/; i < sb_in->stmts_used; i++) {
   6410 
   6411       st = sb_in->stmts[i];
   6412       first_stmt = sb_out->stmts_used;
   6413 
   6414       if (verboze) {
   6415          VG_(printf)("\n");
   6416          ppIRStmt(st);
   6417          VG_(printf)("\n");
   6418       }
   6419 
   6420       if (MC_(clo_mc_level) == 3) {
   6421          /* See comments on case Ist_CAS below. */
   6422          if (st->tag != Ist_CAS)
   6423             schemeS( &mce, st );
   6424       }
   6425 
   6426       /* Generate instrumentation code for each stmt ... */
   6427 
   6428       switch (st->tag) {
   6429 
   6430          case Ist_WrTmp:
   6431             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
   6432                                expr2vbits( &mce, st->Ist.WrTmp.data) );
   6433             break;
   6434 
   6435          case Ist_Put:
   6436             do_shadow_PUT( &mce,
   6437                            st->Ist.Put.offset,
   6438                            st->Ist.Put.data,
   6439                            NULL /* shadow atom */, NULL /* guard */ );
   6440             break;
   6441 
   6442          case Ist_PutI:
   6443             do_shadow_PUTI( &mce, st->Ist.PutI.details);
   6444             break;
   6445 
   6446          case Ist_Store:
   6447             do_shadow_Store( &mce, st->Ist.Store.end,
   6448                                    st->Ist.Store.addr, 0/* addr bias */,
   6449                                    st->Ist.Store.data,
   6450                                    NULL /* shadow data */,
   6451                                    NULL/*guard*/ );
   6452             break;
   6453 
   6454          case Ist_StoreG:
   6455             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
   6456             break;
   6457 
   6458          case Ist_LoadG:
   6459             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
   6460             break;
   6461 
   6462          case Ist_Exit:
   6463             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
   6464             break;
   6465 
   6466          case Ist_IMark:
   6467             break;
   6468 
   6469          case Ist_NoOp:
   6470          case Ist_MBE:
   6471             break;
   6472 
   6473          case Ist_Dirty:
   6474             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
   6475             break;
   6476 
   6477          case Ist_AbiHint:
   6478             do_AbiHint( &mce, st->Ist.AbiHint.base,
   6479                               st->Ist.AbiHint.len,
   6480                               st->Ist.AbiHint.nia );
   6481             break;
   6482 
   6483          case Ist_CAS:
   6484             do_shadow_CAS( &mce, st->Ist.CAS.details );
   6485             /* Note, do_shadow_CAS copies the CAS itself to the output
   6486                block, because it needs to add instrumentation both
   6487                before and after it.  Hence skip the copy below.  Also
   6488                skip the origin-tracking stuff (call to schemeS) above,
   6489                since that's all tangled up with it too; do_shadow_CAS
   6490                does it all. */
   6491             break;
   6492 
   6493          case Ist_LLSC:
   6494             do_shadow_LLSC( &mce,
   6495                             st->Ist.LLSC.end,
   6496                             st->Ist.LLSC.result,
   6497                             st->Ist.LLSC.addr,
   6498                             st->Ist.LLSC.storedata );
   6499             break;
   6500 
   6501          default:
   6502             VG_(printf)("\n");
   6503             ppIRStmt(st);
   6504             VG_(printf)("\n");
   6505             VG_(tool_panic)("memcheck: unhandled IRStmt");
   6506 
   6507       } /* switch (st->tag) */
   6508 
   6509       if (0 && verboze) {
   6510          for (j = first_stmt; j < sb_out->stmts_used; j++) {
   6511             VG_(printf)("   ");
   6512             ppIRStmt(sb_out->stmts[j]);
   6513             VG_(printf)("\n");
   6514          }
   6515          VG_(printf)("\n");
   6516       }
   6517 
   6518       /* ... and finally copy the stmt itself to the output.  Except,
   6519          skip the copy of IRCASs; see comments on case Ist_CAS
   6520          above. */
   6521       if (st->tag != Ist_CAS)
   6522          stmt('C', &mce, st);
   6523    }
   6524 
   6525    /* Now we need to complain if the jump target is undefined. */
   6526    first_stmt = sb_out->stmts_used;
   6527 
   6528    if (verboze) {
   6529       VG_(printf)("sb_in->next = ");
   6530       ppIRExpr(sb_in->next);
   6531       VG_(printf)("\n\n");
   6532    }
   6533 
   6534    complainIfUndefined( &mce, sb_in->next, NULL );
   6535 
   6536    if (0 && verboze) {
   6537       for (j = first_stmt; j < sb_out->stmts_used; j++) {
   6538          VG_(printf)("   ");
   6539          ppIRStmt(sb_out->stmts[j]);
   6540          VG_(printf)("\n");
   6541       }
   6542       VG_(printf)("\n");
   6543    }
   6544 
   6545    /* If this fails, there's been some serious snafu with tmp management,
   6546       that should be investigated. */
   6547    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
   6548    VG_(deleteXA)( mce.tmpMap );
   6549 
   6550    tl_assert(mce.sb == sb_out);
   6551    return sb_out;
   6552 }
   6553 
   6554 /*------------------------------------------------------------*/
   6555 /*--- Post-tree-build final tidying                        ---*/
   6556 /*------------------------------------------------------------*/
   6557 
   6558 /* This exploits the observation that Memcheck often produces
   6559    repeated conditional calls of the form
   6560 
   6561    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
   6562 
   6563    with the same guard expression G guarding the same helper call.
   6564    The second and subsequent calls are redundant.  This usually
   6565    results from instrumentation of guest code containing multiple
   6566    memory references at different constant offsets from the same base
   6567    register.  After optimisation of the instrumentation, you get a
   6568    test for the definedness of the base register for each memory
   6569    reference, which is kinda pointless.  MC_(final_tidy) therefore
   6570    looks for such repeated calls and removes all but the first. */
   6571 
   6572 /* A struct for recording which (helper, guard) pairs we have already
   6573    seen. */
   6574 typedef
   6575    struct { void* entry; IRExpr* guard; }
   6576    Pair;
   6577 
   6578 /* Return True if e1 and e2 definitely denote the same value (used to
   6579    compare guards).  Return False if unknown; False is the safe
   6580    answer.  Since guest registers and guest memory do not have the
   6581    SSA property we must return False if any Gets or Loads appear in
   6582    the expression. */
   6583 
   6584 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
   6585 {
   6586    if (e1->tag != e2->tag)
   6587       return False;
   6588    switch (e1->tag) {
   6589       case Iex_Const:
   6590          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
   6591       case Iex_Binop:
   6592          return e1->Iex.Binop.op == e2->Iex.Binop.op
   6593                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
   6594                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
   6595       case Iex_Unop:
   6596          return e1->Iex.Unop.op == e2->Iex.Unop.op
   6597                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
   6598       case Iex_RdTmp:
   6599          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
   6600       case Iex_ITE:
   6601          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
   6602                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
   6603                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
   6604       case Iex_Qop:
   6605       case Iex_Triop:
   6606       case Iex_CCall:
   6607          /* be lazy.  Could define equality for these, but they never
   6608             appear to be used. */
   6609          return False;
   6610       case Iex_Get:
   6611       case Iex_GetI:
   6612       case Iex_Load:
   6613          /* be conservative - these may not give the same value each
   6614             time */
   6615          return False;
   6616       case Iex_Binder:
   6617          /* should never see this */
   6618          /* fallthrough */
   6619       default:
   6620          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
   6621          ppIRExpr(e1);
   6622          VG_(tool_panic)("memcheck:sameIRValue");
   6623          return False;
   6624    }
   6625 }
   6626 
   6627 /* See if 'pairs' already has an entry for (entry, guard).  Return
   6628    True if so.  If not, add an entry. */
   6629 
   6630 static
   6631 Bool check_or_add ( XArray* /*of Pair*/ pairs, IRExpr* guard, void* entry )
   6632 {
   6633    Pair  p;
   6634    Pair* pp;
   6635    Int   i, n = VG_(sizeXA)( pairs );
   6636    for (i = 0; i < n; i++) {
   6637       pp = VG_(indexXA)( pairs, i );
   6638       if (pp->entry == entry && sameIRValue(pp->guard, guard))
   6639          return True;
   6640    }
   6641    p.guard = guard;
   6642    p.entry = entry;
   6643    VG_(addToXA)( pairs, &p );
   6644    return False;
   6645 }
   6646 
   6647 static Bool is_helperc_value_checkN_fail ( const HChar* name )
   6648 {
   6649    return
   6650       0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_no_o)")
   6651       || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_no_o)")
   6652       || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_no_o)")
   6653       || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_no_o)")
   6654       || 0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_w_o)")
   6655       || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_w_o)")
   6656       || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_w_o)")
   6657       || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_w_o)");
   6658 }
   6659 
   6660 IRSB* MC_(final_tidy) ( IRSB* sb_in )
   6661 {
   6662    Int i;
   6663    IRStmt*   st;
   6664    IRDirty*  di;
   6665    IRExpr*   guard;
   6666    IRCallee* cee;
   6667    Bool      alreadyPresent;
   6668    XArray*   pairs = VG_(newXA)( VG_(malloc), "mc.ft.1",
   6669                                  VG_(free), sizeof(Pair) );
   6670    /* Scan forwards through the statements.  Each time a call to one
   6671       of the relevant helpers is seen, check if we have made a
   6672       previous call to the same helper using the same guard
   6673       expression, and if so, delete the call. */
   6674    for (i = 0; i < sb_in->stmts_used; i++) {
   6675       st = sb_in->stmts[i];
   6676       tl_assert(st);
   6677       if (st->tag != Ist_Dirty)
   6678          continue;
   6679       di = st->Ist.Dirty.details;
   6680       guard = di->guard;
   6681       tl_assert(guard);
   6682       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
   6683       cee = di->cee;
   6684       if (!is_helperc_value_checkN_fail( cee->name ))
   6685          continue;
   6686        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
   6687           guard 'guard'.  Check if we have already seen a call to this
   6688           function with the same guard.  If so, delete it.  If not,
   6689           add it to the set of calls we do know about. */
   6690       alreadyPresent = check_or_add( pairs, guard, cee->addr );
   6691       if (alreadyPresent) {
   6692          sb_in->stmts[i] = IRStmt_NoOp();
   6693          if (0) VG_(printf)("XX\n");
   6694       }
   6695    }
   6696    VG_(deleteXA)( pairs );
   6697    return sb_in;
   6698 }
   6699 
   6700 
   6701 /*------------------------------------------------------------*/
   6702 /*--- Origin tracking stuff                                ---*/
   6703 /*------------------------------------------------------------*/
   6704 
   6705 /* Almost identical to findShadowTmpV. */
   6706 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
   6707 {
   6708    TempMapEnt* ent;
   6709    /* VG_(indexXA) range-checks 'orig', hence no need to check
   6710       here. */
   6711    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
   6712    tl_assert(ent->kind == Orig);
   6713    if (ent->shadowB == IRTemp_INVALID) {
   6714       IRTemp tmpB
   6715         = newTemp( mce, Ity_I32, BSh );
   6716       /* newTemp may cause mce->tmpMap to resize, hence previous results
   6717          from VG_(indexXA) are invalid. */
   6718       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
   6719       tl_assert(ent->kind == Orig);
   6720       tl_assert(ent->shadowB == IRTemp_INVALID);
   6721       ent->shadowB = tmpB;
   6722    }
   6723    return ent->shadowB;
   6724 }
   6725 
   6726 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
   6727 {
   6728    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
   6729 }
   6730 
   6731 
   6732 /* Make a guarded origin load, with no special handling in the
   6733    didn't-happen case.  A GUARD of NULL is assumed to mean "always
   6734    True".
   6735 
   6736    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
   6737    return the otag.  The loaded size is SZB.  If GUARD evaluates to
   6738    False at run time then the returned otag is zero.
   6739 */
   6740 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
   6741                                     IRAtom* baseaddr,
   6742                                     Int offset, IRExpr* guard )
   6743 {
   6744    void*    hFun;
   6745    const HChar* hName;
   6746    IRTemp   bTmp;
   6747    IRDirty* di;
   6748    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
   6749    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
   6750    IRAtom*  ea    = baseaddr;
   6751    if (offset != 0) {
   6752       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
   6753                                    : mkU64( (Long)(Int)offset );
   6754       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
   6755    }
   6756    bTmp = newTemp(mce, mce->hWordTy, BSh);
   6757 
   6758    switch (szB) {
   6759       case 1: hFun  = (void*)&MC_(helperc_b_load1);
   6760               hName = "MC_(helperc_b_load1)";
   6761               break;
   6762       case 2: hFun  = (void*)&MC_(helperc_b_load2);
   6763               hName = "MC_(helperc_b_load2)";
   6764               break;
   6765       case 4: hFun  = (void*)&MC_(helperc_b_load4);
   6766               hName = "MC_(helperc_b_load4)";
   6767               break;
   6768       case 8: hFun  = (void*)&MC_(helperc_b_load8);
   6769               hName = "MC_(helperc_b_load8)";
   6770               break;
   6771       case 16: hFun  = (void*)&MC_(helperc_b_load16);
   6772                hName = "MC_(helperc_b_load16)";
   6773                break;
   6774       case 32: hFun  = (void*)&MC_(helperc_b_load32);
   6775                hName = "MC_(helperc_b_load32)";
   6776                break;
   6777       default:
   6778          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
   6779          tl_assert(0);
   6780    }
   6781    di = unsafeIRDirty_1_N(
   6782            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
   6783            mkIRExprVec_1( ea )
   6784         );
   6785    if (guard) {
   6786       di->guard = guard;
   6787       /* Ideally the didn't-happen return value here would be
   6788          all-zeroes (unknown-origin), so it'd be harmless if it got
   6789          used inadvertantly.  We slum it out with the IR-mandated
   6790          default value (0b01 repeating, 0x55 etc) as that'll probably
   6791          trump all legitimate otags via Max32, and it's pretty
   6792          obviously bogus. */
   6793    }
   6794    /* no need to mess with any annotations.  This call accesses
   6795       neither guest state nor guest memory. */
   6796    stmt( 'B', mce, IRStmt_Dirty(di) );
   6797    if (mce->hWordTy == Ity_I64) {
   6798       /* 64-bit host */
   6799       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
   6800       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
   6801       return mkexpr(bTmp32);
   6802    } else {
   6803       /* 32-bit host */
   6804       return mkexpr(bTmp);
   6805    }
   6806 }
   6807 
   6808 
   6809 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
   6810    loaded size is SZB.  The load is regarded as unconditional (always
   6811    happens).
   6812 */
   6813 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
   6814                             Int offset )
   6815 {
   6816    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
   6817 }
   6818 
   6819 
   6820 /* The most general handler for guarded origin loads.  A GUARD of NULL
   6821    is assumed to mean "always True".
   6822 
   6823    Generate IR to do a shadow origin load from ADDR+BIAS and return
   6824    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
   6825    run time then the returned B bits are simply BALT instead.
   6826 */
   6827 static
   6828 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
   6829                                         IRType ty,
   6830                                         IRAtom* addr, UInt bias,
   6831                                         IRAtom* guard, IRAtom* balt )
   6832 {
   6833    /* If the guard evaluates to True, this will hold the loaded
   6834       origin.  If the guard evaluates to False, this will be zero,
   6835       meaning "unknown origin", in which case we will have to replace
   6836       it using an ITE below. */
   6837    IRAtom* iftrue
   6838       = assignNew('B', mce, Ity_I32,
   6839                   gen_guarded_load_b(mce, sizeofIRType(ty),
   6840                                      addr, bias, guard));
   6841    /* These are the bits we will return if the load doesn't take
   6842       place. */
   6843    IRAtom* iffalse
   6844       = balt;
   6845    /* Prepare the cond for the ITE.  Convert a NULL cond into
   6846       something that iropt knows how to fold out later. */
   6847    IRAtom* cond
   6848       = guard == NULL  ? mkU1(1)  : guard;
   6849    /* And assemble the final result. */
   6850    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
   6851 }
   6852 
   6853 
   6854 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
   6855    the store really happens; NULL means it unconditionally does. */
   6856 static void gen_store_b ( MCEnv* mce, Int szB,
   6857                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
   6858                           IRAtom* guard )
   6859 {
   6860    void*    hFun;
   6861    const HChar* hName;
   6862    IRDirty* di;
   6863    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
   6864    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
   6865    IRAtom*  ea    = baseaddr;
   6866    if (guard) {
   6867       tl_assert(isOriginalAtom(mce, guard));
   6868       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
   6869    }
   6870    if (offset != 0) {
   6871       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
   6872                                    : mkU64( (Long)(Int)offset );
   6873       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
   6874    }
   6875    if (mce->hWordTy == Ity_I64)
   6876       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
   6877 
   6878    switch (szB) {
   6879       case 1: hFun  = (void*)&MC_(helperc_b_store1);
   6880               hName = "MC_(helperc_b_store1)";
   6881               break;
   6882       case 2: hFun  = (void*)&MC_(helperc_b_store2);
   6883               hName = "MC_(helperc_b_store2)";
   6884               break;
   6885       case 4: hFun  = (void*)&MC_(helperc_b_store4);
   6886               hName = "MC_(helperc_b_store4)";
   6887               break;
   6888       case 8: hFun  = (void*)&MC_(helperc_b_store8);
   6889               hName = "MC_(helperc_b_store8)";
   6890               break;
   6891       case 16: hFun  = (void*)&MC_(helperc_b_store16);
   6892                hName = "MC_(helperc_b_store16)";
   6893                break;
   6894       case 32: hFun  = (void*)&MC_(helperc_b_store32);
   6895                hName = "MC_(helperc_b_store32)";
   6896                break;
   6897       default:
   6898          tl_assert(0);
   6899    }
   6900    di = unsafeIRDirty_0_N( 2/*regparms*/,
   6901            hName, VG_(fnptr_to_fnentry)( hFun ),
   6902            mkIRExprVec_2( ea, dataB )
   6903         );
   6904    /* no need to mess with any annotations.  This call accesses
   6905       neither guest state nor guest memory. */
   6906    if (guard) di->guard = guard;
   6907    stmt( 'B', mce, IRStmt_Dirty(di) );
   6908 }
   6909 
   6910 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
   6911    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
   6912    if (eTy == Ity_I64)
   6913       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
   6914    if (eTy == Ity_I32)
   6915       return e;
   6916    tl_assert(0);
   6917 }
   6918 
   6919 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
   6920    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
   6921    tl_assert(eTy == Ity_I32);
   6922    if (dstTy == Ity_I64)
   6923       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
   6924    tl_assert(0);
   6925 }
   6926 
   6927 
   6928 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
   6929 {
   6930    tl_assert(MC_(clo_mc_level) == 3);
   6931 
   6932    switch (e->tag) {
   6933 
   6934       case Iex_GetI: {
   6935          IRRegArray* descr_b;
   6936          IRAtom      *t1, *t2, *t3, *t4;
   6937          IRRegArray* descr      = e->Iex.GetI.descr;
   6938          IRType equivIntTy
   6939             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
   6940          /* If this array is unshadowable for whatever reason, use the
   6941             usual approximation. */
   6942          if (equivIntTy == Ity_INVALID)
   6943             return mkU32(0);
   6944          tl_assert(sizeofIRType(equivIntTy) >= 4);
   6945          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
   6946          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
   6947                                  equivIntTy, descr->nElems );
   6948          /* Do a shadow indexed get of the same size, giving t1.  Take
   6949             the bottom 32 bits of it, giving t2.  Compute into t3 the
   6950             origin for the index (almost certainly zero, but there's
   6951             no harm in being completely general here, since iropt will
   6952             remove any useless code), and fold it in, giving a final
   6953             value t4. */
   6954          t1 = assignNew( 'B', mce, equivIntTy,
   6955                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
   6956                                                 e->Iex.GetI.bias ));
   6957          t2 = narrowTo32( mce, t1 );
   6958          t3 = schemeE( mce, e->Iex.GetI.ix );
   6959          t4 = gen_maxU32( mce, t2, t3 );
   6960          return t4;
   6961       }
   6962       case Iex_CCall: {
   6963          Int i;
   6964          IRAtom*  here;
   6965          IRExpr** args = e->Iex.CCall.args;
   6966          IRAtom*  curr = mkU32(0);
   6967          for (i = 0; args[i]; i++) {
   6968             tl_assert(i < 32);
   6969             tl_assert(isOriginalAtom(mce, args[i]));
   6970             /* Only take notice of this arg if the callee's
   6971                mc-exclusion mask does not say it is to be excluded. */
   6972             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
   6973                /* the arg is to be excluded from definedness checking.
   6974                   Do nothing. */
   6975                if (0) VG_(printf)("excluding %s(%d)\n",
   6976                                   e->Iex.CCall.cee->name, i);
   6977             } else {
   6978                /* calculate the arg's definedness, and pessimistically
   6979                   merge it in. */
   6980                here = schemeE( mce, args[i] );
   6981                curr = gen_maxU32( mce, curr, here );
   6982             }
   6983          }
   6984          return curr;
   6985       }
   6986       case Iex_Load: {
   6987          Int dszB;
   6988          dszB = sizeofIRType(e->Iex.Load.ty);
   6989          /* assert that the B value for the address is already
   6990             available (somewhere) */
   6991          tl_assert(isIRAtom(e->Iex.Load.addr));
   6992          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
   6993          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
   6994       }
   6995       case Iex_ITE: {
   6996          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
   6997          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
   6998          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
   6999          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
   7000       }
   7001       case Iex_Qop: {
   7002          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
   7003          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
   7004          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
   7005          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
   7006          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
   7007                                  gen_maxU32( mce, b3, b4 ) );
   7008       }
   7009       case Iex_Triop: {
   7010          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
   7011          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
   7012          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
   7013          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
   7014       }
   7015       case Iex_Binop: {
   7016          switch (e->Iex.Binop.op) {
   7017             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
   7018             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
   7019             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
   7020             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
   7021                /* Just say these all produce a defined result,
   7022                   regardless of their arguments.  See
   7023                   COMMENT_ON_CasCmpEQ in this file. */
   7024                return mkU32(0);
   7025             default: {
   7026                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
   7027                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
   7028                return gen_maxU32( mce, b1, b2 );
   7029             }
   7030          }
   7031          tl_assert(0);
   7032          /*NOTREACHED*/
   7033       }
   7034       case Iex_Unop: {
   7035          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
   7036          return b1;
   7037       }
   7038       case Iex_Const:
   7039          return mkU32(0);
   7040       case Iex_RdTmp:
   7041          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
   7042       case Iex_Get: {
   7043          Int b_offset = MC_(get_otrack_shadow_offset)(
   7044                            e->Iex.Get.offset,
   7045                            sizeofIRType(e->Iex.Get.ty)
   7046                         );
   7047          tl_assert(b_offset >= -1
   7048                    && b_offset <= mce->layout->total_sizeB -4);
   7049          if (b_offset >= 0) {
   7050             /* FIXME: this isn't an atom! */
   7051             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
   7052                                Ity_I32 );
   7053          }
   7054          return mkU32(0);
   7055       }
   7056       default:
   7057          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
   7058          ppIRExpr(e);
   7059          VG_(tool_panic)("memcheck:schemeE");
   7060    }
   7061 }
   7062 
   7063 
   7064 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
   7065 {
   7066    // This is a hacked version of do_shadow_Dirty
   7067    Int       i, k, n, toDo, gSz, gOff;
   7068    IRAtom    *here, *curr;
   7069    IRTemp    dst;
   7070 
   7071    /* First check the guard. */
   7072    curr = schemeE( mce, d->guard );
   7073 
   7074    /* Now round up all inputs and maxU32 over them. */
   7075 
   7076    /* Inputs: unmasked args
   7077       Note: arguments are evaluated REGARDLESS of the guard expression */
   7078    for (i = 0; d->args[i]; i++) {
   7079       IRAtom* arg = d->args[i];
   7080       if ( (d->cee->mcx_mask & (1<<i))
   7081            || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
   7082          /* ignore this arg */
   7083       } else {
   7084          here = schemeE( mce, arg );
   7085          curr = gen_maxU32( mce, curr, here );
   7086       }
   7087    }
   7088 
   7089    /* Inputs: guest state that we read. */
   7090    for (i = 0; i < d->nFxState; i++) {
   7091       tl_assert(d->fxState[i].fx != Ifx_None);
   7092       if (d->fxState[i].fx == Ifx_Write)
   7093          continue;
   7094 
   7095       /* Enumerate the described state segments */
   7096       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   7097          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   7098          gSz  = d->fxState[i].size;
   7099 
   7100          /* Ignore any sections marked as 'always defined'. */
   7101          if (isAlwaysDefd(mce, gOff, gSz)) {
   7102             if (0)
   7103             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
   7104                         gOff, gSz);
   7105             continue;
   7106          }
   7107 
   7108          /* This state element is read or modified.  So we need to
   7109             consider it.  If larger than 4 bytes, deal with it in
   7110             4-byte chunks. */
   7111          while (True) {
   7112             Int b_offset;
   7113             tl_assert(gSz >= 0);
   7114             if (gSz == 0) break;
   7115             n = gSz <= 4 ? gSz : 4;
   7116             /* update 'curr' with maxU32 of the state slice
   7117                gOff .. gOff+n-1 */
   7118             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
   7119             if (b_offset != -1) {
   7120                /* Observe the guard expression. If it is false use 0, i.e.
   7121                   nothing is known about the origin */
   7122                IRAtom *cond, *iffalse, *iftrue;
   7123 
   7124                cond = assignNew( 'B', mce, Ity_I1, d->guard);
   7125                iffalse = mkU32(0);
   7126                iftrue  = assignNew( 'B', mce, Ity_I32,
   7127                                     IRExpr_Get(b_offset
   7128                                                  + 2*mce->layout->total_sizeB,
   7129                                                Ity_I32));
   7130                here = assignNew( 'B', mce, Ity_I32,
   7131                                  IRExpr_ITE(cond, iftrue, iffalse));
   7132                curr = gen_maxU32( mce, curr, here );
   7133             }
   7134             gSz -= n;
   7135             gOff += n;
   7136          }
   7137       }
   7138    }
   7139 
   7140    /* Inputs: memory */
   7141 
   7142    if (d->mFx != Ifx_None) {
   7143       /* Because we may do multiple shadow loads/stores from the same
   7144          base address, it's best to do a single test of its
   7145          definedness right now.  Post-instrumentation optimisation
   7146          should remove all but this test. */
   7147       tl_assert(d->mAddr);
   7148       here = schemeE( mce, d->mAddr );
   7149       curr = gen_maxU32( mce, curr, here );
   7150    }
   7151 
   7152    /* Deal with memory inputs (reads or modifies) */
   7153    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
   7154       toDo   = d->mSize;
   7155       /* chew off 32-bit chunks.  We don't care about the endianness
   7156          since it's all going to be condensed down to a single bit,
   7157          but nevertheless choose an endianness which is hopefully
   7158          native to the platform. */
   7159       while (toDo >= 4) {
   7160          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
   7161                                     d->guard );
   7162          curr = gen_maxU32( mce, curr, here );
   7163          toDo -= 4;
   7164       }
   7165       /* handle possible 16-bit excess */
   7166       while (toDo >= 2) {
   7167          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
   7168                                     d->guard );
   7169          curr = gen_maxU32( mce, curr, here );
   7170          toDo -= 2;
   7171       }
   7172       /* chew off the remaining 8-bit chunk, if any */
   7173       if (toDo == 1) {
   7174          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
   7175                                     d->guard );
   7176          curr = gen_maxU32( mce, curr, here );
   7177          toDo -= 1;
   7178       }
   7179       tl_assert(toDo == 0);
   7180    }
   7181 
   7182    /* Whew!  So curr is a 32-bit B-value which should give an origin
   7183       of some use if any of the inputs to the helper are undefined.
   7184       Now we need to re-distribute the results to all destinations. */
   7185 
   7186    /* Outputs: the destination temporary, if there is one. */
   7187    if (d->tmp != IRTemp_INVALID) {
   7188       dst   = findShadowTmpB(mce, d->tmp);
   7189       assign( 'V', mce, dst, curr );
   7190    }
   7191 
   7192    /* Outputs: guest state that we write or modify. */
   7193    for (i = 0; i < d->nFxState; i++) {
   7194       tl_assert(d->fxState[i].fx != Ifx_None);
   7195       if (d->fxState[i].fx == Ifx_Read)
   7196          continue;
   7197 
   7198       /* Enumerate the described state segments */
   7199       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   7200          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   7201          gSz  = d->fxState[i].size;
   7202 
   7203          /* Ignore any sections marked as 'always defined'. */
   7204          if (isAlwaysDefd(mce, gOff, gSz))
   7205             continue;
   7206 
   7207          /* This state element is written or modified.  So we need to
   7208             consider it.  If larger than 4 bytes, deal with it in
   7209             4-byte chunks. */
   7210          while (True) {
   7211             Int b_offset;
   7212             tl_assert(gSz >= 0);
   7213             if (gSz == 0) break;
   7214             n = gSz <= 4 ? gSz : 4;
   7215             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
   7216             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
   7217             if (b_offset != -1) {
   7218 
   7219                /* If the guard expression evaluates to false we simply Put
   7220                   the value that is already stored in the guest state slot */
   7221                IRAtom *cond, *iffalse;
   7222 
   7223                cond    = assignNew('B', mce, Ity_I1,
   7224                                    d->guard);
   7225                iffalse = assignNew('B', mce, Ity_I32,
   7226                                    IRExpr_Get(b_offset +
   7227                                               2*mce->layout->total_sizeB,
   7228                                               Ity_I32));
   7229                curr = assignNew('V', mce, Ity_I32,
   7230                                 IRExpr_ITE(cond, curr, iffalse));
   7231 
   7232                stmt( 'B', mce, IRStmt_Put(b_offset
   7233                                           + 2*mce->layout->total_sizeB,
   7234                                           curr ));
   7235             }
   7236             gSz -= n;
   7237             gOff += n;
   7238          }
   7239       }
   7240    }
   7241 
   7242    /* Outputs: memory that we write or modify.  Same comments about
   7243       endianness as above apply. */
   7244    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
   7245       toDo   = d->mSize;
   7246       /* chew off 32-bit chunks */
   7247       while (toDo >= 4) {
   7248          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
   7249                       d->guard );
   7250          toDo -= 4;
   7251       }
   7252       /* handle possible 16-bit excess */
   7253       while (toDo >= 2) {
   7254          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
   7255                       d->guard );
   7256          toDo -= 2;
   7257       }
   7258       /* chew off the remaining 8-bit chunk, if any */
   7259       if (toDo == 1) {
   7260          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
   7261                       d->guard );
   7262          toDo -= 1;
   7263       }
   7264       tl_assert(toDo == 0);
   7265    }
   7266 }
   7267 
   7268 
   7269 /* Generate IR for origin shadowing for a general guarded store. */
   7270 static void do_origins_Store_guarded ( MCEnv* mce,
   7271                                        IREndness stEnd,
   7272                                        IRExpr* stAddr,
   7273                                        IRExpr* stData,
   7274                                        IRExpr* guard )
   7275 {
   7276    Int     dszB;
   7277    IRAtom* dataB;
   7278    /* assert that the B value for the address is already available
   7279       (somewhere), since the call to schemeE will want to see it.
   7280       XXXX how does this actually ensure that?? */
   7281    tl_assert(isIRAtom(stAddr));
   7282    tl_assert(isIRAtom(stData));
   7283    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
   7284    dataB = schemeE( mce, stData );
   7285    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
   7286 }
   7287 
   7288 
   7289 /* Generate IR for origin shadowing for a plain store. */
   7290 static void do_origins_Store_plain ( MCEnv* mce,
   7291                                      IREndness stEnd,
   7292                                      IRExpr* stAddr,
   7293                                      IRExpr* stData )
   7294 {
   7295    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
   7296                               NULL/*guard*/ );
   7297 }
   7298 
   7299 
   7300 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
   7301 
   7302 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
   7303 {
   7304    do_origins_Store_guarded( mce, sg->end, sg->addr,
   7305                              sg->data, sg->guard );
   7306 }
   7307 
   7308 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
   7309 {
   7310    IRType loadedTy = Ity_INVALID;
   7311    switch (lg->cvt) {
   7312       case ILGop_Ident64: loadedTy = Ity_I64; break;
   7313       case ILGop_Ident32: loadedTy = Ity_I32; break;
   7314       case ILGop_16Uto32: loadedTy = Ity_I16; break;
   7315       case ILGop_16Sto32: loadedTy = Ity_I16; break;
   7316       case ILGop_8Uto32:  loadedTy = Ity_I8;  break;
   7317       case ILGop_8Sto32:  loadedTy = Ity_I8;  break;
   7318       default: VG_(tool_panic)("schemeS.IRLoadG");
   7319    }
   7320    IRAtom* ori_alt
   7321       = schemeE( mce,lg->alt );
   7322    IRAtom* ori_final
   7323       = expr2ori_Load_guarded_General(mce, loadedTy,
   7324                                       lg->addr, 0/*addr bias*/,
   7325                                       lg->guard, ori_alt );
   7326    /* And finally, bind the origin to the destination temporary. */
   7327    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
   7328 }
   7329 
   7330 
   7331 static void schemeS ( MCEnv* mce, IRStmt* st )
   7332 {
   7333    tl_assert(MC_(clo_mc_level) == 3);
   7334 
   7335    switch (st->tag) {
   7336 
   7337       case Ist_AbiHint:
   7338          /* The value-check instrumenter handles this - by arranging
   7339             to pass the address of the next instruction to
   7340             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
   7341             happen for origin tracking w.r.t. AbiHints.  So there is
   7342             nothing to do here. */
   7343          break;
   7344 
   7345       case Ist_PutI: {
   7346          IRPutI *puti = st->Ist.PutI.details;
   7347          IRRegArray* descr_b;
   7348          IRAtom      *t1, *t2, *t3, *t4;
   7349          IRRegArray* descr = puti->descr;
   7350          IRType equivIntTy
   7351             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
   7352          /* If this array is unshadowable for whatever reason,
   7353             generate no code. */
   7354          if (equivIntTy == Ity_INVALID)
   7355             break;
   7356          tl_assert(sizeofIRType(equivIntTy) >= 4);
   7357          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
   7358          descr_b
   7359             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
   7360                             equivIntTy, descr->nElems );
   7361          /* Compute a value to Put - the conjoinment of the origin for
   7362             the data to be Put-ted (obviously) and of the index value
   7363             (not so obviously). */
   7364          t1 = schemeE( mce, puti->data );
   7365          t2 = schemeE( mce, puti->ix );
   7366          t3 = gen_maxU32( mce, t1, t2 );
   7367          t4 = zWidenFrom32( mce, equivIntTy, t3 );
   7368          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
   7369                                                puti->bias, t4) ));
   7370          break;
   7371       }
   7372 
   7373       case Ist_Dirty:
   7374          do_origins_Dirty( mce, st->Ist.Dirty.details );
   7375          break;
   7376 
   7377       case Ist_Store:
   7378          do_origins_Store_plain( mce, st->Ist.Store.end,
   7379                                       st->Ist.Store.addr,
   7380                                       st->Ist.Store.data );
   7381          break;
   7382 
   7383       case Ist_StoreG:
   7384          do_origins_StoreG( mce, st->Ist.StoreG.details );
   7385          break;
   7386 
   7387       case Ist_LoadG:
   7388          do_origins_LoadG( mce, st->Ist.LoadG.details );
   7389          break;
   7390 
   7391       case Ist_LLSC: {
   7392          /* In short: treat a load-linked like a normal load followed
   7393             by an assignment of the loaded (shadow) data the result
   7394             temporary.  Treat a store-conditional like a normal store,
   7395             and mark the result temporary as defined. */
   7396          if (st->Ist.LLSC.storedata == NULL) {
   7397             /* Load Linked */
   7398             IRType resTy
   7399                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
   7400             IRExpr* vanillaLoad
   7401                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
   7402             tl_assert(resTy == Ity_I64 || resTy == Ity_I32
   7403                       || resTy == Ity_I16 || resTy == Ity_I8);
   7404             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
   7405                               schemeE(mce, vanillaLoad));
   7406          } else {
   7407             /* Store conditional */
   7408             do_origins_Store_plain( mce, st->Ist.LLSC.end,
   7409                                     st->Ist.LLSC.addr,
   7410                                     st->Ist.LLSC.storedata );
   7411             /* For the rationale behind this, see comments at the
   7412                place where the V-shadow for .result is constructed, in
   7413                do_shadow_LLSC.  In short, we regard .result as
   7414                always-defined. */
   7415             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
   7416                               mkU32(0) );
   7417          }
   7418          break;
   7419       }
   7420 
   7421       case Ist_Put: {
   7422          Int b_offset
   7423             = MC_(get_otrack_shadow_offset)(
   7424                  st->Ist.Put.offset,
   7425                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
   7426               );
   7427          if (b_offset >= 0) {
   7428             /* FIXME: this isn't an atom! */
   7429             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
   7430                                        schemeE( mce, st->Ist.Put.data )) );
   7431          }
   7432          break;
   7433       }
   7434 
   7435       case Ist_WrTmp:
   7436          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
   7437                            schemeE(mce, st->Ist.WrTmp.data) );
   7438          break;
   7439 
   7440       case Ist_MBE:
   7441       case Ist_NoOp:
   7442       case Ist_Exit:
   7443       case Ist_IMark:
   7444          break;
   7445 
   7446       default:
   7447          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
   7448          ppIRStmt(st);
   7449          VG_(tool_panic)("memcheck:schemeS");
   7450    }
   7451 }
   7452 
   7453 
   7454 /*--------------------------------------------------------------------*/
   7455 /*--- end                                           mc_translate.c ---*/
   7456 /*--------------------------------------------------------------------*/
   7457