android-4.3.0_r3.1/s

/*--------------------------------------------------------------------*/
/*--- begin                                     guest_amd64_toIR.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Valgrind, a dynamic binary instrumentation
   framework.

   Copyright (C) 2004-2012 OpenWorks LLP
      info (at) open-works.net

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, USA.

   The GNU General Public License is contained in the file COPYING.

   Neither the names of the U.S. Department of Energy nor the
   University of California nor the names of its contributors may be
   used to endorse or promote products derived from this software
   without prior written permission.
*/

/* Translates AMD64 code to IR. */

/* TODO:

   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
   to ensure a 64-bit value is being written.

   x87 FP Limitations:

   * all arithmetic done at 64 bits

   * no FP exceptions, except for handling stack over/underflow

   * FP rounding mode observed only for float->int conversions and
     int->float conversions which could lose accuracy, and for
     float-to-float rounding.  For all other operations,
     round-to-nearest is used, regardless.

   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
     even when it isn't.

   * some of the FCOM cases could do with testing -- not convinced
     that the args are the right way round.

   * FSAVE does not re-initialise the FPU; it should do

   * FINIT not only initialises the FPU environment, it also zeroes
     all the FP registers.  It should leave the registers unchanged.

    RDTSC returns zero, always.

    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
    only way to observe eflags[1], a proper fix would be to make that
    bit be set by PUSHF.

    This module uses global variables and so is not MT-safe (if that
    should ever become relevant).
*/

/* Notes re address size overrides (0x67).

   According to the AMD documentation (24594 Rev 3.09, Sept 2003,
   "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
   and System Instructions"), Section 1.2.3 ("Address-Size Override
   Prefix"):

   0x67 applies to all explicit memory references, causing the top
   32 bits of the effective address to become zero.

   0x67 has no effect on stack references (push/pop); these always
   use a 64-bit address.

   0x67 changes the interpretation of instructions which implicitly
   reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
   instead.  These are:

      cmp{s,sb,sw,sd,sq}
      in{s,sb,sw,sd}
      jcxz, jecxz, jrcxz
      lod{s,sb,sw,sd,sq}
      loop{,e,bz,be,z}
      mov{s,sb,sw,sd,sq}
      out{s,sb,sw,sd}
      rep{,e,ne,nz}
      sca{s,sb,sw,sd,sq}
      sto{s,sb,sw,sd,sq}
      xlat{,b} */

/* "Special" instructions.

   This instruction decoder can decode three special instructions
   which mean nothing natively (are no-ops as far as regs/mem are
   concerned) but have meaning for supporting Valgrind.  A special
   instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
   48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
   $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
   Following that, one of the following 3 are allowed (standard
   interpretation in parentheses):

      4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
      4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
      4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX

   Any other bytes following the 16-byte preamble are illegal and
   constitute a failure in instruction decoding.  This all assumes
   that the preamble will never occur except in specific code
   fragments designed for Valgrind to catch.

   No prefixes may precede a "Special" instruction.
*/

/* casLE (implementation of lock-prefixed insns) and rep-prefixed
   insns: the side-exit back to the start of the insn is done with
   Ijk_Boring.  This is quite wrong, it should be done with
   Ijk_NoRedir, since otherwise the side exit, which is intended to
   restart the instruction for whatever reason, could go somewhere
   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
   no-redir jumps performance critical, at least for rep-prefixed
   instructions, since all iterations thereof would involve such a
   jump.  It's not such a big deal with casLE since the side exit is
   only taken if the CAS fails, that is, the location is contended,
   which is relatively unlikely.

   Note also, the test for CAS success vs failure is done using
   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
   shouldn't definedness-check these comparisons.  See
   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
   background/rationale.
*/

/* LOCK prefixed instructions.  These are translated using IR-level
   CAS statements (IRCAS) and are believed to preserve atomicity, even
   from the point of view of some other process racing against a
   simulated one (presumably they communicate via a shared memory
   segment).

   Handlers which are aware of LOCK prefixes are:
      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
      dis_cmpxchg_G_E  (cmpxchg)
      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
      dis_Grp3         (not, neg)
      dis_Grp4         (inc, dec)
      dis_Grp5         (inc, dec)
      dis_Grp8_Imm     (bts, btc, btr)
      dis_bt_G_E       (bts, btc, btr)
      dis_xadd_G_E     (xadd)
*/


#include "libvex_basictypes.h"
#include "libvex_ir.h"
#include "libvex.h"
#include "libvex_guest_amd64.h"

#include "main_util.h"
#include "main_globals.h"
#include "guest_generic_bb_to_IR.h"
#include "guest_generic_x87.h"
#include "guest_amd64_defs.h"


/*------------------------------------------------------------*/
/*--- Globals                                              ---*/
/*------------------------------------------------------------*/

/* These are set at the start of the translation of an insn, right
   down in disInstr_AMD64, so that we don't have to pass them around
   endlessly.  They are all constant during the translation of any
   given insn. */

/* These are set at the start of the translation of a BB, so
   that we don't have to pass them around endlessly. */

/* We need to know this to do sub-register accesses correctly. */
static Bool host_is_bigendian;

/* Pointer to the guest code area (points to start of BB, not to the
   insn being processed). */
static UChar* guest_code;

/* The guest address corresponding to guest_code[0]. */
static Addr64 guest_RIP_bbstart;

/* The guest address for the instruction currently being
   translated. */
static Addr64 guest_RIP_curr_instr;

/* The IRSB* into which we're generating code. */
static IRSB* irsb;

/* For ensuring that %rip-relative addressing is done right.  A read
   of %rip generates the address of the next instruction.  It may be
   that we don't conveniently know that inside disAMode().  For sanity
   checking, if the next insn %rip is needed, we make a guess at what
   it is, record that guess here, and set the accompanying Bool to
   indicate that -- after this insn's decode is finished -- that guess
   needs to be checked.  */

/* At the start of each insn decode, is set to (0, False).
   After the decode, if _mustcheck is now True, _assumed is
   checked. */

static Addr64 guest_RIP_next_assumed;
static Bool   guest_RIP_next_mustcheck;


/*------------------------------------------------------------*/
/*--- Helpers for constructing IR.                         ---*/
/*------------------------------------------------------------*/

/* Generate a new temporary of the given type. */
static IRTemp newTemp ( IRType ty )
{
   vassert(isPlausibleIRType(ty));
   return newIRTemp( irsb->tyenv, ty );
}

/* Add a statement to the list held by "irsb". */
static void stmt ( IRStmt* st )
{
   addStmtToIRSB( irsb, st );
}

/* Generate a statement "dst := e". */
static void assign ( IRTemp dst, IRExpr* e )
{
   stmt( IRStmt_WrTmp(dst, e) );
}

static IRExpr* unop ( IROp op, IRExpr* a )
{
   return IRExpr_Unop(op, a);
}

static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
{
   return IRExpr_Binop(op, a1, a2);
}

static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
{
   return IRExpr_Triop(op, a1, a2, a3);
}

static IRExpr* mkexpr ( IRTemp tmp )
{
   return IRExpr_RdTmp(tmp);
}

static IRExpr* mkU8 ( ULong i )
{
   vassert(i < 256);
   return IRExpr_Const(IRConst_U8( (UChar)i ));
}

static IRExpr* mkU16 ( ULong i )
{
   vassert(i < 0x10000ULL);
   return IRExpr_Const(IRConst_U16( (UShort)i ));
}

static IRExpr* mkU32 ( ULong i )
{
   vassert(i < 0x100000000ULL);
   return IRExpr_Const(IRConst_U32( (UInt)i ));
}

static IRExpr* mkU64 ( ULong i )
{
   return IRExpr_Const(IRConst_U64(i));
}

static IRExpr* mkU ( IRType ty, ULong i )
{
   switch (ty) {
      case Ity_I8:  return mkU8(i);
      case Ity_I16: return mkU16(i);
      case Ity_I32: return mkU32(i);
      case Ity_I64: return mkU64(i);
      default: vpanic("mkU(amd64)");
   }
}

static void storeLE ( IRExpr* addr, IRExpr* data )
{
   stmt( IRStmt_Store(Iend_LE, addr, data) );
}

static IRExpr* loadLE ( IRType ty, IRExpr* addr )
{
   return IRExpr_Load(Iend_LE, ty, addr);
}

static IROp mkSizedOp ( IRType ty, IROp op8 )
{
   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
           || op8 == Iop_Mul8
           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
           || op8 == Iop_CasCmpNE8
           || op8 == Iop_Not8 );
   switch (ty) {
      case Ity_I8:  return 0 +op8;
      case Ity_I16: return 1 +op8;
      case Ity_I32: return 2 +op8;
      case Ity_I64: return 3 +op8;
      default: vpanic("mkSizedOp(amd64)");
   }
}

static
IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
{
   if (szSmall == 1 && szBig == 4) {
      return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
   }
   if (szSmall == 1 && szBig == 2) {
      return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
   }
   if (szSmall == 2 && szBig == 4) {
      return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
   }
   if (szSmall == 1 && szBig == 8 && !signd) {
      return unop(Iop_8Uto64, src);
   }
   if (szSmall == 1 && szBig == 8 && signd) {
      return unop(Iop_8Sto64, src);
   }
   if (szSmall == 2 && szBig == 8 && !signd) {
      return unop(Iop_16Uto64, src);
   }
   if (szSmall == 2 && szBig == 8 && signd) {
      return unop(Iop_16Sto64, src);
   }
   vpanic("doScalarWidening(amd64)");
}


/*------------------------------------------------------------*/
/*--- Debugging output                                     ---*/
/*------------------------------------------------------------*/

/* Bomb out if we can't handle something. */
__attribute__ ((noreturn))
static void unimplemented ( HChar* str )
{
   vex_printf("amd64toIR: unimplemented feature\n");
   vpanic(str);
}

#define DIP(format, args...)           \
   if (vex_traceflags & VEX_TRACE_FE)  \
      vex_printf(format, ## args)

#define DIS(buf, format, args...)      \
   if (vex_traceflags & VEX_TRACE_FE)  \
      vex_sprintf(buf, format, ## args)


/*------------------------------------------------------------*/
/*--- Offsets of various parts of the amd64 guest state.   ---*/
/*------------------------------------------------------------*/

#define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
#define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
#define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
#define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
#define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
#define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
#define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
#define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
#define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
#define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
#define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
#define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
#define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
#define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
#define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
#define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)

#define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)

#define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
#define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)

#define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
#define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
#define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
#define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)

#define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
#define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
#define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
#define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
#define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
#define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
#define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
#define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)

#define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
#define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
#define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
#define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
#define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
#define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
#define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
#define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
#define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
#define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
#define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
#define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
#define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
#define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
#define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
#define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
#define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
#define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)

#define OFFB_EMWARN    offsetof(VexGuestAMD64State,guest_EMWARN)
#define OFFB_TISTART   offsetof(VexGuestAMD64State,guest_TISTART)
#define OFFB_TILEN     offsetof(VexGuestAMD64State,guest_TILEN)

#define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)


/*------------------------------------------------------------*/
/*--- Helper bits and pieces for deconstructing the        ---*/
/*--- amd64 insn stream.                                   ---*/
/*------------------------------------------------------------*/

/* This is the AMD64 register encoding -- integer regs. */
#define R_RAX 0
#define R_RCX 1
#define R_RDX 2
#define R_RBX 3
#define R_RSP 4
#define R_RBP 5
#define R_RSI 6
#define R_RDI 7
#define R_R8  8
#define R_R9  9
#define R_R10 10
#define R_R11 11
#define R_R12 12
#define R_R13 13
#define R_R14 14
#define R_R15 15

/* This is the Intel register encoding -- segment regs. */
#define R_ES 0
#define R_CS 1
#define R_SS 2
#define R_DS 3
#define R_FS 4
#define R_GS 5


/* Various simple conversions */

static ULong extend_s_8to64 ( UChar x )
{
   return (ULong)((((Long)x) << 56) >> 56);
}

static ULong extend_s_16to64 ( UShort x )
{
   return (ULong)((((Long)x) << 48) >> 48);
}

static ULong extend_s_32to64 ( UInt x )
{
   return (ULong)((((Long)x) << 32) >> 32);
}

/* Figure out whether the mod and rm parts of a modRM byte refer to a
   register or memory.  If so, the byte will have the form 11XXXYYY,
   where YYY is the register number. */
inline
static Bool epartIsReg ( UChar mod_reg_rm )
{
   return toBool(0xC0 == (mod_reg_rm & 0xC0));
}

/* Extract the 'g' field from a modRM byte.  This only produces 3
   bits, which is not a complete register number.  You should avoid
   this function if at all possible. */
inline
static Int gregLO3ofRM ( UChar mod_reg_rm )
{
   return (Int)( (mod_reg_rm >> 3) & 7 );
}

/* Ditto the 'e' field of a modRM byte. */
inline
static Int eregLO3ofRM ( UChar mod_reg_rm )
{
   return (Int)(mod_reg_rm & 0x7);
}

/* Get a 8/16/32-bit unsigned value out of the insn stream. */

static inline UChar getUChar ( Long delta )
{
   UChar v = guest_code[delta+0];
   return v;
}

static UInt getUDisp16 ( Long delta )
{
   UInt v = guest_code[delta+1]; v <<= 8;
   v |= guest_code[delta+0];
   return v & 0xFFFF;
}

//.. static UInt getUDisp ( Int size, Long delta )
//.. {
//..    switch (size) {
//..       case 4: return getUDisp32(delta);
//..       case 2: return getUDisp16(delta);
//..       case 1: return getUChar(delta);
//..       default: vpanic("getUDisp(x86)");
//..    }
//..    return 0; /*notreached*/
//.. }


/* Get a byte value out of the insn stream and sign-extend to 64
   bits. */
static Long getSDisp8 ( Long delta )
{
   return extend_s_8to64( guest_code[delta] );
}

/* Get a 16-bit value out of the insn stream and sign-extend to 64
   bits. */
static Long getSDisp16 ( Long delta )
{
   UInt v = guest_code[delta+1]; v <<= 8;
   v |= guest_code[delta+0];
   return extend_s_16to64( (UShort)v );
}

/* Get a 32-bit value out of the insn stream and sign-extend to 64
   bits. */
static Long getSDisp32 ( Long delta )
{
   UInt v = guest_code[delta+3]; v <<= 8;
   v |= guest_code[delta+2]; v <<= 8;
   v |= guest_code[delta+1]; v <<= 8;
   v |= guest_code[delta+0];
   return extend_s_32to64( v );
}

/* Get a 64-bit value out of the insn stream. */
static Long getDisp64 ( Long delta )
{
   ULong v = 0;
   v |= guest_code[delta+7]; v <<= 8;
   v |= guest_code[delta+6]; v <<= 8;
   v |= guest_code[delta+5]; v <<= 8;
   v |= guest_code[delta+4]; v <<= 8;
   v |= guest_code[delta+3]; v <<= 8;
   v |= guest_code[delta+2]; v <<= 8;
   v |= guest_code[delta+1]; v <<= 8;
   v |= guest_code[delta+0];
   return v;
}

/* Note: because AMD64 doesn't allow 64-bit literals, it is an error
   if this is called with size==8.  Should not happen. */
static Long getSDisp ( Int size, Long delta )
{
   switch (size) {
      case 4: return getSDisp32(delta);
      case 2: return getSDisp16(delta);
      case 1: return getSDisp8(delta);
      default: vpanic("getSDisp(amd64)");
  }
}

static ULong mkSizeMask ( Int sz )
{
   switch (sz) {
      case 1: return 0x00000000000000FFULL;
      case 2: return 0x000000000000FFFFULL;
      case 4: return 0x00000000FFFFFFFFULL;
      case 8: return 0xFFFFFFFFFFFFFFFFULL;
      default: vpanic("mkSzMask(amd64)");
   }
}

static Int imin ( Int a, Int b )
{
   return (a < b) ? a : b;
}

static IRType szToITy ( Int n )
{
   switch (n) {
      case 1: return Ity_I8;
      case 2: return Ity_I16;
      case 4: return Ity_I32;
      case 8: return Ity_I64;
      default: vex_printf("\nszToITy(%d)\n", n);
               vpanic("szToITy(amd64)");
   }
}


/*------------------------------------------------------------*/
/*--- For dealing with prefixes.                           ---*/
/*------------------------------------------------------------*/

/* The idea is to pass around an int holding a bitmask summarising
   info from the prefixes seen on the current instruction, including
   info from the REX byte.  This info is used in various places, but
   most especially when making sense of register fields in
   instructions.

   The top 8 bits of the prefix are 0x55, just as a hacky way to
   ensure it really is a valid prefix.

   Things you can safely assume about a well-formed prefix:
   * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
   * if REX is not present then REXW,REXR,REXX,REXB will read
     as zero.
   * F2 and F3 will not both be 1.
*/

typedef UInt  Prefix;

#define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
#define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
#define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
#define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
#define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
#define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
#define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
#define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
#define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
#define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
#define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
#define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
#define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
#define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
#define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
#define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
#define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
#define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
/* The extra register field VEX.vvvv is encoded (after not-ing it) as
   PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
   positions. */
#define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
#define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
#define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
#define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */


#define PFX_EMPTY 0x55000000

static Bool IS_VALID_PFX ( Prefix pfx ) {
   return toBool((pfx & 0xFF000000) == PFX_EMPTY);
}

static Bool haveREX ( Prefix pfx ) {
   return toBool(pfx & PFX_REX);
}

static Int getRexW ( Prefix pfx ) {
   return (pfx & PFX_REXW) ? 1 : 0;
}
static Int getRexR ( Prefix pfx ) {
   return (pfx & PFX_REXR) ? 1 : 0;
}
static Int getRexX ( Prefix pfx ) {
   return (pfx & PFX_REXX) ? 1 : 0;
}
static Int getRexB ( Prefix pfx ) {
   return (pfx & PFX_REXB) ? 1 : 0;
}

/* Check a prefix doesn't have F2 or F3 set in it, since usually that
   completely changes what instruction it really is. */
static Bool haveF2orF3 ( Prefix pfx ) {
   return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
}
static Bool haveF2 ( Prefix pfx ) {
   return toBool((pfx & PFX_F2) > 0);
}
static Bool haveF3 ( Prefix pfx ) {
   return toBool((pfx & PFX_F3) > 0);
}

static Bool have66 ( Prefix pfx ) {
   return toBool((pfx & PFX_66) > 0);
}
static Bool haveASO ( Prefix pfx ) {
   return toBool((pfx & PFX_ASO) > 0);
}

/* Return True iff pfx has 66 set and F2 and F3 clear */
static Bool have66noF2noF3 ( Prefix pfx )
{
  return
     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
}

/* Return True iff pfx has F2 set and 66 and F3 clear */
static Bool haveF2no66noF3 ( Prefix pfx )
{
  return
     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
}

/* Return True iff pfx has F3 set and 66 and F2 clear */
static Bool haveF3no66noF2 ( Prefix pfx )
{
  return
     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
}

/* Return True iff pfx has F3 set and F2 clear */
static Bool haveF3noF2 ( Prefix pfx )
{
  return
     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
}

/* Return True iff pfx has F2 set and F3 clear */
static Bool haveF2noF3 ( Prefix pfx )
{
  return
     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
}

/* Return True iff pfx has 66, F2 and F3 clear */
static Bool haveNo66noF2noF3 ( Prefix pfx )
{
  return
     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
}

/* Return True iff pfx has any of 66, F2 and F3 set */
static Bool have66orF2orF3 ( Prefix pfx )
{
  return toBool( ! haveNo66noF2noF3(pfx) );
}

/* Return True iff pfx has 66 or F2 set */
static Bool have66orF2 ( Prefix pfx )
{
   return toBool((pfx & (PFX_66|PFX_F2)) > 0);
}

/* Clear all the segment-override bits in a prefix. */
static Prefix clearSegBits ( Prefix p )
{
   return
      p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
}

/* Get the (inverted, hence back to "normal") VEX.vvvv field. */
static UInt getVexNvvvv ( Prefix pfx ) {
   UInt r = (UInt)pfx;
   r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
   return r & 0xF;
}

static Bool haveVEX ( Prefix pfx ) {
   return toBool(pfx & PFX_VEX);
}

static Int getVexL ( Prefix pfx ) {
   return (pfx & PFX_VEXL) ? 1 : 0;
}


/*------------------------------------------------------------*/
/*--- For dealing with escapes                             ---*/
/*------------------------------------------------------------*/


/* Escapes come after the prefixes, but before the primary opcode
   byte.  They escape the primary opcode byte into a bigger space.
   The 0xF0000000 isn't significant, except so as to make it not
   overlap valid Prefix values, for sanity checking.
*/

typedef
   enum {
      ESC_NONE=0xF0000000, // none
      ESC_0F,              // 0F
      ESC_0F38,            // 0F 38
      ESC_0F3A             // 0F 3A
   }
   Escape;


/*------------------------------------------------------------*/
/*--- For dealing with integer registers                   ---*/
/*------------------------------------------------------------*/

/* This is somewhat complex.  The rules are:

   For 64, 32 and 16 bit register references, the e or g fields in the
   modrm bytes supply the low 3 bits of the register number.  The
   fourth (most-significant) bit of the register number is supplied by
   the REX byte, if it is present; else that bit is taken to be zero.

   The REX.R bit supplies the high bit corresponding to the g register
   field, and the REX.B bit supplies the high bit corresponding to the
   e register field (when the mod part of modrm indicates that modrm's
   e component refers to a register and not to memory).

   The REX.X bit supplies a high register bit for certain registers
   in SIB address modes, and is generally rarely used.

   For 8 bit register references, the presence of the REX byte itself
   has significance.  If there is no REX present, then the 3-bit
   number extracted from the modrm e or g field is treated as an index
   into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
   old x86 encoding scheme.

   But if there is a REX present, the register reference is
   interpreted in the same way as for 64/32/16-bit references: a high
   bit is extracted from REX, giving a 4-bit number, and the denoted
   register is the lowest 8 bits of the 16 integer registers denoted
   by the number.  In particular, values 3 through 7 of this sequence
   do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
   %rsp %rbp %rsi %rdi.

   The REX.W bit has no bearing at all on register numbers.  Instead
   its presence indicates that the operand size is to be overridden
   from its default value (32 bits) to 64 bits instead.  This is in
   the same fashion that an 0x66 prefix indicates the operand size is
   to be overridden from 32 bits down to 16 bits.  When both REX.W and
   0x66 are present there is a conflict, and REX.W takes precedence.

   Rather than try to handle this complexity using a single huge
   function, several smaller ones are provided.  The aim is to make it
   as difficult as possible to screw up register decoding in a subtle
   and hard-to-track-down way.

   Because these routines fish around in the host's memory (that is,
   in the guest state area) for sub-parts of guest registers, their
   correctness depends on the host's endianness.  So far these
   routines only work for little-endian hosts.  Those for which
   endianness is important have assertions to ensure sanity.
*/


/* About the simplest question you can ask: where do the 64-bit
   integer registers live (in the guest state) ? */

static Int integerGuestReg64Offset ( UInt reg )
{
   switch (reg) {
      case R_RAX: return OFFB_RAX;
      case R_RCX: return OFFB_RCX;
      case R_RDX: return OFFB_RDX;
      case R_RBX: return OFFB_RBX;
      case R_RSP: return OFFB_RSP;
      case R_RBP: return OFFB_RBP;
      case R_RSI: return OFFB_RSI;
      case R_RDI: return OFFB_RDI;
      case R_R8:  return OFFB_R8;
      case R_R9:  return OFFB_R9;
      case R_R10: return OFFB_R10;
      case R_R11: return OFFB_R11;
      case R_R12: return OFFB_R12;
      case R_R13: return OFFB_R13;
      case R_R14: return OFFB_R14;
      case R_R15: return OFFB_R15;
      default: vpanic("integerGuestReg64Offset(amd64)");
   }
}


/* Produce the name of an integer register, for printing purposes.
   reg is a number in the range 0 .. 15 that has been generated from a
   3-bit reg-field number and a REX extension bit.  irregular denotes
   the case where sz==1 and no REX byte is present. */

static
HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
{
   static HChar* ireg64_names[16]
     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
   static HChar* ireg32_names[16]
     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
         "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
   static HChar* ireg16_names[16]
     = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
         "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
   static HChar* ireg8_names[16]
     = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
         "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
   static HChar* ireg8_irregular[8]
     = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };

   vassert(reg < 16);
   if (sz == 1) {
      if (irregular)
         vassert(reg < 8);
   } else {
      vassert(irregular == False);
   }

   switch (sz) {
      case 8: return ireg64_names[reg];
      case 4: return ireg32_names[reg];
      case 2: return ireg16_names[reg];
      case 1: if (irregular) {
                 return ireg8_irregular[reg];
              } else {
                 return ireg8_names[reg];
              }
      default: vpanic("nameIReg(amd64)");
   }
}

/* Using the same argument conventions as nameIReg, produce the
   guest state offset of an integer register. */

static
Int offsetIReg ( Int sz, UInt reg, Bool irregular )
{
   vassert(reg < 16);
   if (sz == 1) {
      if (irregular)
         vassert(reg < 8);
   } else {
      vassert(irregular == False);
   }

   /* Deal with irregular case -- sz==1 and no REX present */
   if (sz == 1 && irregular) {
      switch (reg) {
         case R_RSP: return 1+ OFFB_RAX;
         case R_RBP: return 1+ OFFB_RCX;
         case R_RSI: return 1+ OFFB_RDX;
         case R_RDI: return 1+ OFFB_RBX;
         default:    break; /* use the normal case */
      }
   }

   /* Normal case */
   return integerGuestReg64Offset(reg);
}


/* Read the %CL register :: Ity_I8, for shift/rotate operations. */

static IRExpr* getIRegCL ( void )
{
   vassert(!host_is_bigendian);
   return IRExpr_Get( OFFB_RCX, Ity_I8 );
}


/* Write to the %AH register. */

static void putIRegAH ( IRExpr* e )
{
   vassert(!host_is_bigendian);
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
   stmt( IRStmt_Put( OFFB_RAX+1, e ) );
}


/* Read/write various widths of %RAX, as it has various
   special-purpose uses. */

static HChar* nameIRegRAX ( Int sz )
{
   switch (sz) {
      case 1: return "%al";
      case 2: return "%ax";
      case 4: return "%eax";
      case 8: return "%rax";
      default: vpanic("nameIRegRAX(amd64)");
   }
}

static IRExpr* getIRegRAX ( Int sz )
{
   vassert(!host_is_bigendian);
   switch (sz) {
      case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
      case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
      case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
      default: vpanic("getIRegRAX(amd64)");
   }
}

static void putIRegRAX ( Int sz, IRExpr* e )
{
   IRType ty = typeOfIRExpr(irsb->tyenv, e);
   vassert(!host_is_bigendian);
   switch (sz) {
      case 8: vassert(ty == Ity_I64);
              stmt( IRStmt_Put( OFFB_RAX, e ));
              break;
      case 4: vassert(ty == Ity_I32);
              stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
              break;
      case 2: vassert(ty == Ity_I16);
              stmt( IRStmt_Put( OFFB_RAX, e ));
              break;
      case 1: vassert(ty == Ity_I8);
              stmt( IRStmt_Put( OFFB_RAX, e ));
              break;
      default: vpanic("putIRegRAX(amd64)");
   }
}


/* Read/write various widths of %RDX, as it has various
   special-purpose uses. */

static HChar* nameIRegRDX ( Int sz )
{
   switch (sz) {
      case 1: return "%dl";
      case 2: return "%dx";
      case 4: return "%edx";
      case 8: return "%rdx";
      default: vpanic("nameIRegRDX(amd64)");
   }
}

static IRExpr* getIRegRDX ( Int sz )
{
   vassert(!host_is_bigendian);
   switch (sz) {
      case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
      case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
      case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
      default: vpanic("getIRegRDX(amd64)");
   }
}

static void putIRegRDX ( Int sz, IRExpr* e )
{
   vassert(!host_is_bigendian);
   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   switch (sz) {
      case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
              break;
      case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
              break;
      case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
              break;
      case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
              break;
      default: vpanic("putIRegRDX(amd64)");
   }
}


/* Simplistic functions to deal with the integer registers as a
   straightforward bank of 16 64-bit regs. */

static IRExpr* getIReg64 ( UInt regno )
{
   return IRExpr_Get( integerGuestReg64Offset(regno),
                      Ity_I64 );
}

static void putIReg64 ( UInt regno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
}

static HChar* nameIReg64 ( UInt regno )
{
   return nameIReg( 8, regno, False );
}


/* Simplistic functions to deal with the lower halves of integer
   registers as a straightforward bank of 16 32-bit regs. */

static IRExpr* getIReg32 ( UInt regno )
{
   vassert(!host_is_bigendian);
   return unop(Iop_64to32,
               IRExpr_Get( integerGuestReg64Offset(regno),
                           Ity_I64 ));
}

static void putIReg32 ( UInt regno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
                     unop(Iop_32Uto64,e) ) );
}

static HChar* nameIReg32 ( UInt regno )
{
   return nameIReg( 4, regno, False );
}


/* Simplistic functions to deal with the lower quarters of integer
   registers as a straightforward bank of 16 16-bit regs. */

static IRExpr* getIReg16 ( UInt regno )
{
   vassert(!host_is_bigendian);
   return IRExpr_Get( integerGuestReg64Offset(regno),
                      Ity_I16 );
}

static void putIReg16 ( UInt regno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
                     unop(Iop_16Uto64,e) ) );
}

static HChar* nameIReg16 ( UInt regno )
{
   return nameIReg( 2, regno, False );
}


/* Sometimes what we know is a 3-bit register number, a REX byte, and
   which field of the REX byte is to be used to extend to a 4-bit
   number.  These functions cater for that situation.
*/
static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
{
   vassert(lo3bits < 8);
   vassert(IS_VALID_PFX(pfx));
   return getIReg64( lo3bits | (getRexX(pfx) << 3) );
}

static HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
{
   vassert(lo3bits < 8);
   vassert(IS_VALID_PFX(pfx));
   return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
}

static HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
{
   vassert(lo3bits < 8);
   vassert(IS_VALID_PFX(pfx));
   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
                        toBool(sz==1 && !haveREX(pfx)) );
}

static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
{
   vassert(lo3bits < 8);
   vassert(IS_VALID_PFX(pfx));
   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   if (sz == 4) {
      sz = 8;
      return unop(Iop_64to32,
                  IRExpr_Get(
                     offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
                                     toBool(sz==1 && !haveREX(pfx)) ),
                     szToITy(sz)
                 )
             );
   } else {
      return IRExpr_Get(
                offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
                                toBool(sz==1 && !haveREX(pfx)) ),
                szToITy(sz)
             );
   }
}

static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
{
   vassert(lo3bits < 8);
   vassert(IS_VALID_PFX(pfx));
   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   stmt( IRStmt_Put(
            offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
                            toBool(sz==1 && !haveREX(pfx)) ),
            sz==4 ? unop(Iop_32Uto64,e) : e
   ));
}


/* Functions for getting register numbers from modrm bytes and REX
   when we don't have to consider the complexities of integer subreg
   accesses.
*/
/* Extract the g reg field from a modRM byte, and augment it using the
   REX.R bit from the supplied REX byte.  The R bit usually is
   associated with the g register field.
*/
static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
{
   Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   reg += (pfx & PFX_REXR) ? 8 : 0;
   return reg;
}

/* Extract the e reg field from a modRM byte, and augment it using the
   REX.B bit from the supplied REX byte.  The B bit usually is
   associated with the e register field (when modrm indicates e is a
   register, that is).
*/
static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
{
   Int rm;
   vassert(epartIsReg(mod_reg_rm));
   rm = (Int)(mod_reg_rm & 0x7);
   rm += (pfx & PFX_REXB) ? 8 : 0;
   return rm;
}


/* General functions for dealing with integer register access. */

/* Produce the guest state offset for a reference to the 'g' register
   field in a modrm byte, taking into account REX (or its absence),
   and the size of the access.
*/
static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
{
   UInt reg;
   vassert(!host_is_bigendian);
   vassert(IS_VALID_PFX(pfx));
   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   reg = gregOfRexRM( pfx, mod_reg_rm );
   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
}

static
IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
{
   if (sz == 4) {
      sz = 8;
      return unop(Iop_64to32,
                  IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
                              szToITy(sz) ));
   } else {
      return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
                         szToITy(sz) );
   }
}

static
void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   if (sz == 4) {
      e = unop(Iop_32Uto64,e);
   }
   stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
}

static
HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
{
   return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
                        toBool(sz==1 && !haveREX(pfx)) );
}


/* Produce the guest state offset for a reference to the 'e' register
   field in a modrm byte, taking into account REX (or its absence),
   and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   denotes a memory access rather than a register access.
*/
static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
{
   UInt reg;
   vassert(!host_is_bigendian);
   vassert(IS_VALID_PFX(pfx));
   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   reg = eregOfRexRM( pfx, mod_reg_rm );
   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
}

static
IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
{
   if (sz == 4) {
      sz = 8;
      return unop(Iop_64to32,
                  IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
                              szToITy(sz) ));
   } else {
      return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
                         szToITy(sz) );
   }
}

static
void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   if (sz == 4) {
      e = unop(Iop_32Uto64,e);
   }
   stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
}

static
HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
{
   return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
                        toBool(sz==1 && !haveREX(pfx)) );
}


/*------------------------------------------------------------*/
/*--- For dealing with XMM registers                       ---*/
/*------------------------------------------------------------*/

static Int ymmGuestRegOffset ( UInt ymmreg )
{
   switch (ymmreg) {
      case 0:  return OFFB_YMM0;
      case 1:  return OFFB_YMM1;
      case 2:  return OFFB_YMM2;
      case 3:  return OFFB_YMM3;
      case 4:  return OFFB_YMM4;
      case 5:  return OFFB_YMM5;
      case 6:  return OFFB_YMM6;
      case 7:  return OFFB_YMM7;
      case 8:  return OFFB_YMM8;
      case 9:  return OFFB_YMM9;
      case 10: return OFFB_YMM10;
      case 11: return OFFB_YMM11;
      case 12: return OFFB_YMM12;
      case 13: return OFFB_YMM13;
      case 14: return OFFB_YMM14;
      case 15: return OFFB_YMM15;
      default: vpanic("ymmGuestRegOffset(amd64)");
   }
}

static Int xmmGuestRegOffset ( UInt xmmreg )
{
   /* Correct for little-endian host only. */
   vassert(!host_is_bigendian);
   return ymmGuestRegOffset( xmmreg );
}

/* Lanes of vector registers are always numbered from zero being the
   least significant lane (rightmost in the register).  */

static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
{
   /* Correct for little-endian host only. */
   vassert(!host_is_bigendian);
   vassert(laneno >= 0 && laneno < 8);
   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
}

static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
{
   /* Correct for little-endian host only. */
   vassert(!host_is_bigendian);
   vassert(laneno >= 0 && laneno < 4);
   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
}

static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
{
   /* Correct for little-endian host only. */
   vassert(!host_is_bigendian);
   vassert(laneno >= 0 && laneno < 2);
   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
}

static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
{
   /* Correct for little-endian host only. */
   vassert(!host_is_bigendian);
   vassert(laneno >= 0 && laneno < 2);
   return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
}

static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
{
   /* Correct for little-endian host only. */
   vassert(!host_is_bigendian);
   vassert(laneno >= 0 && laneno < 4);
   return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
}

static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
{
   /* Correct for little-endian host only. */
   vassert(!host_is_bigendian);
   vassert(laneno >= 0 && laneno < 8);
   return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
}

static IRExpr* getXMMReg ( UInt xmmreg )
{
   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
}

static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
{
   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
}

static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
{
   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
}

static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
{
   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
}

static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
{
   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
}

static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
{
  return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
}

static void putXMMReg ( UInt xmmreg, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
}

static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
}

static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
}

static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
}

static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
}

static IRExpr* getYMMReg ( UInt xmmreg )
{
   return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
}

static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
{
   return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
}

static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
{
   return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
}

static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
{
   return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
}

static void putYMMReg ( UInt ymmreg, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
   stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
}

static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
}

static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
}

static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
}

static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
}

static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
}

static IRExpr* mkV128 ( UShort mask )
{
   return IRExpr_Const(IRConst_V128(mask));
}

/* Write the low half of a YMM reg and zero out the upper half. */
static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
{
   putYMMRegLane128( ymmreg, 0, e );
   putYMMRegLane128( ymmreg, 1, mkV128(0) );
}

static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
{
   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   return unop(Iop_64to1,
               binop(Iop_And64,
                     unop(Iop_1Uto64,x),
                     unop(Iop_1Uto64,y)));
}

/* Generate a compare-and-swap operation, operating on memory at
   'addr'.  The expected value is 'expVal' and the new value is
   'newVal'.  If the operation fails, then transfer control (with a
   no-redir jump (XXX no -- see comment at top of this file)) to
   'restart_point', which is presumably the address of the guest
   instruction again -- retrying, essentially. */
static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
                    Addr64 restart_point )
{
   IRCAS* cas;
   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   IRTemp oldTmp = newTemp(tyE);
   IRTemp expTmp = newTemp(tyE);
   vassert(tyE == tyN);
   vassert(tyE == Ity_I64 || tyE == Ity_I32
           || tyE == Ity_I16 || tyE == Ity_I8);
   assign(expTmp, expVal);
   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
                  NULL, mkexpr(expTmp), NULL, newVal );
   stmt( IRStmt_CAS(cas) );
   stmt( IRStmt_Exit(
            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
                   mkexpr(oldTmp), mkexpr(expTmp) ),
            Ijk_Boring, /*Ijk_NoRedir*/
            IRConst_U64( restart_point ),
            OFFB_RIP
         ));
}


/*------------------------------------------------------------*/
/*--- Helpers for %rflags.                                 ---*/
/*------------------------------------------------------------*/

/* -------------- Evaluating the flags-thunk. -------------- */

/* Build IR to calculate all the eflags from stored
   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   Ity_I64. */
static IRExpr* mk_amd64g_calculate_rflags_all ( void )
{
   IRExpr** args
      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   IRExpr* call
      = mkIRExprCCall(
           Ity_I64,
           0/*regparm*/,
           "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
           args
        );
   /* Exclude OP and NDEP from definedness checking.  We're only
      interested in DEP1 and DEP2. */
   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   return call;
}

/* Build IR to calculate some particular condition from stored
   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   Ity_Bit. */
static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
{
   IRExpr** args
      = mkIRExprVec_5( mkU64(cond),
                       IRExpr_Get(OFFB_CC_OP,   Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   IRExpr* call
      = mkIRExprCCall(
           Ity_I64,
           0/*regparm*/,
           "amd64g_calculate_condition", &amd64g_calculate_condition,
           args
        );
   /* Exclude the requested condition, OP and NDEP from definedness
      checking.  We're only interested in DEP1 and DEP2. */
   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   return unop(Iop_64to1, call);
}

/* Build IR to calculate just the carry flag from stored
   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
static IRExpr* mk_amd64g_calculate_rflags_c ( void )
{
   IRExpr** args
      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   IRExpr* call
      = mkIRExprCCall(
           Ity_I64,
           0/*regparm*/,
           "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
           args
        );
   /* Exclude OP and NDEP from definedness checking.  We're only
      interested in DEP1 and DEP2. */
   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   return call;
}


/* -------------- Building the flags-thunk. -------------- */

/* The machinery in this section builds the flag-thunk following a
   flag-setting operation.  Hence the various setFlags_* functions.
*/

static Bool isAddSub ( IROp op8 )
{
   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
}

static Bool isLogic ( IROp op8 )
{
   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
}

/* U-widen 8/16/32/64 bit int expr to 64. */
static IRExpr* widenUto64 ( IRExpr* e )
{
   switch (typeOfIRExpr(irsb->tyenv,e)) {
      case Ity_I64: return e;
      case Ity_I32: return unop(Iop_32Uto64, e);
      case Ity_I16: return unop(Iop_16Uto64, e);
      case Ity_I8:  return unop(Iop_8Uto64, e);
      default: vpanic("widenUto64");
   }
}

/* S-widen 8/16/32/64 bit int expr to 32. */
static IRExpr* widenSto64 ( IRExpr* e )
{
   switch (typeOfIRExpr(irsb->tyenv,e)) {
      case Ity_I64: return e;
      case Ity_I32: return unop(Iop_32Sto64, e);
      case Ity_I16: return unop(Iop_16Sto64, e);
      case Ity_I8:  return unop(Iop_8Sto64, e);
      default: vpanic("widenSto64");
   }
}

/* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   of these combinations make sense. */
static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
{
   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   if (src_ty == dst_ty)
      return e;
   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
      return unop(Iop_32to16, e);
   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
      return unop(Iop_32to8, e);
   if (src_ty == Ity_I64 && dst_ty == Ity_I32)
      return unop(Iop_64to32, e);
   if (src_ty == Ity_I64 && dst_ty == Ity_I16)
      return unop(Iop_64to16, e);
   if (src_ty == Ity_I64 && dst_ty == Ity_I8)
      return unop(Iop_64to8, e);

   vex_printf("\nsrc, dst tys are: ");
   ppIRType(src_ty);
   vex_printf(", ");
   ppIRType(dst_ty);
   vex_printf("\n");
   vpanic("narrowTo(amd64)");
}


/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   auto-sized up to the real op. */

static
void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
{
   Int ccOp = 0;
   switch (ty) {
      case Ity_I8:  ccOp = 0; break;
      case Ity_I16: ccOp = 1; break;
      case Ity_I32: ccOp = 2; break;
      case Ity_I64: ccOp = 3; break;
      default: vassert(0);
   }
   switch (op8) {
      case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
      case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
      default:       ppIROp(op8);
                     vpanic("setFlags_DEP1_DEP2(amd64)");
   }
   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
}


/* Set the OP and DEP1 fields only, and write zero to DEP2. */

static
void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
{
   Int ccOp = 0;
   switch (ty) {
      case Ity_I8:  ccOp = 0; break;
      case Ity_I16: ccOp = 1; break;
      case Ity_I32: ccOp = 2; break;
      case Ity_I64: ccOp = 3; break;
      default: vassert(0);
   }
   switch (op8) {
      case Iop_Or8:
      case Iop_And8:
      case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
      default:       ppIROp(op8);
                     vpanic("setFlags_DEP1(amd64)");
   }
   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
}


/* For shift operations, we put in the result and the undershifted
   result.  Except if the shift amount is zero, the thunk is left
   unchanged. */

static void setFlags_DEP1_DEP2_shift ( IROp    op64,
                                       IRTemp  res,
                                       IRTemp  resUS,
                                       IRType  ty,
                                       IRTemp  guard )
{
   Int ccOp = 0;
   switch (ty) {
      case Ity_I8:  ccOp = 0; break;
      case Ity_I16: ccOp = 1; break;
      case Ity_I32: ccOp = 2; break;
      case Ity_I64: ccOp = 3; break;
      default: vassert(0);
   }

   vassert(guard);

   /* Both kinds of right shifts are handled by the same thunk
      operation. */
   switch (op64) {
      case Iop_Shr64:
      case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
      case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
      default:        ppIROp(op64);
                      vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   }

   /* DEP1 contains the result, DEP2 contains the undershifted value. */
   stmt( IRStmt_Put( OFFB_CC_OP,
                     IRExpr_Mux0X( mkexpr(guard),
                                   IRExpr_Get(OFFB_CC_OP,Ity_I64),
                                   mkU64(ccOp))) );
   stmt( IRStmt_Put( OFFB_CC_DEP1,
                     IRExpr_Mux0X( mkexpr(guard),
                                   IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
                                   widenUto64(mkexpr(res)))) );
   stmt( IRStmt_Put( OFFB_CC_DEP2,
                     IRExpr_Mux0X( mkexpr(guard),
                                   IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
                                   widenUto64(mkexpr(resUS)))) );
}


/* For the inc/dec case, we store in DEP1 the result value and in NDEP
   the former value of the carry flag, which unfortunately we have to
   compute. */

static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
{
   Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;

   switch (ty) {
      case Ity_I8:  ccOp += 0; break;
      case Ity_I16: ccOp += 1; break;
      case Ity_I32: ccOp += 2; break;
      case Ity_I64: ccOp += 3; break;
      default: vassert(0);
   }

   /* This has to come first, because calculating the C flag
      may require reading all four thunk fields. */
   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
}


/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   two arguments. */

static
void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
{
   switch (ty) {
      case Ity_I8:
         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
         break;
      case Ity_I16:
         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
         break;
      case Ity_I32:
         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
         break;
      case Ity_I64:
         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
         break;
      default:
         vpanic("setFlags_MUL(amd64)");
   }
   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
}


/* -------------- Condition codes. -------------- */

/* Condition codes, using the AMD encoding.  */

static HChar* name_AMD64Condcode ( AMD64Condcode cond )
{
   switch (cond) {
      case AMD64CondO:      return "o";
      case AMD64CondNO:     return "no";
      case AMD64CondB:      return "b";
      case AMD64CondNB:     return "ae"; /*"nb";*/
      case AMD64CondZ:      return "e"; /*"z";*/
      case AMD64CondNZ:     return "ne"; /*"nz";*/
      case AMD64CondBE:     return "be";
      case AMD64CondNBE:    return "a"; /*"nbe";*/
      case AMD64CondS:      return "s";
      case AMD64CondNS:     return "ns";
      case AMD64CondP:      return "p";
      case AMD64CondNP:     return "np";
      case AMD64CondL:      return "l";
      case AMD64CondNL:     return "ge"; /*"nl";*/
      case AMD64CondLE:     return "le";
      case AMD64CondNLE:    return "g"; /*"nle";*/
      case AMD64CondAlways: return "ALWAYS";
      default: vpanic("name_AMD64Condcode");
   }
}

static
AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
                                          /*OUT*/Bool*   needInvert )
{
   vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   if (cond & 1) {
      *needInvert = True;
      return cond-1;
   } else {
      *needInvert = False;
      return cond;
   }
}


/* -------------- Helpers for ADD/SUB with carry. -------------- */

/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   appropriately.

   Optionally, generate a store for the 'tres' value.  This can either
   be a normal store, or it can be a cas-with-possible-failure style
   store:

   if taddr is IRTemp_INVALID, then no store is generated.

   if taddr is not IRTemp_INVALID, then a store (using taddr as
   the address) is generated:

     if texpVal is IRTemp_INVALID then a normal store is
     generated, and restart_point must be zero (it is irrelevant).

     if texpVal is not IRTemp_INVALID then a cas-style store is
     generated.  texpVal is the expected value, restart_point
     is the restart point if the store fails, and texpVal must
     have the same type as tres.

*/
static void helper_ADC ( Int sz,
                         IRTemp tres, IRTemp ta1, IRTemp ta2,
                         /* info about optional store: */
                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
{
   UInt    thunkOp;
   IRType  ty    = szToITy(sz);
   IRTemp  oldc  = newTemp(Ity_I64);
   IRTemp  oldcn = newTemp(ty);
   IROp    plus  = mkSizedOp(ty, Iop_Add8);
   IROp    xor   = mkSizedOp(ty, Iop_Xor8);

   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);

   switch (sz) {
      case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
      case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
      case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
      case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
      default: vassert(0);
   }

   /* oldc = old carry flag, 0 or 1 */
   assign( oldc,  binop(Iop_And64,
                        mk_amd64g_calculate_rflags_c(),
                        mkU64(1)) );

   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );

   assign( tres, binop(plus,
                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
                       mkexpr(oldcn)) );

   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
      start of this function. */
   if (taddr != IRTemp_INVALID) {
      if (texpVal == IRTemp_INVALID) {
         vassert(restart_point == 0);
         storeLE( mkexpr(taddr), mkexpr(tres) );
      } else {
         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
         /* .. and hence 'texpVal' has the same type as 'tres'. */
         casLE( mkexpr(taddr),
                mkexpr(texpVal), mkexpr(tres), restart_point );
      }
   }

   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
                                                         mkexpr(oldcn)) )) );
   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
}


/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   appropriately.  As with helper_ADC, possibly generate a store of
   the result -- see comments on helper_ADC for details.
*/
static void helper_SBB ( Int sz,
                         IRTemp tres, IRTemp ta1, IRTemp ta2,
                         /* info about optional store: */
                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
{
   UInt    thunkOp;
   IRType  ty    = szToITy(sz);
   IRTemp  oldc  = newTemp(Ity_I64);
   IRTemp  oldcn = newTemp(ty);
   IROp    minus = mkSizedOp(ty, Iop_Sub8);
   IROp    xor   = mkSizedOp(ty, Iop_Xor8);

   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);

   switch (sz) {
      case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
      case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
      case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
      case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
      default: vassert(0);
   }

   /* oldc = old carry flag, 0 or 1 */
   assign( oldc, binop(Iop_And64,
                       mk_amd64g_calculate_rflags_c(),
                       mkU64(1)) );

   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );

   assign( tres, binop(minus,
                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
                       mkexpr(oldcn)) );

   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
      start of this function. */
   if (taddr != IRTemp_INVALID) {
      if (texpVal == IRTemp_INVALID) {
         vassert(restart_point == 0);
         storeLE( mkexpr(taddr), mkexpr(tres) );
      } else {
         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
         /* .. and hence 'texpVal' has the same type as 'tres'. */
         casLE( mkexpr(taddr),
                mkexpr(texpVal), mkexpr(tres), restart_point );
      }
   }

   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
                                                         mkexpr(oldcn)) )) );
   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
}


/* -------------- Helpers for disassembly printing. -------------- */

static HChar* nameGrp1 ( Int opc_aux )
{
   static HChar* grp1_names[8]
     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   return grp1_names[opc_aux];
}

static HChar* nameGrp2 ( Int opc_aux )
{
   static HChar* grp2_names[8]
     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   return grp2_names[opc_aux];
}

static HChar* nameGrp4 ( Int opc_aux )
{
   static HChar* grp4_names[8]
     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   return grp4_names[opc_aux];
}

static HChar* nameGrp5 ( Int opc_aux )
{
   static HChar* grp5_names[8]
     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   return grp5_names[opc_aux];
}

static HChar* nameGrp8 ( Int opc_aux )
{
   static HChar* grp8_names[8]
      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   return grp8_names[opc_aux];
}

//.. static HChar* nameSReg ( UInt sreg )
//.. {
//..    switch (sreg) {
//..       case R_ES: return "%es";
//..       case R_CS: return "%cs";
//..       case R_SS: return "%ss";
//..       case R_DS: return "%ds";
//..       case R_FS: return "%fs";
//..       case R_GS: return "%gs";
//..       default: vpanic("nameSReg(x86)");
//..    }
//.. }

static HChar* nameMMXReg ( Int mmxreg )
{
   static HChar* mmx_names[8]
     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   return mmx_names[mmxreg];
}

static HChar* nameXMMReg ( Int xmmreg )
{
   static HChar* xmm_names[16]
     = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
         "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
         "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
         "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   return xmm_names[xmmreg];
}

static HChar* nameMMXGran ( Int gran )
{
   switch (gran) {
      case 0: return "b";
      case 1: return "w";
      case 2: return "d";
      case 3: return "q";
      default: vpanic("nameMMXGran(amd64,guest)");
   }
}

static HChar nameISize ( Int size )
{
   switch (size) {
      case 8: return 'q';
      case 4: return 'l';
      case 2: return 'w';
      case 1: return 'b';
      default: vpanic("nameISize(amd64)");
   }
}

static HChar* nameYMMReg ( Int ymmreg )
{
   static HChar* ymm_names[16]
     = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
         "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
         "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
         "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
   if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
   return ymm_names[ymmreg];
}


/*------------------------------------------------------------*/
/*--- JMP helpers                                          ---*/
/*------------------------------------------------------------*/

static void jmp_lit( /*MOD*/DisResult* dres,
                     IRJumpKind kind, Addr64 d64 )
{
   vassert(dres->whatNext    == Dis_Continue);
   vassert(dres->len         == 0);
   vassert(dres->continueAt  == 0);
   vassert(dres->jk_StopHere == Ijk_INVALID);
   dres->whatNext    = Dis_StopHere;
   dres->jk_StopHere = kind;
   stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
}

static void jmp_treg( /*MOD*/DisResult* dres,
                      IRJumpKind kind, IRTemp t )
{
   vassert(dres->whatNext    == Dis_Continue);
   vassert(dres->len         == 0);
   vassert(dres->continueAt  == 0);
   vassert(dres->jk_StopHere == Ijk_INVALID);
   dres->whatNext    = Dis_StopHere;
   dres->jk_StopHere = kind;
   stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
}

static
void jcc_01 ( /*MOD*/DisResult* dres,
              AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
{
   Bool          invert;
   AMD64Condcode condPos;
   vassert(dres->whatNext    == Dis_Continue);
   vassert(dres->len         == 0);
   vassert(dres->continueAt  == 0);
   vassert(dres->jk_StopHere == Ijk_INVALID);
   dres->whatNext    = Dis_StopHere;
   dres->jk_StopHere = Ijk_Boring;
   condPos = positiveIse_AMD64Condcode ( cond, &invert );
   if (invert) {
      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
                         Ijk_Boring,
                         IRConst_U64(d64_false),
                         OFFB_RIP ) );
      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
   } else {
      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
                         Ijk_Boring,
                         IRConst_U64(d64_true),
                         OFFB_RIP ) );
      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
   }
}

/* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   guest address of the next instruction to be executed.

   This function generates an AbiHint to say that -128(%rsp)
   .. -1(%rsp) should now be regarded as uninitialised.
*/
static
void make_redzone_AbiHint ( VexAbiInfo* vbi,
                            IRTemp new_rsp, IRTemp nia, HChar* who )
{
   Int szB = vbi->guest_stack_redzone_size;
   vassert(szB >= 0);

   /* A bit of a kludge.  Currently the only AbI we've guested AMD64
      for is ELF.  So just check it's the expected 128 value
      (paranoia). */
   vassert(szB == 128);

   if (0) vex_printf("AbiHint: %s\n", who);
   vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   if (szB > 0)
      stmt( IRStmt_AbiHint(
               binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
               szB,
               mkexpr(nia)
            ));
}


/*------------------------------------------------------------*/
/*--- Disassembling addressing modes                       ---*/
/*------------------------------------------------------------*/

static
HChar* segRegTxt ( Prefix pfx )
{
   if (pfx & PFX_CS) return "%cs:";
   if (pfx & PFX_DS) return "%ds:";
   if (pfx & PFX_ES) return "%es:";
   if (pfx & PFX_FS) return "%fs:";
   if (pfx & PFX_GS) return "%gs:";
   if (pfx & PFX_SS) return "%ss:";
   return ""; /* no override */
}


/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   linear address by adding any required segment override as indicated
   by sorb, and also dealing with any address size override
   present. */
static
IRExpr* handleAddrOverrides ( VexAbiInfo* vbi,
                              Prefix pfx, IRExpr* virtual )
{
   /* --- segment overrides --- */
   if (pfx & PFX_FS) {
      if (vbi->guest_amd64_assume_fs_is_zero) {
         /* Note that this is a linux-kernel specific hack that relies
            on the assumption that %fs is always zero. */
         /* return virtual + guest_FS_ZERO. */
         virtual = binop(Iop_Add64, virtual,
                                    IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
      } else {
         unimplemented("amd64 %fs segment override");
      }
   }

   if (pfx & PFX_GS) {
      if (vbi->guest_amd64_assume_gs_is_0x60) {
         /* Note that this is a darwin-kernel specific hack that relies
            on the assumption that %gs is always 0x60. */
         /* return virtual + guest_GS_0x60. */
         virtual = binop(Iop_Add64, virtual,
                                    IRExpr_Get(OFFB_GS_0x60, Ity_I64));
      } else {
         unimplemented("amd64 %gs segment override");
      }
   }

   /* cs, ds, es and ss are simply ignored in 64-bit mode. */

   /* --- address size override --- */
   if (haveASO(pfx))
      virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));

   return virtual;
}

//.. {
//..    Int    sreg;
//..    IRType hWordTy;
//..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
//..
//..    if (sorb == 0)
//..       /* the common case - no override */
//..       return virtual;
//..
//..    switch (sorb) {
//..       case 0x3E: sreg = R_DS; break;
//..       case 0x26: sreg = R_ES; break;
//..       case 0x64: sreg = R_FS; break;
//..       case 0x65: sreg = R_GS; break;
//..       default: vpanic("handleAddrOverrides(x86,guest)");
//..    }
//..
//..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
//..
//..    seg_selector = newTemp(Ity_I32);
//..    ldt_ptr      = newTemp(hWordTy);
//..    gdt_ptr      = newTemp(hWordTy);
//..    r64          = newTemp(Ity_I64);
//..
//..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
//..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
//..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
//..
//..    /*
//..    Call this to do the translation and limit checks:
//..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
//..                                  UInt seg_selector, UInt virtual_addr )
//..    */
//..    assign(
//..       r64,
//..       mkIRExprCCall(
//..          Ity_I64,
//..          0/*regparms*/,
//..          "x86g_use_seg_selector",
//..          &x86g_use_seg_selector,
//..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
//..                         mkexpr(seg_selector), virtual)
//..       )
//..    );
//..
//..    /* If the high 32 of the result are non-zero, there was a
//..       failure in address translation.  In which case, make a
//..       quick exit.
//..    */
//..    stmt(
//..       IRStmt_Exit(
//..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
//..          Ijk_MapFail,
//..          IRConst_U32( guest_eip_curr_instr )
//..       )
//..    );
//..
//..    /* otherwise, here's the translated result. */
//..    return unop(Iop_64to32, mkexpr(r64));
//.. }


/* Generate IR to calculate an address indicated by a ModRM and
   following SIB bytes.  The expression, and the number of bytes in
   the address mode, are returned (the latter in *len).  Note that
   this fn should not be called if the R/M part of the address denotes
   a register instead of memory.  If print_codegen is true, text of
   the addressing mode is placed in buf.

   The computed address is stored in a new tempreg, and the
   identity of the tempreg is returned.

   extra_bytes holds the number of bytes after the amode, as supplied
   by the caller.  This is needed to make sense of %rip-relative
   addresses.  Note that the value that *len is set to is only the
   length of the amode itself and does not include the value supplied
   in extra_bytes.
 */

static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
{
   IRTemp tmp = newTemp(Ity_I64);
   assign( tmp, addr64 );
   return tmp;
}

static
IRTemp disAMode ( /*OUT*/Int* len,
                  VexAbiInfo* vbi, Prefix pfx, Long delta,
                  /*OUT*/HChar* buf, Int extra_bytes )
{
   UChar mod_reg_rm = getUChar(delta);
   delta++;

   buf[0] = (UChar)0;
   vassert(extra_bytes >= 0 && extra_bytes < 10);

   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
      jump table seems a bit excessive.
   */
   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
                                               /* is now XX0XXYYY */
   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   switch (mod_reg_rm) {

      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
      */
      case 0x00: case 0x01: case 0x02: case 0x03:
      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
         { UChar rm = toUChar(mod_reg_rm & 7);
           DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
           *len = 1;
           return disAMode_copy2tmp(
                  handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
         }

      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
      */
      case 0x08: case 0x09: case 0x0A: case 0x0B:
      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
         { UChar rm = toUChar(mod_reg_rm & 7);
           Long d   = getSDisp8(delta);
           if (d == 0) {
              DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
           } else {
              DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
           }
           *len = 2;
           return disAMode_copy2tmp(
                  handleAddrOverrides(vbi, pfx,
                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
         }

      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
      */
      case 0x10: case 0x11: case 0x12: case 0x13:
      /* ! 14 */ case 0x15: case 0x16: case 0x17:
         { UChar rm = toUChar(mod_reg_rm & 7);
           Long  d  = getSDisp32(delta);
           DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
           *len = 5;
           return disAMode_copy2tmp(
                  handleAddrOverrides(vbi, pfx,
                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
         }

      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
      case 0x18: case 0x19: case 0x1A: case 0x1B:
      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
         vpanic("disAMode(amd64): not an addr!");

      /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
         correctly at the start of handling each instruction. */
      case 0x05:
         { Long d = getSDisp32(delta);
           *len = 5;
           DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
           /* We need to know the next instruction's start address.
              Try and figure out what it is, record the guess, and ask
              the top-level driver logic (bbToIR_AMD64) to check we
              guessed right, after the instruction is completely
              decoded. */
           guest_RIP_next_mustcheck = True;
           guest_RIP_next_assumed = guest_RIP_bbstart
                                    + delta+4 + extra_bytes;
           return disAMode_copy2tmp(
                     handleAddrOverrides(vbi, pfx,
                        binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
                                         mkU64(d))));
         }

      case 0x04: {
         /* SIB, with no displacement.  Special cases:
            -- %rsp cannot act as an index value.
               If index_r indicates %rsp, zero is used for the index.
            -- when mod is zero and base indicates RBP or R13, base is
               instead a 32-bit sign-extended literal.
            It's all madness, I tell you.  Extract %index, %base and
            scale from the SIB byte.  The value denoted is then:
               | %index == %RSP && (%base == %RBP || %base == %R13)
               = d32 following SIB byte
               | %index == %RSP && !(%base == %RBP || %base == %R13)
               = %base
               | %index != %RSP && (%base == %RBP || %base == %R13)
               = d32 following SIB byte + (%index << scale)
               | %index != %RSP && !(%base == %RBP || %base == %R13)
               = %base + (%index << scale)
         */
         UChar sib     = getUChar(delta);
         UChar scale   = toUChar((sib >> 6) & 3);
         UChar index_r = toUChar((sib >> 3) & 7);
         UChar base_r  = toUChar(sib & 7);
         /* correct since #(R13) == 8 + #(RBP) */
         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
         Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
         delta++;

         if ((!index_is_SP) && (!base_is_BPor13)) {
            if (scale == 0) {
               DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
                         nameIRegRexB(8,pfx,base_r),
                         nameIReg64rexX(pfx,index_r));
            } else {
               DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
                         nameIRegRexB(8,pfx,base_r),
                         nameIReg64rexX(pfx,index_r), 1<<scale);
            }
            *len = 2;
            return
               disAMode_copy2tmp(
               handleAddrOverrides(vbi, pfx,
                  binop(Iop_Add64,
                        getIRegRexB(8,pfx,base_r),
                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
                              mkU8(scale)))));
         }

         if ((!index_is_SP) && base_is_BPor13) {
            Long d = getSDisp32(delta);
            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
                      nameIReg64rexX(pfx,index_r), 1<<scale);
            *len = 6;
            return
               disAMode_copy2tmp(
               handleAddrOverrides(vbi, pfx,
                  binop(Iop_Add64,
                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
                                         mkU8(scale)),
                        mkU64(d))));
         }

         if (index_is_SP && (!base_is_BPor13)) {
            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
            *len = 2;
            return disAMode_copy2tmp(
                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
         }

         if (index_is_SP && base_is_BPor13) {
            Long d = getSDisp32(delta);
            DIS(buf, "%s%lld", segRegTxt(pfx), d);
            *len = 6;
            return disAMode_copy2tmp(
                   handleAddrOverrides(vbi, pfx, mkU64(d)));
         }

         vassert(0);
      }

      /* SIB, with 8-bit displacement.  Special cases:
         -- %esp cannot act as an index value.
            If index_r indicates %esp, zero is used for the index.
         Denoted value is:
            | %index == %ESP
            = d8 + %base
            | %index != %ESP
            = d8 + %base + (%index << scale)
      */
      case 0x0C: {
         UChar sib     = getUChar(delta);
         UChar scale   = toUChar((sib >> 6) & 3);
         UChar index_r = toUChar((sib >> 3) & 7);
         UChar base_r  = toUChar(sib & 7);
         Long d        = getSDisp8(delta+1);

         if (index_r == R_RSP && 0==getRexX(pfx)) {
            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
                                   d, nameIRegRexB(8,pfx,base_r));
            *len = 3;
            return disAMode_copy2tmp(
                   handleAddrOverrides(vbi, pfx,
                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
         } else {
            if (scale == 0) {
               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
                         nameIRegRexB(8,pfx,base_r),
                         nameIReg64rexX(pfx,index_r));
            } else {
               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
                         nameIRegRexB(8,pfx,base_r),
                         nameIReg64rexX(pfx,index_r), 1<<scale);
            }
            *len = 3;
            return
                disAMode_copy2tmp(
                handleAddrOverrides(vbi, pfx,
                  binop(Iop_Add64,
                        binop(Iop_Add64,
                              getIRegRexB(8,pfx,base_r),
                              binop(Iop_Shl64,
                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
                        mkU64(d))));
         }
         vassert(0); /*NOTREACHED*/
      }

      /* SIB, with 32-bit displacement.  Special cases:
         -- %rsp cannot act as an index value.
            If index_r indicates %rsp, zero is used for the index.
         Denoted value is:
            | %index == %RSP
            = d32 + %base
            | %index != %RSP
            = d32 + %base + (%index << scale)
      */
      case 0x14: {
         UChar sib     = getUChar(delta);
         UChar scale   = toUChar((sib >> 6) & 3);
         UChar index_r = toUChar((sib >> 3) & 7);
         UChar base_r  = toUChar(sib & 7);
         Long d        = getSDisp32(delta+1);

         if (index_r == R_RSP && 0==getRexX(pfx)) {
            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
                                   d, nameIRegRexB(8,pfx,base_r));
            *len = 6;
            return disAMode_copy2tmp(
                   handleAddrOverrides(vbi, pfx,
                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
         } else {
            if (scale == 0) {
               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
                         nameIRegRexB(8,pfx,base_r),
                         nameIReg64rexX(pfx,index_r));
            } else {
               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
                         nameIRegRexB(8,pfx,base_r),
                         nameIReg64rexX(pfx,index_r), 1<<scale);
            }
            *len = 6;
            return
                disAMode_copy2tmp(
                handleAddrOverrides(vbi, pfx,
                  binop(Iop_Add64,
                        binop(Iop_Add64,
                              getIRegRexB(8,pfx,base_r),
                              binop(Iop_Shl64,
                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
                        mkU64(d))));
         }
         vassert(0); /*NOTREACHED*/
      }

      default:
         vpanic("disAMode(amd64)");
         return 0; /*notreached*/
   }
}


/* Figure out the number of (insn-stream) bytes constituting the amode
   beginning at delta.  Is useful for getting hold of literals beyond
   the end of the amode before it has been disassembled.  */

static UInt lengthAMode ( Prefix pfx, Long delta )
{
   UChar mod_reg_rm = getUChar(delta);
   delta++;

   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
      jump table seems a bit excessive.
   */
   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
                                               /* is now XX0XXYYY */
   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   switch (mod_reg_rm) {

      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
      */
      case 0x00: case 0x01: case 0x02: case 0x03:
      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
         return 1;

      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
      */
      case 0x08: case 0x09: case 0x0A: case 0x0B:
      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
         return 2;

      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
      */
      case 0x10: case 0x11: case 0x12: case 0x13:
      /* ! 14 */ case 0x15: case 0x16: case 0x17:
         return 5;

      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
      /* Not an address, but still handled. */
      case 0x18: case 0x19: case 0x1A: case 0x1B:
      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
         return 1;

      /* RIP + disp32. */
      case 0x05:
         return 5;

      case 0x04: {
         /* SIB, with no displacement. */
         UChar sib     = getUChar(delta);
         UChar base_r  = toUChar(sib & 7);
         /* correct since #(R13) == 8 + #(RBP) */
         Bool  base_is_BPor13 = toBool(base_r == R_RBP);

         if (base_is_BPor13) {
            return 6;
         } else {
            return 2;
         }
      }

      /* SIB, with 8-bit displacement. */
      case 0x0C:
         return 3;

      /* SIB, with 32-bit displacement. */
      case 0x14:
         return 6;

      default:
         vpanic("lengthAMode(amd64)");
         return 0; /*notreached*/
   }
}


/*------------------------------------------------------------*/
/*--- Disassembling common idioms                          ---*/
/*------------------------------------------------------------*/

/* Handle binary integer instructions of the form
      op E, G  meaning
      op reg-or-mem, reg
   Is passed the a ptr to the modRM byte, the actual operation, and the
   data size.  Returns the address advanced completely over this
   instruction.

   E(src) is reg-or-mem
   G(dst) is reg.

   If E is reg, -->    GET %G,  tmp
                       OP %E,   tmp
                       PUT tmp, %G

   If E is mem and OP is not reversible,
                -->    (getAddr E) -> tmpa
                       LD (tmpa), tmpa
                       GET %G, tmp2
                       OP tmpa, tmp2
                       PUT tmp2, %G

   If E is mem and OP is reversible
                -->    (getAddr E) -> tmpa
                       LD (tmpa), tmpa
                       OP %G, tmpa
                       PUT tmpa, %G
*/
static
ULong dis_op2_E_G ( VexAbiInfo* vbi,
                    Prefix      pfx,
                    Bool        addSubCarry,
                    IROp        op8,
                    Bool        keep,
                    Int         size,
                    Long        delta0,
                    HChar*      t_amd64opc )
{
   HChar   dis_buf[50];
   Int     len;
   IRType  ty   = szToITy(size);
   IRTemp  dst1 = newTemp(ty);
   IRTemp  src  = newTemp(ty);
   IRTemp  dst0 = newTemp(ty);
   UChar   rm   = getUChar(delta0);
   IRTemp  addr = IRTemp_INVALID;

   /* addSubCarry == True indicates the intended operation is
      add-with-carry or subtract-with-borrow. */
   if (addSubCarry) {
      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
      vassert(keep);
   }

   if (epartIsReg(rm)) {
      /* Specially handle XOR reg,reg, because that doesn't really
         depend on reg, and doing the obvious thing potentially
         generates a spurious value check failure due to the bogus
         dependency. */
      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
         if (False && op8 == Iop_Sub8)
            vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
	 putIRegG(size,pfx,rm, mkU(ty,0));
      }

      assign( dst0, getIRegG(size,pfx,rm) );
      assign( src,  getIRegE(size,pfx,rm) );

      if (addSubCarry && op8 == Iop_Add8) {
         helper_ADC( size, dst1, dst0, src,
                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
         putIRegG(size, pfx, rm, mkexpr(dst1));
      } else
      if (addSubCarry && op8 == Iop_Sub8) {
         helper_SBB( size, dst1, dst0, src,
                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
         putIRegG(size, pfx, rm, mkexpr(dst1));
      } else {
         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
         if (isAddSub(op8))
            setFlags_DEP1_DEP2(op8, dst0, src, ty);
         else
            setFlags_DEP1(op8, dst1, ty);
         if (keep)
            putIRegG(size, pfx, rm, mkexpr(dst1));
      }

      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
                          nameIRegE(size,pfx,rm),
                          nameIRegG(size,pfx,rm));
      return 1+delta0;
   } else {
      /* E refers to memory */
      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
      assign( dst0, getIRegG(size,pfx,rm) );
      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );

      if (addSubCarry && op8 == Iop_Add8) {
         helper_ADC( size, dst1, dst0, src,
                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
         putIRegG(size, pfx, rm, mkexpr(dst1));
      } else
      if (addSubCarry && op8 == Iop_Sub8) {
         helper_SBB( size, dst1, dst0, src,
                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
         putIRegG(size, pfx, rm, mkexpr(dst1));
      } else {
         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
         if (isAddSub(op8))
            setFlags_DEP1_DEP2(op8, dst0, src, ty);
         else
            setFlags_DEP1(op8, dst1, ty);
         if (keep)
            putIRegG(size, pfx, rm, mkexpr(dst1));
      }

      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
                          dis_buf, nameIRegG(size, pfx, rm));
      return len+delta0;
   }
}


/* Handle binary integer instructions of the form
      op G, E  meaning
      op reg, reg-or-mem
   Is passed the a ptr to the modRM byte, the actual operation, and the
   data size.  Returns the address advanced completely over this
   instruction.

   G(src) is reg.
   E(dst) is reg-or-mem

   If E is reg, -->    GET %E,  tmp
                       OP %G,   tmp
                       PUT tmp, %E

   If E is mem, -->    (getAddr E) -> tmpa
                       LD (tmpa), tmpv
                       OP %G, tmpv
                       ST tmpv, (tmpa)
*/
static
ULong dis_op2_G_E ( VexAbiInfo* vbi,
                    Prefix      pfx,
                    Bool        addSubCarry,
                    IROp        op8,
                    Bool        keep,
                    Int         size,
                    Long        delta0,
                    HChar*      t_amd64opc )
{
   HChar   dis_buf[50];
   Int     len;
   IRType  ty   = szToITy(size);
   IRTemp  dst1 = newTemp(ty);
   IRTemp  src  = newTemp(ty);
   IRTemp  dst0 = newTemp(ty);
   UChar   rm   = getUChar(delta0);
   IRTemp  addr = IRTemp_INVALID;

   /* addSubCarry == True indicates the intended operation is
      add-with-carry or subtract-with-borrow. */
   if (addSubCarry) {
      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
      vassert(keep);
   }

   if (epartIsReg(rm)) {
      /* Specially handle XOR reg,reg, because that doesn't really
         depend on reg, and doing the obvious thing potentially
         generates a spurious value check failure due to the bogus
         dependency.  Ditto SBB reg,reg. */
      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
         putIRegE(size,pfx,rm, mkU(ty,0));
      }

      assign(dst0, getIRegE(size,pfx,rm));
      assign(src,  getIRegG(size,pfx,rm));

      if (addSubCarry && op8 == Iop_Add8) {
         helper_ADC( size, dst1, dst0, src,
                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
         putIRegE(size, pfx, rm, mkexpr(dst1));
      } else
      if (addSubCarry && op8 == Iop_Sub8) {
         helper_SBB( size, dst1, dst0, src,
                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
         putIRegE(size, pfx, rm, mkexpr(dst1));
      } else {
         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
         if (isAddSub(op8))
            setFlags_DEP1_DEP2(op8, dst0, src, ty);
         else
            setFlags_DEP1(op8, dst1, ty);
         if (keep)
            putIRegE(size, pfx, rm, mkexpr(dst1));
      }

      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
                          nameIRegG(size,pfx,rm),
                          nameIRegE(size,pfx,rm));
      return 1+delta0;
   }

   /* E refers to memory */
   {
      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
      assign(dst0, loadLE(ty,mkexpr(addr)));
      assign(src,  getIRegG(size,pfx,rm));

      if (addSubCarry && op8 == Iop_Add8) {
         if (pfx & PFX_LOCK) {
            /* cas-style store */
            helper_ADC( size, dst1, dst0, src,
                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
         } else {
            /* normal store */
            helper_ADC( size, dst1, dst0, src,
                        /*store*/addr, IRTemp_INVALID, 0 );
         }
      } else
      if (addSubCarry && op8 == Iop_Sub8) {
         if (pfx & PFX_LOCK) {
            /* cas-style store */
            helper_SBB( size, dst1, dst0, src,
                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
         } else {
            /* normal store */
            helper_SBB( size, dst1, dst0, src,
                        /*store*/addr, IRTemp_INVALID, 0 );
         }
      } else {
         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
         if (keep) {
            if (pfx & PFX_LOCK) {
               if (0) vex_printf("locked case\n" );
               casLE( mkexpr(addr),
                      mkexpr(dst0)/*expval*/,
                      mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
            } else {
               if (0) vex_printf("nonlocked case\n");
               storeLE(mkexpr(addr), mkexpr(dst1));
            }
         }
         if (isAddSub(op8))
            setFlags_DEP1_DEP2(op8, dst0, src, ty);
         else
            setFlags_DEP1(op8, dst1, ty);
      }

      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
                          nameIRegG(size,pfx,rm), dis_buf);
      return len+delta0;
   }
}


/* Handle move instructions of the form
      mov E, G  meaning
      mov reg-or-mem, reg
   Is passed the a ptr to the modRM byte, and the data size.  Returns
   the address advanced completely over this instruction.

   E(src) is reg-or-mem
   G(dst) is reg.

   If E is reg, -->    GET %E,  tmpv
                       PUT tmpv, %G

   If E is mem  -->    (getAddr E) -> tmpa
                       LD (tmpa), tmpb
                       PUT tmpb, %G
*/
static
ULong dis_mov_E_G ( VexAbiInfo* vbi,
                    Prefix      pfx,
                    Int         size,
                    Long        delta0 )
{
   Int len;
   UChar rm = getUChar(delta0);
   HChar dis_buf[50];

   if (epartIsReg(rm)) {
      putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
      DIP("mov%c %s,%s\n", nameISize(size),
                           nameIRegE(size,pfx,rm),
                           nameIRegG(size,pfx,rm));
      return 1+delta0;
   }

   /* E refers to memory */
   {
      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
      putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
      DIP("mov%c %s,%s\n", nameISize(size),
                           dis_buf,
                           nameIRegG(size,pfx,rm));
      return delta0+len;
   }
}


/* Handle move instructions of the form
      mov G, E  meaning
      mov reg, reg-or-mem
   Is passed the a ptr to the modRM byte, and the data size.  Returns
   the address advanced completely over this instruction.

   G(src) is reg.
   E(dst) is reg-or-mem

   If E is reg, -->    GET %G,  tmp
                       PUT tmp, %E

   If E is mem, -->    (getAddr E) -> tmpa
                       GET %G, tmpv
                       ST tmpv, (tmpa)
*/
static
ULong dis_mov_G_E ( VexAbiInfo* vbi,
                    Prefix      pfx,
                    Int         size,
                    Long        delta0 )
{
   Int len;
   UChar rm = getUChar(delta0);
   HChar dis_buf[50];

   if (epartIsReg(rm)) {
      putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
      DIP("mov%c %s,%s\n", nameISize(size),
                           nameIRegG(size,pfx,rm),
                           nameIRegE(size,pfx,rm));
      return 1+delta0;
   }

   /* E refers to memory */
   {
      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
      storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
      DIP("mov%c %s,%s\n", nameISize(size),
                           nameIRegG(size,pfx,rm),
                           dis_buf);
      return len+delta0;
   }
}


/* op $immediate, AL/AX/EAX/RAX. */
static
ULong dis_op_imm_A ( Int    size,
                     Bool   carrying,
                     IROp   op8,
                     Bool   keep,
                     Long   delta,
                     HChar* t_amd64opc )
{
   Int    size4 = imin(size,4);
   IRType ty    = szToITy(size);
   IRTemp dst0  = newTemp(ty);
   IRTemp src   = newTemp(ty);
   IRTemp dst1  = newTemp(ty);
   Long  lit    = getSDisp(size4,delta);
   assign(dst0, getIRegRAX(size));
   assign(src,  mkU(ty,lit & mkSizeMask(size)));

   if (isAddSub(op8) && !carrying) {
      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
      setFlags_DEP1_DEP2(op8, dst0, src, ty);
   }
   else
   if (isLogic(op8)) {
      vassert(!carrying);
      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
      setFlags_DEP1(op8, dst1, ty);
   }
   else
   if (op8 == Iop_Add8 && carrying) {
      helper_ADC( size, dst1, dst0, src,
                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   }
   else
   if (op8 == Iop_Sub8 && carrying) {
      helper_SBB( size, dst1, dst0, src,
                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   }
   else
      vpanic("dis_op_imm_A(amd64,guest)");

   if (keep)
      putIRegRAX(size, mkexpr(dst1));

   DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
                           lit, nameIRegRAX(size));
   return delta+size4;
}


/* Sign- and Zero-extending moves. */
static
ULong dis_movx_E_G ( VexAbiInfo* vbi,
                     Prefix pfx,
                     Long delta, Int szs, Int szd, Bool sign_extend )
{
   UChar rm = getUChar(delta);
   if (epartIsReg(rm)) {
      putIRegG(szd, pfx, rm,
                    doScalarWidening(
                       szs,szd,sign_extend,
                       getIRegE(szs,pfx,rm)));
      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
                               nameISize(szs),
                               nameISize(szd),
                               nameIRegE(szs,pfx,rm),
                               nameIRegG(szd,pfx,rm));
      return 1+delta;
   }

   /* E refers to memory */
   {
      Int    len;
      HChar  dis_buf[50];
      IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
      putIRegG(szd, pfx, rm,
                    doScalarWidening(
                       szs,szd,sign_extend,
                       loadLE(szToITy(szs),mkexpr(addr))));
      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
                               nameISize(szs),
                               nameISize(szd),
                               dis_buf,
                               nameIRegG(szd,pfx,rm));
      return len+delta;
   }
}


/* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
static
void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
{
   /* special-case the 64-bit case */
   if (sz == 8) {
      IROp   op     = signed_divide ? Iop_DivModS128to64
                                    : Iop_DivModU128to64;
      IRTemp src128 = newTemp(Ity_I128);
      IRTemp dst128 = newTemp(Ity_I128);
      assign( src128, binop(Iop_64HLto128,
                            getIReg64(R_RDX),
                            getIReg64(R_RAX)) );
      assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
      putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
      putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   } else {
      IROp   op    = signed_divide ? Iop_DivModS64to32
                                   : Iop_DivModU64to32;
      IRTemp src64 = newTemp(Ity_I64);
      IRTemp dst64 = newTemp(Ity_I64);
      switch (sz) {
      case 4:
         assign( src64,
                 binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
         assign( dst64,
                 binop(op, mkexpr(src64), mkexpr(t)) );
         putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
         putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
         break;
      case 2: {
         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
         assign( src64, unop(widen3264,
                             binop(Iop_16HLto32,
                                   getIRegRDX(2),
                                   getIRegRAX(2))) );
         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
         putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
         putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
         break;
      }
      case 1: {
         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
         assign( src64, unop(widen3264,
                        unop(widen1632, getIRegRAX(2))) );
         assign( dst64,
                 binop(op, mkexpr(src64),
                           unop(widen1632, unop(widen816, mkexpr(t)))) );
         putIRegRAX( 1, unop(Iop_16to8,
                        unop(Iop_32to16,
                        unop(Iop_64to32,mkexpr(dst64)))) );
         putIRegAH( unop(Iop_16to8,
                    unop(Iop_32to16,
                    unop(Iop_64HIto32,mkexpr(dst64)))) );
         break;
      }
      default:
         vpanic("codegen_div(amd64)");
      }
   }
}

static
ULong dis_Grp1 ( VexAbiInfo* vbi,
                 Prefix pfx,
                 Long delta, UChar modrm,
                 Int am_sz, Int d_sz, Int sz, Long d64 )
{
   Int     len;
   HChar   dis_buf[50];
   IRType  ty   = szToITy(sz);
   IRTemp  dst1 = newTemp(ty);
   IRTemp  src  = newTemp(ty);
   IRTemp  dst0 = newTemp(ty);
   IRTemp  addr = IRTemp_INVALID;
   IROp    op8  = Iop_INVALID;
   ULong   mask = mkSizeMask(sz);

   switch (gregLO3ofRM(modrm)) {
      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
      case 2: break;  // ADC
      case 3: break;  // SBB
      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
      /*NOTREACHED*/
      default: vpanic("dis_Grp1(amd64): unhandled case");
   }

   if (epartIsReg(modrm)) {
      vassert(am_sz == 1);

      assign(dst0, getIRegE(sz,pfx,modrm));
      assign(src,  mkU(ty,d64 & mask));

      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
         helper_ADC( sz, dst1, dst0, src,
                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
      } else
      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
         helper_SBB( sz, dst1, dst0, src,
                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
      } else {
         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
         if (isAddSub(op8))
            setFlags_DEP1_DEP2(op8, dst0, src, ty);
         else
            setFlags_DEP1(op8, dst1, ty);
      }

      if (gregLO3ofRM(modrm) < 7)
         putIRegE(sz, pfx, modrm, mkexpr(dst1));

      delta += (am_sz + d_sz);
      DIP("%s%c $%lld, %s\n",
          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
          nameIRegE(sz,pfx,modrm));
   } else {
      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );

      assign(dst0, loadLE(ty,mkexpr(addr)));
      assign(src, mkU(ty,d64 & mask));

      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
         if (pfx & PFX_LOCK) {
            /* cas-style store */
            helper_ADC( sz, dst1, dst0, src,
                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
         } else {
            /* normal store */
            helper_ADC( sz, dst1, dst0, src,
                        /*store*/addr, IRTemp_INVALID, 0 );
         }
      } else
      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
         if (pfx & PFX_LOCK) {
            /* cas-style store */
            helper_SBB( sz, dst1, dst0, src,
                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
         } else {
            /* normal store */
            helper_SBB( sz, dst1, dst0, src,
                        /*store*/addr, IRTemp_INVALID, 0 );
         }
      } else {
         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
         if (gregLO3ofRM(modrm) < 7) {
            if (pfx & PFX_LOCK) {
               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
                                    mkexpr(dst1)/*newVal*/,
                                    guest_RIP_curr_instr );
            } else {
               storeLE(mkexpr(addr), mkexpr(dst1));
            }
         }
         if (isAddSub(op8))
            setFlags_DEP1_DEP2(op8, dst0, src, ty);
         else
            setFlags_DEP1(op8, dst1, ty);
      }

      delta += (len+d_sz);
      DIP("%s%c $%lld, %s\n",
          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
          d64, dis_buf);
   }
   return delta;
}


/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   expression. */

static
ULong dis_Grp2 ( VexAbiInfo* vbi,
                 Prefix pfx,
                 Long delta, UChar modrm,
                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
                 HChar* shift_expr_txt, Bool* decode_OK )
{
   /* delta on entry points at the modrm byte. */
   HChar  dis_buf[50];
   Int    len;
   Bool   isShift, isRotate, isRotateC;
   IRType ty    = szToITy(sz);
   IRTemp dst0  = newTemp(ty);
   IRTemp dst1  = newTemp(ty);
   IRTemp addr  = IRTemp_INVALID;

   *decode_OK = True;

   vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);

   /* Put value to shift/rotate in dst0. */
   if (epartIsReg(modrm)) {
      assign(dst0, getIRegE(sz, pfx, modrm));
      delta += (am_sz + d_sz);
   } else {
      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
      assign(dst0, loadLE(ty,mkexpr(addr)));
      delta += len + d_sz;
   }

   isShift = False;
   switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }

   isRotate = False;
   switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }

   isRotateC = False;
   switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }

   if (!isShift && !isRotate && !isRotateC) {
      /*NOTREACHED*/
      vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   }

   if (isRotateC) {
      /* Call a helper; this insn is so ridiculous it does not deserve
         better.  One problem is, the helper has to calculate both the
         new value and the new flags.  This is more than 64 bits, and
         there is no way to return more than 64 bits from the helper.
         Hence the crude and obvious solution is to call it twice,
         using the sign of the sz field to indicate whether it is the
         value or rflags result we want.
      */
      Bool     left = toBool(gregLO3ofRM(modrm) == 2);
      IRExpr** argsVALUE;
      IRExpr** argsRFLAGS;

      IRTemp new_value  = newTemp(Ity_I64);
      IRTemp new_rflags = newTemp(Ity_I64);
      IRTemp old_rflags = newTemp(Ity_I64);

      assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );

      argsVALUE
         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
                          widenUto64(shift_expr),   /* rotate amount */
                          mkexpr(old_rflags),
                          mkU64(sz) );
      assign( new_value,
                 mkIRExprCCall(
                    Ity_I64,
                    0/*regparm*/,
                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
                    argsVALUE
                 )
            );

      argsRFLAGS
         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
                          widenUto64(shift_expr),   /* rotate amount */
                          mkexpr(old_rflags),
                          mkU64(-sz) );
      assign( new_rflags,
                 mkIRExprCCall(
                    Ity_I64,
                    0/*regparm*/,
                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
                    argsRFLAGS
                 )
            );

      assign( dst1, narrowTo(ty, mkexpr(new_value)) );
      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   }

   else
   if (isShift) {

      IRTemp pre64     = newTemp(Ity_I64);
      IRTemp res64     = newTemp(Ity_I64);
      IRTemp res64ss   = newTemp(Ity_I64);
      IRTemp shift_amt = newTemp(Ity_I8);
      UChar  mask      = toUChar(sz==8 ? 63 : 31);
      IROp   op64;

      switch (gregLO3ofRM(modrm)) {
         case 4: op64 = Iop_Shl64; break;
         case 5: op64 = Iop_Shr64; break;
         case 6: op64 = Iop_Shl64; break;
         case 7: op64 = Iop_Sar64; break;
         /*NOTREACHED*/
         default: vpanic("dis_Grp2:shift"); break;
      }

      /* Widen the value to be shifted to 64 bits, do the shift, and
         narrow back down.  This seems surprisingly long-winded, but
         unfortunately the AMD semantics requires that 8/16/32-bit
         shifts give defined results for shift values all the way up
         to 32, and this seems the simplest way to do it.  It has the
         advantage that the only IR level shifts generated are of 64
         bit values, and the shift amount is guaranteed to be in the
         range 0 .. 63, thereby observing the IR semantics requiring
         all shift values to be in the range 0 .. 2^word_size-1.

         Therefore the shift amount is masked with 63 for 64-bit shifts
         and 31 for all others.
      */
      /* shift_amt = shift_expr & MASK, regardless of operation size */
      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );

      /* suitably widen the value to be shifted to 64 bits. */
      assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
                                     : widenUto64(mkexpr(dst0)) );

      /* res64 = pre64 `shift` shift_amt */
      assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );

      /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
      assign( res64ss,
              binop(op64,
                    mkexpr(pre64),
                    binop(Iop_And8,
                          binop(Iop_Sub8,
                                mkexpr(shift_amt), mkU8(1)),
                          mkU8(mask))) );

      /* Build the flags thunk. */
      setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);

      /* Narrow the result back down. */
      assign( dst1, narrowTo(ty, mkexpr(res64)) );

   } /* if (isShift) */

   else
   if (isRotate) {
      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
                                        : (ty==Ity_I32 ? 2 : 3));
      Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
      IRTemp rot_amt   = newTemp(Ity_I8);
      IRTemp rot_amt64 = newTemp(Ity_I8);
      IRTemp oldFlags  = newTemp(Ity_I64);
      UChar  mask      = toUChar(sz==8 ? 63 : 31);

      /* rot_amt = shift_expr & mask */
      /* By masking the rotate amount thusly, the IR-level Shl/Shr
         expressions never shift beyond the word size and thus remain
         well defined. */
      assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));

      if (ty == Ity_I64)
         assign(rot_amt, mkexpr(rot_amt64));
      else
         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));

      if (left) {

         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
         assign(dst1,
            binop( mkSizedOp(ty,Iop_Or8),
                   binop( mkSizedOp(ty,Iop_Shl8),
                          mkexpr(dst0),
                          mkexpr(rot_amt)
                   ),
                   binop( mkSizedOp(ty,Iop_Shr8),
                          mkexpr(dst0),
                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
                   )
            )
         );
         ccOp += AMD64G_CC_OP_ROLB;

      } else { /* right */

         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
         assign(dst1,
            binop( mkSizedOp(ty,Iop_Or8),
                   binop( mkSizedOp(ty,Iop_Shr8),
                          mkexpr(dst0),
                          mkexpr(rot_amt)
                   ),
                   binop( mkSizedOp(ty,Iop_Shl8),
                          mkexpr(dst0),
                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
                   )
            )
         );
         ccOp += AMD64G_CC_OP_RORB;

      }

      /* dst1 now holds the rotated value.  Build flag thunk.  We
         need the resulting value for this, and the previous flags.
         Except don't set it if the rotate count is zero. */

      assign(oldFlags, mk_amd64g_calculate_rflags_all());

      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
      stmt( IRStmt_Put( OFFB_CC_OP,
                        IRExpr_Mux0X( mkexpr(rot_amt64),
                                      IRExpr_Get(OFFB_CC_OP,Ity_I64),
                                      mkU64(ccOp))) );
      stmt( IRStmt_Put( OFFB_CC_DEP1,
                        IRExpr_Mux0X( mkexpr(rot_amt64),
                                      IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
                                      widenUto64(mkexpr(dst1)))) );
      stmt( IRStmt_Put( OFFB_CC_DEP2,
                        IRExpr_Mux0X( mkexpr(rot_amt64),
                                      IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
                                      mkU64(0))) );
      stmt( IRStmt_Put( OFFB_CC_NDEP,
                        IRExpr_Mux0X( mkexpr(rot_amt64),
                                      IRExpr_Get(OFFB_CC_NDEP,Ity_I64),
                                      mkexpr(oldFlags))) );
   } /* if (isRotate) */

   /* Save result, and finish up. */
   if (epartIsReg(modrm)) {
      putIRegE(sz, pfx, modrm, mkexpr(dst1));
      if (vex_traceflags & VEX_TRACE_FE) {
         vex_printf("%s%c ",
                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
         if (shift_expr_txt)
            vex_printf("%s", shift_expr_txt);
         else
            ppIRExpr(shift_expr);
         vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
      }
   } else {
      storeLE(mkexpr(addr), mkexpr(dst1));
      if (vex_traceflags & VEX_TRACE_FE) {
         vex_printf("%s%c ",
                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
         if (shift_expr_txt)
            vex_printf("%s", shift_expr_txt);
         else
            ppIRExpr(shift_expr);
         vex_printf(", %s\n", dis_buf);
      }
   }
   return delta;
}


/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
static
ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
                     Prefix pfx,
                     Long delta, UChar modrm,
                     Int am_sz, Int sz, ULong src_val,
                     Bool* decode_OK )
{
   /* src_val denotes a d8.
      And delta on entry points at the modrm byte. */

   IRType ty     = szToITy(sz);
   IRTemp t2     = newTemp(Ity_I64);
   IRTemp t2m    = newTemp(Ity_I64);
   IRTemp t_addr = IRTemp_INVALID;
   HChar  dis_buf[50];
   ULong  mask;

   /* we're optimists :-) */
   *decode_OK = True;

   /* Limit src_val -- the bit offset -- to something within a word.
      The Intel docs say that literal offsets larger than a word are
      masked in this way. */
   switch (sz) {
      case 2:  src_val &= 15; break;
      case 4:  src_val &= 31; break;
      case 8:  src_val &= 63; break;
      default: *decode_OK = False; return delta;
   }

   /* Invent a mask suitable for the operation. */
   switch (gregLO3ofRM(modrm)) {
      case 4: /* BT */  mask = 0;                  break;
      case 5: /* BTS */ mask = 1ULL << src_val;    break;
      case 6: /* BTR */ mask = ~(1ULL << src_val); break;
      case 7: /* BTC */ mask = 1ULL << src_val;    break;
         /* If this needs to be extended, probably simplest to make a
            new function to handle the other cases (0 .. 3).  The
            Intel docs do however not indicate any use for 0 .. 3, so
            we don't expect this to happen. */
      default: *decode_OK = False; return delta;
   }

   /* Fetch the value to be tested and modified into t2, which is
      64-bits wide regardless of sz. */
   if (epartIsReg(modrm)) {
      vassert(am_sz == 1);
      assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
      delta += (am_sz + 1);
      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
                                nameISize(sz),
                                src_val, nameIRegE(sz,pfx,modrm));
   } else {
      Int len;
      t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
      delta  += (len+1);
      assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
                                nameISize(sz),
                                src_val, dis_buf);
   }

   /* Compute the new value into t2m, if non-BT. */
   switch (gregLO3ofRM(modrm)) {
      case 4: /* BT */
         break;
      case 5: /* BTS */
         assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
         break;
      case 6: /* BTR */
         assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
         break;
      case 7: /* BTC */
         assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
         break;
     default:
         /*NOTREACHED*/ /*the previous switch guards this*/
         vassert(0);
   }

   /* Write the result back, if non-BT. */
   if (gregLO3ofRM(modrm) != 4 /* BT */) {
      if (epartIsReg(modrm)) {
	putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
      } else {
         if (pfx & PFX_LOCK) {
            casLE( mkexpr(t_addr),
                   narrowTo(ty, mkexpr(t2))/*expd*/,
                   narrowTo(ty, mkexpr(t2m))/*new*/,
                   guest_RIP_curr_instr );
         } else {
            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
         }
      }
   }

   /* Copy relevant bit from t2 into the carry flag. */
   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   stmt( IRStmt_Put(
            OFFB_CC_DEP1,
            binop(Iop_And64,
                  binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
                  mkU64(1))
       ));
   /* Set NDEP even though it isn't used.  This makes redundant-PUT
      elimination of previous stores to this field work better. */
   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));

   return delta;
}


/* Signed/unsigned widening multiply.  Generate IR to multiply the
   value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   RDX:RAX/EDX:EAX/DX:AX/AX.
*/
static void codegen_mulL_A_D ( Int sz, Bool syned,
                               IRTemp tmp, HChar* tmp_txt )
{
   IRType ty = szToITy(sz);
   IRTemp t1 = newTemp(ty);

   assign( t1, getIRegRAX(sz) );

   switch (ty) {
      case Ity_I64: {
         IRTemp res128  = newTemp(Ity_I128);
         IRTemp resHi   = newTemp(Ity_I64);
         IRTemp resLo   = newTemp(Ity_I64);
         IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
         setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
         assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
         assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
         assign( resLo, unop(Iop_128to64,mkexpr(res128)));
         putIReg64(R_RDX, mkexpr(resHi));
         putIReg64(R_RAX, mkexpr(resLo));
         break;
      }
      case Ity_I32: {
         IRTemp res64   = newTemp(Ity_I64);
         IRTemp resHi   = newTemp(Ity_I32);
         IRTemp resLo   = newTemp(Ity_I32);
         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
         putIRegRDX(4, mkexpr(resHi));
         putIRegRAX(4, mkexpr(resLo));
         break;
      }
      case Ity_I16: {
         IRTemp res32   = newTemp(Ity_I32);
         IRTemp resHi   = newTemp(Ity_I16);
         IRTemp resLo   = newTemp(Ity_I16);
         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
         putIRegRDX(2, mkexpr(resHi));
         putIRegRAX(2, mkexpr(resLo));
         break;
      }
      case Ity_I8: {
         IRTemp res16   = newTemp(Ity_I16);
         IRTemp resHi   = newTemp(Ity_I8);
         IRTemp resLo   = newTemp(Ity_I8);
         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
         putIRegRAX(2, mkexpr(res16));
         break;
      }
      default:
         ppIRType(ty);
         vpanic("codegen_mulL_A_D(amd64)");
   }
   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
}


/* Group 3 extended opcodes. */
static
ULong dis_Grp3 ( VexAbiInfo* vbi,
                 Prefix pfx, Int sz, Long delta, Bool* decode_OK )
{
   Long    d64;
   UChar   modrm;
   HChar   dis_buf[50];
   Int     len;
   IRTemp  addr;
   IRType  ty = szToITy(sz);
   IRTemp  t1 = newTemp(ty);
   IRTemp dst1, src, dst0;
   *decode_OK = True;
   modrm = getUChar(delta);
   if (epartIsReg(modrm)) {
      switch (gregLO3ofRM(modrm)) {
         case 0: { /* TEST */
            delta++;
            d64 = getSDisp(imin(4,sz), delta);
            delta += imin(4,sz);
            dst1 = newTemp(ty);
            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
                               getIRegE(sz,pfx,modrm),
                               mkU(ty, d64 & mkSizeMask(sz))));
            setFlags_DEP1( Iop_And8, dst1, ty );
            DIP("test%c $%lld, %s\n",
                nameISize(sz), d64,
                nameIRegE(sz, pfx, modrm));
            break;
         }
         case 1:
            *decode_OK = False;
            return delta;
         case 2: /* NOT */
            delta++;
            putIRegE(sz, pfx, modrm,
                              unop(mkSizedOp(ty,Iop_Not8),
                                   getIRegE(sz, pfx, modrm)));
            DIP("not%c %s\n", nameISize(sz),
                              nameIRegE(sz, pfx, modrm));
            break;
         case 3: /* NEG */
            delta++;
            dst0 = newTemp(ty);
            src  = newTemp(ty);
            dst1 = newTemp(ty);
            assign(dst0, mkU(ty,0));
            assign(src,  getIRegE(sz, pfx, modrm));
            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
                                                       mkexpr(src)));
            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
            putIRegE(sz, pfx, modrm, mkexpr(dst1));
            DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
            break;
         case 4: /* MUL (unsigned widening) */
            delta++;
            src = newTemp(ty);
            assign(src, getIRegE(sz,pfx,modrm));
            codegen_mulL_A_D ( sz, False, src,
                               nameIRegE(sz,pfx,modrm) );
            break;
         case 5: /* IMUL (signed widening) */
            delta++;
            src = newTemp(ty);
            assign(src, getIRegE(sz,pfx,modrm));
            codegen_mulL_A_D ( sz, True, src,
                               nameIRegE(sz,pfx,modrm) );
            break;
         case 6: /* DIV */
            delta++;
            assign( t1, getIRegE(sz, pfx, modrm) );
            codegen_div ( sz, t1, False );
            DIP("div%c %s\n", nameISize(sz),
                              nameIRegE(sz, pfx, modrm));
            break;
         case 7: /* IDIV */
            delta++;
            assign( t1, getIRegE(sz, pfx, modrm) );
            codegen_div ( sz, t1, True );
            DIP("idiv%c %s\n", nameISize(sz),
                               nameIRegE(sz, pfx, modrm));
            break;
         default:
            /*NOTREACHED*/
            vpanic("Grp3(amd64,R)");
      }
   } else {
      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
                        /* we have to inform disAMode of any immediate
			   bytes used */
                        gregLO3ofRM(modrm)==0/*TEST*/
                           ? imin(4,sz)
                           : 0
                      );
      t1   = newTemp(ty);
      delta += len;
      assign(t1, loadLE(ty,mkexpr(addr)));
      switch (gregLO3ofRM(modrm)) {
         case 0: { /* TEST */
            d64 = getSDisp(imin(4,sz), delta);
            delta += imin(4,sz);
            dst1 = newTemp(ty);
            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
                               mkexpr(t1),
                               mkU(ty, d64 & mkSizeMask(sz))));
            setFlags_DEP1( Iop_And8, dst1, ty );
            DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
            break;
         }
         case 1:
            *decode_OK = False;
            return delta;
         case 2: /* NOT */
            dst1 = newTemp(ty);
            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
            if (pfx & PFX_LOCK) {
               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
                                    guest_RIP_curr_instr );
            } else {
               storeLE( mkexpr(addr), mkexpr(dst1) );
            }
            DIP("not%c %s\n", nameISize(sz), dis_buf);
            break;
         case 3: /* NEG */
            dst0 = newTemp(ty);
            src  = newTemp(ty);
            dst1 = newTemp(ty);
            assign(dst0, mkU(ty,0));
            assign(src,  mkexpr(t1));
            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
                                                       mkexpr(src)));
            if (pfx & PFX_LOCK) {
               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
                                    guest_RIP_curr_instr );
            } else {
               storeLE( mkexpr(addr), mkexpr(dst1) );
            }
            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
            DIP("neg%c %s\n", nameISize(sz), dis_buf);
            break;
         case 4: /* MUL (unsigned widening) */
            codegen_mulL_A_D ( sz, False, t1, dis_buf );
            break;
         case 5: /* IMUL */
            codegen_mulL_A_D ( sz, True, t1, dis_buf );
            break;
         case 6: /* DIV */
            codegen_div ( sz, t1, False );
            DIP("div%c %s\n", nameISize(sz), dis_buf);
            break;
         case 7: /* IDIV */
            codegen_div ( sz, t1, True );
            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
            break;
         default:
            /*NOTREACHED*/
            vpanic("Grp3(amd64,M)");
      }
   }
   return delta;
}


/* Group 4 extended opcodes. */
static
ULong dis_Grp4 ( VexAbiInfo* vbi,
                 Prefix pfx, Long delta, Bool* decode_OK )
{
   Int   alen;
   UChar modrm;
   HChar dis_buf[50];
   IRType ty = Ity_I8;
   IRTemp t1 = newTemp(ty);
   IRTemp t2 = newTemp(ty);

   *decode_OK = True;

   modrm = getUChar(delta);
   if (epartIsReg(modrm)) {
      assign(t1, getIRegE(1, pfx, modrm));
      switch (gregLO3ofRM(modrm)) {
         case 0: /* INC */
            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
            putIRegE(1, pfx, modrm, mkexpr(t2));
            setFlags_INC_DEC( True, t2, ty );
            break;
         case 1: /* DEC */
            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
            putIRegE(1, pfx, modrm, mkexpr(t2));
            setFlags_INC_DEC( False, t2, ty );
            break;
         default:
            *decode_OK = False;
            return delta;
      }
      delta++;
      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
                      nameIRegE(1, pfx, modrm));
   } else {
      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
      assign( t1, loadLE(ty, mkexpr(addr)) );
      switch (gregLO3ofRM(modrm)) {
         case 0: /* INC */
            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
            if (pfx & PFX_LOCK) {
               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
                      guest_RIP_curr_instr );
            } else {
               storeLE( mkexpr(addr), mkexpr(t2) );
            }
            setFlags_INC_DEC( True, t2, ty );
            break;
         case 1: /* DEC */
            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
            if (pfx & PFX_LOCK) {
               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
                      guest_RIP_curr_instr );
            } else {
               storeLE( mkexpr(addr), mkexpr(t2) );
            }
            setFlags_INC_DEC( False, t2, ty );
            break;
         default:
            *decode_OK = False;
            return delta;
      }
      delta += alen;
      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   }
   return delta;
}


/* Group 5 extended opcodes. */
static
ULong dis_Grp5 ( VexAbiInfo* vbi,
                 Prefix pfx, Int sz, Long delta,
                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
{
   Int     len;
   UChar   modrm;
   HChar   dis_buf[50];
   IRTemp  addr = IRTemp_INVALID;
   IRType  ty = szToITy(sz);
   IRTemp  t1 = newTemp(ty);
   IRTemp  t2 = IRTemp_INVALID;
   IRTemp  t3 = IRTemp_INVALID;
   Bool    showSz = True;

   *decode_OK = True;

   modrm = getUChar(delta);
   if (epartIsReg(modrm)) {
      assign(t1, getIRegE(sz,pfx,modrm));
      switch (gregLO3ofRM(modrm)) {
         case 0: /* INC */
            t2 = newTemp(ty);
            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
                             mkexpr(t1), mkU(ty,1)));
            setFlags_INC_DEC( True, t2, ty );
            putIRegE(sz,pfx,modrm, mkexpr(t2));
            break;
         case 1: /* DEC */
            t2 = newTemp(ty);
            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
                             mkexpr(t1), mkU(ty,1)));
            setFlags_INC_DEC( False, t2, ty );
            putIRegE(sz,pfx,modrm, mkexpr(t2));
            break;
         case 2: /* call Ev */
            /* Ignore any sz value and operate as if sz==8. */
            if (!(sz == 4 || sz == 8)) goto unhandled;
            sz = 8;
            t3 = newTemp(Ity_I64);
            assign(t3, getIRegE(sz,pfx,modrm));
            t2 = newTemp(Ity_I64);
            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
            putIReg64(R_RSP, mkexpr(t2));
            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
            jmp_treg(dres, Ijk_Call, t3);
            vassert(dres->whatNext == Dis_StopHere);
            showSz = False;
            break;
         case 4: /* jmp Ev */
            /* Ignore any sz value and operate as if sz==8. */
            if (!(sz == 4 || sz == 8)) goto unhandled;
            sz = 8;
            t3 = newTemp(Ity_I64);
            assign(t3, getIRegE(sz,pfx,modrm));
            jmp_treg(dres, Ijk_Boring, t3);
            vassert(dres->whatNext == Dis_StopHere);
            showSz = False;
            break;
         default:
            *decode_OK = False;
            return delta;
      }
      delta++;
      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
                       showSz ? nameISize(sz) : ' ',
                       nameIRegE(sz, pfx, modrm));
   } else {
      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
      if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
                                  && gregLO3ofRM(modrm) != 6) {
         assign(t1, loadLE(ty,mkexpr(addr)));
      }
      switch (gregLO3ofRM(modrm)) {
         case 0: /* INC */
            t2 = newTemp(ty);
            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
                             mkexpr(t1), mkU(ty,1)));
            if (pfx & PFX_LOCK) {
               casLE( mkexpr(addr),
                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
            } else {
               storeLE(mkexpr(addr),mkexpr(t2));
            }
            setFlags_INC_DEC( True, t2, ty );
            break;
         case 1: /* DEC */
            t2 = newTemp(ty);
            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
                             mkexpr(t1), mkU(ty,1)));
            if (pfx & PFX_LOCK) {
               casLE( mkexpr(addr),
                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
            } else {
               storeLE(mkexpr(addr),mkexpr(t2));
            }
            setFlags_INC_DEC( False, t2, ty );
            break;
         case 2: /* call Ev */
            /* Ignore any sz value and operate as if sz==8. */
            if (!(sz == 4 || sz == 8)) goto unhandled;
            sz = 8;
            t3 = newTemp(Ity_I64);
            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
            t2 = newTemp(Ity_I64);
            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
            putIReg64(R_RSP, mkexpr(t2));
            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
            jmp_treg(dres, Ijk_Call, t3);
            vassert(dres->whatNext == Dis_StopHere);
            showSz = False;
            break;
         case 4: /* JMP Ev */
            /* Ignore any sz value and operate as if sz==8. */
            if (!(sz == 4 || sz == 8)) goto unhandled;
            sz = 8;
            t3 = newTemp(Ity_I64);
            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
            jmp_treg(dres, Ijk_Boring, t3);
            vassert(dres->whatNext == Dis_StopHere);
            showSz = False;
            break;
         case 6: /* PUSH Ev */
            /* There is no encoding for 32-bit operand size; hence ... */
            if (sz == 4) sz = 8;
            if (!(sz == 8 || sz == 2)) goto unhandled;
            if (sz == 8) {
               t3 = newTemp(Ity_I64);
               assign(t3, loadLE(Ity_I64,mkexpr(addr)));
               t2 = newTemp(Ity_I64);
               assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
               putIReg64(R_RSP, mkexpr(t2) );
               storeLE( mkexpr(t2), mkexpr(t3) );
               break;
	    } else {
               goto unhandled; /* awaiting test case */
	    }
         default:
         unhandled:
            *decode_OK = False;
            return delta;
      }
      delta += len;
      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
                       showSz ? nameISize(sz) : ' ',
                       dis_buf);
   }
   return delta;
}


/*------------------------------------------------------------*/
/*--- Disassembling string ops (including REP prefixes)    ---*/
/*------------------------------------------------------------*/

/* Code shared by all the string ops */
static
void dis_string_op_increment ( Int sz, IRTemp t_inc )
{
   UChar logSz;
   if (sz == 8 || sz == 4 || sz == 2) {
      logSz = 1;
      if (sz == 4) logSz = 2;
      if (sz == 8) logSz = 3;
      assign( t_inc,
              binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
                               mkU8(logSz) ) );
   } else {
      assign( t_inc,
              IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   }
}

static
void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
                    Int sz, HChar* name, Prefix pfx )
{
   IRTemp t_inc = newTemp(Ity_I64);
   /* Really we ought to inspect the override prefixes, but we don't.
      The following assertion catches any resulting sillyness. */
   vassert(pfx == clearSegBits(pfx));
   dis_string_op_increment(sz, t_inc);
   dis_OP( sz, t_inc, pfx );
   DIP("%s%c\n", name, nameISize(sz));
}

static
void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
{
   IRType ty = szToITy(sz);
   IRTemp td = newTemp(Ity_I64);   /* RDI */
   IRTemp ts = newTemp(Ity_I64);   /* RSI */
   IRExpr *incd, *incs;

   if (haveASO(pfx)) {
      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   } else {
      assign( td, getIReg64(R_RDI) );
      assign( ts, getIReg64(R_RSI) );
   }

   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );

   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   if (haveASO(pfx)) {
      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   }
   putIReg64( R_RDI, incd );
   putIReg64( R_RSI, incs );
}

static
void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
{
   IRType ty = szToITy(sz);
   IRTemp ts = newTemp(Ity_I64);   /* RSI */
   IRExpr *incs;

   if (haveASO(pfx))
      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   else
      assign( ts, getIReg64(R_RSI) );

   putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );

   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   if (haveASO(pfx))
      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   putIReg64( R_RSI, incs );
}

static
void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
{
   IRType ty = szToITy(sz);
   IRTemp ta = newTemp(ty);        /* rAX */
   IRTemp td = newTemp(Ity_I64);   /* RDI */
   IRExpr *incd;

   assign( ta, getIRegRAX(sz) );

   if (haveASO(pfx))
      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   else
      assign( td, getIReg64(R_RDI) );

   storeLE( mkexpr(td), mkexpr(ta) );

   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   if (haveASO(pfx))
      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   putIReg64( R_RDI, incd );
}

static
void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
{
   IRType ty  = szToITy(sz);
   IRTemp tdv = newTemp(ty);      /* (RDI) */
   IRTemp tsv = newTemp(ty);      /* (RSI) */
   IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   IRExpr *incd, *incs;

   if (haveASO(pfx)) {
      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   } else {
      assign( td, getIReg64(R_RDI) );
      assign( ts, getIReg64(R_RSI) );
   }

   assign( tdv, loadLE(ty,mkexpr(td)) );

   assign( tsv, loadLE(ty,mkexpr(ts)) );

   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );

   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   if (haveASO(pfx)) {
      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   }
   putIReg64( R_RDI, incd );
   putIReg64( R_RSI, incs );
}

static
void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
{
   IRType ty  = szToITy(sz);
   IRTemp ta  = newTemp(ty);       /*  rAX  */
   IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   IRTemp tdv = newTemp(ty);       /* (RDI) */
   IRExpr *incd;

   assign( ta, getIRegRAX(sz) );

   if (haveASO(pfx))
      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   else
      assign( td, getIReg64(R_RDI) );

   assign( tdv, loadLE(ty,mkexpr(td)) );

   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );

   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   if (haveASO(pfx))
      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   putIReg64( R_RDI, incd );
}


/* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   the insn is the last one in the basic block, and so emit a jump to
   the next insn, rather than just falling through. */
static
void dis_REP_op ( /*MOD*/DisResult* dres,
                  AMD64Condcode cond,
                  void (*dis_OP)(Int, IRTemp, Prefix),
                  Int sz, Addr64 rip, Addr64 rip_next, HChar* name,
                  Prefix pfx )
{
   IRTemp t_inc = newTemp(Ity_I64);
   IRTemp tc;
   IRExpr* cmp;

   /* Really we ought to inspect the override prefixes, but we don't.
      The following assertion catches any resulting sillyness. */
   vassert(pfx == clearSegBits(pfx));

   if (haveASO(pfx)) {
      tc = newTemp(Ity_I32);  /*  ECX  */
      assign( tc, getIReg32(R_RCX) );
      cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   } else {
      tc = newTemp(Ity_I64);  /*  RCX  */
      assign( tc, getIReg64(R_RCX) );
      cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   }

   stmt( IRStmt_Exit( cmp, Ijk_Boring,
                      IRConst_U64(rip_next), OFFB_RIP ) );

   if (haveASO(pfx))
      putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
  else
      putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );

   dis_string_op_increment(sz, t_inc);
   dis_OP (sz, t_inc, pfx);

   if (cond == AMD64CondAlways) {
      jmp_lit(dres, Ijk_Boring, rip);
      vassert(dres->whatNext == Dis_StopHere);
   } else {
      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
                         Ijk_Boring,
                         IRConst_U64(rip),
                         OFFB_RIP ) );
      jmp_lit(dres, Ijk_Boring, rip_next);
      vassert(dres->whatNext == Dis_StopHere);
   }
   DIP("%s%c\n", name, nameISize(sz));
}


/*------------------------------------------------------------*/
/*--- Arithmetic, etc.                                     ---*/
/*------------------------------------------------------------*/

/* IMUL E, G.  Supplied eip points to the modR/M byte. */
static
ULong dis_mul_E_G ( VexAbiInfo* vbi,
                    Prefix      pfx,
                    Int         size,
                    Long        delta0 )
{
   Int    alen;
   HChar  dis_buf[50];
   UChar  rm = getUChar(delta0);
   IRType ty = szToITy(size);
   IRTemp te = newTemp(ty);
   IRTemp tg = newTemp(ty);
   IRTemp resLo = newTemp(ty);

   assign( tg, getIRegG(size, pfx, rm) );
   if (epartIsReg(rm)) {
      assign( te, getIRegE(size, pfx, rm) );
   } else {
      IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
      assign( te, loadLE(ty,mkexpr(addr)) );
   }

   setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );

   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );

   putIRegG(size, pfx, rm, mkexpr(resLo) );

   if (epartIsReg(rm)) {
      DIP("imul%c %s, %s\n", nameISize(size),
                             nameIRegE(size,pfx,rm),
                             nameIRegG(size,pfx,rm));
      return 1+delta0;
   } else {
      DIP("imul%c %s, %s\n", nameISize(size),
                             dis_buf,
                             nameIRegG(size,pfx,rm));
      return alen+delta0;
   }
}


/* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
static
ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
                       Prefix      pfx,
                       Int         size,
                       Long        delta,
                       Int         litsize )
{
   Long   d64;
   Int    alen;
   HChar  dis_buf[50];
   UChar  rm = getUChar(delta);
   IRType ty = szToITy(size);
   IRTemp te = newTemp(ty);
   IRTemp tl = newTemp(ty);
   IRTemp resLo = newTemp(ty);

   vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);

   if (epartIsReg(rm)) {
      assign(te, getIRegE(size, pfx, rm));
      delta++;
   } else {
      IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
                                     imin(4,litsize) );
      assign(te, loadLE(ty, mkexpr(addr)));
      delta += alen;
   }
   d64 = getSDisp(imin(4,litsize),delta);
   delta += imin(4,litsize);

   d64 &= mkSizeMask(size);
   assign(tl, mkU(ty,d64));

   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));

   setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );

   putIRegG(size, pfx, rm, mkexpr(resLo));

   DIP("imul%c $%lld, %s, %s\n",
       nameISize(size), d64,
       ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
       nameIRegG(size,pfx,rm) );
   return delta;
}


/* Generate an IR sequence to do a popcount operation on the supplied
   IRTemp, and return a new IRTemp holding the result.  'ty' may be
   Ity_I16, Ity_I32 or Ity_I64 only. */
static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
{
   Int i;
   if (ty == Ity_I16) {
      IRTemp old = IRTemp_INVALID;
      IRTemp nyu = IRTemp_INVALID;
      IRTemp mask[4], shift[4];
      for (i = 0; i < 4; i++) {
         mask[i]  = newTemp(ty);
         shift[i] = 1 << i;
      }
      assign(mask[0], mkU16(0x5555));
      assign(mask[1], mkU16(0x3333));
      assign(mask[2], mkU16(0x0F0F));
      assign(mask[3], mkU16(0x00FF));
      old = src;
      for (i = 0; i < 4; i++) {
         nyu = newTemp(ty);
         assign(nyu,
                binop(Iop_Add16,
                      binop(Iop_And16,
                            mkexpr(old),
                            mkexpr(mask[i])),
                      binop(Iop_And16,
                            binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
                            mkexpr(mask[i]))));
         old = nyu;
      }
      return nyu;
   }
   if (ty == Ity_I32) {
      IRTemp old = IRTemp_INVALID;
      IRTemp nyu = IRTemp_INVALID;
      IRTemp mask[5], shift[5];
      for (i = 0; i < 5; i++) {
         mask[i]  = newTemp(ty);
         shift[i] = 1 << i;
      }
      assign(mask[0], mkU32(0x55555555));
      assign(mask[1], mkU32(0x33333333));
      assign(mask[2], mkU32(0x0F0F0F0F));
      assign(mask[3], mkU32(0x00FF00FF));
      assign(mask[4], mkU32(0x0000FFFF));
      old = src;
      for (i = 0; i < 5; i++) {
         nyu = newTemp(ty);
         assign(nyu,
                binop(Iop_Add32,
                      binop(Iop_And32,
                            mkexpr(old),
                            mkexpr(mask[i])),
                      binop(Iop_And32,
                            binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
                            mkexpr(mask[i]))));
         old = nyu;
      }
      return nyu;
   }
   if (ty == Ity_I64) {
      IRTemp old = IRTemp_INVALID;
      IRTemp nyu = IRTemp_INVALID;
      IRTemp mask[6], shift[6];
      for (i = 0; i < 6; i++) {
         mask[i]  = newTemp(ty);
         shift[i] = 1 << i;
      }
      assign(mask[0], mkU64(0x5555555555555555ULL));
      assign(mask[1], mkU64(0x3333333333333333ULL));
      assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
      assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
      assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
      assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
      old = src;
      for (i = 0; i < 6; i++) {
         nyu = newTemp(ty);
         assign(nyu,
                binop(Iop_Add64,
                      binop(Iop_And64,
                            mkexpr(old),
                            mkexpr(mask[i])),
                      binop(Iop_And64,
                            binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
                            mkexpr(mask[i]))));
         old = nyu;
      }
      return nyu;
   }
   /*NOTREACHED*/
   vassert(0);
}


/* Generate an IR sequence to do a count-leading-zeroes operation on
   the supplied IRTemp, and return a new IRTemp holding the result.
   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   the argument is zero, return the number of bits in the word (the
   natural semantics). */
static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
{
   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);

   IRTemp src64 = newTemp(Ity_I64);
   assign(src64, widenUto64( mkexpr(src) ));

   IRTemp src64x = newTemp(Ity_I64);
   assign(src64x,
          binop(Iop_Shl64, mkexpr(src64),
                           mkU8(64 - 8 * sizeofIRType(ty))));

   // Clz64 has undefined semantics when its input is zero, so
   // special-case around that.
   IRTemp res64 = newTemp(Ity_I64);
   assign(res64,
          IRExpr_Mux0X(
             unop(Iop_1Uto8,
                  binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0))),
             unop(Iop_Clz64, mkexpr(src64x)),
             mkU64(8 * sizeofIRType(ty))
   ));

   IRTemp res = newTemp(ty);
   assign(res, narrowTo(ty, mkexpr(res64)));
   return res;
}


/*------------------------------------------------------------*/
/*---                                                      ---*/
/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
/*---                                                      ---*/
/*------------------------------------------------------------*/

/* --- Helper functions for dealing with the register stack. --- */

/* --- Set the emulation-warning pseudo-register. --- */

static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   stmt( IRStmt_Put( OFFB_EMWARN, e ) );
}

/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */

static IRExpr* mkQNaN64 ( void )
{
  /* QNaN is 0 2047 1 0(51times)
     == 0b 11111111111b 1 0(51times)
     == 0x7FF8 0000 0000 0000
   */
   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
}

/* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */

static IRExpr* get_ftop ( void )
{
   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
}

static void put_ftop ( IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   stmt( IRStmt_Put( OFFB_FTOP, e ) );
}

/* --------- Get/put the C3210 bits. --------- */

static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
{
   return IRExpr_Get( OFFB_FC3210, Ity_I64 );
}

static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   stmt( IRStmt_Put( OFFB_FC3210, e ) );
}

/* --------- Get/put the FPU rounding mode. --------- */
static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
{
   return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
}

static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
}


/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
/* Produces a value in 0 .. 3, which is encoded as per the type
   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   per IRRoundingMode, we merely need to get it and mask it for
   safety.
*/
static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
{
   return binop( Iop_And32, get_fpround(), mkU32(3) );
}

static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
{
   return mkU32(Irrm_NEAREST);
}


/* --------- Get/set FP register tag bytes. --------- */

/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */

static void put_ST_TAG ( Int i, IRExpr* value )
{
   IRRegArray* descr;
   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
}

/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */

static IRExpr* get_ST_TAG ( Int i )
{
   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   return IRExpr_GetI( descr, get_ftop(), i );
}


/* --------- Get/set FP registers. --------- */

/* Given i, and some expression e, emit 'ST(i) = e' and set the
   register's tag to indicate the register is full.  The previous
   state of the register is not checked. */

static void put_ST_UNCHECKED ( Int i, IRExpr* value )
{
   IRRegArray* descr;
   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   /* Mark the register as in-use. */
   put_ST_TAG(i, mkU8(1));
}

/* Given i, and some expression e, emit
      ST(i) = is_full(i) ? NaN : e
   and set the tag accordingly.
*/

static void put_ST ( Int i, IRExpr* value )
{
   put_ST_UNCHECKED( i,
                     IRExpr_Mux0X( get_ST_TAG(i),
                                   /* 0 means empty */
                                   value,
                                   /* non-0 means full */
                                   mkQNaN64()
                   )
   );
}


/* Given i, generate an expression yielding 'ST(i)'. */

static IRExpr* get_ST_UNCHECKED ( Int i )
{
   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   return IRExpr_GetI( descr, get_ftop(), i );
}


/* Given i, generate an expression yielding
  is_full(i) ? ST(i) : NaN
*/

static IRExpr* get_ST ( Int i )
{
   return
      IRExpr_Mux0X( get_ST_TAG(i),
                    /* 0 means empty */
                    mkQNaN64(),
                    /* non-0 means full */
                    get_ST_UNCHECKED(i));
}


/* Adjust FTOP downwards by one register. */

static void fp_push ( void )
{
   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
}

/* Adjust FTOP upwards by one register, and mark the vacated register
   as empty.  */

static void fp_pop ( void )
{
   put_ST_TAG(0, mkU8(0));
   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
}

/* Clear the C2 bit of the FPU status register, for
   sin/cos/tan/sincos. */

static void clear_C2 ( void )
{
   put_C3210( binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2)) );
}

/* Invent a plausible-looking FPU status word value:
      ((ftop & 7) << 11) | (c3210 & 0x4700)
 */
static IRExpr* get_FPU_sw ( void )
{
   return
      unop(Iop_32to16,
           binop(Iop_Or32,
                 binop(Iop_Shl32,
                       binop(Iop_And32, get_ftop(), mkU32(7)),
                             mkU8(11)),
                       binop(Iop_And32, unop(Iop_64to32, get_C3210()),
                                        mkU32(0x4700))
      ));
}


/* ------------------------------------------------------- */
/* Given all that stack-mangling junk, we can now go ahead
   and describe FP instructions.
*/

/* ST(0) = ST(0) `op` mem64/32(addr)
   Need to check ST(0)'s tag on read, but not on write.
*/
static
void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
                         IROp op, Bool dbl )
{
   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   if (dbl) {
      put_ST_UNCHECKED(0,
         triop( op,
                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                get_ST(0),
                loadLE(Ity_F64,mkexpr(addr))
         ));
   } else {
      put_ST_UNCHECKED(0,
         triop( op,
                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                get_ST(0),
                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
         ));
   }
}


/* ST(0) = mem64/32(addr) `op` ST(0)
   Need to check ST(0)'s tag on read, but not on write.
*/
static
void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
                            IROp op, Bool dbl )
{
   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   if (dbl) {
      put_ST_UNCHECKED(0,
         triop( op,
                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                loadLE(Ity_F64,mkexpr(addr)),
                get_ST(0)
         ));
   } else {
      put_ST_UNCHECKED(0,
         triop( op,
                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
                get_ST(0)
         ));
   }
}


/* ST(dst) = ST(dst) `op` ST(src).
   Check dst and src tags when reading but not on write.
*/
static
void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
                      Bool pop_after )
{
   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   put_ST_UNCHECKED(
      st_dst,
      triop( op,
             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
             get_ST(st_dst),
             get_ST(st_src) )
   );
   if (pop_after)
      fp_pop();
}

/* ST(dst) = ST(src) `op` ST(dst).
   Check dst and src tags when reading but not on write.
*/
static
void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
                         Bool pop_after )
{
   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   put_ST_UNCHECKED(
      st_dst,
      triop( op,
             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
             get_ST(st_src),
             get_ST(st_dst) )
   );
   if (pop_after)
      fp_pop();
}

/* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
{
   DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   /* This is a bit of a hack (and isn't really right).  It sets
      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
      documentation implies A and S are unchanged.
   */
   /* It's also fishy in that it is used both for COMIP and
      UCOMIP, and they aren't the same (although similar). */
   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   stmt( IRStmt_Put(
            OFFB_CC_DEP1,
            binop( Iop_And64,
                   unop( Iop_32Uto64,
                         binop(Iop_CmpF64, get_ST(0), get_ST(i))),
                   mkU64(0x45)
        )));
   if (pop_after)
      fp_pop();
}


/* returns
   32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
*/
static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
{
   IRTemp t32 = newTemp(Ity_I32);
   assign( t32, e32 );
   return
      IRExpr_Mux0X(
         unop(Iop_1Uto8,
              binop(Iop_CmpLT64U,
                    unop(Iop_32Uto64,
                         binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
                    mkU64(65536))),
         mkU16( 0x8000 ),
         unop(Iop_32to16, mkexpr(t32)));
}


static
ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                VexAbiInfo* vbi, Prefix pfx, Long delta )
{
   Int    len;
   UInt   r_src, r_dst;
   HChar  dis_buf[50];
   IRTemp t1, t2;

   /* On entry, delta points at the second byte of the insn (the modrm
      byte).*/
   UChar first_opcode = getUChar(delta-1);
   UChar modrm        = getUChar(delta+0);

   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */

   if (first_opcode == 0xD8) {
      if (modrm < 0xC0) {

         /* bits 5,4,3 are an opcode extension, and the modRM also
           specifies an address. */
         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
         delta += len;

         switch (gregLO3ofRM(modrm)) {

            case 0: /* FADD single-real */
               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
               break;

            case 1: /* FMUL single-real */
               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
               break;

            case 2: /* FCOM single-real */
               DIP("fcoms %s\n", dis_buf);
               /* This forces C1 to zero, which isn't right. */
               /* The AMD documentation suggests that forcing C1 to
                  zero is correct (Eliot Moss) */
               put_C3210(
                   unop( Iop_32Uto64,
                       binop( Iop_And32,
                              binop(Iop_Shl32,
                                    binop(Iop_CmpF64,
                                          get_ST(0),
                                          unop(Iop_F32toF64,
                                               loadLE(Ity_F32,mkexpr(addr)))),
                                    mkU8(8)),
                              mkU32(0x4500)
                   )));
               break;

            case 3: /* FCOMP single-real */
               /* The AMD documentation suggests that forcing C1 to
                  zero is correct (Eliot Moss) */
               DIP("fcomps %s\n", dis_buf);
               /* This forces C1 to zero, which isn't right. */
               put_C3210(
                   unop( Iop_32Uto64,
                       binop( Iop_And32,
                              binop(Iop_Shl32,
                                    binop(Iop_CmpF64,
                                          get_ST(0),
                                          unop(Iop_F32toF64,
                                               loadLE(Ity_F32,mkexpr(addr)))),
                                    mkU8(8)),
                              mkU32(0x4500)
                   )));
               fp_pop();
               break;

            case 4: /* FSUB single-real */
               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
               break;

            case 5: /* FSUBR single-real */
               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
               break;

            case 6: /* FDIV single-real */
               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
               break;

            case 7: /* FDIVR single-real */
               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
               break;

            default:
               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
               vex_printf("first_opcode == 0xD8\n");
               goto decode_fail;
         }
      } else {
         delta++;
         switch (modrm) {

            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
               break;

            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
               break;

            /* Dunno if this is right */
            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
               r_dst = (UInt)modrm - 0xD0;
               DIP("fcom %%st(0),%%st(%d)\n", r_dst);
               /* This forces C1 to zero, which isn't right. */
               put_C3210(
                   unop(Iop_32Uto64,
                   binop( Iop_And32,
                          binop(Iop_Shl32,
                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
                                mkU8(8)),
                          mkU32(0x4500)
                   )));
               break;

            /* Dunno if this is right */
            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
               r_dst = (UInt)modrm - 0xD8;
               DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
               /* This forces C1 to zero, which isn't right. */
               put_C3210(
                   unop(Iop_32Uto64,
                   binop( Iop_And32,
                          binop(Iop_Shl32,
                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
                                mkU8(8)),
                          mkU32(0x4500)
                   )));
               fp_pop();
               break;

            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
               break;

            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
               break;

            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
               break;

            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
               break;

            default:
               goto decode_fail;
         }
      }
   }

   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   else
   if (first_opcode == 0xD9) {
      if (modrm < 0xC0) {

         /* bits 5,4,3 are an opcode extension, and the modRM also
            specifies an address. */
         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
         delta += len;

         switch (gregLO3ofRM(modrm)) {

            case 0: /* FLD single-real */
               DIP("flds %s\n", dis_buf);
               fp_push();
               put_ST(0, unop(Iop_F32toF64,
                              loadLE(Ity_F32, mkexpr(addr))));
               break;

            case 2: /* FST single-real */
               DIP("fsts %s\n", dis_buf);
               storeLE(mkexpr(addr),
                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
               break;

            case 3: /* FSTP single-real */
               DIP("fstps %s\n", dis_buf);
               storeLE(mkexpr(addr),
                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
               fp_pop();
               break;

            case 4: { /* FLDENV m28 */
               /* Uses dirty helper:
                     VexEmWarn amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
               IRTemp    ew = newTemp(Ity_I32);
               IRTemp   w64 = newTemp(Ity_I64);
               IRDirty*   d = unsafeIRDirty_0_N (
                                 0/*regparms*/,
                                 "amd64g_dirtyhelper_FLDENV",
                                 &amd64g_dirtyhelper_FLDENV,
                                 mkIRExprVec_1( mkexpr(addr) )
                              );
               d->needsBBP = True;
               d->tmp      = w64;
               /* declare we're reading memory */
               d->mFx   = Ifx_Read;
               d->mAddr = mkexpr(addr);
               d->mSize = 28;

               /* declare we're writing guest state */
               d->nFxState = 4;
               vex_bzero(&d->fxState, sizeof(d->fxState));

               d->fxState[0].fx     = Ifx_Write;
               d->fxState[0].offset = OFFB_FTOP;
               d->fxState[0].size   = sizeof(UInt);

               d->fxState[1].fx     = Ifx_Write;
               d->fxState[1].offset = OFFB_FPTAGS;
               d->fxState[1].size   = 8 * sizeof(UChar);

               d->fxState[2].fx     = Ifx_Write;
               d->fxState[2].offset = OFFB_FPROUND;
               d->fxState[2].size   = sizeof(ULong);

               d->fxState[3].fx     = Ifx_Write;
               d->fxState[3].offset = OFFB_FC3210;
               d->fxState[3].size   = sizeof(ULong);

               stmt( IRStmt_Dirty(d) );

               /* ew contains any emulation warning we may need to
                  issue.  If needed, side-exit to the next insn,
                  reporting the warning, so that Valgrind's dispatcher
                  sees the warning. */
	       assign(ew, unop(Iop_64to32,mkexpr(w64)) );
               put_emwarn( mkexpr(ew) );
               stmt(
                  IRStmt_Exit(
                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
                     Ijk_EmWarn,
                     IRConst_U64( guest_RIP_bbstart+delta ),
                     OFFB_RIP
                  )
               );

               DIP("fldenv %s\n", dis_buf);
               break;
            }

            case 5: {/* FLDCW */
               /* The only thing we observe in the control word is the
                  rounding mode.  Therefore, pass the 16-bit value
                  (x87 native-format control word) to a clean helper,
                  getting back a 64-bit value, the lower half of which
                  is the FPROUND value to store, and the upper half of
                  which is the emulation-warning token which may be
                  generated.
               */
               /* ULong amd64h_check_fldcw ( ULong ); */
               IRTemp t64 = newTemp(Ity_I64);
               IRTemp ew = newTemp(Ity_I32);
               DIP("fldcw %s\n", dis_buf);
               assign( t64, mkIRExprCCall(
                               Ity_I64, 0/*regparms*/,
                               "amd64g_check_fldcw",
                               &amd64g_check_fldcw,
                               mkIRExprVec_1(
                                  unop( Iop_16Uto64,
                                        loadLE(Ity_I16, mkexpr(addr)))
                               )
                            )
                     );

               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
               put_emwarn( mkexpr(ew) );
               /* Finally, if an emulation warning was reported,
                  side-exit to the next insn, reporting the warning,
                  so that Valgrind's dispatcher sees the warning. */
               stmt(
                  IRStmt_Exit(
                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
                     Ijk_EmWarn,
                     IRConst_U64( guest_RIP_bbstart+delta ),
                     OFFB_RIP
                  )
               );
               break;
            }

            case 6: { /* FNSTENV m28 */
               /* Uses dirty helper:
                     void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
               IRDirty* d = unsafeIRDirty_0_N (
                               0/*regparms*/,
                               "amd64g_dirtyhelper_FSTENV",
                               &amd64g_dirtyhelper_FSTENV,
                               mkIRExprVec_1( mkexpr(addr) )
                            );
               d->needsBBP = True;
               /* declare we're writing memory */
               d->mFx   = Ifx_Write;
               d->mAddr = mkexpr(addr);
               d->mSize = 28;

               /* declare we're reading guest state */
               d->nFxState = 4;
               vex_bzero(&d->fxState, sizeof(d->fxState));

               d->fxState[0].fx     = Ifx_Read;
               d->fxState[0].offset = OFFB_FTOP;
               d->fxState[0].size   = sizeof(UInt);

               d->fxState[1].fx     = Ifx_Read;
               d->fxState[1].offset = OFFB_FPTAGS;
               d->fxState[1].size   = 8 * sizeof(UChar);

               d->fxState[2].fx     = Ifx_Read;
               d->fxState[2].offset = OFFB_FPROUND;
               d->fxState[2].size   = sizeof(ULong);

               d->fxState[3].fx     = Ifx_Read;
               d->fxState[3].offset = OFFB_FC3210;
               d->fxState[3].size   = sizeof(ULong);

               stmt( IRStmt_Dirty(d) );

               DIP("fnstenv %s\n", dis_buf);
               break;
            }

            case 7: /* FNSTCW */
               /* Fake up a native x87 FPU control word.  The only
                  thing it depends on is FPROUND[1:0], so call a clean
                  helper to cook it up. */
               /* ULong amd64g_create_fpucw ( ULong fpround ) */
               DIP("fnstcw %s\n", dis_buf);
               storeLE(
                  mkexpr(addr),
                  unop( Iop_64to16,
                        mkIRExprCCall(
                           Ity_I64, 0/*regp*/,
                           "amd64g_create_fpucw", &amd64g_create_fpucw,
                           mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
                        )
                  )
               );
               break;

            default:
               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
               vex_printf("first_opcode == 0xD9\n");
               goto decode_fail;
         }

      } else {
         delta++;
         switch (modrm) {

            case 0xC0 ... 0xC7: /* FLD %st(?) */
               r_src = (UInt)modrm - 0xC0;
               DIP("fld %%st(%u)\n", r_src);
               t1 = newTemp(Ity_F64);
               assign(t1, get_ST(r_src));
               fp_push();
               put_ST(0, mkexpr(t1));
               break;

            case 0xC8 ... 0xCF: /* FXCH %st(?) */
               r_src = (UInt)modrm - 0xC8;
               DIP("fxch %%st(%u)\n", r_src);
               t1 = newTemp(Ity_F64);
               t2 = newTemp(Ity_F64);
               assign(t1, get_ST(0));
               assign(t2, get_ST(r_src));
               put_ST_UNCHECKED(0, mkexpr(t2));
               put_ST_UNCHECKED(r_src, mkexpr(t1));
               break;

            case 0xE0: /* FCHS */
               DIP("fchs\n");
               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
               break;

            case 0xE1: /* FABS */
               DIP("fabs\n");
               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
               break;

            case 0xE5: { /* FXAM */
               /* This is an interesting one.  It examines %st(0),
                  regardless of whether the tag says it's empty or not.
                  Here, just pass both the tag (in our format) and the
                  value (as a double, actually a ULong) to a helper
                  function. */
               IRExpr** args
                  = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
                                   unop(Iop_ReinterpF64asI64,
                                        get_ST_UNCHECKED(0)) );
               put_C3210(mkIRExprCCall(
                            Ity_I64,
                            0/*regparm*/,
                            "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
                            args
                        ));
               DIP("fxam\n");
               break;
            }

            case 0xE8: /* FLD1 */
               DIP("fld1\n");
               fp_push();
               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
               break;

            case 0xE9: /* FLDL2T */
               DIP("fldl2t\n");
               fp_push();
               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
               break;

            case 0xEA: /* FLDL2E */
               DIP("fldl2e\n");
               fp_push();
               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
               break;

            case 0xEB: /* FLDPI */
               DIP("fldpi\n");
               fp_push();
               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
               break;

            case 0xEC: /* FLDLG2 */
               DIP("fldlg2\n");
               fp_push();
               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
               break;

            case 0xED: /* FLDLN2 */
               DIP("fldln2\n");
               fp_push();
               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
               break;

            case 0xEE: /* FLDZ */
               DIP("fldz\n");
               fp_push();
               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
               break;

            case 0xF0: /* F2XM1 */
               DIP("f2xm1\n");
               put_ST_UNCHECKED(0,
                  binop(Iop_2xm1F64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(0)));
               break;

            case 0xF1: /* FYL2X */
               DIP("fyl2x\n");
               put_ST_UNCHECKED(1,
                  triop(Iop_Yl2xF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(1),
                        get_ST(0)));
               fp_pop();
               break;

            case 0xF2: /* FPTAN */
               DIP("ftan\n");
               put_ST_UNCHECKED(0,
                  binop(Iop_TanF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(0)));
               fp_push();
               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
               clear_C2(); /* HACK */
               break;

            case 0xF3: /* FPATAN */
               DIP("fpatan\n");
               put_ST_UNCHECKED(1,
                  triop(Iop_AtanF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(1),
                        get_ST(0)));
               fp_pop();
               break;

            case 0xF4: { /* FXTRACT */
               IRTemp argF = newTemp(Ity_F64);
               IRTemp sigF = newTemp(Ity_F64);
               IRTemp expF = newTemp(Ity_F64);
               IRTemp argI = newTemp(Ity_I64);
               IRTemp sigI = newTemp(Ity_I64);
               IRTemp expI = newTemp(Ity_I64);
               DIP("fxtract\n");
               assign( argF, get_ST(0) );
               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
               assign( sigI,
                       mkIRExprCCall(
                          Ity_I64, 0/*regparms*/,
                          "x86amd64g_calculate_FXTRACT",
                          &x86amd64g_calculate_FXTRACT,
                          mkIRExprVec_2( mkexpr(argI),
                                         mkIRExpr_HWord(0)/*sig*/ ))
               );
               assign( expI,
                       mkIRExprCCall(
                          Ity_I64, 0/*regparms*/,
                          "x86amd64g_calculate_FXTRACT",
                          &x86amd64g_calculate_FXTRACT,
                          mkIRExprVec_2( mkexpr(argI),
                                         mkIRExpr_HWord(1)/*exp*/ ))
               );
               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
               /* exponent */
               put_ST_UNCHECKED(0, mkexpr(expF) );
               fp_push();
               /* significand */
               put_ST(0, mkexpr(sigF) );
               break;
            }

            case 0xF5: { /* FPREM1 -- IEEE compliant */
               IRTemp a1 = newTemp(Ity_F64);
               IRTemp a2 = newTemp(Ity_F64);
               DIP("fprem1\n");
               /* Do FPREM1 twice, once to get the remainder, and once
                  to get the C3210 flag values. */
               assign( a1, get_ST(0) );
               assign( a2, get_ST(1) );
               put_ST_UNCHECKED(0,
                  triop(Iop_PRem1F64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        mkexpr(a1),
                        mkexpr(a2)));
               put_C3210(
                  unop(Iop_32Uto64,
                  triop(Iop_PRem1C3210F64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        mkexpr(a1),
                        mkexpr(a2)) ));
               break;
            }

            case 0xF7: /* FINCSTP */
               DIP("fincstp\n");
               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
               break;

            case 0xF8: { /* FPREM -- not IEEE compliant */
               IRTemp a1 = newTemp(Ity_F64);
               IRTemp a2 = newTemp(Ity_F64);
               DIP("fprem\n");
               /* Do FPREM twice, once to get the remainder, and once
                  to get the C3210 flag values. */
               assign( a1, get_ST(0) );
               assign( a2, get_ST(1) );
               put_ST_UNCHECKED(0,
                  triop(Iop_PRemF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        mkexpr(a1),
                        mkexpr(a2)));
               put_C3210(
                  unop(Iop_32Uto64,
                  triop(Iop_PRemC3210F64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        mkexpr(a1),
                        mkexpr(a2)) ));
               break;
            }

            case 0xF9: /* FYL2XP1 */
               DIP("fyl2xp1\n");
               put_ST_UNCHECKED(1,
                  triop(Iop_Yl2xp1F64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(1),
                        get_ST(0)));
               fp_pop();
               break;

            case 0xFA: /* FSQRT */
               DIP("fsqrt\n");
               put_ST_UNCHECKED(0,
                  binop(Iop_SqrtF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(0)));
               break;

            case 0xFB: { /* FSINCOS */
               IRTemp a1 = newTemp(Ity_F64);
               assign( a1, get_ST(0) );
               DIP("fsincos\n");
               put_ST_UNCHECKED(0,
                  binop(Iop_SinF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        mkexpr(a1)));
               fp_push();
               put_ST(0,
                  binop(Iop_CosF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        mkexpr(a1)));
               clear_C2(); /* HACK */
               break;
            }

            case 0xFC: /* FRNDINT */
               DIP("frndint\n");
               put_ST_UNCHECKED(0,
                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
               break;

            case 0xFD: /* FSCALE */
               DIP("fscale\n");
               put_ST_UNCHECKED(0,
                  triop(Iop_ScaleF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(0),
                        get_ST(1)));
               break;

            case 0xFE: /* FSIN */
               DIP("fsin\n");
               put_ST_UNCHECKED(0,
                  binop(Iop_SinF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(0)));
               clear_C2(); /* HACK */
               break;

            case 0xFF: /* FCOS */
               DIP("fcos\n");
               put_ST_UNCHECKED(0,
                  binop(Iop_CosF64,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(0)));
               clear_C2(); /* HACK */
               break;

            default:
               goto decode_fail;
         }
      }
   }

   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   else
   if (first_opcode == 0xDA) {

      if (modrm < 0xC0) {

         /* bits 5,4,3 are an opcode extension, and the modRM also
            specifies an address. */
         IROp   fop;
         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
         delta += len;
         switch (gregLO3ofRM(modrm)) {

            case 0: /* FIADD m32int */ /* ST(0) += m32int */
               DIP("fiaddl %s\n", dis_buf);
               fop = Iop_AddF64;
               goto do_fop_m32;

            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
               DIP("fimull %s\n", dis_buf);
               fop = Iop_MulF64;
               goto do_fop_m32;

            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
               DIP("fisubl %s\n", dis_buf);
               fop = Iop_SubF64;
               goto do_fop_m32;

            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
               DIP("fisubrl %s\n", dis_buf);
               fop = Iop_SubF64;
               goto do_foprev_m32;

            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
               DIP("fisubl %s\n", dis_buf);
               fop = Iop_DivF64;
               goto do_fop_m32;

            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
               DIP("fidivrl %s\n", dis_buf);
               fop = Iop_DivF64;
               goto do_foprev_m32;

            do_fop_m32:
               put_ST_UNCHECKED(0,
                  triop(fop,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        get_ST(0),
                        unop(Iop_I32StoF64,
                             loadLE(Ity_I32, mkexpr(addr)))));
               break;

            do_foprev_m32:
               put_ST_UNCHECKED(0,
                  triop(fop,
                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
                        unop(Iop_I32StoF64,
                             loadLE(Ity_I32, mkexpr(addr))),
                        get_ST(0)));
               break;

            default:
               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
               vex_printf("first_opcode == 0xDA\n");
               goto decode_fail;
         }

      } else {

         delta++;
         switch (modrm) {

            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
               r_src = (UInt)modrm - 0xC0;
               DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
               put_ST_UNCHECKED(0,
                                IRExpr_Mux0X(
                                    unop(Iop_1Uto8,
                                         mk_amd64g_calculate_condition(AMD64CondB)),
                                    get_ST(0), get_ST(r_src)) );
               break;

            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
               r_src = (UInt)modrm - 0xC8;
               DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
               put_ST_UNCHECKED(0,
                                IRExpr_Mux0X(
                                    unop(Iop_1Uto8,
                                         mk_amd64g_calculate_condition(AMD64CondZ)),
                                    get_ST(0), get_ST(r_src)) );
               break;

            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
               r_src = (UInt)modrm - 0xD0;
               DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
               put_ST_UNCHECKED(0,
                                IRExpr_Mux0X(
                                    unop(Iop_1Uto8,
                                         mk_amd64g_calculate_condition(AMD64CondBE)),
                                    get_ST(0), get_ST(r_src)) );
               break;

            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
               r_src = (UInt)modrm - 0xD8;
               DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
               put_ST_UNCHECKED(0,
                                IRExpr_Mux0X(
                                    unop(Iop_1Uto8,
                                         mk_amd64g_calculate_condition(AMD64CondP)),
                                    get_ST(0), get_ST(r_src)) );
               break;

            case 0xE9: /* FUCOMPP %st(0),%st(1) */
               DIP("fucompp %%st(0),%%st(1)\n");
               /* This forces C1 to zero, which isn't right. */
               put_C3210(
                   unop(Iop_32Uto64,