Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                   host_x86_defs.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2012 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex.h"
     38 #include "libvex_trc_values.h"
     39 
     40 #include "main_util.h"
     41 #include "host_generic_regs.h"
     42 #include "host_x86_defs.h"
     43 
     44 
     45 /* --------- Registers. --------- */
     46 
     47 void ppHRegX86 ( HReg reg )
     48 {
     49    Int r;
     50    static HChar* ireg32_names[8]
     51      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" };
     52    /* Be generic for all virtual regs. */
     53    if (hregIsVirtual(reg)) {
     54       ppHReg(reg);
     55       return;
     56    }
     57    /* But specific for real regs. */
     58    switch (hregClass(reg)) {
     59       case HRcInt32:
     60          r = hregNumber(reg);
     61          vassert(r >= 0 && r < 8);
     62          vex_printf("%s", ireg32_names[r]);
     63          return;
     64       case HRcFlt64:
     65          r = hregNumber(reg);
     66          vassert(r >= 0 && r < 6);
     67          vex_printf("%%fake%d", r);
     68          return;
     69       case HRcVec128:
     70          r = hregNumber(reg);
     71          vassert(r >= 0 && r < 8);
     72          vex_printf("%%xmm%d", r);
     73          return;
     74       default:
     75          vpanic("ppHRegX86");
     76    }
     77 }
     78 
     79 HReg hregX86_EAX ( void ) { return mkHReg(0, HRcInt32, False); }
     80 HReg hregX86_ECX ( void ) { return mkHReg(1, HRcInt32, False); }
     81 HReg hregX86_EDX ( void ) { return mkHReg(2, HRcInt32, False); }
     82 HReg hregX86_EBX ( void ) { return mkHReg(3, HRcInt32, False); }
     83 HReg hregX86_ESP ( void ) { return mkHReg(4, HRcInt32, False); }
     84 HReg hregX86_EBP ( void ) { return mkHReg(5, HRcInt32, False); }
     85 HReg hregX86_ESI ( void ) { return mkHReg(6, HRcInt32, False); }
     86 HReg hregX86_EDI ( void ) { return mkHReg(7, HRcInt32, False); }
     87 
     88 HReg hregX86_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
     89 HReg hregX86_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
     90 HReg hregX86_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
     91 HReg hregX86_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
     92 HReg hregX86_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
     93 HReg hregX86_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
     94 
     95 HReg hregX86_XMM0 ( void ) { return mkHReg(0, HRcVec128, False); }
     96 HReg hregX86_XMM1 ( void ) { return mkHReg(1, HRcVec128, False); }
     97 HReg hregX86_XMM2 ( void ) { return mkHReg(2, HRcVec128, False); }
     98 HReg hregX86_XMM3 ( void ) { return mkHReg(3, HRcVec128, False); }
     99 HReg hregX86_XMM4 ( void ) { return mkHReg(4, HRcVec128, False); }
    100 HReg hregX86_XMM5 ( void ) { return mkHReg(5, HRcVec128, False); }
    101 HReg hregX86_XMM6 ( void ) { return mkHReg(6, HRcVec128, False); }
    102 HReg hregX86_XMM7 ( void ) { return mkHReg(7, HRcVec128, False); }
    103 
    104 
    105 void getAllocableRegs_X86 ( Int* nregs, HReg** arr )
    106 {
    107    *nregs = 20;
    108    *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
    109    (*arr)[0] = hregX86_EAX();
    110    (*arr)[1] = hregX86_EBX();
    111    (*arr)[2] = hregX86_ECX();
    112    (*arr)[3] = hregX86_EDX();
    113    (*arr)[4] = hregX86_ESI();
    114    (*arr)[5] = hregX86_EDI();
    115    (*arr)[6] = hregX86_FAKE0();
    116    (*arr)[7] = hregX86_FAKE1();
    117    (*arr)[8] = hregX86_FAKE2();
    118    (*arr)[9] = hregX86_FAKE3();
    119    (*arr)[10] = hregX86_FAKE4();
    120    (*arr)[11] = hregX86_FAKE5();
    121    (*arr)[12] = hregX86_XMM0();
    122    (*arr)[13] = hregX86_XMM1();
    123    (*arr)[14] = hregX86_XMM2();
    124    (*arr)[15] = hregX86_XMM3();
    125    (*arr)[16] = hregX86_XMM4();
    126    (*arr)[17] = hregX86_XMM5();
    127    (*arr)[18] = hregX86_XMM6();
    128    (*arr)[19] = hregX86_XMM7();
    129 }
    130 
    131 
    132 /* --------- Condition codes, Intel encoding. --------- */
    133 
    134 HChar* showX86CondCode ( X86CondCode cond )
    135 {
    136    switch (cond) {
    137       case Xcc_O:      return "o";
    138       case Xcc_NO:     return "no";
    139       case Xcc_B:      return "b";
    140       case Xcc_NB:     return "nb";
    141       case Xcc_Z:      return "z";
    142       case Xcc_NZ:     return "nz";
    143       case Xcc_BE:     return "be";
    144       case Xcc_NBE:    return "nbe";
    145       case Xcc_S:      return "s";
    146       case Xcc_NS:     return "ns";
    147       case Xcc_P:      return "p";
    148       case Xcc_NP:     return "np";
    149       case Xcc_L:      return "l";
    150       case Xcc_NL:     return "nl";
    151       case Xcc_LE:     return "le";
    152       case Xcc_NLE:    return "nle";
    153       case Xcc_ALWAYS: return "ALWAYS";
    154       default: vpanic("ppX86CondCode");
    155    }
    156 }
    157 
    158 
    159 /* --------- X86AMode: memory address expressions. --------- */
    160 
    161 X86AMode* X86AMode_IR ( UInt imm32, HReg reg ) {
    162    X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
    163    am->tag = Xam_IR;
    164    am->Xam.IR.imm = imm32;
    165    am->Xam.IR.reg = reg;
    166    return am;
    167 }
    168 X86AMode* X86AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
    169    X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
    170    am->tag = Xam_IRRS;
    171    am->Xam.IRRS.imm = imm32;
    172    am->Xam.IRRS.base = base;
    173    am->Xam.IRRS.index = indEx;
    174    am->Xam.IRRS.shift = shift;
    175    vassert(shift >= 0 && shift <= 3);
    176    return am;
    177 }
    178 
    179 X86AMode* dopyX86AMode ( X86AMode* am ) {
    180    switch (am->tag) {
    181       case Xam_IR:
    182          return X86AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
    183       case Xam_IRRS:
    184          return X86AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base,
    185                                am->Xam.IRRS.index, am->Xam.IRRS.shift );
    186       default:
    187          vpanic("dopyX86AMode");
    188    }
    189 }
    190 
    191 void ppX86AMode ( X86AMode* am ) {
    192    switch (am->tag) {
    193       case Xam_IR:
    194          if (am->Xam.IR.imm == 0)
    195             vex_printf("(");
    196          else
    197             vex_printf("0x%x(", am->Xam.IR.imm);
    198          ppHRegX86(am->Xam.IR.reg);
    199          vex_printf(")");
    200          return;
    201       case Xam_IRRS:
    202          vex_printf("0x%x(", am->Xam.IRRS.imm);
    203          ppHRegX86(am->Xam.IRRS.base);
    204          vex_printf(",");
    205          ppHRegX86(am->Xam.IRRS.index);
    206          vex_printf(",%d)", 1 << am->Xam.IRRS.shift);
    207          return;
    208       default:
    209          vpanic("ppX86AMode");
    210    }
    211 }
    212 
    213 static void addRegUsage_X86AMode ( HRegUsage* u, X86AMode* am ) {
    214    switch (am->tag) {
    215       case Xam_IR:
    216          addHRegUse(u, HRmRead, am->Xam.IR.reg);
    217          return;
    218       case Xam_IRRS:
    219          addHRegUse(u, HRmRead, am->Xam.IRRS.base);
    220          addHRegUse(u, HRmRead, am->Xam.IRRS.index);
    221          return;
    222       default:
    223          vpanic("addRegUsage_X86AMode");
    224    }
    225 }
    226 
    227 static void mapRegs_X86AMode ( HRegRemap* m, X86AMode* am ) {
    228    switch (am->tag) {
    229       case Xam_IR:
    230          am->Xam.IR.reg = lookupHRegRemap(m, am->Xam.IR.reg);
    231          return;
    232       case Xam_IRRS:
    233          am->Xam.IRRS.base = lookupHRegRemap(m, am->Xam.IRRS.base);
    234          am->Xam.IRRS.index = lookupHRegRemap(m, am->Xam.IRRS.index);
    235          return;
    236       default:
    237          vpanic("mapRegs_X86AMode");
    238    }
    239 }
    240 
    241 /* --------- Operand, which can be reg, immediate or memory. --------- */
    242 
    243 X86RMI* X86RMI_Imm ( UInt imm32 ) {
    244    X86RMI* op         = LibVEX_Alloc(sizeof(X86RMI));
    245    op->tag            = Xrmi_Imm;
    246    op->Xrmi.Imm.imm32 = imm32;
    247    return op;
    248 }
    249 X86RMI* X86RMI_Reg ( HReg reg ) {
    250    X86RMI* op       = LibVEX_Alloc(sizeof(X86RMI));
    251    op->tag          = Xrmi_Reg;
    252    op->Xrmi.Reg.reg = reg;
    253    return op;
    254 }
    255 X86RMI* X86RMI_Mem ( X86AMode* am ) {
    256    X86RMI* op      = LibVEX_Alloc(sizeof(X86RMI));
    257    op->tag         = Xrmi_Mem;
    258    op->Xrmi.Mem.am = am;
    259    return op;
    260 }
    261 
    262 void ppX86RMI ( X86RMI* op ) {
    263    switch (op->tag) {
    264       case Xrmi_Imm:
    265          vex_printf("$0x%x", op->Xrmi.Imm.imm32);
    266          return;
    267       case Xrmi_Reg:
    268          ppHRegX86(op->Xrmi.Reg.reg);
    269          return;
    270       case Xrmi_Mem:
    271          ppX86AMode(op->Xrmi.Mem.am);
    272          return;
    273      default:
    274          vpanic("ppX86RMI");
    275    }
    276 }
    277 
    278 /* An X86RMI can only be used in a "read" context (what would it mean
    279    to write or modify a literal?) and so we enumerate its registers
    280    accordingly. */
    281 static void addRegUsage_X86RMI ( HRegUsage* u, X86RMI* op ) {
    282    switch (op->tag) {
    283       case Xrmi_Imm:
    284          return;
    285       case Xrmi_Reg:
    286          addHRegUse(u, HRmRead, op->Xrmi.Reg.reg);
    287          return;
    288       case Xrmi_Mem:
    289          addRegUsage_X86AMode(u, op->Xrmi.Mem.am);
    290          return;
    291       default:
    292          vpanic("addRegUsage_X86RMI");
    293    }
    294 }
    295 
    296 static void mapRegs_X86RMI ( HRegRemap* m, X86RMI* op ) {
    297    switch (op->tag) {
    298       case Xrmi_Imm:
    299          return;
    300       case Xrmi_Reg:
    301          op->Xrmi.Reg.reg = lookupHRegRemap(m, op->Xrmi.Reg.reg);
    302          return;
    303       case Xrmi_Mem:
    304          mapRegs_X86AMode(m, op->Xrmi.Mem.am);
    305          return;
    306       default:
    307          vpanic("mapRegs_X86RMI");
    308    }
    309 }
    310 
    311 
    312 /* --------- Operand, which can be reg or immediate only. --------- */
    313 
    314 X86RI* X86RI_Imm ( UInt imm32 ) {
    315    X86RI* op         = LibVEX_Alloc(sizeof(X86RI));
    316    op->tag           = Xri_Imm;
    317    op->Xri.Imm.imm32 = imm32;
    318    return op;
    319 }
    320 X86RI* X86RI_Reg ( HReg reg ) {
    321    X86RI* op       = LibVEX_Alloc(sizeof(X86RI));
    322    op->tag         = Xri_Reg;
    323    op->Xri.Reg.reg = reg;
    324    return op;
    325 }
    326 
    327 void ppX86RI ( X86RI* op ) {
    328    switch (op->tag) {
    329       case Xri_Imm:
    330          vex_printf("$0x%x", op->Xri.Imm.imm32);
    331          return;
    332       case Xri_Reg:
    333          ppHRegX86(op->Xri.Reg.reg);
    334          return;
    335      default:
    336          vpanic("ppX86RI");
    337    }
    338 }
    339 
    340 /* An X86RI can only be used in a "read" context (what would it mean
    341    to write or modify a literal?) and so we enumerate its registers
    342    accordingly. */
    343 static void addRegUsage_X86RI ( HRegUsage* u, X86RI* op ) {
    344    switch (op->tag) {
    345       case Xri_Imm:
    346          return;
    347       case Xri_Reg:
    348          addHRegUse(u, HRmRead, op->Xri.Reg.reg);
    349          return;
    350       default:
    351          vpanic("addRegUsage_X86RI");
    352    }
    353 }
    354 
    355 static void mapRegs_X86RI ( HRegRemap* m, X86RI* op ) {
    356    switch (op->tag) {
    357       case Xri_Imm:
    358          return;
    359       case Xri_Reg:
    360          op->Xri.Reg.reg = lookupHRegRemap(m, op->Xri.Reg.reg);
    361          return;
    362       default:
    363          vpanic("mapRegs_X86RI");
    364    }
    365 }
    366 
    367 
    368 /* --------- Operand, which can be reg or memory only. --------- */
    369 
    370 X86RM* X86RM_Reg ( HReg reg ) {
    371    X86RM* op       = LibVEX_Alloc(sizeof(X86RM));
    372    op->tag         = Xrm_Reg;
    373    op->Xrm.Reg.reg = reg;
    374    return op;
    375 }
    376 X86RM* X86RM_Mem ( X86AMode* am ) {
    377    X86RM* op      = LibVEX_Alloc(sizeof(X86RM));
    378    op->tag        = Xrm_Mem;
    379    op->Xrm.Mem.am = am;
    380    return op;
    381 }
    382 
    383 void ppX86RM ( X86RM* op ) {
    384    switch (op->tag) {
    385       case Xrm_Mem:
    386          ppX86AMode(op->Xrm.Mem.am);
    387          return;
    388       case Xrm_Reg:
    389          ppHRegX86(op->Xrm.Reg.reg);
    390          return;
    391      default:
    392          vpanic("ppX86RM");
    393    }
    394 }
    395 
    396 /* Because an X86RM can be both a source or destination operand, we
    397    have to supply a mode -- pertaining to the operand as a whole --
    398    indicating how it's being used. */
    399 static void addRegUsage_X86RM ( HRegUsage* u, X86RM* op, HRegMode mode ) {
    400    switch (op->tag) {
    401       case Xrm_Mem:
    402          /* Memory is read, written or modified.  So we just want to
    403             know the regs read by the amode. */
    404          addRegUsage_X86AMode(u, op->Xrm.Mem.am);
    405          return;
    406       case Xrm_Reg:
    407          /* reg is read, written or modified.  Add it in the
    408             appropriate way. */
    409          addHRegUse(u, mode, op->Xrm.Reg.reg);
    410          return;
    411      default:
    412          vpanic("addRegUsage_X86RM");
    413    }
    414 }
    415 
    416 static void mapRegs_X86RM ( HRegRemap* m, X86RM* op )
    417 {
    418    switch (op->tag) {
    419       case Xrm_Mem:
    420          mapRegs_X86AMode(m, op->Xrm.Mem.am);
    421          return;
    422       case Xrm_Reg:
    423          op->Xrm.Reg.reg = lookupHRegRemap(m, op->Xrm.Reg.reg);
    424          return;
    425      default:
    426          vpanic("mapRegs_X86RM");
    427    }
    428 }
    429 
    430 
    431 /* --------- Instructions. --------- */
    432 
    433 HChar* showX86UnaryOp ( X86UnaryOp op ) {
    434    switch (op) {
    435       case Xun_NOT: return "not";
    436       case Xun_NEG: return "neg";
    437       default: vpanic("showX86UnaryOp");
    438    }
    439 }
    440 
    441 HChar* showX86AluOp ( X86AluOp op ) {
    442    switch (op) {
    443       case Xalu_MOV:  return "mov";
    444       case Xalu_CMP:  return "cmp";
    445       case Xalu_ADD:  return "add";
    446       case Xalu_SUB:  return "sub";
    447       case Xalu_ADC:  return "adc";
    448       case Xalu_SBB:  return "sbb";
    449       case Xalu_AND:  return "and";
    450       case Xalu_OR:   return "or";
    451       case Xalu_XOR:  return "xor";
    452       case Xalu_MUL:  return "mul";
    453       default: vpanic("showX86AluOp");
    454    }
    455 }
    456 
    457 HChar* showX86ShiftOp ( X86ShiftOp op ) {
    458    switch (op) {
    459       case Xsh_SHL: return "shl";
    460       case Xsh_SHR: return "shr";
    461       case Xsh_SAR: return "sar";
    462       default: vpanic("showX86ShiftOp");
    463    }
    464 }
    465 
    466 HChar* showX86FpOp ( X86FpOp op ) {
    467    switch (op) {
    468       case Xfp_ADD:    return "add";
    469       case Xfp_SUB:    return "sub";
    470       case Xfp_MUL:    return "mul";
    471       case Xfp_DIV:    return "div";
    472       case Xfp_SCALE:  return "scale";
    473       case Xfp_ATAN:   return "atan";
    474       case Xfp_YL2X:   return "yl2x";
    475       case Xfp_YL2XP1: return "yl2xp1";
    476       case Xfp_PREM:   return "prem";
    477       case Xfp_PREM1:  return "prem1";
    478       case Xfp_SQRT:   return "sqrt";
    479       case Xfp_ABS:    return "abs";
    480       case Xfp_NEG:    return "chs";
    481       case Xfp_MOV:    return "mov";
    482       case Xfp_SIN:    return "sin";
    483       case Xfp_COS:    return "cos";
    484       case Xfp_TAN:    return "tan";
    485       case Xfp_ROUND:  return "round";
    486       case Xfp_2XM1:   return "2xm1";
    487       default: vpanic("showX86FpOp");
    488    }
    489 }
    490 
    491 HChar* showX86SseOp ( X86SseOp op ) {
    492    switch (op) {
    493       case Xsse_MOV:      return "mov(?!)";
    494       case Xsse_ADDF:     return "add";
    495       case Xsse_SUBF:     return "sub";
    496       case Xsse_MULF:     return "mul";
    497       case Xsse_DIVF:     return "div";
    498       case Xsse_MAXF:     return "max";
    499       case Xsse_MINF:     return "min";
    500       case Xsse_CMPEQF:   return "cmpFeq";
    501       case Xsse_CMPLTF:   return "cmpFlt";
    502       case Xsse_CMPLEF:   return "cmpFle";
    503       case Xsse_CMPUNF:   return "cmpFun";
    504       case Xsse_RCPF:     return "rcp";
    505       case Xsse_RSQRTF:   return "rsqrt";
    506       case Xsse_SQRTF:    return "sqrt";
    507       case Xsse_AND:      return "and";
    508       case Xsse_OR:       return "or";
    509       case Xsse_XOR:      return "xor";
    510       case Xsse_ANDN:     return "andn";
    511       case Xsse_ADD8:     return "paddb";
    512       case Xsse_ADD16:    return "paddw";
    513       case Xsse_ADD32:    return "paddd";
    514       case Xsse_ADD64:    return "paddq";
    515       case Xsse_QADD8U:   return "paddusb";
    516       case Xsse_QADD16U:  return "paddusw";
    517       case Xsse_QADD8S:   return "paddsb";
    518       case Xsse_QADD16S:  return "paddsw";
    519       case Xsse_SUB8:     return "psubb";
    520       case Xsse_SUB16:    return "psubw";
    521       case Xsse_SUB32:    return "psubd";
    522       case Xsse_SUB64:    return "psubq";
    523       case Xsse_QSUB8U:   return "psubusb";
    524       case Xsse_QSUB16U:  return "psubusw";
    525       case Xsse_QSUB8S:   return "psubsb";
    526       case Xsse_QSUB16S:  return "psubsw";
    527       case Xsse_MUL16:    return "pmullw";
    528       case Xsse_MULHI16U: return "pmulhuw";
    529       case Xsse_MULHI16S: return "pmulhw";
    530       case Xsse_AVG8U:    return "pavgb";
    531       case Xsse_AVG16U:   return "pavgw";
    532       case Xsse_MAX16S:   return "pmaxw";
    533       case Xsse_MAX8U:    return "pmaxub";
    534       case Xsse_MIN16S:   return "pminw";
    535       case Xsse_MIN8U:    return "pminub";
    536       case Xsse_CMPEQ8:   return "pcmpeqb";
    537       case Xsse_CMPEQ16:  return "pcmpeqw";
    538       case Xsse_CMPEQ32:  return "pcmpeqd";
    539       case Xsse_CMPGT8S:  return "pcmpgtb";
    540       case Xsse_CMPGT16S: return "pcmpgtw";
    541       case Xsse_CMPGT32S: return "pcmpgtd";
    542       case Xsse_SHL16:    return "psllw";
    543       case Xsse_SHL32:    return "pslld";
    544       case Xsse_SHL64:    return "psllq";
    545       case Xsse_SHR16:    return "psrlw";
    546       case Xsse_SHR32:    return "psrld";
    547       case Xsse_SHR64:    return "psrlq";
    548       case Xsse_SAR16:    return "psraw";
    549       case Xsse_SAR32:    return "psrad";
    550       case Xsse_PACKSSD:  return "packssdw";
    551       case Xsse_PACKSSW:  return "packsswb";
    552       case Xsse_PACKUSW:  return "packuswb";
    553       case Xsse_UNPCKHB:  return "punpckhb";
    554       case Xsse_UNPCKHW:  return "punpckhw";
    555       case Xsse_UNPCKHD:  return "punpckhd";
    556       case Xsse_UNPCKHQ:  return "punpckhq";
    557       case Xsse_UNPCKLB:  return "punpcklb";
    558       case Xsse_UNPCKLW:  return "punpcklw";
    559       case Xsse_UNPCKLD:  return "punpckld";
    560       case Xsse_UNPCKLQ:  return "punpcklq";
    561       default: vpanic("showX86SseOp");
    562    }
    563 }
    564 
    565 X86Instr* X86Instr_Alu32R ( X86AluOp op, X86RMI* src, HReg dst ) {
    566    X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
    567    i->tag            = Xin_Alu32R;
    568    i->Xin.Alu32R.op  = op;
    569    i->Xin.Alu32R.src = src;
    570    i->Xin.Alu32R.dst = dst;
    571    return i;
    572 }
    573 X86Instr* X86Instr_Alu32M ( X86AluOp op, X86RI* src, X86AMode* dst ) {
    574    X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
    575    i->tag            = Xin_Alu32M;
    576    i->Xin.Alu32M.op  = op;
    577    i->Xin.Alu32M.src = src;
    578    i->Xin.Alu32M.dst = dst;
    579    vassert(op != Xalu_MUL);
    580    return i;
    581 }
    582 X86Instr* X86Instr_Sh32 ( X86ShiftOp op, UInt src, HReg dst ) {
    583    X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
    584    i->tag          = Xin_Sh32;
    585    i->Xin.Sh32.op  = op;
    586    i->Xin.Sh32.src = src;
    587    i->Xin.Sh32.dst = dst;
    588    return i;
    589 }
    590 X86Instr* X86Instr_Test32 ( UInt imm32, X86RM* dst ) {
    591    X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
    592    i->tag              = Xin_Test32;
    593    i->Xin.Test32.imm32 = imm32;
    594    i->Xin.Test32.dst   = dst;
    595    return i;
    596 }
    597 X86Instr* X86Instr_Unary32 ( X86UnaryOp op, HReg dst ) {
    598    X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
    599    i->tag             = Xin_Unary32;
    600    i->Xin.Unary32.op  = op;
    601    i->Xin.Unary32.dst = dst;
    602    return i;
    603 }
    604 X86Instr* X86Instr_Lea32 ( X86AMode* am, HReg dst ) {
    605    X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
    606    i->tag             = Xin_Lea32;
    607    i->Xin.Lea32.am    = am;
    608    i->Xin.Lea32.dst   = dst;
    609    return i;
    610 }
    611 X86Instr* X86Instr_MulL ( Bool syned, X86RM* src ) {
    612    X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
    613    i->tag             = Xin_MulL;
    614    i->Xin.MulL.syned  = syned;
    615    i->Xin.MulL.src    = src;
    616    return i;
    617 }
    618 X86Instr* X86Instr_Div ( Bool syned, X86RM* src ) {
    619    X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
    620    i->tag           = Xin_Div;
    621    i->Xin.Div.syned = syned;
    622    i->Xin.Div.src   = src;
    623    return i;
    624 }
    625 X86Instr* X86Instr_Sh3232  ( X86ShiftOp op, UInt amt, HReg src, HReg dst ) {
    626    X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
    627    i->tag            = Xin_Sh3232;
    628    i->Xin.Sh3232.op  = op;
    629    i->Xin.Sh3232.amt = amt;
    630    i->Xin.Sh3232.src = src;
    631    i->Xin.Sh3232.dst = dst;
    632    vassert(op == Xsh_SHL || op == Xsh_SHR);
    633    return i;
    634 }
    635 X86Instr* X86Instr_Push( X86RMI* src ) {
    636    X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
    637    i->tag          = Xin_Push;
    638    i->Xin.Push.src = src;
    639    return i;
    640 }
    641 X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms ) {
    642    X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
    643    i->tag               = Xin_Call;
    644    i->Xin.Call.cond     = cond;
    645    i->Xin.Call.target   = target;
    646    i->Xin.Call.regparms = regparms;
    647    vassert(regparms >= 0 && regparms <= 3);
    648    return i;
    649 }
    650 X86Instr* X86Instr_XDirect ( Addr32 dstGA, X86AMode* amEIP,
    651                              X86CondCode cond, Bool toFastEP ) {
    652    X86Instr* i             = LibVEX_Alloc(sizeof(X86Instr));
    653    i->tag                  = Xin_XDirect;
    654    i->Xin.XDirect.dstGA    = dstGA;
    655    i->Xin.XDirect.amEIP    = amEIP;
    656    i->Xin.XDirect.cond     = cond;
    657    i->Xin.XDirect.toFastEP = toFastEP;
    658    return i;
    659 }
    660 X86Instr* X86Instr_XIndir ( HReg dstGA, X86AMode* amEIP,
    661                             X86CondCode cond ) {
    662    X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
    663    i->tag              = Xin_XIndir;
    664    i->Xin.XIndir.dstGA = dstGA;
    665    i->Xin.XIndir.amEIP = amEIP;
    666    i->Xin.XIndir.cond  = cond;
    667    return i;
    668 }
    669 X86Instr* X86Instr_XAssisted ( HReg dstGA, X86AMode* amEIP,
    670                                X86CondCode cond, IRJumpKind jk ) {
    671    X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
    672    i->tag                 = Xin_XAssisted;
    673    i->Xin.XAssisted.dstGA = dstGA;
    674    i->Xin.XAssisted.amEIP = amEIP;
    675    i->Xin.XAssisted.cond  = cond;
    676    i->Xin.XAssisted.jk    = jk;
    677    return i;
    678 }
    679 X86Instr* X86Instr_CMov32  ( X86CondCode cond, X86RM* src, HReg dst ) {
    680    X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
    681    i->tag             = Xin_CMov32;
    682    i->Xin.CMov32.cond = cond;
    683    i->Xin.CMov32.src  = src;
    684    i->Xin.CMov32.dst  = dst;
    685    vassert(cond != Xcc_ALWAYS);
    686    return i;
    687 }
    688 X86Instr* X86Instr_LoadEX ( UChar szSmall, Bool syned,
    689                             X86AMode* src, HReg dst ) {
    690    X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
    691    i->tag                = Xin_LoadEX;
    692    i->Xin.LoadEX.szSmall = szSmall;
    693    i->Xin.LoadEX.syned   = syned;
    694    i->Xin.LoadEX.src     = src;
    695    i->Xin.LoadEX.dst     = dst;
    696    vassert(szSmall == 1 || szSmall == 2);
    697    return i;
    698 }
    699 X86Instr* X86Instr_Store ( UChar sz, HReg src, X86AMode* dst ) {
    700    X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
    701    i->tag           = Xin_Store;
    702    i->Xin.Store.sz  = sz;
    703    i->Xin.Store.src = src;
    704    i->Xin.Store.dst = dst;
    705    vassert(sz == 1 || sz == 2);
    706    return i;
    707 }
    708 X86Instr* X86Instr_Set32 ( X86CondCode cond, HReg dst ) {
    709    X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
    710    i->tag            = Xin_Set32;
    711    i->Xin.Set32.cond = cond;
    712    i->Xin.Set32.dst  = dst;
    713    return i;
    714 }
    715 X86Instr* X86Instr_Bsfr32 ( Bool isFwds, HReg src, HReg dst ) {
    716    X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
    717    i->tag               = Xin_Bsfr32;
    718    i->Xin.Bsfr32.isFwds = isFwds;
    719    i->Xin.Bsfr32.src    = src;
    720    i->Xin.Bsfr32.dst    = dst;
    721    return i;
    722 }
    723 X86Instr* X86Instr_MFence ( UInt hwcaps ) {
    724    X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
    725    i->tag               = Xin_MFence;
    726    i->Xin.MFence.hwcaps = hwcaps;
    727    vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_SSE1
    728                             |VEX_HWCAPS_X86_SSE2
    729                             |VEX_HWCAPS_X86_SSE3
    730                             |VEX_HWCAPS_X86_LZCNT)));
    731    return i;
    732 }
    733 X86Instr* X86Instr_ACAS ( X86AMode* addr, UChar sz ) {
    734    X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
    735    i->tag           = Xin_ACAS;
    736    i->Xin.ACAS.addr = addr;
    737    i->Xin.ACAS.sz   = sz;
    738    vassert(sz == 4 || sz == 2 || sz == 1);
    739    return i;
    740 }
    741 X86Instr* X86Instr_DACAS ( X86AMode* addr ) {
    742    X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
    743    i->tag            = Xin_DACAS;
    744    i->Xin.DACAS.addr = addr;
    745    return i;
    746 }
    747 
    748 X86Instr* X86Instr_FpUnary ( X86FpOp op, HReg src, HReg dst ) {
    749    X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
    750    i->tag             = Xin_FpUnary;
    751    i->Xin.FpUnary.op  = op;
    752    i->Xin.FpUnary.src = src;
    753    i->Xin.FpUnary.dst = dst;
    754    return i;
    755 }
    756 X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst ) {
    757    X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
    758    i->tag               = Xin_FpBinary;
    759    i->Xin.FpBinary.op   = op;
    760    i->Xin.FpBinary.srcL = srcL;
    761    i->Xin.FpBinary.srcR = srcR;
    762    i->Xin.FpBinary.dst  = dst;
    763    return i;
    764 }
    765 X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
    766    X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
    767    i->tag               = Xin_FpLdSt;
    768    i->Xin.FpLdSt.isLoad = isLoad;
    769    i->Xin.FpLdSt.sz     = sz;
    770    i->Xin.FpLdSt.reg    = reg;
    771    i->Xin.FpLdSt.addr   = addr;
    772    vassert(sz == 4 || sz == 8 || sz == 10);
    773    return i;
    774 }
    775 X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,
    776                              HReg reg, X86AMode* addr ) {
    777    X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
    778    i->tag                = Xin_FpLdStI;
    779    i->Xin.FpLdStI.isLoad = isLoad;
    780    i->Xin.FpLdStI.sz     = sz;
    781    i->Xin.FpLdStI.reg    = reg;
    782    i->Xin.FpLdStI.addr   = addr;
    783    vassert(sz == 2 || sz == 4 || sz == 8);
    784    return i;
    785 }
    786 X86Instr* X86Instr_Fp64to32 ( HReg src, HReg dst ) {
    787    X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
    788    i->tag              = Xin_Fp64to32;
    789    i->Xin.Fp64to32.src = src;
    790    i->Xin.Fp64to32.dst = dst;
    791    return i;
    792 }
    793 X86Instr* X86Instr_FpCMov ( X86CondCode cond, HReg src, HReg dst ) {
    794    X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
    795    i->tag             = Xin_FpCMov;
    796    i->Xin.FpCMov.cond = cond;
    797    i->Xin.FpCMov.src  = src;
    798    i->Xin.FpCMov.dst  = dst;
    799    vassert(cond != Xcc_ALWAYS);
    800    return i;
    801 }
    802 X86Instr* X86Instr_FpLdCW ( X86AMode* addr ) {
    803    X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
    804    i->tag               = Xin_FpLdCW;
    805    i->Xin.FpLdCW.addr   = addr;
    806    return i;
    807 }
    808 X86Instr* X86Instr_FpStSW_AX ( void ) {
    809    X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
    810    i->tag      = Xin_FpStSW_AX;
    811    return i;
    812 }
    813 X86Instr* X86Instr_FpCmp ( HReg srcL, HReg srcR, HReg dst ) {
    814    X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
    815    i->tag            = Xin_FpCmp;
    816    i->Xin.FpCmp.srcL = srcL;
    817    i->Xin.FpCmp.srcR = srcR;
    818    i->Xin.FpCmp.dst  = dst;
    819    return i;
    820 }
    821 X86Instr* X86Instr_SseConst ( UShort con, HReg dst ) {
    822    X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
    823    i->tag                 = Xin_SseConst;
    824    i->Xin.SseConst.con    = con;
    825    i->Xin.SseConst.dst    = dst;
    826    vassert(hregClass(dst) == HRcVec128);
    827    return i;
    828 }
    829 X86Instr* X86Instr_SseLdSt ( Bool isLoad, HReg reg, X86AMode* addr ) {
    830    X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
    831    i->tag                = Xin_SseLdSt;
    832    i->Xin.SseLdSt.isLoad = isLoad;
    833    i->Xin.SseLdSt.reg    = reg;
    834    i->Xin.SseLdSt.addr   = addr;
    835    return i;
    836 }
    837 X86Instr* X86Instr_SseLdzLO  ( Int sz, HReg reg, X86AMode* addr )
    838 {
    839    X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
    840    i->tag                = Xin_SseLdzLO;
    841    i->Xin.SseLdzLO.sz    = toUChar(sz);
    842    i->Xin.SseLdzLO.reg   = reg;
    843    i->Xin.SseLdzLO.addr  = addr;
    844    vassert(sz == 4 || sz == 8);
    845    return i;
    846 }
    847 X86Instr* X86Instr_Sse32Fx4 ( X86SseOp op, HReg src, HReg dst ) {
    848    X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
    849    i->tag              = Xin_Sse32Fx4;
    850    i->Xin.Sse32Fx4.op  = op;
    851    i->Xin.Sse32Fx4.src = src;
    852    i->Xin.Sse32Fx4.dst = dst;
    853    vassert(op != Xsse_MOV);
    854    return i;
    855 }
    856 X86Instr* X86Instr_Sse32FLo ( X86SseOp op, HReg src, HReg dst ) {
    857    X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
    858    i->tag              = Xin_Sse32FLo;
    859    i->Xin.Sse32FLo.op  = op;
    860    i->Xin.Sse32FLo.src = src;
    861    i->Xin.Sse32FLo.dst = dst;
    862    vassert(op != Xsse_MOV);
    863    return i;
    864 }
    865 X86Instr* X86Instr_Sse64Fx2 ( X86SseOp op, HReg src, HReg dst ) {
    866    X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
    867    i->tag              = Xin_Sse64Fx2;
    868    i->Xin.Sse64Fx2.op  = op;
    869    i->Xin.Sse64Fx2.src = src;
    870    i->Xin.Sse64Fx2.dst = dst;
    871    vassert(op != Xsse_MOV);
    872    return i;
    873 }
    874 X86Instr* X86Instr_Sse64FLo ( X86SseOp op, HReg src, HReg dst ) {
    875    X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
    876    i->tag              = Xin_Sse64FLo;
    877    i->Xin.Sse64FLo.op  = op;
    878    i->Xin.Sse64FLo.src = src;
    879    i->Xin.Sse64FLo.dst = dst;
    880    vassert(op != Xsse_MOV);
    881    return i;
    882 }
    883 X86Instr* X86Instr_SseReRg ( X86SseOp op, HReg re, HReg rg ) {
    884    X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
    885    i->tag             = Xin_SseReRg;
    886    i->Xin.SseReRg.op  = op;
    887    i->Xin.SseReRg.src = re;
    888    i->Xin.SseReRg.dst = rg;
    889    return i;
    890 }
    891 X86Instr* X86Instr_SseCMov ( X86CondCode cond, HReg src, HReg dst ) {
    892    X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
    893    i->tag              = Xin_SseCMov;
    894    i->Xin.SseCMov.cond = cond;
    895    i->Xin.SseCMov.src  = src;
    896    i->Xin.SseCMov.dst  = dst;
    897    vassert(cond != Xcc_ALWAYS);
    898    return i;
    899 }
    900 X86Instr* X86Instr_SseShuf ( Int order, HReg src, HReg dst ) {
    901    X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
    902    i->tag               = Xin_SseShuf;
    903    i->Xin.SseShuf.order = order;
    904    i->Xin.SseShuf.src   = src;
    905    i->Xin.SseShuf.dst   = dst;
    906    vassert(order >= 0 && order <= 0xFF);
    907    return i;
    908 }
    909 X86Instr* X86Instr_EvCheck ( X86AMode* amCounter,
    910                              X86AMode* amFailAddr ) {
    911    X86Instr* i               = LibVEX_Alloc(sizeof(X86Instr));
    912    i->tag                    = Xin_EvCheck;
    913    i->Xin.EvCheck.amCounter  = amCounter;
    914    i->Xin.EvCheck.amFailAddr = amFailAddr;
    915    return i;
    916 }
    917 X86Instr* X86Instr_ProfInc ( void ) {
    918    X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
    919    i->tag      = Xin_ProfInc;
    920    return i;
    921 }
    922 
    923 void ppX86Instr ( X86Instr* i, Bool mode64 ) {
    924    vassert(mode64 == False);
    925    switch (i->tag) {
    926       case Xin_Alu32R:
    927          vex_printf("%sl ", showX86AluOp(i->Xin.Alu32R.op));
    928          ppX86RMI(i->Xin.Alu32R.src);
    929          vex_printf(",");
    930          ppHRegX86(i->Xin.Alu32R.dst);
    931          return;
    932       case Xin_Alu32M:
    933          vex_printf("%sl ", showX86AluOp(i->Xin.Alu32M.op));
    934          ppX86RI(i->Xin.Alu32M.src);
    935          vex_printf(",");
    936          ppX86AMode(i->Xin.Alu32M.dst);
    937          return;
    938       case Xin_Sh32:
    939          vex_printf("%sl ", showX86ShiftOp(i->Xin.Sh32.op));
    940          if (i->Xin.Sh32.src == 0)
    941            vex_printf("%%cl,");
    942          else
    943             vex_printf("$%d,", (Int)i->Xin.Sh32.src);
    944          ppHRegX86(i->Xin.Sh32.dst);
    945          return;
    946       case Xin_Test32:
    947          vex_printf("testl $%d,", (Int)i->Xin.Test32.imm32);
    948          ppX86RM(i->Xin.Test32.dst);
    949          return;
    950       case Xin_Unary32:
    951          vex_printf("%sl ", showX86UnaryOp(i->Xin.Unary32.op));
    952          ppHRegX86(i->Xin.Unary32.dst);
    953          return;
    954       case Xin_Lea32:
    955          vex_printf("leal ");
    956          ppX86AMode(i->Xin.Lea32.am);
    957          vex_printf(",");
    958          ppHRegX86(i->Xin.Lea32.dst);
    959          return;
    960       case Xin_MulL:
    961          vex_printf("%cmull ", i->Xin.MulL.syned ? 's' : 'u');
    962          ppX86RM(i->Xin.MulL.src);
    963          return;
    964       case Xin_Div:
    965          vex_printf("%cdivl ", i->Xin.Div.syned ? 's' : 'u');
    966          ppX86RM(i->Xin.Div.src);
    967          return;
    968       case Xin_Sh3232:
    969          vex_printf("%sdl ", showX86ShiftOp(i->Xin.Sh3232.op));
    970          if (i->Xin.Sh3232.amt == 0)
    971            vex_printf(" %%cl,");
    972          else
    973             vex_printf(" $%d,", (Int)i->Xin.Sh3232.amt);
    974          ppHRegX86(i->Xin.Sh3232.src);
    975          vex_printf(",");
    976          ppHRegX86(i->Xin.Sh3232.dst);
    977          return;
    978       case Xin_Push:
    979          vex_printf("pushl ");
    980          ppX86RMI(i->Xin.Push.src);
    981          return;
    982       case Xin_Call:
    983          vex_printf("call%s[%d] ",
    984                     i->Xin.Call.cond==Xcc_ALWAYS
    985                        ? "" : showX86CondCode(i->Xin.Call.cond),
    986                     i->Xin.Call.regparms);
    987          vex_printf("0x%x", i->Xin.Call.target);
    988          break;
    989       case Xin_XDirect:
    990          vex_printf("(xDirect) ");
    991          vex_printf("if (%%eflags.%s) { ",
    992                     showX86CondCode(i->Xin.XDirect.cond));
    993          vex_printf("movl $0x%x,", i->Xin.XDirect.dstGA);
    994          ppX86AMode(i->Xin.XDirect.amEIP);
    995          vex_printf("; ");
    996          vex_printf("movl $disp_cp_chain_me_to_%sEP,%%edx; call *%%edx }",
    997                     i->Xin.XDirect.toFastEP ? "fast" : "slow");
    998          return;
    999       case Xin_XIndir:
   1000          vex_printf("(xIndir) ");
   1001          vex_printf("if (%%eflags.%s) { movl ",
   1002                     showX86CondCode(i->Xin.XIndir.cond));
   1003          ppHRegX86(i->Xin.XIndir.dstGA);
   1004          vex_printf(",");
   1005          ppX86AMode(i->Xin.XIndir.amEIP);
   1006          vex_printf("; movl $disp_indir,%%edx; jmp *%%edx }");
   1007          return;
   1008       case Xin_XAssisted:
   1009          vex_printf("(xAssisted) ");
   1010          vex_printf("if (%%eflags.%s) { ",
   1011                     showX86CondCode(i->Xin.XAssisted.cond));
   1012          vex_printf("movl ");
   1013          ppHRegX86(i->Xin.XAssisted.dstGA);
   1014          vex_printf(",");
   1015          ppX86AMode(i->Xin.XAssisted.amEIP);
   1016          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%ebp",
   1017                     (Int)i->Xin.XAssisted.jk);
   1018          vex_printf("; movl $disp_assisted,%%edx; jmp *%%edx }");
   1019          return;
   1020       case Xin_CMov32:
   1021          vex_printf("cmov%s ", showX86CondCode(i->Xin.CMov32.cond));
   1022          ppX86RM(i->Xin.CMov32.src);
   1023          vex_printf(",");
   1024          ppHRegX86(i->Xin.CMov32.dst);
   1025          return;
   1026       case Xin_LoadEX:
   1027          vex_printf("mov%c%cl ",
   1028                     i->Xin.LoadEX.syned ? 's' : 'z',
   1029                     i->Xin.LoadEX.szSmall==1 ? 'b' : 'w');
   1030          ppX86AMode(i->Xin.LoadEX.src);
   1031          vex_printf(",");
   1032          ppHRegX86(i->Xin.LoadEX.dst);
   1033          return;
   1034       case Xin_Store:
   1035          vex_printf("mov%c ", i->Xin.Store.sz==1 ? 'b' : 'w');
   1036          ppHRegX86(i->Xin.Store.src);
   1037          vex_printf(",");
   1038          ppX86AMode(i->Xin.Store.dst);
   1039          return;
   1040       case Xin_Set32:
   1041          vex_printf("setl%s ", showX86CondCode(i->Xin.Set32.cond));
   1042          ppHRegX86(i->Xin.Set32.dst);
   1043          return;
   1044       case Xin_Bsfr32:
   1045          vex_printf("bs%cl ", i->Xin.Bsfr32.isFwds ? 'f' : 'r');
   1046          ppHRegX86(i->Xin.Bsfr32.src);
   1047          vex_printf(",");
   1048          ppHRegX86(i->Xin.Bsfr32.dst);
   1049          return;
   1050       case Xin_MFence:
   1051          vex_printf("mfence(%s)",
   1052                     LibVEX_ppVexHwCaps(VexArchX86,i->Xin.MFence.hwcaps));
   1053          return;
   1054       case Xin_ACAS:
   1055          vex_printf("lock cmpxchg%c ",
   1056                      i->Xin.ACAS.sz==1 ? 'b'
   1057                                        : i->Xin.ACAS.sz==2 ? 'w' : 'l');
   1058          vex_printf("{%%eax->%%ebx},");
   1059          ppX86AMode(i->Xin.ACAS.addr);
   1060          return;
   1061       case Xin_DACAS:
   1062          vex_printf("lock cmpxchg8b {%%edx:%%eax->%%ecx:%%ebx},");
   1063          ppX86AMode(i->Xin.DACAS.addr);
   1064          return;
   1065       case Xin_FpUnary:
   1066          vex_printf("g%sD ", showX86FpOp(i->Xin.FpUnary.op));
   1067          ppHRegX86(i->Xin.FpUnary.src);
   1068          vex_printf(",");
   1069          ppHRegX86(i->Xin.FpUnary.dst);
   1070          break;
   1071       case Xin_FpBinary:
   1072          vex_printf("g%sD ", showX86FpOp(i->Xin.FpBinary.op));
   1073          ppHRegX86(i->Xin.FpBinary.srcL);
   1074          vex_printf(",");
   1075          ppHRegX86(i->Xin.FpBinary.srcR);
   1076          vex_printf(",");
   1077          ppHRegX86(i->Xin.FpBinary.dst);
   1078          break;
   1079       case Xin_FpLdSt:
   1080          if (i->Xin.FpLdSt.isLoad) {
   1081             vex_printf("gld%c " ,  i->Xin.FpLdSt.sz==10 ? 'T'
   1082                                    : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
   1083             ppX86AMode(i->Xin.FpLdSt.addr);
   1084             vex_printf(", ");
   1085             ppHRegX86(i->Xin.FpLdSt.reg);
   1086          } else {
   1087             vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
   1088                                   : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
   1089             ppHRegX86(i->Xin.FpLdSt.reg);
   1090             vex_printf(", ");
   1091             ppX86AMode(i->Xin.FpLdSt.addr);
   1092          }
   1093          return;
   1094       case Xin_FpLdStI:
   1095          if (i->Xin.FpLdStI.isLoad) {
   1096             vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
   1097                                   i->Xin.FpLdStI.sz==4 ? "l" : "w");
   1098             ppX86AMode(i->Xin.FpLdStI.addr);
   1099             vex_printf(", ");
   1100             ppHRegX86(i->Xin.FpLdStI.reg);
   1101          } else {
   1102             vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
   1103                                   i->Xin.FpLdStI.sz==4 ? "l" : "w");
   1104             ppHRegX86(i->Xin.FpLdStI.reg);
   1105             vex_printf(", ");
   1106             ppX86AMode(i->Xin.FpLdStI.addr);
   1107          }
   1108          return;
   1109       case Xin_Fp64to32:
   1110          vex_printf("gdtof ");
   1111          ppHRegX86(i->Xin.Fp64to32.src);
   1112          vex_printf(",");
   1113          ppHRegX86(i->Xin.Fp64to32.dst);
   1114          return;
   1115       case Xin_FpCMov:
   1116          vex_printf("gcmov%s ", showX86CondCode(i->Xin.FpCMov.cond));
   1117          ppHRegX86(i->Xin.FpCMov.src);
   1118          vex_printf(",");
   1119          ppHRegX86(i->Xin.FpCMov.dst);
   1120          return;
   1121       case Xin_FpLdCW:
   1122          vex_printf("fldcw ");
   1123          ppX86AMode(i->Xin.FpLdCW.addr);
   1124          return;
   1125       case Xin_FpStSW_AX:
   1126          vex_printf("fstsw %%ax");
   1127          return;
   1128       case Xin_FpCmp:
   1129          vex_printf("gcmp ");
   1130          ppHRegX86(i->Xin.FpCmp.srcL);
   1131          vex_printf(",");
   1132          ppHRegX86(i->Xin.FpCmp.srcR);
   1133          vex_printf(",");
   1134          ppHRegX86(i->Xin.FpCmp.dst);
   1135          break;
   1136       case Xin_SseConst:
   1137          vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
   1138          ppHRegX86(i->Xin.SseConst.dst);
   1139          break;
   1140       case Xin_SseLdSt:
   1141          vex_printf("movups ");
   1142          if (i->Xin.SseLdSt.isLoad) {
   1143             ppX86AMode(i->Xin.SseLdSt.addr);
   1144             vex_printf(",");
   1145             ppHRegX86(i->Xin.SseLdSt.reg);
   1146          } else {
   1147             ppHRegX86(i->Xin.SseLdSt.reg);
   1148             vex_printf(",");
   1149             ppX86AMode(i->Xin.SseLdSt.addr);
   1150          }
   1151          return;
   1152       case Xin_SseLdzLO:
   1153          vex_printf("movs%s ", i->Xin.SseLdzLO.sz==4 ? "s" : "d");
   1154          ppX86AMode(i->Xin.SseLdzLO.addr);
   1155          vex_printf(",");
   1156          ppHRegX86(i->Xin.SseLdzLO.reg);
   1157          return;
   1158       case Xin_Sse32Fx4:
   1159          vex_printf("%sps ", showX86SseOp(i->Xin.Sse32Fx4.op));
   1160          ppHRegX86(i->Xin.Sse32Fx4.src);
   1161          vex_printf(",");
   1162          ppHRegX86(i->Xin.Sse32Fx4.dst);
   1163          return;
   1164       case Xin_Sse32FLo:
   1165          vex_printf("%sss ", showX86SseOp(i->Xin.Sse32FLo.op));
   1166          ppHRegX86(i->Xin.Sse32FLo.src);
   1167          vex_printf(",");
   1168          ppHRegX86(i->Xin.Sse32FLo.dst);
   1169          return;
   1170       case Xin_Sse64Fx2:
   1171          vex_printf("%spd ", showX86SseOp(i->Xin.Sse64Fx2.op));
   1172          ppHRegX86(i->Xin.Sse64Fx2.src);
   1173          vex_printf(",");
   1174          ppHRegX86(i->Xin.Sse64Fx2.dst);
   1175          return;
   1176       case Xin_Sse64FLo:
   1177          vex_printf("%ssd ", showX86SseOp(i->Xin.Sse64FLo.op));
   1178          ppHRegX86(i->Xin.Sse64FLo.src);
   1179          vex_printf(",");
   1180          ppHRegX86(i->Xin.Sse64FLo.dst);
   1181          return;
   1182       case Xin_SseReRg:
   1183          vex_printf("%s ", showX86SseOp(i->Xin.SseReRg.op));
   1184          ppHRegX86(i->Xin.SseReRg.src);
   1185          vex_printf(",");
   1186          ppHRegX86(i->Xin.SseReRg.dst);
   1187          return;
   1188       case Xin_SseCMov:
   1189          vex_printf("cmov%s ", showX86CondCode(i->Xin.SseCMov.cond));
   1190          ppHRegX86(i->Xin.SseCMov.src);
   1191          vex_printf(",");
   1192          ppHRegX86(i->Xin.SseCMov.dst);
   1193          return;
   1194       case Xin_SseShuf:
   1195          vex_printf("pshufd $0x%x,", i->Xin.SseShuf.order);
   1196          ppHRegX86(i->Xin.SseShuf.src);
   1197          vex_printf(",");
   1198          ppHRegX86(i->Xin.SseShuf.dst);
   1199          return;
   1200       case Xin_EvCheck:
   1201          vex_printf("(evCheck) decl ");
   1202          ppX86AMode(i->Xin.EvCheck.amCounter);
   1203          vex_printf("; jns nofail; jmp *");
   1204          ppX86AMode(i->Xin.EvCheck.amFailAddr);
   1205          vex_printf("; nofail:");
   1206          return;
   1207       case Xin_ProfInc:
   1208          vex_printf("(profInc) addl $1,NotKnownYet; "
   1209                     "adcl $0,NotKnownYet+4");
   1210          return;
   1211       default:
   1212          vpanic("ppX86Instr");
   1213    }
   1214 }
   1215 
   1216 /* --------- Helpers for register allocation. --------- */
   1217 
   1218 void getRegUsage_X86Instr (HRegUsage* u, X86Instr* i, Bool mode64)
   1219 {
   1220    Bool unary;
   1221    vassert(mode64 == False);
   1222    initHRegUsage(u);
   1223    switch (i->tag) {
   1224       case Xin_Alu32R:
   1225          addRegUsage_X86RMI(u, i->Xin.Alu32R.src);
   1226          if (i->Xin.Alu32R.op == Xalu_MOV) {
   1227             addHRegUse(u, HRmWrite, i->Xin.Alu32R.dst);
   1228             return;
   1229          }
   1230          if (i->Xin.Alu32R.op == Xalu_CMP) {
   1231             addHRegUse(u, HRmRead, i->Xin.Alu32R.dst);
   1232             return;
   1233          }
   1234          addHRegUse(u, HRmModify, i->Xin.Alu32R.dst);
   1235          return;
   1236       case Xin_Alu32M:
   1237          addRegUsage_X86RI(u, i->Xin.Alu32M.src);
   1238          addRegUsage_X86AMode(u, i->Xin.Alu32M.dst);
   1239          return;
   1240       case Xin_Sh32:
   1241          addHRegUse(u, HRmModify, i->Xin.Sh32.dst);
   1242          if (i->Xin.Sh32.src == 0)
   1243             addHRegUse(u, HRmRead, hregX86_ECX());
   1244          return;
   1245       case Xin_Test32:
   1246          addRegUsage_X86RM(u, i->Xin.Test32.dst, HRmRead);
   1247          return;
   1248       case Xin_Unary32:
   1249          addHRegUse(u, HRmModify, i->Xin.Unary32.dst);
   1250          return;
   1251       case Xin_Lea32:
   1252          addRegUsage_X86AMode(u, i->Xin.Lea32.am);
   1253          addHRegUse(u, HRmWrite, i->Xin.Lea32.dst);
   1254          return;
   1255       case Xin_MulL:
   1256          addRegUsage_X86RM(u, i->Xin.MulL.src, HRmRead);
   1257          addHRegUse(u, HRmModify, hregX86_EAX());
   1258          addHRegUse(u, HRmWrite, hregX86_EDX());
   1259          return;
   1260       case Xin_Div:
   1261          addRegUsage_X86RM(u, i->Xin.Div.src, HRmRead);
   1262          addHRegUse(u, HRmModify, hregX86_EAX());
   1263          addHRegUse(u, HRmModify, hregX86_EDX());
   1264          return;
   1265       case Xin_Sh3232:
   1266          addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
   1267          addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
   1268          if (i->Xin.Sh3232.amt == 0)
   1269             addHRegUse(u, HRmRead, hregX86_ECX());
   1270          return;
   1271       case Xin_Push:
   1272          addRegUsage_X86RMI(u, i->Xin.Push.src);
   1273          addHRegUse(u, HRmModify, hregX86_ESP());
   1274          return;
   1275       case Xin_Call:
   1276          /* This is a bit subtle. */
   1277          /* First off, claim it trashes all the caller-saved regs
   1278             which fall within the register allocator's jurisdiction.
   1279             These I believe to be %eax %ecx %edx and all the xmm
   1280             registers. */
   1281          addHRegUse(u, HRmWrite, hregX86_EAX());
   1282          addHRegUse(u, HRmWrite, hregX86_ECX());
   1283          addHRegUse(u, HRmWrite, hregX86_EDX());
   1284          addHRegUse(u, HRmWrite, hregX86_XMM0());
   1285          addHRegUse(u, HRmWrite, hregX86_XMM1());
   1286          addHRegUse(u, HRmWrite, hregX86_XMM2());
   1287          addHRegUse(u, HRmWrite, hregX86_XMM3());
   1288          addHRegUse(u, HRmWrite, hregX86_XMM4());
   1289          addHRegUse(u, HRmWrite, hregX86_XMM5());
   1290          addHRegUse(u, HRmWrite, hregX86_XMM6());
   1291          addHRegUse(u, HRmWrite, hregX86_XMM7());
   1292          /* Now we have to state any parameter-carrying registers
   1293             which might be read.  This depends on the regparmness. */
   1294          switch (i->Xin.Call.regparms) {
   1295             case 3: addHRegUse(u, HRmRead, hregX86_ECX()); /*fallthru*/
   1296             case 2: addHRegUse(u, HRmRead, hregX86_EDX()); /*fallthru*/
   1297             case 1: addHRegUse(u, HRmRead, hregX86_EAX()); break;
   1298             case 0: break;
   1299             default: vpanic("getRegUsage_X86Instr:Call:regparms");
   1300          }
   1301          /* Finally, there is the issue that the insn trashes a
   1302             register because the literal target address has to be
   1303             loaded into a register.  Fortunately, for the 0/1/2
   1304             regparm case, we can use EAX, EDX and ECX respectively, so
   1305             this does not cause any further damage.  For the 3-regparm
   1306             case, we'll have to choose another register arbitrarily --
   1307             since A, D and C are used for parameters -- and so we might
   1308             as well choose EDI. */
   1309          if (i->Xin.Call.regparms == 3)
   1310             addHRegUse(u, HRmWrite, hregX86_EDI());
   1311          /* Upshot of this is that the assembler really must observe
   1312             the here-stated convention of which register to use as an
   1313             address temporary, depending on the regparmness: 0==EAX,
   1314             1==EDX, 2==ECX, 3==EDI. */
   1315          return;
   1316       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
   1317          conditionally exit the block.  Hence we only need to list (1)
   1318          the registers that they read, and (2) the registers that they
   1319          write in the case where the block is not exited.  (2) is
   1320          empty, hence only (1) is relevant here. */
   1321       case Xin_XDirect:
   1322          addRegUsage_X86AMode(u, i->Xin.XDirect.amEIP);
   1323          return;
   1324       case Xin_XIndir:
   1325          addHRegUse(u, HRmRead, i->Xin.XIndir.dstGA);
   1326          addRegUsage_X86AMode(u, i->Xin.XIndir.amEIP);
   1327          return;
   1328       case Xin_XAssisted:
   1329          addHRegUse(u, HRmRead, i->Xin.XAssisted.dstGA);
   1330          addRegUsage_X86AMode(u, i->Xin.XAssisted.amEIP);
   1331          return;
   1332       case Xin_CMov32:
   1333          addRegUsage_X86RM(u, i->Xin.CMov32.src, HRmRead);
   1334          addHRegUse(u, HRmModify, i->Xin.CMov32.dst);
   1335          return;
   1336       case Xin_LoadEX:
   1337          addRegUsage_X86AMode(u, i->Xin.LoadEX.src);
   1338          addHRegUse(u, HRmWrite, i->Xin.LoadEX.dst);
   1339          return;
   1340       case Xin_Store:
   1341          addHRegUse(u, HRmRead, i->Xin.Store.src);
   1342          addRegUsage_X86AMode(u, i->Xin.Store.dst);
   1343          return;
   1344       case Xin_Set32:
   1345          addHRegUse(u, HRmWrite, i->Xin.Set32.dst);
   1346          return;
   1347       case Xin_Bsfr32:
   1348          addHRegUse(u, HRmRead, i->Xin.Bsfr32.src);
   1349          addHRegUse(u, HRmWrite, i->Xin.Bsfr32.dst);
   1350          return;
   1351       case Xin_MFence:
   1352          return;
   1353       case Xin_ACAS:
   1354          addRegUsage_X86AMode(u, i->Xin.ACAS.addr);
   1355          addHRegUse(u, HRmRead, hregX86_EBX());
   1356          addHRegUse(u, HRmModify, hregX86_EAX());
   1357          return;
   1358       case Xin_DACAS:
   1359          addRegUsage_X86AMode(u, i->Xin.DACAS.addr);
   1360          addHRegUse(u, HRmRead, hregX86_ECX());
   1361          addHRegUse(u, HRmRead, hregX86_EBX());
   1362          addHRegUse(u, HRmModify, hregX86_EDX());
   1363          addHRegUse(u, HRmModify, hregX86_EAX());
   1364          return;
   1365       case Xin_FpUnary:
   1366          addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
   1367          addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
   1368          return;
   1369       case Xin_FpBinary:
   1370          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
   1371          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
   1372          addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
   1373          return;
   1374       case Xin_FpLdSt:
   1375          addRegUsage_X86AMode(u, i->Xin.FpLdSt.addr);
   1376          addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
   1377                        i->Xin.FpLdSt.reg);
   1378          return;
   1379       case Xin_FpLdStI:
   1380          addRegUsage_X86AMode(u, i->Xin.FpLdStI.addr);
   1381          addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
   1382                        i->Xin.FpLdStI.reg);
   1383          return;
   1384       case Xin_Fp64to32:
   1385          addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
   1386          addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
   1387          return;
   1388       case Xin_FpCMov:
   1389          addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
   1390          addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
   1391          return;
   1392       case Xin_FpLdCW:
   1393          addRegUsage_X86AMode(u, i->Xin.FpLdCW.addr);
   1394          return;
   1395       case Xin_FpStSW_AX:
   1396          addHRegUse(u, HRmWrite, hregX86_EAX());
   1397          return;
   1398       case Xin_FpCmp:
   1399          addHRegUse(u, HRmRead, i->Xin.FpCmp.srcL);
   1400          addHRegUse(u, HRmRead, i->Xin.FpCmp.srcR);
   1401          addHRegUse(u, HRmWrite, i->Xin.FpCmp.dst);
   1402          addHRegUse(u, HRmWrite, hregX86_EAX());
   1403          return;
   1404       case Xin_SseLdSt:
   1405          addRegUsage_X86AMode(u, i->Xin.SseLdSt.addr);
   1406          addHRegUse(u, i->Xin.SseLdSt.isLoad ? HRmWrite : HRmRead,
   1407                        i->Xin.SseLdSt.reg);
   1408          return;
   1409       case Xin_SseLdzLO:
   1410          addRegUsage_X86AMode(u, i->Xin.SseLdzLO.addr);
   1411          addHRegUse(u, HRmWrite, i->Xin.SseLdzLO.reg);
   1412          return;
   1413       case Xin_SseConst:
   1414          addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
   1415          return;
   1416       case Xin_Sse32Fx4:
   1417          vassert(i->Xin.Sse32Fx4.op != Xsse_MOV);
   1418          unary = toBool( i->Xin.Sse32Fx4.op == Xsse_RCPF
   1419                          || i->Xin.Sse32Fx4.op == Xsse_RSQRTF
   1420                          || i->Xin.Sse32Fx4.op == Xsse_SQRTF );
   1421          addHRegUse(u, HRmRead, i->Xin.Sse32Fx4.src);
   1422          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1423                        i->Xin.Sse32Fx4.dst);
   1424          return;
   1425       case Xin_Sse32FLo:
   1426          vassert(i->Xin.Sse32FLo.op != Xsse_MOV);
   1427          unary = toBool( i->Xin.Sse32FLo.op == Xsse_RCPF
   1428                          || i->Xin.Sse32FLo.op == Xsse_RSQRTF
   1429                          || i->Xin.Sse32FLo.op == Xsse_SQRTF );
   1430          addHRegUse(u, HRmRead, i->Xin.Sse32FLo.src);
   1431          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1432                        i->Xin.Sse32FLo.dst);
   1433          return;
   1434       case Xin_Sse64Fx2:
   1435          vassert(i->Xin.Sse64Fx2.op != Xsse_MOV);
   1436          unary = toBool( i->Xin.Sse64Fx2.op == Xsse_RCPF
   1437                          || i->Xin.Sse64Fx2.op == Xsse_RSQRTF
   1438                          || i->Xin.Sse64Fx2.op == Xsse_SQRTF );
   1439          addHRegUse(u, HRmRead, i->Xin.Sse64Fx2.src);
   1440          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1441                        i->Xin.Sse64Fx2.dst);
   1442          return;
   1443       case Xin_Sse64FLo:
   1444          vassert(i->Xin.Sse64FLo.op != Xsse_MOV);
   1445          unary = toBool( i->Xin.Sse64FLo.op == Xsse_RCPF
   1446                          || i->Xin.Sse64FLo.op == Xsse_RSQRTF
   1447                          || i->Xin.Sse64FLo.op == Xsse_SQRTF );
   1448          addHRegUse(u, HRmRead, i->Xin.Sse64FLo.src);
   1449          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1450                        i->Xin.Sse64FLo.dst);
   1451          return;
   1452       case Xin_SseReRg:
   1453          if (i->Xin.SseReRg.op == Xsse_XOR
   1454              && i->Xin.SseReRg.src == i->Xin.SseReRg.dst) {
   1455             /* reg-alloc needs to understand 'xor r,r' as a write of r */
   1456             /* (as opposed to a rite of passage :-) */
   1457             addHRegUse(u, HRmWrite, i->Xin.SseReRg.dst);
   1458          } else {
   1459             addHRegUse(u, HRmRead, i->Xin.SseReRg.src);
   1460             addHRegUse(u, i->Xin.SseReRg.op == Xsse_MOV
   1461                              ? HRmWrite : HRmModify,
   1462                           i->Xin.SseReRg.dst);
   1463          }
   1464          return;
   1465       case Xin_SseCMov:
   1466          addHRegUse(u, HRmRead,   i->Xin.SseCMov.src);
   1467          addHRegUse(u, HRmModify, i->Xin.SseCMov.dst);
   1468          return;
   1469       case Xin_SseShuf:
   1470          addHRegUse(u, HRmRead,  i->Xin.SseShuf.src);
   1471          addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
   1472          return;
   1473       case Xin_EvCheck:
   1474          /* We expect both amodes only to mention %ebp, so this is in
   1475             fact pointless, since %ebp isn't allocatable, but anyway.. */
   1476          addRegUsage_X86AMode(u, i->Xin.EvCheck.amCounter);
   1477          addRegUsage_X86AMode(u, i->Xin.EvCheck.amFailAddr);
   1478          return;
   1479       case Xin_ProfInc:
   1480          /* does not use any registers. */
   1481          return;
   1482       default:
   1483          ppX86Instr(i, False);
   1484          vpanic("getRegUsage_X86Instr");
   1485    }
   1486 }
   1487 
   1488 /* local helper */
   1489 static void mapReg( HRegRemap* m, HReg* r )
   1490 {
   1491    *r = lookupHRegRemap(m, *r);
   1492 }
   1493 
   1494 void mapRegs_X86Instr ( HRegRemap* m, X86Instr* i, Bool mode64 )
   1495 {
   1496    vassert(mode64 == False);
   1497    switch (i->tag) {
   1498       case Xin_Alu32R:
   1499          mapRegs_X86RMI(m, i->Xin.Alu32R.src);
   1500          mapReg(m, &i->Xin.Alu32R.dst);
   1501          return;
   1502       case Xin_Alu32M:
   1503          mapRegs_X86RI(m, i->Xin.Alu32M.src);
   1504          mapRegs_X86AMode(m, i->Xin.Alu32M.dst);
   1505          return;
   1506       case Xin_Sh32:
   1507          mapReg(m, &i->Xin.Sh32.dst);
   1508          return;
   1509       case Xin_Test32:
   1510          mapRegs_X86RM(m, i->Xin.Test32.dst);
   1511          return;
   1512       case Xin_Unary32:
   1513          mapReg(m, &i->Xin.Unary32.dst);
   1514          return;
   1515       case Xin_Lea32:
   1516          mapRegs_X86AMode(m, i->Xin.Lea32.am);
   1517          mapReg(m, &i->Xin.Lea32.dst);
   1518          return;
   1519       case Xin_MulL:
   1520          mapRegs_X86RM(m, i->Xin.MulL.src);
   1521          return;
   1522       case Xin_Div:
   1523          mapRegs_X86RM(m, i->Xin.Div.src);
   1524          return;
   1525       case Xin_Sh3232:
   1526          mapReg(m, &i->Xin.Sh3232.src);
   1527          mapReg(m, &i->Xin.Sh3232.dst);
   1528          return;
   1529       case Xin_Push:
   1530          mapRegs_X86RMI(m, i->Xin.Push.src);
   1531          return;
   1532       case Xin_Call:
   1533          return;
   1534       case Xin_XDirect:
   1535          mapRegs_X86AMode(m, i->Xin.XDirect.amEIP);
   1536          return;
   1537       case Xin_XIndir:
   1538          mapReg(m, &i->Xin.XIndir.dstGA);
   1539          mapRegs_X86AMode(m, i->Xin.XIndir.amEIP);
   1540          return;
   1541       case Xin_XAssisted:
   1542          mapReg(m, &i->Xin.XAssisted.dstGA);
   1543          mapRegs_X86AMode(m, i->Xin.XAssisted.amEIP);
   1544          return;
   1545       case Xin_CMov32:
   1546          mapRegs_X86RM(m, i->Xin.CMov32.src);
   1547          mapReg(m, &i->Xin.CMov32.dst);
   1548          return;
   1549       case Xin_LoadEX:
   1550          mapRegs_X86AMode(m, i->Xin.LoadEX.src);
   1551          mapReg(m, &i->Xin.LoadEX.dst);
   1552          return;
   1553       case Xin_Store:
   1554          mapReg(m, &i->Xin.Store.src);
   1555          mapRegs_X86AMode(m, i->Xin.Store.dst);
   1556          return;
   1557       case Xin_Set32:
   1558          mapReg(m, &i->Xin.Set32.dst);
   1559          return;
   1560       case Xin_Bsfr32:
   1561          mapReg(m, &i->Xin.Bsfr32.src);
   1562          mapReg(m, &i->Xin.Bsfr32.dst);
   1563          return;
   1564       case Xin_MFence:
   1565          return;
   1566       case Xin_ACAS:
   1567          mapRegs_X86AMode(m, i->Xin.ACAS.addr);
   1568          return;
   1569       case Xin_DACAS:
   1570          mapRegs_X86AMode(m, i->Xin.DACAS.addr);
   1571          return;
   1572       case Xin_FpUnary:
   1573          mapReg(m, &i->Xin.FpUnary.src);
   1574          mapReg(m, &i->Xin.FpUnary.dst);
   1575          return;
   1576       case Xin_FpBinary:
   1577          mapReg(m, &i->Xin.FpBinary.srcL);
   1578          mapReg(m, &i->Xin.FpBinary.srcR);
   1579          mapReg(m, &i->Xin.FpBinary.dst);
   1580          return;
   1581       case Xin_FpLdSt:
   1582          mapRegs_X86AMode(m, i->Xin.FpLdSt.addr);
   1583          mapReg(m, &i->Xin.FpLdSt.reg);
   1584          return;
   1585       case Xin_FpLdStI:
   1586          mapRegs_X86AMode(m, i->Xin.FpLdStI.addr);
   1587          mapReg(m, &i->Xin.FpLdStI.reg);
   1588          return;
   1589       case Xin_Fp64to32:
   1590          mapReg(m, &i->Xin.Fp64to32.src);
   1591          mapReg(m, &i->Xin.Fp64to32.dst);
   1592          return;
   1593       case Xin_FpCMov:
   1594          mapReg(m, &i->Xin.FpCMov.src);
   1595          mapReg(m, &i->Xin.FpCMov.dst);
   1596          return;
   1597       case Xin_FpLdCW:
   1598          mapRegs_X86AMode(m, i->Xin.FpLdCW.addr);
   1599          return;
   1600       case Xin_FpStSW_AX:
   1601          return;
   1602       case Xin_FpCmp:
   1603          mapReg(m, &i->Xin.FpCmp.srcL);
   1604          mapReg(m, &i->Xin.FpCmp.srcR);
   1605          mapReg(m, &i->Xin.FpCmp.dst);
   1606          return;
   1607       case Xin_SseConst:
   1608          mapReg(m, &i->Xin.SseConst.dst);
   1609          return;
   1610       case Xin_SseLdSt:
   1611          mapReg(m, &i->Xin.SseLdSt.reg);
   1612          mapRegs_X86AMode(m, i->Xin.SseLdSt.addr);
   1613          break;
   1614       case Xin_SseLdzLO:
   1615          mapReg(m, &i->Xin.SseLdzLO.reg);
   1616          mapRegs_X86AMode(m, i->Xin.SseLdzLO.addr);
   1617          break;
   1618       case Xin_Sse32Fx4:
   1619          mapReg(m, &i->Xin.Sse32Fx4.src);
   1620          mapReg(m, &i->Xin.Sse32Fx4.dst);
   1621          return;
   1622       case Xin_Sse32FLo:
   1623          mapReg(m, &i->Xin.Sse32FLo.src);
   1624          mapReg(m, &i->Xin.Sse32FLo.dst);
   1625          return;
   1626       case Xin_Sse64Fx2:
   1627          mapReg(m, &i->Xin.Sse64Fx2.src);
   1628          mapReg(m, &i->Xin.Sse64Fx2.dst);
   1629          return;
   1630       case Xin_Sse64FLo:
   1631          mapReg(m, &i->Xin.Sse64FLo.src);
   1632          mapReg(m, &i->Xin.Sse64FLo.dst);
   1633          return;
   1634       case Xin_SseReRg:
   1635          mapReg(m, &i->Xin.SseReRg.src);
   1636          mapReg(m, &i->Xin.SseReRg.dst);
   1637          return;
   1638       case Xin_SseCMov:
   1639          mapReg(m, &i->Xin.SseCMov.src);
   1640          mapReg(m, &i->Xin.SseCMov.dst);
   1641          return;
   1642       case Xin_SseShuf:
   1643          mapReg(m, &i->Xin.SseShuf.src);
   1644          mapReg(m, &i->Xin.SseShuf.dst);
   1645          return;
   1646       case Xin_EvCheck:
   1647          /* We expect both amodes only to mention %ebp, so this is in
   1648             fact pointless, since %ebp isn't allocatable, but anyway.. */
   1649          mapRegs_X86AMode(m, i->Xin.EvCheck.amCounter);
   1650          mapRegs_X86AMode(m, i->Xin.EvCheck.amFailAddr);
   1651          return;
   1652       case Xin_ProfInc:
   1653          /* does not use any registers. */
   1654          return;
   1655 
   1656       default:
   1657          ppX86Instr(i, mode64);
   1658          vpanic("mapRegs_X86Instr");
   1659    }
   1660 }
   1661 
   1662 /* Figure out if i represents a reg-reg move, and if so assign the
   1663    source and destination to *src and *dst.  If in doubt say No.  Used
   1664    by the register allocator to do move coalescing.
   1665 */
   1666 Bool isMove_X86Instr ( X86Instr* i, HReg* src, HReg* dst )
   1667 {
   1668    /* Moves between integer regs */
   1669    if (i->tag == Xin_Alu32R) {
   1670       if (i->Xin.Alu32R.op != Xalu_MOV)
   1671          return False;
   1672       if (i->Xin.Alu32R.src->tag != Xrmi_Reg)
   1673          return False;
   1674       *src = i->Xin.Alu32R.src->Xrmi.Reg.reg;
   1675       *dst = i->Xin.Alu32R.dst;
   1676       return True;
   1677    }
   1678    /* Moves between FP regs */
   1679    if (i->tag == Xin_FpUnary) {
   1680       if (i->Xin.FpUnary.op != Xfp_MOV)
   1681          return False;
   1682       *src = i->Xin.FpUnary.src;
   1683       *dst = i->Xin.FpUnary.dst;
   1684       return True;
   1685    }
   1686    if (i->tag == Xin_SseReRg) {
   1687       if (i->Xin.SseReRg.op != Xsse_MOV)
   1688          return False;
   1689       *src = i->Xin.SseReRg.src;
   1690       *dst = i->Xin.SseReRg.dst;
   1691       return True;
   1692    }
   1693    return False;
   1694 }
   1695 
   1696 
   1697 /* Generate x86 spill/reload instructions under the direction of the
   1698    register allocator.  Note it's critical these don't write the
   1699    condition codes. */
   1700 
   1701 void genSpill_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1702                     HReg rreg, Int offsetB, Bool mode64 )
   1703 {
   1704    X86AMode* am;
   1705    vassert(offsetB >= 0);
   1706    vassert(!hregIsVirtual(rreg));
   1707    vassert(mode64 == False);
   1708    *i1 = *i2 = NULL;
   1709    am = X86AMode_IR(offsetB, hregX86_EBP());
   1710    switch (hregClass(rreg)) {
   1711       case HRcInt32:
   1712          *i1 = X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
   1713          return;
   1714       case HRcFlt64:
   1715          *i1 = X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
   1716          return;
   1717       case HRcVec128:
   1718          *i1 = X86Instr_SseLdSt ( False/*store*/, rreg, am );
   1719          return;
   1720       default:
   1721          ppHRegClass(hregClass(rreg));
   1722          vpanic("genSpill_X86: unimplemented regclass");
   1723    }
   1724 }
   1725 
   1726 void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1727                      HReg rreg, Int offsetB, Bool mode64 )
   1728 {
   1729    X86AMode* am;
   1730    vassert(offsetB >= 0);
   1731    vassert(!hregIsVirtual(rreg));
   1732    vassert(mode64 == False);
   1733    *i1 = *i2 = NULL;
   1734    am = X86AMode_IR(offsetB, hregX86_EBP());
   1735    switch (hregClass(rreg)) {
   1736       case HRcInt32:
   1737          *i1 = X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
   1738          return;
   1739       case HRcFlt64:
   1740          *i1 = X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
   1741          return;
   1742       case HRcVec128:
   1743          *i1 = X86Instr_SseLdSt ( True/*load*/, rreg, am );
   1744          return;
   1745       default:
   1746          ppHRegClass(hregClass(rreg));
   1747          vpanic("genReload_X86: unimplemented regclass");
   1748    }
   1749 }
   1750 
   1751 /* The given instruction reads the specified vreg exactly once, and
   1752    that vreg is currently located at the given spill offset.  If
   1753    possible, return a variant of the instruction to one which instead
   1754    references the spill slot directly. */
   1755 
   1756 X86Instr* directReload_X86( X86Instr* i, HReg vreg, Short spill_off )
   1757 {
   1758    vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
   1759 
   1760    /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
   1761       Convert to: src=RMI_Mem, dst=Reg
   1762    */
   1763    if (i->tag == Xin_Alu32R
   1764        && (i->Xin.Alu32R.op == Xalu_MOV || i->Xin.Alu32R.op == Xalu_OR
   1765            || i->Xin.Alu32R.op == Xalu_XOR)
   1766        && i->Xin.Alu32R.src->tag == Xrmi_Reg
   1767        && i->Xin.Alu32R.src->Xrmi.Reg.reg == vreg) {
   1768       vassert(i->Xin.Alu32R.dst != vreg);
   1769       return X86Instr_Alu32R(
   1770                 i->Xin.Alu32R.op,
   1771                 X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP())),
   1772                 i->Xin.Alu32R.dst
   1773              );
   1774    }
   1775 
   1776    /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
   1777       Convert to: src=RI_Imm, dst=Mem
   1778    */
   1779    if (i->tag == Xin_Alu32R
   1780        && (i->Xin.Alu32R.op == Xalu_CMP)
   1781        && i->Xin.Alu32R.src->tag == Xrmi_Imm
   1782        && i->Xin.Alu32R.dst == vreg) {
   1783       return X86Instr_Alu32M(
   1784                 i->Xin.Alu32R.op,
   1785 		X86RI_Imm( i->Xin.Alu32R.src->Xrmi.Imm.imm32 ),
   1786                 X86AMode_IR( spill_off, hregX86_EBP())
   1787              );
   1788    }
   1789 
   1790    /* Deal with form: Push(RMI_Reg)
   1791       Convert to: Push(RMI_Mem)
   1792    */
   1793    if (i->tag == Xin_Push
   1794        && i->Xin.Push.src->tag == Xrmi_Reg
   1795        && i->Xin.Push.src->Xrmi.Reg.reg == vreg) {
   1796       return X86Instr_Push(
   1797                 X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP()))
   1798              );
   1799    }
   1800 
   1801    /* Deal with form: CMov32(src=RM_Reg, dst) where vreg == src
   1802       Convert to CMov32(RM_Mem, dst) */
   1803    if (i->tag == Xin_CMov32
   1804        && i->Xin.CMov32.src->tag == Xrm_Reg
   1805        && i->Xin.CMov32.src->Xrm.Reg.reg == vreg) {
   1806       vassert(i->Xin.CMov32.dst != vreg);
   1807       return X86Instr_CMov32(
   1808                 i->Xin.CMov32.cond,
   1809                 X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() )),
   1810                 i->Xin.CMov32.dst
   1811              );
   1812    }
   1813 
   1814    /* Deal with form: Test32(imm,RM_Reg vreg) -> Test32(imm,amode) */
   1815    if (i->tag == Xin_Test32
   1816        && i->Xin.Test32.dst->tag == Xrm_Reg
   1817        && i->Xin.Test32.dst->Xrm.Reg.reg == vreg) {
   1818       return X86Instr_Test32(
   1819                 i->Xin.Test32.imm32,
   1820                 X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() ) )
   1821              );
   1822    }
   1823 
   1824    return NULL;
   1825 }
   1826 
   1827 
   1828 /* --------- The x86 assembler (bleh.) --------- */
   1829 
   1830 static UChar iregNo ( HReg r )
   1831 {
   1832    UInt n;
   1833    vassert(hregClass(r) == HRcInt32);
   1834    vassert(!hregIsVirtual(r));
   1835    n = hregNumber(r);
   1836    vassert(n <= 7);
   1837    return toUChar(n);
   1838 }
   1839 
   1840 static UInt fregNo ( HReg r )
   1841 {
   1842    UInt n;
   1843    vassert(hregClass(r) == HRcFlt64);
   1844    vassert(!hregIsVirtual(r));
   1845    n = hregNumber(r);
   1846    vassert(n <= 5);
   1847    return n;
   1848 }
   1849 
   1850 static UInt vregNo ( HReg r )
   1851 {
   1852    UInt n;
   1853    vassert(hregClass(r) == HRcVec128);
   1854    vassert(!hregIsVirtual(r));
   1855    n = hregNumber(r);
   1856    vassert(n <= 7);
   1857    return n;
   1858 }
   1859 
   1860 static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
   1861 {
   1862    return toUChar( ((mod & 3) << 6)
   1863                    | ((reg & 7) << 3)
   1864                    | (regmem & 7) );
   1865 }
   1866 
   1867 static UChar mkSIB ( Int shift, Int regindex, Int regbase )
   1868 {
   1869    return toUChar( ((shift & 3) << 6)
   1870                    | ((regindex & 7) << 3)
   1871                    | (regbase & 7) );
   1872 }
   1873 
   1874 static UChar* emit32 ( UChar* p, UInt w32 )
   1875 {
   1876    *p++ = toUChar( w32        & 0x000000FF);
   1877    *p++ = toUChar((w32 >>  8) & 0x000000FF);
   1878    *p++ = toUChar((w32 >> 16) & 0x000000FF);
   1879    *p++ = toUChar((w32 >> 24) & 0x000000FF);
   1880    return p;
   1881 }
   1882 
   1883 /* Does a sign-extend of the lowest 8 bits give
   1884    the original number? */
   1885 static Bool fits8bits ( UInt w32 )
   1886 {
   1887    Int i32 = (Int)w32;
   1888    return toBool(i32 == ((i32 << 24) >> 24));
   1889 }
   1890 
   1891 
   1892 /* Forming mod-reg-rm bytes and scale-index-base bytes.
   1893 
   1894      greg,  0(ereg)    |  ereg != ESP && ereg != EBP
   1895                        =  00 greg ereg
   1896 
   1897      greg,  d8(ereg)   |  ereg != ESP
   1898                        =  01 greg ereg, d8
   1899 
   1900      greg,  d32(ereg)  |  ereg != ESP
   1901                        =  10 greg ereg, d32
   1902 
   1903      greg,  d8(%esp)   =  01 greg 100, 0x24, d8
   1904 
   1905      -----------------------------------------------
   1906 
   1907      greg,  d8(base,index,scale)
   1908                |  index != ESP
   1909                =  01 greg 100, scale index base, d8
   1910 
   1911      greg,  d32(base,index,scale)
   1912                |  index != ESP
   1913                =  10 greg 100, scale index base, d32
   1914 */
   1915 static UChar* doAMode_M ( UChar* p, HReg greg, X86AMode* am )
   1916 {
   1917    if (am->tag == Xam_IR) {
   1918       if (am->Xam.IR.imm == 0
   1919           && am->Xam.IR.reg != hregX86_ESP()
   1920           && am->Xam.IR.reg != hregX86_EBP() ) {
   1921          *p++ = mkModRegRM(0, iregNo(greg), iregNo(am->Xam.IR.reg));
   1922          return p;
   1923       }
   1924       if (fits8bits(am->Xam.IR.imm)
   1925           && am->Xam.IR.reg != hregX86_ESP()) {
   1926          *p++ = mkModRegRM(1, iregNo(greg), iregNo(am->Xam.IR.reg));
   1927          *p++ = toUChar(am->Xam.IR.imm & 0xFF);
   1928          return p;
   1929       }
   1930       if (am->Xam.IR.reg != hregX86_ESP()) {
   1931          *p++ = mkModRegRM(2, iregNo(greg), iregNo(am->Xam.IR.reg));
   1932          p = emit32(p, am->Xam.IR.imm);
   1933          return p;
   1934       }
   1935       if (am->Xam.IR.reg == hregX86_ESP()
   1936           && fits8bits(am->Xam.IR.imm)) {
   1937  	 *p++ = mkModRegRM(1, iregNo(greg), 4);
   1938          *p++ = 0x24;
   1939          *p++ = toUChar(am->Xam.IR.imm & 0xFF);
   1940          return p;
   1941       }
   1942       ppX86AMode(am);
   1943       vpanic("doAMode_M: can't emit amode IR");
   1944       /*NOTREACHED*/
   1945    }
   1946    if (am->tag == Xam_IRRS) {
   1947       if (fits8bits(am->Xam.IRRS.imm)
   1948           && am->Xam.IRRS.index != hregX86_ESP()) {
   1949          *p++ = mkModRegRM(1, iregNo(greg), 4);
   1950          *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
   1951                                           am->Xam.IRRS.base);
   1952          *p++ = toUChar(am->Xam.IRRS.imm & 0xFF);
   1953          return p;
   1954       }
   1955       if (am->Xam.IRRS.index != hregX86_ESP()) {
   1956          *p++ = mkModRegRM(2, iregNo(greg), 4);
   1957          *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
   1958                                           am->Xam.IRRS.base);
   1959          p = emit32(p, am->Xam.IRRS.imm);
   1960          return p;
   1961       }
   1962       ppX86AMode(am);
   1963       vpanic("doAMode_M: can't emit amode IRRS");
   1964       /*NOTREACHED*/
   1965    }
   1966    vpanic("doAMode_M: unknown amode");
   1967    /*NOTREACHED*/
   1968 }
   1969 
   1970 
   1971 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
   1972 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
   1973 {
   1974    *p++ = mkModRegRM(3, iregNo(greg), iregNo(ereg));
   1975    return p;
   1976 }
   1977 
   1978 
   1979 /* Emit ffree %st(7) */
   1980 static UChar* do_ffree_st7 ( UChar* p )
   1981 {
   1982    *p++ = 0xDD;
   1983    *p++ = 0xC7;
   1984    return p;
   1985 }
   1986 
   1987 /* Emit fstp %st(i), 1 <= i <= 7 */
   1988 static UChar* do_fstp_st ( UChar* p, Int i )
   1989 {
   1990    vassert(1 <= i && i <= 7);
   1991    *p++ = 0xDD;
   1992    *p++ = toUChar(0xD8+i);
   1993    return p;
   1994 }
   1995 
   1996 /* Emit fld %st(i), 0 <= i <= 6 */
   1997 static UChar* do_fld_st ( UChar* p, Int i )
   1998 {
   1999    vassert(0 <= i && i <= 6);
   2000    *p++ = 0xD9;
   2001    *p++ = toUChar(0xC0+i);
   2002    return p;
   2003 }
   2004 
   2005 /* Emit f<op> %st(0) */
   2006 static UChar* do_fop1_st ( UChar* p, X86FpOp op )
   2007 {
   2008    switch (op) {
   2009       case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
   2010       case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
   2011       case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
   2012       case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
   2013       case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
   2014       case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
   2015       case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
   2016       case Xfp_MOV:    break;
   2017       case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
   2018                        *p++ = 0xD9; *p++ = 0xF2; /* fptan */
   2019                        *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
   2020                        break;
   2021       default: vpanic("do_fop1_st: unknown op");
   2022    }
   2023    return p;
   2024 }
   2025 
   2026 /* Emit f<op> %st(i), 1 <= i <= 5 */
   2027 static UChar* do_fop2_st ( UChar* p, X86FpOp op, Int i )
   2028 {
   2029 #  define fake(_n) mkHReg((_n), HRcInt32, False)
   2030    Int subopc;
   2031    switch (op) {
   2032       case Xfp_ADD: subopc = 0; break;
   2033       case Xfp_SUB: subopc = 4; break;
   2034       case Xfp_MUL: subopc = 1; break;
   2035       case Xfp_DIV: subopc = 6; break;
   2036       default: vpanic("do_fop2_st: unknown op");
   2037    }
   2038    *p++ = 0xD8;
   2039    p    = doAMode_R(p, fake(subopc), fake(i));
   2040    return p;
   2041 #  undef fake
   2042 }
   2043 
   2044 /* Push a 32-bit word on the stack.  The word depends on tags[3:0];
   2045 each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
   2046 */
   2047 static UChar* push_word_from_tags ( UChar* p, UShort tags )
   2048 {
   2049    UInt w;
   2050    vassert(0 == (tags & ~0xF));
   2051    if (tags == 0) {
   2052       /* pushl $0x00000000 */
   2053       *p++ = 0x6A;
   2054       *p++ = 0x00;
   2055    }
   2056    else
   2057    /* pushl $0xFFFFFFFF */
   2058    if (tags == 0xF) {
   2059       *p++ = 0x6A;
   2060       *p++ = 0xFF;
   2061    } else {
   2062       vassert(0); /* awaiting test case */
   2063       w = 0;
   2064       if (tags & 1) w |= 0x000000FF;
   2065       if (tags & 2) w |= 0x0000FF00;
   2066       if (tags & 4) w |= 0x00FF0000;
   2067       if (tags & 8) w |= 0xFF000000;
   2068       *p++ = 0x68;
   2069       p = emit32(p, w);
   2070    }
   2071    return p;
   2072 }
   2073 
   2074 /* Emit an instruction into buf and return the number of bytes used.
   2075    Note that buf is not the insn's final place, and therefore it is
   2076    imperative to emit position-independent code.  If the emitted
   2077    instruction was a profiler inc, set *is_profInc to True, else
   2078    leave it unchanged. */
   2079 
   2080 Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
   2081                     UChar* buf, Int nbuf, X86Instr* i,
   2082                     Bool mode64,
   2083                     void* disp_cp_chain_me_to_slowEP,
   2084                     void* disp_cp_chain_me_to_fastEP,
   2085                     void* disp_cp_xindir,
   2086                     void* disp_cp_xassisted )
   2087 {
   2088    UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
   2089 
   2090    UInt   xtra;
   2091    UChar* p = &buf[0];
   2092    UChar* ptmp;
   2093    vassert(nbuf >= 32);
   2094    vassert(mode64 == False);
   2095 
   2096    /* Wrap an integer as a int register, for use assembling
   2097       GrpN insns, in which the greg field is used as a sub-opcode
   2098       and does not really contain a register. */
   2099 #  define fake(_n) mkHReg((_n), HRcInt32, False)
   2100 
   2101    /* vex_printf("asm  ");ppX86Instr(i, mode64); vex_printf("\n"); */
   2102 
   2103    switch (i->tag) {
   2104 
   2105    case Xin_Alu32R:
   2106       /* Deal specially with MOV */
   2107       if (i->Xin.Alu32R.op == Xalu_MOV) {
   2108          switch (i->Xin.Alu32R.src->tag) {
   2109             case Xrmi_Imm:
   2110                *p++ = toUChar(0xB8 + iregNo(i->Xin.Alu32R.dst));
   2111                p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
   2112                goto done;
   2113             case Xrmi_Reg:
   2114                *p++ = 0x89;
   2115                p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
   2116                                 i->Xin.Alu32R.dst);
   2117                goto done;
   2118             case Xrmi_Mem:
   2119                *p++ = 0x8B;
   2120                p = doAMode_M(p, i->Xin.Alu32R.dst,
   2121                                 i->Xin.Alu32R.src->Xrmi.Mem.am);
   2122                goto done;
   2123             default:
   2124                goto bad;
   2125          }
   2126       }
   2127       /* MUL */
   2128       if (i->Xin.Alu32R.op == Xalu_MUL) {
   2129          switch (i->Xin.Alu32R.src->tag) {
   2130             case Xrmi_Reg:
   2131                *p++ = 0x0F;
   2132                *p++ = 0xAF;
   2133                p = doAMode_R(p, i->Xin.Alu32R.dst,
   2134                                 i->Xin.Alu32R.src->Xrmi.Reg.reg);
   2135                goto done;
   2136             case Xrmi_Mem:
   2137                *p++ = 0x0F;
   2138                *p++ = 0xAF;
   2139                p = doAMode_M(p, i->Xin.Alu32R.dst,
   2140                                 i->Xin.Alu32R.src->Xrmi.Mem.am);
   2141                goto done;
   2142             case Xrmi_Imm:
   2143                if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
   2144                   *p++ = 0x6B;
   2145                   p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
   2146                   *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
   2147                } else {
   2148                   *p++ = 0x69;
   2149                   p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
   2150                   p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
   2151                }
   2152                goto done;
   2153             default:
   2154                goto bad;
   2155          }
   2156       }
   2157       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
   2158       opc = opc_rr = subopc_imm = opc_imma = 0;
   2159       switch (i->Xin.Alu32R.op) {
   2160          case Xalu_ADC: opc = 0x13; opc_rr = 0x11;
   2161                         subopc_imm = 2; opc_imma = 0x15; break;
   2162          case Xalu_ADD: opc = 0x03; opc_rr = 0x01;
   2163                         subopc_imm = 0; opc_imma = 0x05; break;
   2164          case Xalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2165                         subopc_imm = 5; opc_imma = 0x2D; break;
   2166          case Xalu_SBB: opc = 0x1B; opc_rr = 0x19;
   2167                         subopc_imm = 3; opc_imma = 0x1D; break;
   2168          case Xalu_AND: opc = 0x23; opc_rr = 0x21;
   2169                         subopc_imm = 4; opc_imma = 0x25; break;
   2170          case Xalu_XOR: opc = 0x33; opc_rr = 0x31;
   2171                         subopc_imm = 6; opc_imma = 0x35; break;
   2172          case Xalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2173                         subopc_imm = 1; opc_imma = 0x0D; break;
   2174          case Xalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2175                         subopc_imm = 7; opc_imma = 0x3D; break;
   2176          default: goto bad;
   2177       }
   2178       switch (i->Xin.Alu32R.src->tag) {
   2179          case Xrmi_Imm:
   2180             if (i->Xin.Alu32R.dst == hregX86_EAX()
   2181                 && !fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
   2182                *p++ = toUChar(opc_imma);
   2183                p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
   2184             } else
   2185             if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
   2186                *p++ = 0x83;
   2187                p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
   2188                *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
   2189             } else {
   2190                *p++ = 0x81;
   2191                p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
   2192                p    = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
   2193             }
   2194             goto done;
   2195          case Xrmi_Reg:
   2196             *p++ = toUChar(opc_rr);
   2197             p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
   2198                              i->Xin.Alu32R.dst);
   2199             goto done;
   2200          case Xrmi_Mem:
   2201             *p++ = toUChar(opc);
   2202             p = doAMode_M(p, i->Xin.Alu32R.dst,
   2203                              i->Xin.Alu32R.src->Xrmi.Mem.am);
   2204             goto done;
   2205          default:
   2206             goto bad;
   2207       }
   2208       break;
   2209 
   2210    case Xin_Alu32M:
   2211       /* Deal specially with MOV */
   2212       if (i->Xin.Alu32M.op == Xalu_MOV) {
   2213          switch (i->Xin.Alu32M.src->tag) {
   2214             case Xri_Reg:
   2215                *p++ = 0x89;
   2216                p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
   2217                                 i->Xin.Alu32M.dst);
   2218                goto done;
   2219             case Xri_Imm:
   2220                *p++ = 0xC7;
   2221                p = doAMode_M(p, fake(0), i->Xin.Alu32M.dst);
   2222                p = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
   2223                goto done;
   2224             default:
   2225                goto bad;
   2226          }
   2227       }
   2228       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
   2229          allowed here. */
   2230       opc = subopc_imm = opc_imma = 0;
   2231       switch (i->Xin.Alu32M.op) {
   2232          case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
   2233          case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
   2234          case Xalu_CMP: opc = 0x39; subopc_imm = 7; break;
   2235          default: goto bad;
   2236       }
   2237       switch (i->Xin.Alu32M.src->tag) {
   2238          case Xri_Reg:
   2239             *p++ = toUChar(opc);
   2240             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
   2241                              i->Xin.Alu32M.dst);
   2242             goto done;
   2243          case Xri_Imm:
   2244             if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
   2245                *p++ = 0x83;
   2246                p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
   2247                *p++ = toUChar(0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32);
   2248                goto done;
   2249             } else {
   2250                *p++ = 0x81;
   2251                p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
   2252                p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
   2253                goto done;
   2254             }
   2255          default:
   2256             goto bad;
   2257       }
   2258       break;
   2259 
   2260    case Xin_Sh32:
   2261       opc_cl = opc_imm = subopc = 0;
   2262       switch (i->Xin.Sh32.op) {
   2263          case Xsh_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
   2264          case Xsh_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
   2265          case Xsh_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
   2266          default: goto bad;
   2267       }
   2268       if (i->Xin.Sh32.src == 0) {
   2269          *p++ = toUChar(opc_cl);
   2270          p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
   2271       } else {
   2272          *p++ = toUChar(opc_imm);
   2273          p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
   2274          *p++ = (UChar)(i->Xin.Sh32.src);
   2275       }
   2276       goto done;
   2277 
   2278    case Xin_Test32:
   2279       if (i->Xin.Test32.dst->tag == Xrm_Reg) {
   2280          /* testl $imm32, %reg */
   2281          *p++ = 0xF7;
   2282          p = doAMode_R(p, fake(0), i->Xin.Test32.dst->Xrm.Reg.reg);
   2283          p = emit32(p, i->Xin.Test32.imm32);
   2284          goto done;
   2285       } else {
   2286          /* testl $imm32, amode */
   2287          *p++ = 0xF7;
   2288          p = doAMode_M(p, fake(0), i->Xin.Test32.dst->Xrm.Mem.am);
   2289          p = emit32(p, i->Xin.Test32.imm32);
   2290          goto done;
   2291       }
   2292 
   2293    case Xin_Unary32:
   2294       if (i->Xin.Unary32.op == Xun_NOT) {
   2295          *p++ = 0xF7;
   2296          p = doAMode_R(p, fake(2), i->Xin.Unary32.dst);
   2297          goto done;
   2298       }
   2299       if (i->Xin.Unary32.op == Xun_NEG) {
   2300          *p++ = 0xF7;
   2301          p = doAMode_R(p, fake(3), i->Xin.Unary32.dst);
   2302          goto done;
   2303       }
   2304       break;
   2305 
   2306    case Xin_Lea32:
   2307       *p++ = 0x8D;
   2308       p = doAMode_M(p, i->Xin.Lea32.dst, i->Xin.Lea32.am);
   2309       goto done;
   2310 
   2311    case Xin_MulL:
   2312       subopc = i->Xin.MulL.syned ? 5 : 4;
   2313       *p++ = 0xF7;
   2314       switch (i->Xin.MulL.src->tag)  {
   2315          case Xrm_Mem:
   2316             p = doAMode_M(p, fake(subopc),
   2317                              i->Xin.MulL.src->Xrm.Mem.am);
   2318             goto done;
   2319          case Xrm_Reg:
   2320             p = doAMode_R(p, fake(subopc),
   2321                              i->Xin.MulL.src->Xrm.Reg.reg);
   2322             goto done;
   2323          default:
   2324             goto bad;
   2325       }
   2326       break;
   2327 
   2328    case Xin_Div:
   2329       subopc = i->Xin.Div.syned ? 7 : 6;
   2330       *p++ = 0xF7;
   2331       switch (i->Xin.Div.src->tag)  {
   2332          case Xrm_Mem:
   2333             p = doAMode_M(p, fake(subopc),
   2334                              i->Xin.Div.src->Xrm.Mem.am);
   2335             goto done;
   2336          case Xrm_Reg:
   2337             p = doAMode_R(p, fake(subopc),
   2338                              i->Xin.Div.src->Xrm.Reg.reg);
   2339             goto done;
   2340          default:
   2341             goto bad;
   2342       }
   2343       break;
   2344 
   2345    case Xin_Sh3232:
   2346       vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
   2347       if (i->Xin.Sh3232.amt == 0) {
   2348          /* shldl/shrdl by %cl */
   2349          *p++ = 0x0F;
   2350          if (i->Xin.Sh3232.op == Xsh_SHL) {
   2351             *p++ = 0xA5;
   2352          } else {
   2353             *p++ = 0xAD;
   2354          }
   2355          p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
   2356          goto done;
   2357       }
   2358       break;
   2359 
   2360    case Xin_Push:
   2361       switch (i->Xin.Push.src->tag) {
   2362          case Xrmi_Mem:
   2363             *p++ = 0xFF;
   2364             p = doAMode_M(p, fake(6), i->Xin.Push.src->Xrmi.Mem.am);
   2365             goto done;
   2366          case Xrmi_Imm:
   2367             *p++ = 0x68;
   2368             p = emit32(p, i->Xin.Push.src->Xrmi.Imm.imm32);
   2369             goto done;
   2370          case Xrmi_Reg:
   2371             *p++ = toUChar(0x50 + iregNo(i->Xin.Push.src->Xrmi.Reg.reg));
   2372             goto done;
   2373         default:
   2374             goto bad;
   2375       }
   2376 
   2377    case Xin_Call:
   2378       /* See detailed comment for Xin_Call in getRegUsage_X86Instr above
   2379          for explanation of this. */
   2380       switch (i->Xin.Call.regparms) {
   2381          case 0: irno = iregNo(hregX86_EAX()); break;
   2382          case 1: irno = iregNo(hregX86_EDX()); break;
   2383          case 2: irno = iregNo(hregX86_ECX()); break;
   2384          case 3: irno = iregNo(hregX86_EDI()); break;
   2385          default: vpanic(" emit_X86Instr:call:regparms");
   2386       }
   2387       /* jump over the following two insns if the condition does not
   2388          hold */
   2389       if (i->Xin.Call.cond != Xcc_ALWAYS) {
   2390          *p++ = toUChar(0x70 + (0xF & (i->Xin.Call.cond ^ 1)));
   2391          *p++ = 0x07; /* 7 bytes in the next two insns */
   2392       }
   2393       /* movl $target, %tmp */
   2394       *p++ = toUChar(0xB8 + irno);
   2395       p = emit32(p, i->Xin.Call.target);
   2396       /* call *%tmp */
   2397       *p++ = 0xFF;
   2398       *p++ = toUChar(0xD0 + irno);
   2399       goto done;
   2400 
   2401    case Xin_XDirect: {
   2402       /* NB: what goes on here has to be very closely coordinated with the
   2403          chainXDirect_X86 and unchainXDirect_X86 below. */
   2404       /* We're generating chain-me requests here, so we need to be
   2405          sure this is actually allowed -- no-redir translations can't
   2406          use chain-me's.  Hence: */
   2407       vassert(disp_cp_chain_me_to_slowEP != NULL);
   2408       vassert(disp_cp_chain_me_to_fastEP != NULL);
   2409 
   2410       /* Use ptmp for backpatching conditional jumps. */
   2411       ptmp = NULL;
   2412 
   2413       /* First off, if this is conditional, create a conditional
   2414          jump over the rest of it. */
   2415       if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
   2416          /* jmp fwds if !condition */
   2417          *p++ = toUChar(0x70 + (0xF & (i->Xin.XDirect.cond ^ 1)));
   2418          ptmp = p; /* fill in this bit later */
   2419          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2420       }
   2421 
   2422       /* Update the guest EIP. */
   2423       /* movl $dstGA, amEIP */
   2424       *p++ = 0xC7;
   2425       p    = doAMode_M(p, fake(0), i->Xin.XDirect.amEIP);
   2426       p    = emit32(p, i->Xin.XDirect.dstGA);
   2427 
   2428       /* --- FIRST PATCHABLE BYTE follows --- */
   2429       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
   2430          to) backs up the return address, so as to find the address of
   2431          the first patchable byte.  So: don't change the length of the
   2432          two instructions below. */
   2433       /* movl $disp_cp_chain_me_to_{slow,fast}EP,%edx; */
   2434       *p++ = 0xBA;
   2435       void* disp_cp_chain_me
   2436                = i->Xin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
   2437                                          : disp_cp_chain_me_to_slowEP;
   2438       p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_chain_me));
   2439       /* call *%edx */
   2440       *p++ = 0xFF;
   2441       *p++ = 0xD2;
   2442       /* --- END of PATCHABLE BYTES --- */
   2443 
   2444       /* Fix up the conditional jump, if there was one. */
   2445       if (i->Xin.XDirect.cond != Xcc_ALWAYS) {
   2446          Int delta = p - ptmp;
   2447          vassert(delta > 0 && delta < 40);
   2448          *ptmp = toUChar(delta-1);
   2449       }
   2450       goto done;
   2451    }
   2452 
   2453    case Xin_XIndir: {
   2454       /* We're generating transfers that could lead indirectly to a
   2455          chain-me, so we need to be sure this is actually allowed --
   2456          no-redir translations are not allowed to reach normal
   2457          translations without going through the scheduler.  That means
   2458          no XDirects or XIndirs out from no-redir translations.
   2459          Hence: */
   2460       vassert(disp_cp_xindir != NULL);
   2461 
   2462       /* Use ptmp for backpatching conditional jumps. */
   2463       ptmp = NULL;
   2464 
   2465       /* First off, if this is conditional, create a conditional
   2466          jump over the rest of it. */
   2467       if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
   2468          /* jmp fwds if !condition */
   2469          *p++ = toUChar(0x70 + (0xF & (i->Xin.XIndir.cond ^ 1)));
   2470          ptmp = p; /* fill in this bit later */
   2471          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2472       }
   2473 
   2474       /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
   2475       *p++ = 0x89;
   2476       p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
   2477 
   2478       /* movl $disp_indir, %edx */
   2479       *p++ = 0xBA;
   2480       p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xindir));
   2481       /* jmp *%edx */
   2482       *p++ = 0xFF;
   2483       *p++ = 0xE2;
   2484 
   2485       /* Fix up the conditional jump, if there was one. */
   2486       if (i->Xin.XIndir.cond != Xcc_ALWAYS) {
   2487          Int delta = p - ptmp;
   2488          vassert(delta > 0 && delta < 40);
   2489          *ptmp = toUChar(delta-1);
   2490       }
   2491       goto done;
   2492    }
   2493 
   2494    case Xin_XAssisted: {
   2495       /* Use ptmp for backpatching conditional jumps. */
   2496       ptmp = NULL;
   2497 
   2498       /* First off, if this is conditional, create a conditional
   2499          jump over the rest of it. */
   2500       if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
   2501          /* jmp fwds if !condition */
   2502          *p++ = toUChar(0x70 + (0xF & (i->Xin.XAssisted.cond ^ 1)));
   2503          ptmp = p; /* fill in this bit later */
   2504          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2505       }
   2506 
   2507       /* movl dstGA(a reg), amEIP -- copied from Alu32M MOV case */
   2508       *p++ = 0x89;
   2509       p = doAMode_M(p, i->Xin.XIndir.dstGA, i->Xin.XIndir.amEIP);
   2510       /* movl $magic_number, %ebp. */
   2511       UInt trcval = 0;
   2512       switch (i->Xin.XAssisted.jk) {
   2513          case Ijk_ClientReq:    trcval = VEX_TRC_JMP_CLIENTREQ;    break;
   2514          case Ijk_Sys_syscall:  trcval = VEX_TRC_JMP_SYS_SYSCALL;  break;
   2515          case Ijk_Sys_int128:   trcval = VEX_TRC_JMP_SYS_INT128;   break;
   2516          case Ijk_Sys_int129:   trcval = VEX_TRC_JMP_SYS_INT129;   break;
   2517          case Ijk_Sys_int130:   trcval = VEX_TRC_JMP_SYS_INT130;   break;
   2518          case Ijk_Sys_sysenter: trcval = VEX_TRC_JMP_SYS_SYSENTER; break;
   2519          case Ijk_Yield:        trcval = VEX_TRC_JMP_YIELD;        break;
   2520          case Ijk_EmWarn:       trcval = VEX_TRC_JMP_EMWARN;       break;
   2521          case Ijk_MapFail:      trcval = VEX_TRC_JMP_MAPFAIL;      break;
   2522          case Ijk_NoDecode:     trcval = VEX_TRC_JMP_NODECODE;     break;
   2523          case Ijk_TInval:       trcval = VEX_TRC_JMP_TINVAL;       break;
   2524          case Ijk_NoRedir:      trcval = VEX_TRC_JMP_NOREDIR;      break;
   2525          case Ijk_SigTRAP:      trcval = VEX_TRC_JMP_SIGTRAP;      break;
   2526          case Ijk_SigSEGV:      trcval = VEX_TRC_JMP_SIGSEGV;      break;
   2527          case Ijk_Boring:       trcval = VEX_TRC_JMP_BORING;       break;
   2528          /* We don't expect to see the following being assisted. */
   2529          case Ijk_Ret:
   2530          case Ijk_Call:
   2531          /* fallthrough */
   2532          default:
   2533             ppIRJumpKind(i->Xin.XAssisted.jk);
   2534             vpanic("emit_X86Instr.Xin_XAssisted: unexpected jump kind");
   2535       }
   2536       vassert(trcval != 0);
   2537       *p++ = 0xBD;
   2538       p = emit32(p, trcval);
   2539 
   2540       /* movl $disp_indir, %edx */
   2541       *p++ = 0xBA;
   2542       p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xassisted));
   2543       /* jmp *%edx */
   2544       *p++ = 0xFF;
   2545       *p++ = 0xE2;
   2546 
   2547       /* Fix up the conditional jump, if there was one. */
   2548       if (i->Xin.XAssisted.cond != Xcc_ALWAYS) {
   2549          Int delta = p - ptmp;
   2550          vassert(delta > 0 && delta < 40);
   2551          *ptmp = toUChar(delta-1);
   2552       }
   2553       goto done;
   2554    }
   2555 
   2556    case Xin_CMov32:
   2557       vassert(i->Xin.CMov32.cond != Xcc_ALWAYS);
   2558 
   2559       /* This generates cmov, which is illegal on P54/P55. */
   2560       /*
   2561       *p++ = 0x0F;
   2562       *p++ = toUChar(0x40 + (0xF & i->Xin.CMov32.cond));
   2563       if (i->Xin.CMov32.src->tag == Xrm_Reg) {
   2564          p = doAMode_R(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Reg.reg);
   2565          goto done;
   2566       }
   2567       if (i->Xin.CMov32.src->tag == Xrm_Mem) {
   2568          p = doAMode_M(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Mem.am);
   2569          goto done;
   2570       }
   2571       */
   2572 
   2573       /* Alternative version which works on any x86 variant. */
   2574       /* jmp fwds if !condition */
   2575       *p++ = toUChar(0x70 + (i->Xin.CMov32.cond ^ 1));
   2576       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   2577       ptmp = p;
   2578 
   2579       switch (i->Xin.CMov32.src->tag) {
   2580          case Xrm_Reg:
   2581             /* Big sigh.  This is movl E -> G ... */
   2582             *p++ = 0x89;
   2583             p = doAMode_R(p, i->Xin.CMov32.src->Xrm.Reg.reg,
   2584                              i->Xin.CMov32.dst);
   2585 
   2586             break;
   2587          case Xrm_Mem:
   2588             /* ... whereas this is movl G -> E.  That's why the args
   2589                to doAMode_R appear to be the wrong way round in the
   2590                Xrm_Reg case. */
   2591             *p++ = 0x8B;
   2592             p = doAMode_M(p, i->Xin.CMov32.dst,
   2593                              i->Xin.CMov32.src->Xrm.Mem.am);
   2594             break;
   2595          default:
   2596             goto bad;
   2597       }
   2598       /* Fill in the jump offset. */
   2599       *(ptmp-1) = toUChar(p - ptmp);
   2600       goto done;
   2601 
   2602       break;
   2603 
   2604    case Xin_LoadEX:
   2605       if (i->Xin.LoadEX.szSmall == 1 && !i->Xin.LoadEX.syned) {
   2606          /* movzbl */
   2607          *p++ = 0x0F;
   2608          *p++ = 0xB6;
   2609          p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
   2610          goto done;
   2611       }
   2612       if (i->Xin.LoadEX.szSmall == 2 && !i->Xin.LoadEX.syned) {
   2613          /* movzwl */
   2614          *p++ = 0x0F;
   2615          *p++ = 0xB7;
   2616          p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
   2617          goto done;
   2618       }
   2619       if (i->Xin.LoadEX.szSmall == 1 && i->Xin.LoadEX.syned) {
   2620          /* movsbl */
   2621          *p++ = 0x0F;
   2622          *p++ = 0xBE;
   2623          p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src);
   2624          goto done;
   2625       }
   2626       break;
   2627 
   2628    case Xin_Set32:
   2629       /* Make the destination register be 1 or 0, depending on whether
   2630          the relevant condition holds.  We have to dodge and weave
   2631          when the destination is %esi or %edi as we cannot directly
   2632          emit the native 'setb %reg' for those.  Further complication:
   2633          the top 24 bits of the destination should be forced to zero,
   2634          but doing 'xor %r,%r' kills the flag(s) we are about to read.
   2635          Sigh.  So start off my moving $0 into the dest. */
   2636 
   2637       /* Do we need to swap in %eax? */
   2638       if (iregNo(i->Xin.Set32.dst) >= 4) {
   2639          /* xchg %eax, %dst */
   2640          *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
   2641          /* movl $0, %eax */
   2642          *p++ =toUChar(0xB8 + iregNo(hregX86_EAX()));
   2643          p = emit32(p, 0);
   2644          /* setb lo8(%eax) */
   2645          *p++ = 0x0F;
   2646          *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
   2647          p = doAMode_R(p, fake(0), hregX86_EAX());
   2648          /* xchg %eax, %dst */
   2649          *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
   2650       } else {
   2651          /* movl $0, %dst */
   2652          *p++ = toUChar(0xB8 + iregNo(i->Xin.Set32.dst));
   2653          p = emit32(p, 0);
   2654          /* setb lo8(%dst) */
   2655          *p++ = 0x0F;
   2656          *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
   2657          p = doAMode_R(p, fake(0), i->Xin.Set32.dst);
   2658       }
   2659       goto done;
   2660 
   2661    case Xin_Bsfr32:
   2662       *p++ = 0x0F;
   2663       if (i->Xin.Bsfr32.isFwds) {
   2664          *p++ = 0xBC;
   2665       } else {
   2666          *p++ = 0xBD;
   2667       }
   2668       p = doAMode_R(p, i->Xin.Bsfr32.dst, i->Xin.Bsfr32.src);
   2669       goto done;
   2670 
   2671    case Xin_MFence:
   2672       /* see comment in hdefs.h re this insn */
   2673       if (0) vex_printf("EMIT FENCE\n");
   2674       if (i->Xin.MFence.hwcaps & (VEX_HWCAPS_X86_SSE3
   2675                                   |VEX_HWCAPS_X86_SSE2)) {
   2676          /* mfence */
   2677          *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
   2678          goto done;
   2679       }
   2680       if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_SSE1) {
   2681          /* sfence */
   2682          *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
   2683          /* lock addl $0,0(%esp) */
   2684          *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
   2685          *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
   2686          goto done;
   2687       }
   2688       if (i->Xin.MFence.hwcaps == 0/*baseline, no SSE*/) {
   2689          /* lock addl $0,0(%esp) */
   2690          *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44;
   2691          *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
   2692          goto done;
   2693       }
   2694       vpanic("emit_X86Instr:mfence:hwcaps");
   2695       /*NOTREACHED*/
   2696       break;
   2697 
   2698    case Xin_ACAS:
   2699       /* lock */
   2700       *p++ = 0xF0;
   2701       /* cmpxchg{b,w,l} %ebx,mem.  Expected-value in %eax, new value
   2702          in %ebx.  The new-value register is hardwired to be %ebx
   2703          since letting it be any integer register gives the problem
   2704          that %sil and %dil are unaddressible on x86 and hence we
   2705          would have to resort to the same kind of trickery as with
   2706          byte-sized Xin.Store, just below.  Given that this isn't
   2707          performance critical, it is simpler just to force the
   2708          register operand to %ebx (could equally be %ecx or %edx).
   2709          (Although %ebx is more consistent with cmpxchg8b.) */
   2710       if (i->Xin.ACAS.sz == 2) *p++ = 0x66;
   2711       *p++ = 0x0F;
   2712       if (i->Xin.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
   2713       p = doAMode_M(p, hregX86_EBX(), i->Xin.ACAS.addr);
   2714       goto done;
   2715 
   2716    case Xin_DACAS:
   2717       /* lock */
   2718       *p++ = 0xF0;
   2719       /* cmpxchg8b m64.  Expected-value in %edx:%eax, new value
   2720          in %ecx:%ebx.  All 4 regs are hardwired in the ISA, so
   2721          aren't encoded in the insn. */
   2722       *p++ = 0x0F;
   2723       *p++ = 0xC7;
   2724       p = doAMode_M(p, fake(1), i->Xin.DACAS.addr);
   2725       goto done;
   2726 
   2727    case Xin_Store:
   2728       if (i->Xin.Store.sz == 2) {
   2729          /* This case, at least, is simple, given that we can
   2730             reference the low 16 bits of any integer register. */
   2731          *p++ = 0x66;
   2732          *p++ = 0x89;
   2733          p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
   2734          goto done;
   2735       }
   2736 
   2737       if (i->Xin.Store.sz == 1) {
   2738          /* We have to do complex dodging and weaving if src is not
   2739             the low 8 bits of %eax/%ebx/%ecx/%edx. */
   2740          if (iregNo(i->Xin.Store.src) < 4) {
   2741             /* we're OK, can do it directly */
   2742             *p++ = 0x88;
   2743             p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
   2744            goto done;
   2745          } else {
   2746             /* Bleh.  This means the source is %edi or %esi.  Since
   2747                the address mode can only mention three registers, at
   2748                least one of %eax/%ebx/%ecx/%edx must be available to
   2749                temporarily swap the source into, so the store can
   2750                happen.  So we have to look at the regs mentioned
   2751                in the amode. */
   2752             HReg swap = INVALID_HREG;
   2753             HReg  eax = hregX86_EAX(), ebx = hregX86_EBX(),
   2754                   ecx = hregX86_ECX(), edx = hregX86_EDX();
   2755             Bool a_ok = True, b_ok = True, c_ok = True, d_ok = True;
   2756             HRegUsage u;
   2757             Int j;
   2758             initHRegUsage(&u);
   2759             addRegUsage_X86AMode(&u,  i->Xin.Store.dst);
   2760             for (j = 0; j < u.n_used; j++) {
   2761                HReg r = u.hreg[j];
   2762                if (r == eax) a_ok = False;
   2763                if (r == ebx) b_ok = False;
   2764                if (r == ecx) c_ok = False;
   2765                if (r == edx) d_ok = False;
   2766             }
   2767             if (a_ok) swap = eax;
   2768             if (b_ok) swap = ebx;
   2769             if (c_ok) swap = ecx;
   2770             if (d_ok) swap = edx;
   2771             vassert(swap != INVALID_HREG);
   2772             /* xchgl %source, %swap. Could do better if swap is %eax. */
   2773             *p++ = 0x87;
   2774             p = doAMode_R(p, i->Xin.Store.src, swap);
   2775             /* movb lo8{%swap}, (dst) */
   2776             *p++ = 0x88;
   2777             p = doAMode_M(p, swap, i->Xin.Store.dst);
   2778             /* xchgl %source, %swap. Could do better if swap is %eax. */
   2779             *p++ = 0x87;
   2780             p = doAMode_R(p, i->Xin.Store.src, swap);
   2781             goto done;
   2782          }
   2783       } /* if (i->Xin.Store.sz == 1) */
   2784       break;
   2785 
   2786    case Xin_FpUnary:
   2787       /* gop %src, %dst
   2788          --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
   2789       */
   2790       p = do_ffree_st7(p);
   2791       p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
   2792       p = do_fop1_st(p, i->Xin.FpUnary.op);
   2793       p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
   2794       goto done;
   2795 
   2796    case Xin_FpBinary:
   2797       if (i->Xin.FpBinary.op == Xfp_YL2X
   2798           || i->Xin.FpBinary.op == Xfp_YL2XP1) {
   2799          /* Have to do this specially. */
   2800          /* ffree %st7 ; fld %st(srcL) ;
   2801             ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
   2802          p = do_ffree_st7(p);
   2803          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
   2804          p = do_ffree_st7(p);
   2805          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
   2806          *p++ = 0xD9;
   2807          *p++ = toUChar(i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9);
   2808          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
   2809          goto done;
   2810       }
   2811       if (i->Xin.FpBinary.op == Xfp_ATAN) {
   2812          /* Have to do this specially. */
   2813          /* ffree %st7 ; fld %st(srcL) ;
   2814             ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
   2815          p = do_ffree_st7(p);
   2816          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
   2817          p = do_ffree_st7(p);
   2818          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
   2819          *p++ = 0xD9; *p++ = 0xF3;
   2820          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
   2821          goto done;
   2822       }
   2823       if (i->Xin.FpBinary.op == Xfp_PREM
   2824           || i->Xin.FpBinary.op == Xfp_PREM1
   2825           || i->Xin.FpBinary.op == Xfp_SCALE) {
   2826          /* Have to do this specially. */
   2827          /* ffree %st7 ; fld %st(srcR) ;
   2828             ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ;
   2829             fincstp ; ffree %st7 */
   2830          p = do_ffree_st7(p);
   2831          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
   2832          p = do_ffree_st7(p);
   2833          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
   2834          *p++ = 0xD9;
   2835          switch (i->Xin.FpBinary.op) {
   2836             case Xfp_PREM: *p++ = 0xF8; break;
   2837             case Xfp_PREM1: *p++ = 0xF5; break;
   2838             case Xfp_SCALE: *p++ =  0xFD; break;
   2839             default: vpanic("emitX86Instr(FpBinary,PREM/PREM1/SCALE)");
   2840          }
   2841          p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
   2842          *p++ = 0xD9; *p++ = 0xF7;
   2843          p = do_ffree_st7(p);
   2844          goto done;
   2845       }
   2846       /* General case */
   2847       /* gop %srcL, %srcR, %dst
   2848          --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
   2849       */
   2850       p = do_ffree_st7(p);
   2851       p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
   2852       p = do_fop2_st(p, i->Xin.FpBinary.op,
   2853                         1+hregNumber(i->Xin.FpBinary.srcR));
   2854       p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
   2855       goto done;
   2856 
   2857    case Xin_FpLdSt:
   2858       if (i->Xin.FpLdSt.isLoad) {
   2859          /* Load from memory into %fakeN.
   2860             --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1)
   2861          */
   2862          p = do_ffree_st7(p);
   2863          switch (i->Xin.FpLdSt.sz) {
   2864             case 4:
   2865                *p++ = 0xD9;
   2866                p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
   2867                break;
   2868             case 8:
   2869                *p++ = 0xDD;
   2870                p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
   2871                break;
   2872             case 10:
   2873                *p++ = 0xDB;
   2874                p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr);
   2875                break;
   2876             default:
   2877                vpanic("emitX86Instr(FpLdSt,load)");
   2878          }
   2879          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
   2880          goto done;
   2881       } else {
   2882          /* Store from %fakeN into memory.
   2883             --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
   2884 	 */
   2885          p = do_ffree_st7(p);
   2886          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
   2887          switch (i->Xin.FpLdSt.sz) {
   2888             case 4:
   2889                *p++ = 0xD9;
   2890                p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
   2891                break;
   2892             case 8:
   2893                *p++ = 0xDD;
   2894                p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
   2895                break;
   2896             case 10:
   2897                *p++ = 0xDB;
   2898                p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr);
   2899                break;
   2900             default:
   2901                vpanic("emitX86Instr(FpLdSt,store)");
   2902          }
   2903          goto done;
   2904       }
   2905       break;
   2906 
   2907    case Xin_FpLdStI:
   2908       if (i->Xin.FpLdStI.isLoad) {
   2909          /* Load from memory into %fakeN, converting from an int.
   2910             --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1)
   2911          */
   2912          switch (i->Xin.FpLdStI.sz) {
   2913             case 8:  opc = 0xDF; subopc_imm = 5; break;
   2914             case 4:  opc = 0xDB; subopc_imm = 0; break;
   2915             case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
   2916             default: vpanic("emitX86Instr(Xin_FpLdStI-load)");
   2917          }
   2918          p = do_ffree_st7(p);
   2919          *p++ = toUChar(opc);
   2920          p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
   2921          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
   2922          goto done;
   2923       } else {
   2924          /* Store from %fakeN into memory, converting to an int.
   2925             --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
   2926 	 */
   2927          switch (i->Xin.FpLdStI.sz) {
   2928             case 8:  opc = 0xDF; subopc_imm = 7; break;
   2929             case 4:  opc = 0xDB; subopc_imm = 3; break;
   2930             case 2:  opc = 0xDF; subopc_imm = 3; break;
   2931             default: vpanic("emitX86Instr(Xin_FpLdStI-store)");
   2932          }
   2933          p = do_ffree_st7(p);
   2934          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
   2935          *p++ = toUChar(opc);
   2936          p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
   2937          goto done;
   2938       }
   2939       break;
   2940 
   2941    case Xin_Fp64to32:
   2942       /* ffree %st7 ; fld %st(src) */
   2943       p = do_ffree_st7(p);
   2944       p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
   2945       /* subl $4, %esp */
   2946       *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
   2947       /* fstps (%esp) */
   2948       *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
   2949       /* flds (%esp) */
   2950       *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
   2951       /* addl $4, %esp */
   2952       *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
   2953       /* fstp %st(1+dst) */
   2954       p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
   2955       goto done;
   2956 
   2957    case Xin_FpCMov:
   2958       /* jmp fwds if !condition */
   2959       *p++ = toUChar(0x70 + (i->Xin.FpCMov.cond ^ 1));
   2960       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   2961       ptmp = p;
   2962 
   2963       /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
   2964       p = do_ffree_st7(p);
   2965       p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
   2966       p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
   2967 
   2968       /* Fill in the jump offset. */
   2969       *(ptmp-1) = toUChar(p - ptmp);
   2970       goto done;
   2971 
   2972    case Xin_FpLdCW:
   2973       *p++ = 0xD9;
   2974       p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdCW.addr);
   2975       goto done;
   2976 
   2977    case Xin_FpStSW_AX:
   2978       /* note, this emits fnstsw %ax, not fstsw %ax */
   2979       *p++ = 0xDF;
   2980       *p++ = 0xE0;
   2981       goto done;
   2982 
   2983    case Xin_FpCmp:
   2984       /* gcmp %fL, %fR, %dst
   2985          -> ffree %st7; fpush %fL ; fucomp %(fR+1) ;
   2986             fnstsw %ax ; movl %eax, %dst
   2987       */
   2988       /* ffree %st7 */
   2989       p = do_ffree_st7(p);
   2990       /* fpush %fL */
   2991       p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
   2992       /* fucomp %(fR+1) */
   2993       *p++ = 0xDD;
   2994       *p++ = toUChar(0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR))));
   2995       /* fnstsw %ax */
   2996       *p++ = 0xDF;
   2997       *p++ = 0xE0;
   2998       /*  movl %eax, %dst */
   2999       *p++ = 0x89;
   3000       p = doAMode_R(p, hregX86_EAX(), i->Xin.FpCmp.dst);
   3001       goto done;
   3002 
   3003    case Xin_SseConst: {
   3004       UShort con = i->Xin.SseConst.con;
   3005       p = push_word_from_tags(p, toUShort((con >> 12) & 0xF));
   3006       p = push_word_from_tags(p, toUShort((con >> 8) & 0xF));
   3007       p = push_word_from_tags(p, toUShort((con >> 4) & 0xF));
   3008       p = push_word_from_tags(p, toUShort(con & 0xF));
   3009       /* movl (%esp), %xmm-dst */
   3010       *p++ = 0x0F;
   3011       *p++ = 0x10;
   3012       *p++ = toUChar(0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst)));
   3013       *p++ = 0x24;
   3014       /* addl $16, %esp */
   3015       *p++ = 0x83;
   3016       *p++ = 0xC4;
   3017       *p++ = 0x10;
   3018       goto done;
   3019    }
   3020 
   3021    case Xin_SseLdSt:
   3022       *p++ = 0x0F;
   3023       *p++ = toUChar(i->Xin.SseLdSt.isLoad ? 0x10 : 0x11);
   3024       p = doAMode_M(p, fake(vregNo(i->Xin.SseLdSt.reg)), i->Xin.SseLdSt.addr);
   3025       goto done;
   3026 
   3027    case Xin_SseLdzLO:
   3028       vassert(i->Xin.SseLdzLO.sz == 4 || i->Xin.SseLdzLO.sz == 8);
   3029       /* movs[sd] amode, %xmm-dst */
   3030       *p++ = toUChar(i->Xin.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
   3031       *p++ = 0x0F;
   3032       *p++ = 0x10;
   3033       p = doAMode_M(p, fake(vregNo(i->Xin.SseLdzLO.reg)),
   3034                        i->Xin.SseLdzLO.addr);
   3035       goto done;
   3036 
   3037    case Xin_Sse32Fx4:
   3038       xtra = 0;
   3039       *p++ = 0x0F;
   3040       switch (i->Xin.Sse32Fx4.op) {
   3041          case Xsse_ADDF:   *p++ = 0x58; break;
   3042          case Xsse_DIVF:   *p++ = 0x5E; break;
   3043          case Xsse_MAXF:   *p++ = 0x5F; break;
   3044          case Xsse_MINF:   *p++ = 0x5D; break;
   3045          case Xsse_MULF:   *p++ = 0x59; break;
   3046          case Xsse_RCPF:   *p++ = 0x53; break;
   3047          case Xsse_RSQRTF: *p++ = 0x52; break;
   3048          case Xsse_SQRTF:  *p++ = 0x51; break;
   3049          case Xsse_SUBF:   *p++ = 0x5C; break;
   3050          case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3051          case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3052          case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3053          case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3054          default: goto bad;
   3055       }
   3056       p = doAMode_R(p, fake(vregNo(i->Xin.Sse32Fx4.dst)),
   3057                        fake(vregNo(i->Xin.Sse32Fx4.src)) );
   3058       if (xtra & 0x100)
   3059          *p++ = toUChar(xtra & 0xFF);
   3060       goto done;
   3061 
   3062    case Xin_Sse64Fx2:
   3063       xtra = 0;
   3064       *p++ = 0x66;
   3065       *p++ = 0x0F;
   3066       switch (i->Xin.Sse64Fx2.op) {
   3067          case Xsse_ADDF:   *p++ = 0x58; break;
   3068          case Xsse_DIVF:   *p++ = 0x5E; break;
   3069          case Xsse_MAXF:   *p++ = 0x5F; break;
   3070          case Xsse_MINF:   *p++ = 0x5D; break;
   3071          case Xsse_MULF:   *p++ = 0x59; break;
   3072          case Xsse_RCPF:   *p++ = 0x53; break;
   3073          case Xsse_RSQRTF: *p++ = 0x52; break;
   3074          case Xsse_SQRTF:  *p++ = 0x51; break;
   3075          case Xsse_SUBF:   *p++ = 0x5C; break;
   3076          case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3077          case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3078          case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3079          case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3080          default: goto bad;
   3081       }
   3082       p = doAMode_R(p, fake(vregNo(i->Xin.Sse64Fx2.dst)),
   3083                        fake(vregNo(i->Xin.Sse64Fx2.src)) );
   3084       if (xtra & 0x100)
   3085          *p++ = toUChar(xtra & 0xFF);
   3086       goto done;
   3087 
   3088    case Xin_Sse32FLo:
   3089       xtra = 0;
   3090       *p++ = 0xF3;
   3091       *p++ = 0x0F;
   3092       switch (i->Xin.Sse32FLo.op) {
   3093          case Xsse_ADDF:   *p++ = 0x58; break;
   3094          case Xsse_DIVF:   *p++ = 0x5E; break;
   3095          case Xsse_MAXF:   *p++ = 0x5F; break;
   3096          case Xsse_MINF:   *p++ = 0x5D; break;
   3097          case Xsse_MULF:   *p++ = 0x59; break;
   3098          case Xsse_RCPF:   *p++ = 0x53; break;
   3099          case Xsse_RSQRTF: *p++ = 0x52; break;
   3100          case Xsse_SQRTF:  *p++ = 0x51; break;
   3101          case Xsse_SUBF:   *p++ = 0x5C; break;
   3102          case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3103          case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3104          case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3105          case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3106          default: goto bad;
   3107       }
   3108       p = doAMode_R(p, fake(vregNo(i->Xin.Sse32FLo.dst)),
   3109                        fake(vregNo(i->Xin.Sse32FLo.src)) );
   3110       if (xtra & 0x100)
   3111          *p++ = toUChar(xtra & 0xFF);
   3112       goto done;
   3113 
   3114    case Xin_Sse64FLo:
   3115       xtra = 0;
   3116       *p++ = 0xF2;
   3117       *p++ = 0x0F;
   3118       switch (i->Xin.Sse64FLo.op) {
   3119          case Xsse_ADDF:   *p++ = 0x58; break;
   3120          case Xsse_DIVF:   *p++ = 0x5E; break;
   3121          case Xsse_MAXF:   *p++ = 0x5F; break;
   3122          case Xsse_MINF:   *p++ = 0x5D; break;
   3123          case Xsse_MULF:   *p++ = 0x59; break;
   3124          case Xsse_RCPF:   *p++ = 0x53; break;
   3125          case Xsse_RSQRTF: *p++ = 0x52; break;
   3126          case Xsse_SQRTF:  *p++ = 0x51; break;
   3127          case Xsse_SUBF:   *p++ = 0x5C; break;
   3128          case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3129          case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3130          case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3131          case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3132          default: goto bad;
   3133       }
   3134       p = doAMode_R(p, fake(vregNo(i->Xin.Sse64FLo.dst)),
   3135                        fake(vregNo(i->Xin.Sse64FLo.src)) );
   3136       if (xtra & 0x100)
   3137          *p++ = toUChar(xtra & 0xFF);
   3138       goto done;
   3139 
   3140    case Xin_SseReRg:
   3141 #     define XX(_n) *p++ = (_n)
   3142       switch (i->Xin.SseReRg.op) {
   3143          case Xsse_MOV:     /*movups*/ XX(0x0F); XX(0x10); break;
   3144          case Xsse_OR:                 XX(0x0F); XX(0x56); break;
   3145          case Xsse_XOR:                XX(0x0F); XX(0x57); break;
   3146          case Xsse_AND:                XX(0x0F); XX(0x54); break;
   3147          case Xsse_PACKSSD:  XX(0x66); XX(0x0F); XX(0x6B); break;
   3148          case Xsse_PACKSSW:  XX(0x66); XX(0x0F); XX(0x63); break;
   3149          case Xsse_PACKUSW:  XX(0x66); XX(0x0F); XX(0x67); break;
   3150          case Xsse_ADD8:     XX(0x66); XX(0x0F); XX(0xFC); break;
   3151          case Xsse_ADD16:    XX(0x66); XX(0x0F); XX(0xFD); break;
   3152          case Xsse_ADD32:    XX(0x66); XX(0x0F); XX(0xFE); break;
   3153          case Xsse_ADD64:    XX(0x66); XX(0x0F); XX(0xD4); break;
   3154          case Xsse_QADD8S:   XX(0x66); XX(0x0F); XX(0xEC); break;
   3155          case Xsse_QADD16S:  XX(0x66); XX(0x0F); XX(0xED); break;
   3156          case Xsse_QADD8U:   XX(0x66); XX(0x0F); XX(0xDC); break;
   3157          case Xsse_QADD16U:  XX(0x66); XX(0x0F); XX(0xDD); break;
   3158          case Xsse_AVG8U:    XX(0x66); XX(0x0F); XX(0xE0); break;
   3159          case Xsse_AVG16U:   XX(0x66); XX(0x0F); XX(0xE3); break;
   3160          case Xsse_CMPEQ8:   XX(0x66); XX(0x0F); XX(0x74); break;
   3161          case Xsse_CMPEQ16:  XX(0x66); XX(0x0F); XX(0x75); break;
   3162          case Xsse_CMPEQ32:  XX(0x66); XX(0x0F); XX(0x76); break;
   3163          case Xsse_CMPGT8S:  XX(0x66); XX(0x0F); XX(0x64); break;
   3164          case Xsse_CMPGT16S: XX(0x66); XX(0x0F); XX(0x65); break;
   3165          case Xsse_CMPGT32S: XX(0x66); XX(0x0F); XX(0x66); break;
   3166          case Xsse_MAX16S:   XX(0x66); XX(0x0F); XX(0xEE); break;
   3167          case Xsse_MAX8U:    XX(0x66); XX(0x0F); XX(0xDE); break;
   3168          case Xsse_MIN16S:   XX(0x66); XX(0x0F); XX(0xEA); break;
   3169          case Xsse_MIN8U:    XX(0x66); XX(0x0F); XX(0xDA); break;
   3170          case Xsse_MULHI16U: XX(0x66); XX(0x0F); XX(0xE4); break;
   3171          case Xsse_MULHI16S: XX(0x66); XX(0x0F); XX(0xE5); break;
   3172          case Xsse_MUL16:    XX(0x66); XX(0x0F); XX(0xD5); break;
   3173          case Xsse_SHL16:    XX(0x66); XX(0x0F); XX(0xF1); break;
   3174          case Xsse_SHL32:    XX(0x66); XX(0x0F); XX(0xF2); break;
   3175          case Xsse_SHL64:    XX(0x66); XX(0x0F); XX(0xF3); break;
   3176          case Xsse_SAR16:    XX(0x66); XX(0x0F); XX(0xE1); break;
   3177          case Xsse_SAR32:    XX(0x66); XX(0x0F); XX(0xE2); break;
   3178          case Xsse_SHR16:    XX(0x66); XX(0x0F); XX(0xD1); break;
   3179          case Xsse_SHR32:    XX(0x66); XX(0x0F); XX(0xD2); break;
   3180          case Xsse_SHR64:    XX(0x66); XX(0x0F); XX(0xD3); break;
   3181          case Xsse_SUB8:     XX(0x66); XX(0x0F); XX(0xF8); break;
   3182          case Xsse_SUB16:    XX(0x66); XX(0x0F); XX(0xF9); break;
   3183          case Xsse_SUB32:    XX(0x66); XX(0x0F); XX(0xFA); break;
   3184          case Xsse_SUB64:    XX(0x66); XX(0x0F); XX(0xFB); break;
   3185          case Xsse_QSUB8S:   XX(0x66); XX(0x0F); XX(0xE8); break;
   3186          case Xsse_QSUB16S:  XX(0x66); XX(0x0F); XX(0xE9); break;
   3187          case Xsse_QSUB8U:   XX(0x66); XX(0x0F); XX(0xD8); break;
   3188          case Xsse_QSUB16U:  XX(0x66); XX(0x0F); XX(0xD9); break;
   3189          case Xsse_UNPCKHB:  XX(0x66); XX(0x0F); XX(0x68); break;
   3190          case Xsse_UNPCKHW:  XX(0x66); XX(0x0F); XX(0x69); break;
   3191          case Xsse_UNPCKHD:  XX(0x66); XX(0x0F); XX(0x6A); break;
   3192          case Xsse_UNPCKHQ:  XX(0x66); XX(0x0F); XX(0x6D); break;
   3193          case Xsse_UNPCKLB:  XX(0x66); XX(0x0F); XX(0x60); break;
   3194          case Xsse_UNPCKLW:  XX(0x66); XX(0x0F); XX(0x61); break;
   3195          case Xsse_UNPCKLD:  XX(0x66); XX(0x0F); XX(0x62); break;
   3196          case Xsse_UNPCKLQ:  XX(0x66); XX(0x0F); XX(0x6C); break;
   3197          default: goto bad;
   3198       }
   3199       p = doAMode_R(p, fake(vregNo(i->Xin.SseReRg.dst)),
   3200                        fake(vregNo(i->Xin.SseReRg.src)) );
   3201 #     undef XX
   3202       goto done;
   3203 
   3204    case Xin_SseCMov:
   3205       /* jmp fwds if !condition */
   3206       *p++ = toUChar(0x70 + (i->Xin.SseCMov.cond ^ 1));
   3207       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   3208       ptmp = p;
   3209 
   3210       /* movaps %src, %dst */
   3211       *p++ = 0x0F;
   3212       *p++ = 0x28;
   3213       p = doAMode_R(p, fake(vregNo(i->Xin.SseCMov.dst)),
   3214                        fake(vregNo(i->Xin.SseCMov.src)) );
   3215 
   3216       /* Fill in the jump offset. */
   3217       *(ptmp-1) = toUChar(p - ptmp);
   3218       goto done;
   3219 
   3220    case Xin_SseShuf:
   3221       *p++ = 0x66;
   3222       *p++ = 0x0F;
   3223       *p++ = 0x70;
   3224       p = doAMode_R(p, fake(vregNo(i->Xin.SseShuf.dst)),
   3225                        fake(vregNo(i->Xin.SseShuf.src)) );
   3226       *p++ = (UChar)(i->Xin.SseShuf.order);
   3227       goto done;
   3228 
   3229    case Xin_EvCheck: {
   3230       /* We generate:
   3231             (3 bytes)  decl 4(%ebp)    4 == offsetof(host_EvC_COUNTER)
   3232             (2 bytes)  jns  nofail     expected taken
   3233             (3 bytes)  jmp* 0(%ebp)    0 == offsetof(host_EvC_FAILADDR)
   3234             nofail:
   3235       */
   3236       /* This is heavily asserted re instruction lengths.  It needs to
   3237          be.  If we get given unexpected forms of .amCounter or
   3238          .amFailAddr -- basically, anything that's not of the form
   3239          uimm7(%ebp) -- they are likely to fail. */
   3240       /* Note also that after the decl we must be very careful not to
   3241          read the carry flag, else we get a partial flags stall.
   3242          js/jns avoids that, though. */
   3243       UChar* p0 = p;
   3244       /* ---  decl 8(%ebp) --- */
   3245       /* "fake(1)" because + there's no register in this encoding;
   3246          instead the register + field is used as a sub opcode.  The
   3247          encoding for "decl r/m32" + is FF /1, hence the fake(1). */
   3248       *p++ = 0xFF;
   3249       p = doAMode_M(p, fake(1), i->Xin.EvCheck.amCounter);
   3250       vassert(p - p0 == 3);
   3251       /* --- jns nofail --- */
   3252       *p++ = 0x79;
   3253       *p++ = 0x03; /* need to check this 0x03 after the next insn */
   3254       vassert(p - p0 == 5);
   3255       /* --- jmp* 0(%ebp) --- */
   3256       /* The encoding is FF /4. */
   3257       *p++ = 0xFF;
   3258       p = doAMode_M(p, fake(4), i->Xin.EvCheck.amFailAddr);
   3259       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
   3260       /* And crosscheck .. */
   3261       vassert(evCheckSzB_X86() == 8);
   3262       goto done;
   3263    }
   3264 
   3265    case Xin_ProfInc: {
   3266       /* We generate   addl $1,NotKnownYet
   3267                        adcl $0,NotKnownYet+4
   3268          in the expectation that a later call to LibVEX_patchProfCtr
   3269          will be used to fill in the immediate fields once the right
   3270          value is known.
   3271            83 05  00 00 00 00  01
   3272            83 15  00 00 00 00  00
   3273       */
   3274       *p++ = 0x83; *p++ = 0x05;
   3275       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3276       *p++ = 0x01;
   3277       *p++ = 0x83; *p++ = 0x15;
   3278       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3279       *p++ = 0x00;
   3280       /* Tell the caller .. */
   3281       vassert(!(*is_profInc));
   3282       *is_profInc = True;
   3283       goto done;
   3284    }
   3285 
   3286    default:
   3287       goto bad;
   3288    }
   3289 
   3290   bad:
   3291    ppX86Instr(i, mode64);
   3292    vpanic("emit_X86Instr");
   3293    /*NOTREACHED*/
   3294 
   3295   done:
   3296    vassert(p - &buf[0] <= 32);
   3297    return p - &buf[0];
   3298 
   3299 #  undef fake
   3300 }
   3301 
   3302 
   3303 /* How big is an event check?  See case for Xin_EvCheck in
   3304    emit_X86Instr just above.  That crosschecks what this returns, so
   3305    we can tell if we're inconsistent. */
   3306 Int evCheckSzB_X86 ( void )
   3307 {
   3308    return 8;
   3309 }
   3310 
   3311 
   3312 /* NB: what goes on here has to be very closely coordinated with the
   3313    emitInstr case for XDirect, above. */
   3314 VexInvalRange chainXDirect_X86 ( void* place_to_chain,
   3315                                  void* disp_cp_chain_me_EXPECTED,
   3316                                  void* place_to_jump_to )
   3317 {
   3318    /* What we're expecting to see is:
   3319         movl $disp_cp_chain_me_EXPECTED, %edx
   3320         call *%edx
   3321       viz
   3322         BA <4 bytes value == disp_cp_chain_me_EXPECTED>
   3323         FF D2
   3324    */
   3325    UChar* p = (UChar*)place_to_chain;
   3326    vassert(p[0] == 0xBA);
   3327    vassert(*(UInt*)(&p[1]) == (UInt)Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
   3328    vassert(p[5] == 0xFF);
   3329    vassert(p[6] == 0xD2);
   3330    /* And what we want to change it to is:
   3331           jmp disp32   where disp32 is relative to the next insn
   3332           ud2;
   3333         viz
   3334           E9 <4 bytes == disp32>
   3335           0F 0B
   3336       The replacement has the same length as the original.
   3337    */
   3338    /* This is the delta we need to put into a JMP d32 insn.  It's
   3339       relative to the start of the next insn, hence the -5.  */
   3340    Long delta = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
   3341 
   3342    /* And make the modifications. */
   3343    p[0] = 0xE9;
   3344    p[1] = (delta >> 0) & 0xFF;
   3345    p[2] = (delta >> 8) & 0xFF;
   3346    p[3] = (delta >> 16) & 0xFF;
   3347    p[4] = (delta >> 24) & 0xFF;
   3348    p[5] = 0x0F; p[6]  = 0x0B;
   3349    /* sanity check on the delta -- top 32 are all 0 or all 1 */
   3350    delta >>= 32;
   3351    vassert(delta == 0LL || delta == -1LL);
   3352    VexInvalRange vir = {0, 0};
   3353    return vir;
   3354 }
   3355 
   3356 
   3357 /* NB: what goes on here has to be very closely coordinated with the
   3358    emitInstr case for XDirect, above. */
   3359 VexInvalRange unchainXDirect_X86 ( void* place_to_unchain,
   3360                                    void* place_to_jump_to_EXPECTED,
   3361                                    void* disp_cp_chain_me )
   3362 {
   3363    /* What we're expecting to see is:
   3364           jmp d32
   3365           ud2;
   3366        viz
   3367           E9 <4 bytes == disp32>
   3368           0F 0B
   3369    */
   3370    UChar* p     = (UChar*)place_to_unchain;
   3371    Bool   valid = False;
   3372    if (p[0] == 0xE9
   3373        && p[5]  == 0x0F && p[6]  == 0x0B) {
   3374       /* Check the offset is right. */
   3375       Int s32 = *(Int*)(&p[1]);
   3376       if ((UChar*)p + 5 + s32 == (UChar*)place_to_jump_to_EXPECTED) {
   3377          valid = True;
   3378          if (0)
   3379             vex_printf("QQQ unchainXDirect_X86: found valid\n");
   3380       }
   3381    }
   3382    vassert(valid);
   3383    /* And what we want to change it to is:
   3384          movl $disp_cp_chain_me, %edx
   3385          call *%edx
   3386       viz
   3387          BA <4 bytes value == disp_cp_chain_me_EXPECTED>
   3388          FF D2
   3389       So it's the same length (convenient, huh).
   3390    */
   3391    p[0] = 0xBA;
   3392    *(UInt*)(&p[1]) = (UInt)Ptr_to_ULong(disp_cp_chain_me);
   3393    p[5] = 0xFF;
   3394    p[6] = 0xD2;
   3395    VexInvalRange vir = {0, 0};
   3396    return vir;
   3397 }
   3398 
   3399 
   3400 /* Patch the counter address into a profile inc point, as previously
   3401    created by the Xin_ProfInc case for emit_X86Instr. */
   3402 VexInvalRange patchProfInc_X86 ( void*  place_to_patch,
   3403                                  ULong* location_of_counter )
   3404 {
   3405    vassert(sizeof(ULong*) == 4);
   3406    UChar* p = (UChar*)place_to_patch;
   3407    vassert(p[0] == 0x83);
   3408    vassert(p[1] == 0x05);
   3409    vassert(p[2] == 0x00);
   3410    vassert(p[3] == 0x00);
   3411    vassert(p[4] == 0x00);
   3412    vassert(p[5] == 0x00);
   3413    vassert(p[6] == 0x01);
   3414    vassert(p[7] == 0x83);
   3415    vassert(p[8] == 0x15);
   3416    vassert(p[9] == 0x00);
   3417    vassert(p[10] == 0x00);
   3418    vassert(p[11] == 0x00);
   3419    vassert(p[12] == 0x00);
   3420    vassert(p[13] == 0x00);
   3421    UInt imm32 = (UInt)Ptr_to_ULong(location_of_counter);
   3422    p[2] = imm32 & 0xFF; imm32 >>= 8;
   3423    p[3] = imm32 & 0xFF; imm32 >>= 8;
   3424    p[4] = imm32 & 0xFF; imm32 >>= 8;
   3425    p[5] = imm32 & 0xFF; imm32 >>= 8;
   3426    imm32 = 4 + (UInt)Ptr_to_ULong(location_of_counter);
   3427    p[9]  = imm32 & 0xFF; imm32 >>= 8;
   3428    p[10] = imm32 & 0xFF; imm32 >>= 8;
   3429    p[11] = imm32 & 0xFF; imm32 >>= 8;
   3430    p[12] = imm32 & 0xFF; imm32 >>= 8;
   3431    VexInvalRange vir = {0, 0};
   3432    return vir;
   3433 }
   3434 
   3435 
   3436 /*---------------------------------------------------------------*/
   3437 /*--- end                                     host_x86_defs.c ---*/
   3438 /*---------------------------------------------------------------*/
   3439