Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_defs.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2010 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex.h"
     38 #include "libvex_trc_values.h"
     39 
     40 #include "main_util.h"
     41 #include "host_generic_regs.h"
     42 #include "host_amd64_defs.h"
     43 
     44 
     45 /* --------- Registers. --------- */
     46 
     47 void ppHRegAMD64 ( HReg reg )
     48 {
     49    Int r;
     50    static HChar* ireg64_names[16]
     51      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
     52          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
     53    /* Be generic for all virtual regs. */
     54    if (hregIsVirtual(reg)) {
     55       ppHReg(reg);
     56       return;
     57    }
     58    /* But specific for real regs. */
     59    switch (hregClass(reg)) {
     60       case HRcInt64:
     61          r = hregNumber(reg);
     62          vassert(r >= 0 && r < 16);
     63          vex_printf("%s", ireg64_names[r]);
     64          return;
     65       case HRcFlt64:
     66          r = hregNumber(reg);
     67          vassert(r >= 0 && r < 6);
     68          vex_printf("%%fake%d", r);
     69          return;
     70       case HRcVec128:
     71          r = hregNumber(reg);
     72          vassert(r >= 0 && r < 16);
     73          vex_printf("%%xmm%d", r);
     74          return;
     75       default:
     76          vpanic("ppHRegAMD64");
     77    }
     78 }
     79 
     80 static void ppHRegAMD64_lo32 ( HReg reg )
     81 {
     82    Int r;
     83    static HChar* ireg32_names[16]
     84      = { "%eax",  "%ecx",  "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
     85          "%r8d",  "%r9d",  "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
     86    /* Be generic for all virtual regs. */
     87    if (hregIsVirtual(reg)) {
     88       ppHReg(reg);
     89       vex_printf("d");
     90       return;
     91    }
     92    /* But specific for real regs. */
     93    switch (hregClass(reg)) {
     94       case HRcInt64:
     95          r = hregNumber(reg);
     96          vassert(r >= 0 && r < 16);
     97          vex_printf("%s", ireg32_names[r]);
     98          return;
     99       default:
    100          vpanic("ppHRegAMD64_lo32: invalid regclass");
    101    }
    102 }
    103 
    104 HReg hregAMD64_RAX ( void ) { return mkHReg( 0, HRcInt64, False); }
    105 HReg hregAMD64_RCX ( void ) { return mkHReg( 1, HRcInt64, False); }
    106 HReg hregAMD64_RDX ( void ) { return mkHReg( 2, HRcInt64, False); }
    107 HReg hregAMD64_RBX ( void ) { return mkHReg( 3, HRcInt64, False); }
    108 HReg hregAMD64_RSP ( void ) { return mkHReg( 4, HRcInt64, False); }
    109 HReg hregAMD64_RBP ( void ) { return mkHReg( 5, HRcInt64, False); }
    110 HReg hregAMD64_RSI ( void ) { return mkHReg( 6, HRcInt64, False); }
    111 HReg hregAMD64_RDI ( void ) { return mkHReg( 7, HRcInt64, False); }
    112 HReg hregAMD64_R8  ( void ) { return mkHReg( 8, HRcInt64, False); }
    113 HReg hregAMD64_R9  ( void ) { return mkHReg( 9, HRcInt64, False); }
    114 HReg hregAMD64_R10 ( void ) { return mkHReg(10, HRcInt64, False); }
    115 HReg hregAMD64_R11 ( void ) { return mkHReg(11, HRcInt64, False); }
    116 HReg hregAMD64_R12 ( void ) { return mkHReg(12, HRcInt64, False); }
    117 HReg hregAMD64_R13 ( void ) { return mkHReg(13, HRcInt64, False); }
    118 HReg hregAMD64_R14 ( void ) { return mkHReg(14, HRcInt64, False); }
    119 HReg hregAMD64_R15 ( void ) { return mkHReg(15, HRcInt64, False); }
    120 
    121 //.. HReg hregAMD64_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
    122 //.. HReg hregAMD64_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
    123 //.. HReg hregAMD64_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
    124 //.. HReg hregAMD64_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
    125 //.. HReg hregAMD64_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
    126 //.. HReg hregAMD64_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
    127 //..
    128 HReg hregAMD64_XMM0  ( void ) { return mkHReg( 0, HRcVec128, False); }
    129 HReg hregAMD64_XMM1  ( void ) { return mkHReg( 1, HRcVec128, False); }
    130 HReg hregAMD64_XMM2  ( void ) { return mkHReg( 2, HRcVec128, False); }
    131 HReg hregAMD64_XMM3  ( void ) { return mkHReg( 3, HRcVec128, False); }
    132 HReg hregAMD64_XMM4  ( void ) { return mkHReg( 4, HRcVec128, False); }
    133 HReg hregAMD64_XMM5  ( void ) { return mkHReg( 5, HRcVec128, False); }
    134 HReg hregAMD64_XMM6  ( void ) { return mkHReg( 6, HRcVec128, False); }
    135 HReg hregAMD64_XMM7  ( void ) { return mkHReg( 7, HRcVec128, False); }
    136 HReg hregAMD64_XMM8  ( void ) { return mkHReg( 8, HRcVec128, False); }
    137 HReg hregAMD64_XMM9  ( void ) { return mkHReg( 9, HRcVec128, False); }
    138 HReg hregAMD64_XMM10 ( void ) { return mkHReg(10, HRcVec128, False); }
    139 HReg hregAMD64_XMM11 ( void ) { return mkHReg(11, HRcVec128, False); }
    140 HReg hregAMD64_XMM12 ( void ) { return mkHReg(12, HRcVec128, False); }
    141 HReg hregAMD64_XMM13 ( void ) { return mkHReg(13, HRcVec128, False); }
    142 HReg hregAMD64_XMM14 ( void ) { return mkHReg(14, HRcVec128, False); }
    143 HReg hregAMD64_XMM15 ( void ) { return mkHReg(15, HRcVec128, False); }
    144 
    145 
    146 void getAllocableRegs_AMD64 ( Int* nregs, HReg** arr )
    147 {
    148 #if 0
    149    *nregs = 6;
    150    *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
    151    (*arr)[ 0] = hregAMD64_RSI();
    152    (*arr)[ 1] = hregAMD64_RDI();
    153    (*arr)[ 2] = hregAMD64_RBX();
    154 
    155    (*arr)[ 3] = hregAMD64_XMM7();
    156    (*arr)[ 4] = hregAMD64_XMM8();
    157    (*arr)[ 5] = hregAMD64_XMM9();
    158 #endif
    159 #if 1
    160    *nregs = 20;
    161    *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
    162    (*arr)[ 0] = hregAMD64_RSI();
    163    (*arr)[ 1] = hregAMD64_RDI();
    164    (*arr)[ 2] = hregAMD64_R8();
    165    (*arr)[ 3] = hregAMD64_R9();
    166    (*arr)[ 4] = hregAMD64_R12();
    167    (*arr)[ 5] = hregAMD64_R13();
    168    (*arr)[ 6] = hregAMD64_R14();
    169    (*arr)[ 7] = hregAMD64_R15();
    170    (*arr)[ 8] = hregAMD64_RBX();
    171 
    172    (*arr)[ 9] = hregAMD64_XMM3();
    173    (*arr)[10] = hregAMD64_XMM4();
    174    (*arr)[11] = hregAMD64_XMM5();
    175    (*arr)[12] = hregAMD64_XMM6();
    176    (*arr)[13] = hregAMD64_XMM7();
    177    (*arr)[14] = hregAMD64_XMM8();
    178    (*arr)[15] = hregAMD64_XMM9();
    179    (*arr)[16] = hregAMD64_XMM10();
    180    (*arr)[17] = hregAMD64_XMM11();
    181    (*arr)[18] = hregAMD64_XMM12();
    182    (*arr)[19] = hregAMD64_R10();
    183 #endif
    184 }
    185 
    186 
    187 /* --------- Condition codes, Intel encoding. --------- */
    188 
    189 HChar* showAMD64CondCode ( AMD64CondCode cond )
    190 {
    191    switch (cond) {
    192       case Acc_O:      return "o";
    193       case Acc_NO:     return "no";
    194       case Acc_B:      return "b";
    195       case Acc_NB:     return "nb";
    196       case Acc_Z:      return "z";
    197       case Acc_NZ:     return "nz";
    198       case Acc_BE:     return "be";
    199       case Acc_NBE:    return "nbe";
    200       case Acc_S:      return "s";
    201       case Acc_NS:     return "ns";
    202       case Acc_P:      return "p";
    203       case Acc_NP:     return "np";
    204       case Acc_L:      return "l";
    205       case Acc_NL:     return "nl";
    206       case Acc_LE:     return "le";
    207       case Acc_NLE:    return "nle";
    208       case Acc_ALWAYS: return "ALWAYS";
    209       default: vpanic("ppAMD64CondCode");
    210    }
    211 }
    212 
    213 
    214 /* --------- AMD64AMode: memory address expressions. --------- */
    215 
    216 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
    217    AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
    218    am->tag        = Aam_IR;
    219    am->Aam.IR.imm = imm32;
    220    am->Aam.IR.reg = reg;
    221    return am;
    222 }
    223 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
    224    AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
    225    am->tag = Aam_IRRS;
    226    am->Aam.IRRS.imm   = imm32;
    227    am->Aam.IRRS.base  = base;
    228    am->Aam.IRRS.index = indEx;
    229    am->Aam.IRRS.shift = shift;
    230    vassert(shift >= 0 && shift <= 3);
    231    return am;
    232 }
    233 
    234 //.. AMD64AMode* dopyAMD64AMode ( AMD64AMode* am ) {
    235 //..    switch (am->tag) {
    236 //..       case Xam_IR:
    237 //..          return AMD64AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
    238 //..       case Xam_IRRS:
    239 //..          return AMD64AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base,
    240 //..                                am->Xam.IRRS.index, am->Xam.IRRS.shift );
    241 //..       default:
    242 //..          vpanic("dopyAMD64AMode");
    243 //..    }
    244 //.. }
    245 
    246 void ppAMD64AMode ( AMD64AMode* am ) {
    247    switch (am->tag) {
    248       case Aam_IR:
    249          if (am->Aam.IR.imm == 0)
    250             vex_printf("(");
    251          else
    252             vex_printf("0x%x(", am->Aam.IR.imm);
    253          ppHRegAMD64(am->Aam.IR.reg);
    254          vex_printf(")");
    255          return;
    256       case Aam_IRRS:
    257          vex_printf("0x%x(", am->Aam.IRRS.imm);
    258          ppHRegAMD64(am->Aam.IRRS.base);
    259          vex_printf(",");
    260          ppHRegAMD64(am->Aam.IRRS.index);
    261          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
    262          return;
    263       default:
    264          vpanic("ppAMD64AMode");
    265    }
    266 }
    267 
    268 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
    269    switch (am->tag) {
    270       case Aam_IR:
    271          addHRegUse(u, HRmRead, am->Aam.IR.reg);
    272          return;
    273       case Aam_IRRS:
    274          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
    275          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
    276          return;
    277       default:
    278          vpanic("addRegUsage_AMD64AMode");
    279    }
    280 }
    281 
    282 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
    283    switch (am->tag) {
    284       case Aam_IR:
    285          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
    286          return;
    287       case Aam_IRRS:
    288          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
    289          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
    290          return;
    291       default:
    292          vpanic("mapRegs_AMD64AMode");
    293    }
    294 }
    295 
    296 /* --------- Operand, which can be reg, immediate or memory. --------- */
    297 
    298 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
    299    AMD64RMI* op       = LibVEX_Alloc(sizeof(AMD64RMI));
    300    op->tag            = Armi_Imm;
    301    op->Armi.Imm.imm32 = imm32;
    302    return op;
    303 }
    304 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
    305    AMD64RMI* op     = LibVEX_Alloc(sizeof(AMD64RMI));
    306    op->tag          = Armi_Reg;
    307    op->Armi.Reg.reg = reg;
    308    return op;
    309 }
    310 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
    311    AMD64RMI* op    = LibVEX_Alloc(sizeof(AMD64RMI));
    312    op->tag         = Armi_Mem;
    313    op->Armi.Mem.am = am;
    314    return op;
    315 }
    316 
    317 void ppAMD64RMI ( AMD64RMI* op ) {
    318    switch (op->tag) {
    319       case Armi_Imm:
    320          vex_printf("$0x%x", op->Armi.Imm.imm32);
    321          return;
    322       case Armi_Reg:
    323          ppHRegAMD64(op->Armi.Reg.reg);
    324          return;
    325       case Armi_Mem:
    326          ppAMD64AMode(op->Armi.Mem.am);
    327          return;
    328      default:
    329          vpanic("ppAMD64RMI");
    330    }
    331 }
    332 
    333 /* An AMD64RMI can only be used in a "read" context (what would it mean
    334    to write or modify a literal?) and so we enumerate its registers
    335    accordingly. */
    336 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
    337    switch (op->tag) {
    338       case Armi_Imm:
    339          return;
    340       case Armi_Reg:
    341          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
    342          return;
    343       case Armi_Mem:
    344          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
    345          return;
    346       default:
    347          vpanic("addRegUsage_AMD64RMI");
    348    }
    349 }
    350 
    351 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
    352    switch (op->tag) {
    353       case Armi_Imm:
    354          return;
    355       case Armi_Reg:
    356          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
    357          return;
    358       case Armi_Mem:
    359          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
    360          return;
    361       default:
    362          vpanic("mapRegs_AMD64RMI");
    363    }
    364 }
    365 
    366 
    367 /* --------- Operand, which can be reg or immediate only. --------- */
    368 
    369 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
    370    AMD64RI* op       = LibVEX_Alloc(sizeof(AMD64RI));
    371    op->tag           = Ari_Imm;
    372    op->Ari.Imm.imm32 = imm32;
    373    return op;
    374 }
    375 AMD64RI* AMD64RI_Reg ( HReg reg ) {
    376    AMD64RI* op     = LibVEX_Alloc(sizeof(AMD64RI));
    377    op->tag         = Ari_Reg;
    378    op->Ari.Reg.reg = reg;
    379    return op;
    380 }
    381 
    382 void ppAMD64RI ( AMD64RI* op ) {
    383    switch (op->tag) {
    384       case Ari_Imm:
    385          vex_printf("$0x%x", op->Ari.Imm.imm32);
    386          return;
    387       case Ari_Reg:
    388          ppHRegAMD64(op->Ari.Reg.reg);
    389          return;
    390      default:
    391          vpanic("ppAMD64RI");
    392    }
    393 }
    394 
    395 /* An AMD64RI can only be used in a "read" context (what would it mean
    396    to write or modify a literal?) and so we enumerate its registers
    397    accordingly. */
    398 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
    399    switch (op->tag) {
    400       case Ari_Imm:
    401          return;
    402       case Ari_Reg:
    403          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
    404          return;
    405       default:
    406          vpanic("addRegUsage_AMD64RI");
    407    }
    408 }
    409 
    410 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
    411    switch (op->tag) {
    412       case Ari_Imm:
    413          return;
    414       case Ari_Reg:
    415          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
    416          return;
    417       default:
    418          vpanic("mapRegs_AMD64RI");
    419    }
    420 }
    421 
    422 
    423 /* --------- Operand, which can be reg or memory only. --------- */
    424 
    425 AMD64RM* AMD64RM_Reg ( HReg reg ) {
    426    AMD64RM* op       = LibVEX_Alloc(sizeof(AMD64RM));
    427    op->tag         = Arm_Reg;
    428    op->Arm.Reg.reg = reg;
    429    return op;
    430 }
    431 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
    432    AMD64RM* op    = LibVEX_Alloc(sizeof(AMD64RM));
    433    op->tag        = Arm_Mem;
    434    op->Arm.Mem.am = am;
    435    return op;
    436 }
    437 
    438 void ppAMD64RM ( AMD64RM* op ) {
    439    switch (op->tag) {
    440       case Arm_Mem:
    441          ppAMD64AMode(op->Arm.Mem.am);
    442          return;
    443       case Arm_Reg:
    444          ppHRegAMD64(op->Arm.Reg.reg);
    445          return;
    446      default:
    447          vpanic("ppAMD64RM");
    448    }
    449 }
    450 
    451 /* Because an AMD64RM can be both a source or destination operand, we
    452    have to supply a mode -- pertaining to the operand as a whole --
    453    indicating how it's being used. */
    454 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
    455    switch (op->tag) {
    456       case Arm_Mem:
    457          /* Memory is read, written or modified.  So we just want to
    458             know the regs read by the amode. */
    459          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
    460          return;
    461       case Arm_Reg:
    462          /* reg is read, written or modified.  Add it in the
    463             appropriate way. */
    464          addHRegUse(u, mode, op->Arm.Reg.reg);
    465          return;
    466      default:
    467          vpanic("addRegUsage_AMD64RM");
    468    }
    469 }
    470 
    471 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
    472 {
    473    switch (op->tag) {
    474       case Arm_Mem:
    475          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
    476          return;
    477       case Arm_Reg:
    478          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
    479          return;
    480      default:
    481          vpanic("mapRegs_AMD64RM");
    482    }
    483 }
    484 
    485 
    486 /* --------- Instructions. --------- */
    487 
    488 static HChar* showAMD64ScalarSz ( Int sz ) {
    489    switch (sz) {
    490       case 2: return "w";
    491       case 4: return "l";
    492       case 8: return "q";
    493       default: vpanic("showAMD64ScalarSz");
    494    }
    495 }
    496 
    497 HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
    498    switch (op) {
    499       case Aun_NOT: return "not";
    500       case Aun_NEG: return "neg";
    501       default: vpanic("showAMD64UnaryOp");
    502    }
    503 }
    504 
    505 HChar* showAMD64AluOp ( AMD64AluOp op ) {
    506    switch (op) {
    507       case Aalu_MOV:  return "mov";
    508       case Aalu_CMP:  return "cmp";
    509       case Aalu_ADD:  return "add";
    510       case Aalu_SUB:  return "sub";
    511       case Aalu_ADC:  return "adc";
    512       case Aalu_SBB:  return "sbb";
    513       case Aalu_AND:  return "and";
    514       case Aalu_OR:   return "or";
    515       case Aalu_XOR:  return "xor";
    516       case Aalu_MUL:  return "imul";
    517       default: vpanic("showAMD64AluOp");
    518    }
    519 }
    520 
    521 HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
    522    switch (op) {
    523       case Ash_SHL: return "shl";
    524       case Ash_SHR: return "shr";
    525       case Ash_SAR: return "sar";
    526       default: vpanic("showAMD64ShiftOp");
    527    }
    528 }
    529 
    530 HChar* showA87FpOp ( A87FpOp op ) {
    531    switch (op) {
    532 //..       case Xfp_ADD:    return "add";
    533 //..       case Xfp_SUB:    return "sub";
    534 //..       case Xfp_MUL:    return "mul";
    535 //..       case Xfp_DIV:    return "div";
    536       case Afp_SCALE:  return "scale";
    537       case Afp_ATAN:   return "atan";
    538       case Afp_YL2X:   return "yl2x";
    539       case Afp_YL2XP1: return "yl2xp1";
    540       case Afp_PREM:   return "prem";
    541       case Afp_PREM1:  return "prem1";
    542       case Afp_SQRT:   return "sqrt";
    543 //..       case Xfp_ABS:    return "abs";
    544 //..       case Xfp_NEG:    return "chs";
    545 //..       case Xfp_MOV:    return "mov";
    546       case Afp_SIN:    return "sin";
    547       case Afp_COS:    return "cos";
    548       case Afp_TAN:    return "tan";
    549       case Afp_ROUND:  return "round";
    550       case Afp_2XM1:   return "2xm1";
    551       default: vpanic("showA87FpOp");
    552    }
    553 }
    554 
    555 HChar* showAMD64SseOp ( AMD64SseOp op ) {
    556    switch (op) {
    557       case Asse_MOV:      return "movups";
    558       case Asse_ADDF:     return "add";
    559       case Asse_SUBF:     return "sub";
    560       case Asse_MULF:     return "mul";
    561       case Asse_DIVF:     return "div";
    562       case Asse_MAXF:     return "max";
    563       case Asse_MINF:     return "min";
    564       case Asse_CMPEQF:   return "cmpFeq";
    565       case Asse_CMPLTF:   return "cmpFlt";
    566       case Asse_CMPLEF:   return "cmpFle";
    567       case Asse_CMPUNF:   return "cmpFun";
    568       case Asse_RCPF:     return "rcp";
    569       case Asse_RSQRTF:   return "rsqrt";
    570       case Asse_SQRTF:    return "sqrt";
    571       case Asse_AND:      return "and";
    572       case Asse_OR:       return "or";
    573       case Asse_XOR:      return "xor";
    574       case Asse_ANDN:     return "andn";
    575       case Asse_ADD8:     return "paddb";
    576       case Asse_ADD16:    return "paddw";
    577       case Asse_ADD32:    return "paddd";
    578       case Asse_ADD64:    return "paddq";
    579       case Asse_QADD8U:   return "paddusb";
    580       case Asse_QADD16U:  return "paddusw";
    581       case Asse_QADD8S:   return "paddsb";
    582       case Asse_QADD16S:  return "paddsw";
    583       case Asse_SUB8:     return "psubb";
    584       case Asse_SUB16:    return "psubw";
    585       case Asse_SUB32:    return "psubd";
    586       case Asse_SUB64:    return "psubq";
    587       case Asse_QSUB8U:   return "psubusb";
    588       case Asse_QSUB16U:  return "psubusw";
    589       case Asse_QSUB8S:   return "psubsb";
    590       case Asse_QSUB16S:  return "psubsw";
    591       case Asse_MUL16:    return "pmullw";
    592       case Asse_MULHI16U: return "pmulhuw";
    593       case Asse_MULHI16S: return "pmulhw";
    594       case Asse_AVG8U:    return "pavgb";
    595       case Asse_AVG16U:   return "pavgw";
    596       case Asse_MAX16S:   return "pmaxw";
    597       case Asse_MAX8U:    return "pmaxub";
    598       case Asse_MIN16S:   return "pminw";
    599       case Asse_MIN8U:    return "pminub";
    600       case Asse_CMPEQ8:   return "pcmpeqb";
    601       case Asse_CMPEQ16:  return "pcmpeqw";
    602       case Asse_CMPEQ32:  return "pcmpeqd";
    603       case Asse_CMPGT8S:  return "pcmpgtb";
    604       case Asse_CMPGT16S: return "pcmpgtw";
    605       case Asse_CMPGT32S: return "pcmpgtd";
    606       case Asse_SHL16:    return "psllw";
    607       case Asse_SHL32:    return "pslld";
    608       case Asse_SHL64:    return "psllq";
    609       case Asse_SHR16:    return "psrlw";
    610       case Asse_SHR32:    return "psrld";
    611       case Asse_SHR64:    return "psrlq";
    612       case Asse_SAR16:    return "psraw";
    613       case Asse_SAR32:    return "psrad";
    614       case Asse_PACKSSD:  return "packssdw";
    615       case Asse_PACKSSW:  return "packsswb";
    616       case Asse_PACKUSW:  return "packuswb";
    617       case Asse_UNPCKHB:  return "punpckhb";
    618       case Asse_UNPCKHW:  return "punpckhw";
    619       case Asse_UNPCKHD:  return "punpckhd";
    620       case Asse_UNPCKHQ:  return "punpckhq";
    621       case Asse_UNPCKLB:  return "punpcklb";
    622       case Asse_UNPCKLW:  return "punpcklw";
    623       case Asse_UNPCKLD:  return "punpckld";
    624       case Asse_UNPCKLQ:  return "punpcklq";
    625       default: vpanic("showAMD64SseOp");
    626    }
    627 }
    628 
    629 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
    630    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    631    i->tag             = Ain_Imm64;
    632    i->Ain.Imm64.imm64 = imm64;
    633    i->Ain.Imm64.dst   = dst;
    634    return i;
    635 }
    636 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    637    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    638    i->tag            = Ain_Alu64R;
    639    i->Ain.Alu64R.op  = op;
    640    i->Ain.Alu64R.src = src;
    641    i->Ain.Alu64R.dst = dst;
    642    return i;
    643 }
    644 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
    645    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    646    i->tag            = Ain_Alu64M;
    647    i->Ain.Alu64M.op  = op;
    648    i->Ain.Alu64M.src = src;
    649    i->Ain.Alu64M.dst = dst;
    650    vassert(op != Aalu_MUL);
    651    return i;
    652 }
    653 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
    654    AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
    655    i->tag          = Ain_Sh64;
    656    i->Ain.Sh64.op  = op;
    657    i->Ain.Sh64.src = src;
    658    i->Ain.Sh64.dst = dst;
    659    return i;
    660 }
    661 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
    662    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    663    i->tag              = Ain_Test64;
    664    i->Ain.Test64.imm32 = imm32;
    665    i->Ain.Test64.dst   = dst;
    666    return i;
    667 }
    668 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
    669    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    670    i->tag             = Ain_Unary64;
    671    i->Ain.Unary64.op  = op;
    672    i->Ain.Unary64.dst = dst;
    673    return i;
    674 }
    675 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
    676    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    677    i->tag             = Ain_Lea64;
    678    i->Ain.Lea64.am    = am;
    679    i->Ain.Lea64.dst   = dst;
    680    return i;
    681 }
    682 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
    683    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    684    i->tag            = Ain_MulL;
    685    i->Ain.MulL.syned = syned;
    686    i->Ain.MulL.src   = src;
    687    return i;
    688 }
    689 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
    690    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    691    i->tag            = Ain_Div;
    692    i->Ain.Div.syned  = syned;
    693    i->Ain.Div.sz     = sz;
    694    i->Ain.Div.src    = src;
    695    vassert(sz == 4 || sz == 8);
    696    return i;
    697 }
    698 //.. AMD64Instr* AMD64Instr_Sh3232  ( AMD64ShiftOp op, UInt amt, HReg src, HReg dst ) {
    699 //..    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    700 //..    i->tag            = Xin_Sh3232;
    701 //..    i->Xin.Sh3232.op  = op;
    702 //..    i->Xin.Sh3232.amt = amt;
    703 //..    i->Xin.Sh3232.src = src;
    704 //..    i->Xin.Sh3232.dst = dst;
    705 //..    vassert(op == Xsh_SHL || op == Xsh_SHR);
    706 //..    return i;
    707 //.. }
    708 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
    709    AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
    710    i->tag          = Ain_Push;
    711    i->Ain.Push.src = src;
    712    return i;
    713 }
    714 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms ) {
    715    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    716    i->tag               = Ain_Call;
    717    i->Ain.Call.cond     = cond;
    718    i->Ain.Call.target   = target;
    719    i->Ain.Call.regparms = regparms;
    720    vassert(regparms >= 0 && regparms <= 6);
    721    return i;
    722 }
    723 AMD64Instr* AMD64Instr_Goto ( IRJumpKind jk, AMD64CondCode cond, AMD64RI* dst ) {
    724    AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
    725    i->tag           = Ain_Goto;
    726    i->Ain.Goto.cond = cond;
    727    i->Ain.Goto.dst  = dst;
    728    i->Ain.Goto.jk   = jk;
    729    return i;
    730 }
    731 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, AMD64RM* src, HReg dst ) {
    732    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    733    i->tag             = Ain_CMov64;
    734    i->Ain.CMov64.cond = cond;
    735    i->Ain.CMov64.src  = src;
    736    i->Ain.CMov64.dst  = dst;
    737    vassert(cond != Acc_ALWAYS);
    738    return i;
    739 }
    740 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
    741    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    742    i->tag              = Ain_MovxLQ;
    743    i->Ain.MovxLQ.syned = syned;
    744    i->Ain.MovxLQ.src   = src;
    745    i->Ain.MovxLQ.dst   = dst;
    746    return i;
    747 }
    748 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
    749                                 AMD64AMode* src, HReg dst ) {
    750    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    751    i->tag                = Ain_LoadEX;
    752    i->Ain.LoadEX.szSmall = szSmall;
    753    i->Ain.LoadEX.syned   = syned;
    754    i->Ain.LoadEX.src     = src;
    755    i->Ain.LoadEX.dst     = dst;
    756    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
    757    return i;
    758 }
    759 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
    760    AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
    761    i->tag           = Ain_Store;
    762    i->Ain.Store.sz  = sz;
    763    i->Ain.Store.src = src;
    764    i->Ain.Store.dst = dst;
    765    vassert(sz == 1 || sz == 2 || sz == 4);
    766    return i;
    767 }
    768 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
    769    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    770    i->tag            = Ain_Set64;
    771    i->Ain.Set64.cond = cond;
    772    i->Ain.Set64.dst  = dst;
    773    return i;
    774 }
    775 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
    776    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    777    i->tag               = Ain_Bsfr64;
    778    i->Ain.Bsfr64.isFwds = isFwds;
    779    i->Ain.Bsfr64.src    = src;
    780    i->Ain.Bsfr64.dst    = dst;
    781    return i;
    782 }
    783 AMD64Instr* AMD64Instr_MFence ( void ) {
    784    AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
    785    i->tag        = Ain_MFence;
    786    return i;
    787 }
    788 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
    789    AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
    790    i->tag           = Ain_ACAS;
    791    i->Ain.ACAS.addr = addr;
    792    i->Ain.ACAS.sz   = sz;
    793    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
    794    return i;
    795 }
    796 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
    797    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    798    i->tag            = Ain_DACAS;
    799    i->Ain.DACAS.addr = addr;
    800    i->Ain.DACAS.sz   = sz;
    801    vassert(sz == 8 || sz == 4);
    802    return i;
    803 }
    804 
    805 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
    806 {
    807    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    808    i->tag               = Ain_A87Free;
    809    i->Ain.A87Free.nregs = nregs;
    810    vassert(nregs >= 1 && nregs <= 7);
    811    return i;
    812 }
    813 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
    814 {
    815    AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
    816    i->tag                   = Ain_A87PushPop;
    817    i->Ain.A87PushPop.addr   = addr;
    818    i->Ain.A87PushPop.isPush = isPush;
    819    i->Ain.A87PushPop.szB    = szB;
    820    vassert(szB == 8 || szB == 4);
    821    return i;
    822 }
    823 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
    824 {
    825    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    826    i->tag            = Ain_A87FpOp;
    827    i->Ain.A87FpOp.op = op;
    828    return i;
    829 }
    830 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
    831 {
    832    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    833    i->tag              = Ain_A87LdCW;
    834    i->Ain.A87LdCW.addr = addr;
    835    return i;
    836 }
    837 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
    838 {
    839    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    840    i->tag              = Ain_A87StSW;
    841    i->Ain.A87StSW.addr = addr;
    842    return i;
    843 }
    844 
    845 //.. AMD64Instr* AMD64Instr_FpUnary ( AMD64FpOp op, HReg src, HReg dst ) {
    846 //..    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    847 //..    i->tag             = Xin_FpUnary;
    848 //..    i->Xin.FpUnary.op  = op;
    849 //..    i->Xin.FpUnary.src = src;
    850 //..    i->Xin.FpUnary.dst = dst;
    851 //..    return i;
    852 //.. }
    853 //.. AMD64Instr* AMD64Instr_FpBinary ( AMD64FpOp op, HReg srcL, HReg srcR, HReg dst ) {
    854 //..    AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
    855 //..    i->tag               = Xin_FpBinary;
    856 //..    i->Xin.FpBinary.op   = op;
    857 //..    i->Xin.FpBinary.srcL = srcL;
    858 //..    i->Xin.FpBinary.srcR = srcR;
    859 //..    i->Xin.FpBinary.dst  = dst;
    860 //..    return i;
    861 //.. }
    862 //.. AMD64Instr* AMD64Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, AMD64AMode* addr ) {
    863 //..    AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
    864 //..    i->tag               = Xin_FpLdSt;
    865 //..    i->Xin.FpLdSt.isLoad = isLoad;
    866 //..    i->Xin.FpLdSt.sz     = sz;
    867 //..    i->Xin.FpLdSt.reg    = reg;
    868 //..    i->Xin.FpLdSt.addr   = addr;
    869 //..    vassert(sz == 4 || sz == 8);
    870 //..    return i;
    871 //.. }
    872 //.. AMD64Instr* AMD64Instr_FpLdStI ( Bool isLoad, UChar sz,
    873 //..                              HReg reg, AMD64AMode* addr ) {
    874 //..    AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
    875 //..    i->tag                = Xin_FpLdStI;
    876 //..    i->Xin.FpLdStI.isLoad = isLoad;
    877 //..    i->Xin.FpLdStI.sz     = sz;
    878 //..    i->Xin.FpLdStI.reg    = reg;
    879 //..    i->Xin.FpLdStI.addr   = addr;
    880 //..    vassert(sz == 2 || sz == 4 || sz == 8);
    881 //..    return i;
    882 //.. }
    883 //.. AMD64Instr* AMD64Instr_Fp64to32 ( HReg src, HReg dst ) {
    884 //..    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    885 //..    i->tag              = Xin_Fp64to32;
    886 //..    i->Xin.Fp64to32.src = src;
    887 //..    i->Xin.Fp64to32.dst = dst;
    888 //..    return i;
    889 //.. }
    890 //.. AMD64Instr* AMD64Instr_FpCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
    891 //..    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    892 //..    i->tag             = Xin_FpCMov;
    893 //..    i->Xin.FpCMov.cond = cond;
    894 //..    i->Xin.FpCMov.src  = src;
    895 //..    i->Xin.FpCMov.dst  = dst;
    896 //..    vassert(cond != Xcc_ALWAYS);
    897 //..    return i;
    898 //.. }
    899 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
    900    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    901    i->tag                = Ain_LdMXCSR;
    902    i->Ain.LdMXCSR.addr   = addr;
    903    return i;
    904 }
    905 //.. AMD64Instr* AMD64Instr_FpStSW_AX ( void ) {
    906 //..    AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
    907 //..    i->tag      = Xin_FpStSW_AX;
    908 //..    return i;
    909 //.. }
    910 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
    911    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    912    i->tag                = Ain_SseUComIS;
    913    i->Ain.SseUComIS.sz   = toUChar(sz);
    914    i->Ain.SseUComIS.srcL = srcL;
    915    i->Ain.SseUComIS.srcR = srcR;
    916    i->Ain.SseUComIS.dst  = dst;
    917    vassert(sz == 4 || sz == 8);
    918    return i;
    919 }
    920 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
    921    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    922    i->tag              = Ain_SseSI2SF;
    923    i->Ain.SseSI2SF.szS = toUChar(szS);
    924    i->Ain.SseSI2SF.szD = toUChar(szD);
    925    i->Ain.SseSI2SF.src = src;
    926    i->Ain.SseSI2SF.dst = dst;
    927    vassert(szS == 4 || szS == 8);
    928    vassert(szD == 4 || szD == 8);
    929    return i;
    930 }
    931 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
    932    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    933    i->tag              = Ain_SseSF2SI;
    934    i->Ain.SseSF2SI.szS = toUChar(szS);
    935    i->Ain.SseSF2SI.szD = toUChar(szD);
    936    i->Ain.SseSF2SI.src = src;
    937    i->Ain.SseSF2SI.dst = dst;
    938    vassert(szS == 4 || szS == 8);
    939    vassert(szD == 4 || szD == 8);
    940    return i;
    941 }
    942 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
    943 {
    944    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    945    i->tag                = Ain_SseSDSS;
    946    i->Ain.SseSDSS.from64 = from64;
    947    i->Ain.SseSDSS.src    = src;
    948    i->Ain.SseSDSS.dst    = dst;
    949    return i;
    950 }
    951 
    952 //.. AMD64Instr* AMD64Instr_SseConst ( UShort con, HReg dst ) {
    953 //..    AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
    954 //..    i->tag                 = Xin_SseConst;
    955 //..    i->Xin.SseConst.con    = con;
    956 //..    i->Xin.SseConst.dst    = dst;
    957 //..    vassert(hregClass(dst) == HRcVec128);
    958 //..    return i;
    959 //.. }
    960 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
    961                                  HReg reg, AMD64AMode* addr ) {
    962    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    963    i->tag                = Ain_SseLdSt;
    964    i->Ain.SseLdSt.isLoad = isLoad;
    965    i->Ain.SseLdSt.sz     = toUChar(sz);
    966    i->Ain.SseLdSt.reg    = reg;
    967    i->Ain.SseLdSt.addr   = addr;
    968    vassert(sz == 4 || sz == 8 || sz == 16);
    969    return i;
    970 }
    971 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
    972 {
    973    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    974    i->tag                = Ain_SseLdzLO;
    975    i->Ain.SseLdzLO.sz    = sz;
    976    i->Ain.SseLdzLO.reg   = reg;
    977    i->Ain.SseLdzLO.addr  = addr;
    978    vassert(sz == 4 || sz == 8);
    979    return i;
    980 }
    981 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
    982    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    983    i->tag              = Ain_Sse32Fx4;
    984    i->Ain.Sse32Fx4.op  = op;
    985    i->Ain.Sse32Fx4.src = src;
    986    i->Ain.Sse32Fx4.dst = dst;
    987    vassert(op != Asse_MOV);
    988    return i;
    989 }
    990 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    991    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    992    i->tag              = Ain_Sse32FLo;
    993    i->Ain.Sse32FLo.op  = op;
    994    i->Ain.Sse32FLo.src = src;
    995    i->Ain.Sse32FLo.dst = dst;
    996    vassert(op != Asse_MOV);
    997    return i;
    998 }
    999 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
   1000    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
   1001    i->tag              = Ain_Sse64Fx2;
   1002    i->Ain.Sse64Fx2.op  = op;
   1003    i->Ain.Sse64Fx2.src = src;
   1004    i->Ain.Sse64Fx2.dst = dst;
   1005    vassert(op != Asse_MOV);
   1006    return i;
   1007 }
   1008 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
   1009    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
   1010    i->tag              = Ain_Sse64FLo;
   1011    i->Ain.Sse64FLo.op  = op;
   1012    i->Ain.Sse64FLo.src = src;
   1013    i->Ain.Sse64FLo.dst = dst;
   1014    vassert(op != Asse_MOV);
   1015    return i;
   1016 }
   1017 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
   1018    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
   1019    i->tag             = Ain_SseReRg;
   1020    i->Ain.SseReRg.op  = op;
   1021    i->Ain.SseReRg.src = re;
   1022    i->Ain.SseReRg.dst = rg;
   1023    return i;
   1024 }
   1025 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
   1026    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
   1027    i->tag              = Ain_SseCMov;
   1028    i->Ain.SseCMov.cond = cond;
   1029    i->Ain.SseCMov.src  = src;
   1030    i->Ain.SseCMov.dst  = dst;
   1031    vassert(cond != Acc_ALWAYS);
   1032    return i;
   1033 }
   1034 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
   1035    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
   1036    i->tag               = Ain_SseShuf;
   1037    i->Ain.SseShuf.order = order;
   1038    i->Ain.SseShuf.src   = src;
   1039    i->Ain.SseShuf.dst   = dst;
   1040    vassert(order >= 0 && order <= 0xFF);
   1041    return i;
   1042 }
   1043 
   1044 void ppAMD64Instr ( AMD64Instr* i, Bool mode64 )
   1045 {
   1046    vassert(mode64 == True);
   1047    switch (i->tag) {
   1048       case Ain_Imm64:
   1049          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
   1050          ppHRegAMD64(i->Ain.Imm64.dst);
   1051          return;
   1052       case Ain_Alu64R:
   1053          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
   1054          ppAMD64RMI(i->Ain.Alu64R.src);
   1055          vex_printf(",");
   1056          ppHRegAMD64(i->Ain.Alu64R.dst);
   1057          return;
   1058       case Ain_Alu64M:
   1059          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
   1060          ppAMD64RI(i->Ain.Alu64M.src);
   1061          vex_printf(",");
   1062          ppAMD64AMode(i->Ain.Alu64M.dst);
   1063          return;
   1064       case Ain_Sh64:
   1065          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
   1066          if (i->Ain.Sh64.src == 0)
   1067             vex_printf("%%cl,");
   1068          else
   1069             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
   1070          ppHRegAMD64(i->Ain.Sh64.dst);
   1071          return;
   1072       case Ain_Test64:
   1073          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
   1074          ppHRegAMD64(i->Ain.Test64.dst);
   1075          return;
   1076       case Ain_Unary64:
   1077          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
   1078          ppHRegAMD64(i->Ain.Unary64.dst);
   1079          return;
   1080       case Ain_Lea64:
   1081          vex_printf("leaq ");
   1082          ppAMD64AMode(i->Ain.Lea64.am);
   1083          vex_printf(",");
   1084          ppHRegAMD64(i->Ain.Lea64.dst);
   1085          return;
   1086       case Ain_MulL:
   1087          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
   1088          ppAMD64RM(i->Ain.MulL.src);
   1089          return;
   1090       case Ain_Div:
   1091          vex_printf("%cdiv%s ",
   1092                     i->Ain.Div.syned ? 's' : 'u',
   1093                     showAMD64ScalarSz(i->Ain.Div.sz));
   1094          ppAMD64RM(i->Ain.Div.src);
   1095          return;
   1096 //..       case Xin_Sh3232:
   1097 //..          vex_printf("%sdl ", showAMD64ShiftOp(i->Xin.Sh3232.op));
   1098 //..          if (i->Xin.Sh3232.amt == 0)
   1099 //..            vex_printf(" %%cl,");
   1100 //..          else
   1101 //..             vex_printf(" $%d,", i->Xin.Sh3232.amt);
   1102 //..          ppHRegAMD64(i->Xin.Sh3232.src);
   1103 //..          vex_printf(",");
   1104 //..          ppHRegAMD64(i->Xin.Sh3232.dst);
   1105 //..          return;
   1106       case Ain_Push:
   1107          vex_printf("pushq ");
   1108          ppAMD64RMI(i->Ain.Push.src);
   1109          return;
   1110       case Ain_Call:
   1111          vex_printf("call%s[%d] ",
   1112                     i->Ain.Call.cond==Acc_ALWAYS
   1113                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
   1114                     i->Ain.Call.regparms );
   1115          vex_printf("0x%llx", i->Ain.Call.target);
   1116          break;
   1117       case Ain_Goto:
   1118          if (i->Ain.Goto.cond != Acc_ALWAYS) {
   1119             vex_printf("if (%%rflags.%s) { ",
   1120                        showAMD64CondCode(i->Ain.Goto.cond));
   1121          }
   1122          if (i->Ain.Goto.jk != Ijk_Boring
   1123              && i->Ain.Goto.jk != Ijk_Call
   1124              && i->Ain.Goto.jk != Ijk_Ret) {
   1125             vex_printf("movl $");
   1126             ppIRJumpKind(i->Ain.Goto.jk);
   1127             vex_printf(",%%ebp ; ");
   1128          }
   1129          vex_printf("movq ");
   1130          ppAMD64RI(i->Ain.Goto.dst);
   1131          vex_printf(",%%rax ; movabsq $dispatcher_addr,%%rdx ; jmp *%%rdx");
   1132          if (i->Ain.Goto.cond != Acc_ALWAYS) {
   1133             vex_printf(" }");
   1134          }
   1135          return;
   1136       case Ain_CMov64:
   1137          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
   1138          ppAMD64RM(i->Ain.CMov64.src);
   1139          vex_printf(",");
   1140          ppHRegAMD64(i->Ain.CMov64.dst);
   1141          return;
   1142       case Ain_MovxLQ:
   1143          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
   1144          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
   1145          vex_printf(",");
   1146          ppHRegAMD64(i->Ain.MovxLQ.dst);
   1147          return;
   1148       case Ain_LoadEX:
   1149          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
   1150             vex_printf("movl ");
   1151             ppAMD64AMode(i->Ain.LoadEX.src);
   1152             vex_printf(",");
   1153             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
   1154          } else {
   1155             vex_printf("mov%c%cq ",
   1156                        i->Ain.LoadEX.syned ? 's' : 'z',
   1157                        i->Ain.LoadEX.szSmall==1
   1158                           ? 'b'
   1159                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
   1160             ppAMD64AMode(i->Ain.LoadEX.src);
   1161             vex_printf(",");
   1162             ppHRegAMD64(i->Ain.LoadEX.dst);
   1163          }
   1164          return;
   1165       case Ain_Store:
   1166          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
   1167                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
   1168          ppHRegAMD64(i->Ain.Store.src);
   1169          vex_printf(",");
   1170          ppAMD64AMode(i->Ain.Store.dst);
   1171          return;
   1172       case Ain_Set64:
   1173          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
   1174          ppHRegAMD64(i->Ain.Set64.dst);
   1175          return;
   1176       case Ain_Bsfr64:
   1177          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
   1178          ppHRegAMD64(i->Ain.Bsfr64.src);
   1179          vex_printf(",");
   1180          ppHRegAMD64(i->Ain.Bsfr64.dst);
   1181          return;
   1182       case Ain_MFence:
   1183          vex_printf("mfence" );
   1184          return;
   1185       case Ain_ACAS:
   1186          vex_printf("lock cmpxchg%c ",
   1187                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
   1188                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
   1189          vex_printf("{%%rax->%%rbx},");
   1190          ppAMD64AMode(i->Ain.ACAS.addr);
   1191          return;
   1192       case Ain_DACAS:
   1193          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
   1194                     (Int)(2 * i->Ain.DACAS.sz));
   1195          ppAMD64AMode(i->Ain.DACAS.addr);
   1196          return;
   1197       case Ain_A87Free:
   1198          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
   1199          break;
   1200       case Ain_A87PushPop:
   1201          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
   1202                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
   1203          ppAMD64AMode(i->Ain.A87PushPop.addr);
   1204          break;
   1205       case Ain_A87FpOp:
   1206          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
   1207          break;
   1208       case Ain_A87LdCW:
   1209          vex_printf("fldcw ");
   1210          ppAMD64AMode(i->Ain.A87LdCW.addr);
   1211          break;
   1212       case Ain_A87StSW:
   1213          vex_printf("fstsw ");
   1214          ppAMD64AMode(i->Ain.A87StSW.addr);
   1215          break;
   1216 //..       case Xin_FpUnary:
   1217 //..          vex_printf("g%sD ", showAMD64FpOp(i->Xin.FpUnary.op));
   1218 //..          ppHRegAMD64(i->Xin.FpUnary.src);
   1219 //..          vex_printf(",");
   1220 //..          ppHRegAMD64(i->Xin.FpUnary.dst);
   1221 //..          break;
   1222 //..       case Xin_FpBinary:
   1223 //..          vex_printf("g%sD ", showAMD64FpOp(i->Xin.FpBinary.op));
   1224 //..          ppHRegAMD64(i->Xin.FpBinary.srcL);
   1225 //..          vex_printf(",");
   1226 //..          ppHRegAMD64(i->Xin.FpBinary.srcR);
   1227 //..          vex_printf(",");
   1228 //..          ppHRegAMD64(i->Xin.FpBinary.dst);
   1229 //..          break;
   1230 //..       case Xin_FpLdSt:
   1231 //..          if (i->Xin.FpLdSt.isLoad) {
   1232 //..             vex_printf("gld%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
   1233 //..             ppAMD64AMode(i->Xin.FpLdSt.addr);
   1234 //..             vex_printf(", ");
   1235 //..             ppHRegAMD64(i->Xin.FpLdSt.reg);
   1236 //..          } else {
   1237 //..             vex_printf("gst%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
   1238 //..             ppHRegAMD64(i->Xin.FpLdSt.reg);
   1239 //..             vex_printf(", ");
   1240 //..             ppAMD64AMode(i->Xin.FpLdSt.addr);
   1241 //..          }
   1242 //..          return;
   1243 //..       case Xin_FpLdStI:
   1244 //..          if (i->Xin.FpLdStI.isLoad) {
   1245 //..             vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
   1246 //..                                   i->Xin.FpLdStI.sz==4 ? "l" : "w");
   1247 //..             ppAMD64AMode(i->Xin.FpLdStI.addr);
   1248 //..             vex_printf(", ");
   1249 //..             ppHRegAMD64(i->Xin.FpLdStI.reg);
   1250 //..          } else {
   1251 //..             vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" :
   1252 //..                                   i->Xin.FpLdStI.sz==4 ? "l" : "w");
   1253 //..             ppHRegAMD64(i->Xin.FpLdStI.reg);
   1254 //..             vex_printf(", ");
   1255 //..             ppAMD64AMode(i->Xin.FpLdStI.addr);
   1256 //..          }
   1257 //..          return;
   1258 //..       case Xin_Fp64to32:
   1259 //..          vex_printf("gdtof ");
   1260 //..          ppHRegAMD64(i->Xin.Fp64to32.src);
   1261 //..          vex_printf(",");
   1262 //..          ppHRegAMD64(i->Xin.Fp64to32.dst);
   1263 //..          return;
   1264 //..       case Xin_FpCMov:
   1265 //..          vex_printf("gcmov%s ", showAMD64CondCode(i->Xin.FpCMov.cond));
   1266 //..          ppHRegAMD64(i->Xin.FpCMov.src);
   1267 //..          vex_printf(",");
   1268 //..          ppHRegAMD64(i->Xin.FpCMov.dst);
   1269 //..          return;
   1270 //..       case Xin_FpLdStCW:
   1271 //..          vex_printf(i->Xin.FpLdStCW.isLoad ? "fldcw " : "fstcw ");
   1272 //..          ppAMD64AMode(i->Xin.FpLdStCW.addr);
   1273 //..          return;
   1274 //..       case Xin_FpStSW_AX:
   1275 //..          vex_printf("fstsw %%ax");
   1276 //..          return;
   1277       case Ain_LdMXCSR:
   1278          vex_printf("ldmxcsr ");
   1279          ppAMD64AMode(i->Ain.LdMXCSR.addr);
   1280          break;
   1281       case Ain_SseUComIS:
   1282          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
   1283          ppHRegAMD64(i->Ain.SseUComIS.srcL);
   1284          vex_printf(",");
   1285          ppHRegAMD64(i->Ain.SseUComIS.srcR);
   1286          vex_printf(" ; pushfq ; popq ");
   1287          ppHRegAMD64(i->Ain.SseUComIS.dst);
   1288          break;
   1289       case Ain_SseSI2SF:
   1290          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
   1291          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1292             (i->Ain.SseSI2SF.src);
   1293          vex_printf(",");
   1294          ppHRegAMD64(i->Ain.SseSI2SF.dst);
   1295          break;
   1296       case Ain_SseSF2SI:
   1297          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
   1298          ppHRegAMD64(i->Ain.SseSF2SI.src);
   1299          vex_printf(",");
   1300          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1301             (i->Ain.SseSF2SI.dst);
   1302          break;
   1303       case Ain_SseSDSS:
   1304          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
   1305          ppHRegAMD64(i->Ain.SseSDSS.src);
   1306          vex_printf(",");
   1307          ppHRegAMD64(i->Ain.SseSDSS.dst);
   1308          break;
   1309 //..       case Xin_SseConst:
   1310 //..          vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
   1311 //..          ppHRegAMD64(i->Xin.SseConst.dst);
   1312 //..          break;
   1313       case Ain_SseLdSt:
   1314          switch (i->Ain.SseLdSt.sz) {
   1315             case 4:  vex_printf("movss "); break;
   1316             case 8:  vex_printf("movsd "); break;
   1317             case 16: vex_printf("movups "); break;
   1318             default: vassert(0);
   1319          }
   1320          if (i->Ain.SseLdSt.isLoad) {
   1321             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1322             vex_printf(",");
   1323             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1324          } else {
   1325             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1326             vex_printf(",");
   1327             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1328          }
   1329          return;
   1330       case Ain_SseLdzLO:
   1331          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
   1332          ppAMD64AMode(i->Ain.SseLdzLO.addr);
   1333          vex_printf(",");
   1334          ppHRegAMD64(i->Ain.SseLdzLO.reg);
   1335          return;
   1336       case Ain_Sse32Fx4:
   1337          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
   1338          ppHRegAMD64(i->Ain.Sse32Fx4.src);
   1339          vex_printf(",");
   1340          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
   1341          return;
   1342       case Ain_Sse32FLo:
   1343          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
   1344          ppHRegAMD64(i->Ain.Sse32FLo.src);
   1345          vex_printf(",");
   1346          ppHRegAMD64(i->Ain.Sse32FLo.dst);
   1347          return;
   1348       case Ain_Sse64Fx2:
   1349          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
   1350          ppHRegAMD64(i->Ain.Sse64Fx2.src);
   1351          vex_printf(",");
   1352          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
   1353          return;
   1354       case Ain_Sse64FLo:
   1355          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
   1356          ppHRegAMD64(i->Ain.Sse64FLo.src);
   1357          vex_printf(",");
   1358          ppHRegAMD64(i->Ain.Sse64FLo.dst);
   1359          return;
   1360       case Ain_SseReRg:
   1361          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1362          ppHRegAMD64(i->Ain.SseReRg.src);
   1363          vex_printf(",");
   1364          ppHRegAMD64(i->Ain.SseReRg.dst);
   1365          return;
   1366       case Ain_SseCMov:
   1367          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
   1368          ppHRegAMD64(i->Ain.SseCMov.src);
   1369          vex_printf(",");
   1370          ppHRegAMD64(i->Ain.SseCMov.dst);
   1371          return;
   1372       case Ain_SseShuf:
   1373          vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
   1374          ppHRegAMD64(i->Ain.SseShuf.src);
   1375          vex_printf(",");
   1376          ppHRegAMD64(i->Ain.SseShuf.dst);
   1377          return;
   1378 
   1379       default:
   1380          vpanic("ppAMD64Instr");
   1381    }
   1382 }
   1383 
   1384 /* --------- Helpers for register allocation. --------- */
   1385 
   1386 void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 )
   1387 {
   1388    Bool unary;
   1389    vassert(mode64 == True);
   1390    initHRegUsage(u);
   1391    switch (i->tag) {
   1392       case Ain_Imm64:
   1393          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
   1394          return;
   1395       case Ain_Alu64R:
   1396          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
   1397          if (i->Ain.Alu64R.op == Aalu_MOV) {
   1398             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
   1399             return;
   1400          }
   1401          if (i->Ain.Alu64R.op == Aalu_CMP) {
   1402             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
   1403             return;
   1404          }
   1405          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
   1406          return;
   1407       case Ain_Alu64M:
   1408          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
   1409          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
   1410          return;
   1411       case Ain_Sh64:
   1412          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
   1413          if (i->Ain.Sh64.src == 0)
   1414             addHRegUse(u, HRmRead, hregAMD64_RCX());
   1415          return;
   1416       case Ain_Test64:
   1417          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
   1418          return;
   1419       case Ain_Unary64:
   1420          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
   1421          return;
   1422       case Ain_Lea64:
   1423          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
   1424          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
   1425          return;
   1426       case Ain_MulL:
   1427          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
   1428          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1429          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1430          return;
   1431       case Ain_Div:
   1432          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
   1433          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1434          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1435          return;
   1436 //..       case Xin_Sh3232:
   1437 //..          addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
   1438 //..          addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
   1439 //..          if (i->Xin.Sh3232.amt == 0)
   1440 //..             addHRegUse(u, HRmRead, hregAMD64_ECX());
   1441 //..          return;
   1442       case Ain_Push:
   1443          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
   1444          addHRegUse(u, HRmModify, hregAMD64_RSP());
   1445          return;
   1446       case Ain_Call:
   1447          /* This is a bit subtle. */
   1448          /* First off, claim it trashes all the caller-saved regs
   1449             which fall within the register allocator's jurisdiction.
   1450             These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
   1451             and all the xmm registers.
   1452          */
   1453          addHRegUse(u, HRmWrite, hregAMD64_RAX());
   1454          addHRegUse(u, HRmWrite, hregAMD64_RCX());
   1455          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1456          addHRegUse(u, HRmWrite, hregAMD64_RSI());
   1457          addHRegUse(u, HRmWrite, hregAMD64_RDI());
   1458          addHRegUse(u, HRmWrite, hregAMD64_R8());
   1459          addHRegUse(u, HRmWrite, hregAMD64_R9());
   1460          addHRegUse(u, HRmWrite, hregAMD64_R10());
   1461          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1462          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
   1463          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
   1464          addHRegUse(u, HRmWrite, hregAMD64_XMM2());
   1465          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
   1466          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
   1467          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
   1468          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
   1469          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
   1470          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
   1471          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
   1472          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
   1473          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
   1474          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
   1475          addHRegUse(u, HRmWrite, hregAMD64_XMM13());
   1476          addHRegUse(u, HRmWrite, hregAMD64_XMM14());
   1477          addHRegUse(u, HRmWrite, hregAMD64_XMM15());
   1478 
   1479          /* Now we have to state any parameter-carrying registers
   1480             which might be read.  This depends on the regparmness. */
   1481          switch (i->Ain.Call.regparms) {
   1482             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
   1483             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
   1484             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
   1485             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
   1486             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
   1487             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
   1488             case 0: break;
   1489             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
   1490          }
   1491          /* Finally, there is the issue that the insn trashes a
   1492             register because the literal target address has to be
   1493             loaded into a register.  Fortunately, r11 is stated in the
   1494             ABI as a scratch register, and so seems a suitable victim.  */
   1495          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1496          /* Upshot of this is that the assembler really must use r11,
   1497             and no other, as a destination temporary. */
   1498          return;
   1499       case Ain_Goto:
   1500          addRegUsage_AMD64RI(u, i->Ain.Goto.dst);
   1501          addHRegUse(u, HRmWrite, hregAMD64_RAX()); /* used for next guest addr */
   1502          addHRegUse(u, HRmWrite, hregAMD64_RDX()); /* used for dispatcher addr */
   1503          if (i->Ain.Goto.jk != Ijk_Boring
   1504              && i->Ain.Goto.jk != Ijk_Call
   1505              && i->Ain.Goto.jk != Ijk_Ret)
   1506             /* note, this is irrelevant since rbp is not actually
   1507                available to the allocator.  But still .. */
   1508             addHRegUse(u, HRmWrite, hregAMD64_RBP());
   1509          return;
   1510       case Ain_CMov64:
   1511          addRegUsage_AMD64RM(u, i->Ain.CMov64.src, HRmRead);
   1512          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
   1513          return;
   1514       case Ain_MovxLQ:
   1515          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
   1516          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
   1517          return;
   1518       case Ain_LoadEX:
   1519          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
   1520          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
   1521          return;
   1522       case Ain_Store:
   1523          addHRegUse(u, HRmRead, i->Ain.Store.src);
   1524          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
   1525          return;
   1526       case Ain_Set64:
   1527          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
   1528          return;
   1529       case Ain_Bsfr64:
   1530          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
   1531          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
   1532          return;
   1533       case Ain_MFence:
   1534          return;
   1535       case Ain_ACAS:
   1536          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
   1537          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1538          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1539          return;
   1540       case Ain_DACAS:
   1541          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
   1542          addHRegUse(u, HRmRead, hregAMD64_RCX());
   1543          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1544          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1545          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1546          return;
   1547       case Ain_A87Free:
   1548          return;
   1549       case Ain_A87PushPop:
   1550          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
   1551          return;
   1552       case Ain_A87FpOp:
   1553          return;
   1554       case Ain_A87LdCW:
   1555          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
   1556          return;
   1557       case Ain_A87StSW:
   1558          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
   1559          return;
   1560 //..       case Xin_FpUnary:
   1561 //..          addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
   1562 //..          addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
   1563 //..          return;
   1564 //..       case Xin_FpBinary:
   1565 //..          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
   1566 //..          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
   1567 //..          addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
   1568 //..          return;
   1569 //..       case Xin_FpLdSt:
   1570 //..          addRegUsage_AMD64AMode(u, i->Xin.FpLdSt.addr);
   1571 //..          addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
   1572 //..                        i->Xin.FpLdSt.reg);
   1573 //..          return;
   1574 //..       case Xin_FpLdStI:
   1575 //..          addRegUsage_AMD64AMode(u, i->Xin.FpLdStI.addr);
   1576 //..          addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
   1577 //..                        i->Xin.FpLdStI.reg);
   1578 //..          return;
   1579 //..       case Xin_Fp64to32:
   1580 //..          addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
   1581 //..          addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
   1582 //..          return;
   1583 //..       case Xin_FpCMov:
   1584 //..          addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
   1585 //..          addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
   1586 //..          return;
   1587       case Ain_LdMXCSR:
   1588          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
   1589          return;
   1590 //..       case Xin_FpStSW_AX:
   1591 //..          addHRegUse(u, HRmWrite, hregAMD64_EAX());
   1592 //..          return;
   1593       case Ain_SseUComIS:
   1594          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
   1595          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
   1596          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
   1597          return;
   1598       case Ain_SseSI2SF:
   1599          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
   1600          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
   1601          return;
   1602       case Ain_SseSF2SI:
   1603          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
   1604          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
   1605          return;
   1606       case Ain_SseSDSS:
   1607          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
   1608          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
   1609          return;
   1610       case Ain_SseLdSt:
   1611          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
   1612          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
   1613                        i->Ain.SseLdSt.reg);
   1614          return;
   1615       case Ain_SseLdzLO:
   1616          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
   1617          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
   1618          return;
   1619 //..       case Xin_SseConst:
   1620 //..          addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
   1621 //..          return;
   1622       case Ain_Sse32Fx4:
   1623          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
   1624          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
   1625                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
   1626                          || i->Ain.Sse32Fx4.op == Asse_SQRTF );
   1627          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
   1628          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1629                        i->Ain.Sse32Fx4.dst);
   1630          return;
   1631       case Ain_Sse32FLo:
   1632          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
   1633          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
   1634                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
   1635                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
   1636          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
   1637          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1638                        i->Ain.Sse32FLo.dst);
   1639          return;
   1640       case Ain_Sse64Fx2:
   1641          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
   1642          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
   1643                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
   1644                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
   1645          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
   1646          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1647                        i->Ain.Sse64Fx2.dst);
   1648          return;
   1649       case Ain_Sse64FLo:
   1650          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
   1651          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
   1652                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
   1653                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
   1654          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
   1655          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1656                        i->Ain.Sse64FLo.dst);
   1657          return;
   1658       case Ain_SseReRg:
   1659          if ( (i->Ain.SseReRg.op == Asse_XOR
   1660                || i->Ain.SseReRg.op == Asse_CMPEQ32)
   1661               && i->Ain.SseReRg.src == i->Ain.SseReRg.dst) {
   1662             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
   1663                r,r' as a write of a value to r, and independent of any
   1664                previous value in r */
   1665             /* (as opposed to a rite of passage :-) */
   1666             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
   1667          } else {
   1668             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
   1669             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
   1670                              ? HRmWrite : HRmModify,
   1671                           i->Ain.SseReRg.dst);
   1672          }
   1673          return;
   1674       case Ain_SseCMov:
   1675          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
   1676          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
   1677          return;
   1678       case Ain_SseShuf:
   1679          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
   1680          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
   1681          return;
   1682       default:
   1683          ppAMD64Instr(i, mode64);
   1684          vpanic("getRegUsage_AMD64Instr");
   1685    }
   1686 }
   1687 
   1688 /* local helper */
   1689 static inline void mapReg(HRegRemap* m, HReg* r)
   1690 {
   1691    *r = lookupHRegRemap(m, *r);
   1692 }
   1693 
   1694 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
   1695 {
   1696    vassert(mode64 == True);
   1697    switch (i->tag) {
   1698       case Ain_Imm64:
   1699          mapReg(m, &i->Ain.Imm64.dst);
   1700          return;
   1701       case Ain_Alu64R:
   1702          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
   1703          mapReg(m, &i->Ain.Alu64R.dst);
   1704          return;
   1705       case Ain_Alu64M:
   1706          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
   1707          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
   1708          return;
   1709       case Ain_Sh64:
   1710          mapReg(m, &i->Ain.Sh64.dst);
   1711          return;
   1712       case Ain_Test64:
   1713          mapReg(m, &i->Ain.Test64.dst);
   1714          return;
   1715       case Ain_Unary64:
   1716          mapReg(m, &i->Ain.Unary64.dst);
   1717          return;
   1718       case Ain_Lea64:
   1719          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
   1720          mapReg(m, &i->Ain.Lea64.dst);
   1721          return;
   1722       case Ain_MulL:
   1723          mapRegs_AMD64RM(m, i->Ain.MulL.src);
   1724          return;
   1725       case Ain_Div:
   1726          mapRegs_AMD64RM(m, i->Ain.Div.src);
   1727          return;
   1728 //..       case Xin_Sh3232:
   1729 //..          mapReg(m, &i->Xin.Sh3232.src);
   1730 //..          mapReg(m, &i->Xin.Sh3232.dst);
   1731 //..          return;
   1732       case Ain_Push:
   1733          mapRegs_AMD64RMI(m, i->Ain.Push.src);
   1734          return;
   1735       case Ain_Call:
   1736          return;
   1737       case Ain_Goto:
   1738          mapRegs_AMD64RI(m, i->Ain.Goto.dst);
   1739          return;
   1740       case Ain_CMov64:
   1741          mapRegs_AMD64RM(m, i->Ain.CMov64.src);
   1742          mapReg(m, &i->Ain.CMov64.dst);
   1743          return;
   1744       case Ain_MovxLQ:
   1745          mapReg(m, &i->Ain.MovxLQ.src);
   1746          mapReg(m, &i->Ain.MovxLQ.dst);
   1747          return;
   1748       case Ain_LoadEX:
   1749          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
   1750          mapReg(m, &i->Ain.LoadEX.dst);
   1751          return;
   1752       case Ain_Store:
   1753          mapReg(m, &i->Ain.Store.src);
   1754          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
   1755          return;
   1756       case Ain_Set64:
   1757          mapReg(m, &i->Ain.Set64.dst);
   1758          return;
   1759       case Ain_Bsfr64:
   1760          mapReg(m, &i->Ain.Bsfr64.src);
   1761          mapReg(m, &i->Ain.Bsfr64.dst);
   1762          return;
   1763       case Ain_MFence:
   1764          return;
   1765       case Ain_ACAS:
   1766          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
   1767          return;
   1768       case Ain_DACAS:
   1769          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
   1770          return;
   1771       case Ain_A87Free:
   1772          return;
   1773       case Ain_A87PushPop:
   1774          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
   1775          return;
   1776       case Ain_A87FpOp:
   1777          return;
   1778       case Ain_A87LdCW:
   1779          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
   1780          return;
   1781       case Ain_A87StSW:
   1782          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
   1783          return;
   1784 //..       case Xin_FpUnary:
   1785 //..          mapReg(m, &i->Xin.FpUnary.src);
   1786 //..          mapReg(m, &i->Xin.FpUnary.dst);
   1787 //..          return;
   1788 //..       case Xin_FpBinary:
   1789 //..          mapReg(m, &i->Xin.FpBinary.srcL);
   1790 //..          mapReg(m, &i->Xin.FpBinary.srcR);
   1791 //..          mapReg(m, &i->Xin.FpBinary.dst);
   1792 //..          return;
   1793 //..       case Xin_FpLdSt:
   1794 //..          mapRegs_AMD64AMode(m, i->Xin.FpLdSt.addr);
   1795 //..          mapReg(m, &i->Xin.FpLdSt.reg);
   1796 //..          return;
   1797 //..       case Xin_FpLdStI:
   1798 //..          mapRegs_AMD64AMode(m, i->Xin.FpLdStI.addr);
   1799 //..          mapReg(m, &i->Xin.FpLdStI.reg);
   1800 //..          return;
   1801 //..       case Xin_Fp64to32:
   1802 //..          mapReg(m, &i->Xin.Fp64to32.src);
   1803 //..          mapReg(m, &i->Xin.Fp64to32.dst);
   1804 //..          return;
   1805 //..       case Xin_FpCMov:
   1806 //..          mapReg(m, &i->Xin.FpCMov.src);
   1807 //..          mapReg(m, &i->Xin.FpCMov.dst);
   1808 //..          return;
   1809       case Ain_LdMXCSR:
   1810          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
   1811          return;
   1812 //..       case Xin_FpStSW_AX:
   1813 //..          return;
   1814       case Ain_SseUComIS:
   1815          mapReg(m, &i->Ain.SseUComIS.srcL);
   1816          mapReg(m, &i->Ain.SseUComIS.srcR);
   1817          mapReg(m, &i->Ain.SseUComIS.dst);
   1818          return;
   1819       case Ain_SseSI2SF:
   1820          mapReg(m, &i->Ain.SseSI2SF.src);
   1821          mapReg(m, &i->Ain.SseSI2SF.dst);
   1822          return;
   1823       case Ain_SseSF2SI:
   1824          mapReg(m, &i->Ain.SseSF2SI.src);
   1825          mapReg(m, &i->Ain.SseSF2SI.dst);
   1826          return;
   1827       case Ain_SseSDSS:
   1828          mapReg(m, &i->Ain.SseSDSS.src);
   1829          mapReg(m, &i->Ain.SseSDSS.dst);
   1830          return;
   1831 //..       case Xin_SseConst:
   1832 //..          mapReg(m, &i->Xin.SseConst.dst);
   1833 //..          return;
   1834       case Ain_SseLdSt:
   1835          mapReg(m, &i->Ain.SseLdSt.reg);
   1836          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
   1837          break;
   1838       case Ain_SseLdzLO:
   1839          mapReg(m, &i->Ain.SseLdzLO.reg);
   1840          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
   1841          break;
   1842       case Ain_Sse32Fx4:
   1843          mapReg(m, &i->Ain.Sse32Fx4.src);
   1844          mapReg(m, &i->Ain.Sse32Fx4.dst);
   1845          return;
   1846       case Ain_Sse32FLo:
   1847          mapReg(m, &i->Ain.Sse32FLo.src);
   1848          mapReg(m, &i->Ain.Sse32FLo.dst);
   1849          return;
   1850       case Ain_Sse64Fx2:
   1851          mapReg(m, &i->Ain.Sse64Fx2.src);
   1852          mapReg(m, &i->Ain.Sse64Fx2.dst);
   1853          return;
   1854       case Ain_Sse64FLo:
   1855          mapReg(m, &i->Ain.Sse64FLo.src);
   1856          mapReg(m, &i->Ain.Sse64FLo.dst);
   1857          return;
   1858       case Ain_SseReRg:
   1859          mapReg(m, &i->Ain.SseReRg.src);
   1860          mapReg(m, &i->Ain.SseReRg.dst);
   1861          return;
   1862       case Ain_SseCMov:
   1863          mapReg(m, &i->Ain.SseCMov.src);
   1864          mapReg(m, &i->Ain.SseCMov.dst);
   1865          return;
   1866       case Ain_SseShuf:
   1867          mapReg(m, &i->Ain.SseShuf.src);
   1868          mapReg(m, &i->Ain.SseShuf.dst);
   1869          return;
   1870       default:
   1871          ppAMD64Instr(i, mode64);
   1872          vpanic("mapRegs_AMD64Instr");
   1873    }
   1874 }
   1875 
   1876 /* Figure out if i represents a reg-reg move, and if so assign the
   1877    source and destination to *src and *dst.  If in doubt say No.  Used
   1878    by the register allocator to do move coalescing.
   1879 */
   1880 Bool isMove_AMD64Instr ( AMD64Instr* i, HReg* src, HReg* dst )
   1881 {
   1882    /* Moves between integer regs */
   1883    if (i->tag == Ain_Alu64R) {
   1884       if (i->Ain.Alu64R.op != Aalu_MOV)
   1885          return False;
   1886       if (i->Ain.Alu64R.src->tag != Armi_Reg)
   1887          return False;
   1888       *src = i->Ain.Alu64R.src->Armi.Reg.reg;
   1889       *dst = i->Ain.Alu64R.dst;
   1890       return True;
   1891    }
   1892    /* Moves between vector regs */
   1893    if (i->tag == Ain_SseReRg) {
   1894       if (i->Ain.SseReRg.op != Asse_MOV)
   1895          return False;
   1896       *src = i->Ain.SseReRg.src;
   1897       *dst = i->Ain.SseReRg.dst;
   1898       return True;
   1899    }
   1900    return False;
   1901 }
   1902 
   1903 
   1904 /* Generate amd64 spill/reload instructions under the direction of the
   1905    register allocator.  Note it's critical these don't write the
   1906    condition codes. */
   1907 
   1908 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1909                       HReg rreg, Int offsetB, Bool mode64 )
   1910 {
   1911    AMD64AMode* am;
   1912    vassert(offsetB >= 0);
   1913    vassert(!hregIsVirtual(rreg));
   1914    vassert(mode64 == True);
   1915    *i1 = *i2 = NULL;
   1916    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1917    switch (hregClass(rreg)) {
   1918       case HRcInt64:
   1919          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
   1920          return;
   1921       case HRcVec128:
   1922          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
   1923          return;
   1924       default:
   1925          ppHRegClass(hregClass(rreg));
   1926          vpanic("genSpill_AMD64: unimplemented regclass");
   1927    }
   1928 }
   1929 
   1930 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1931                        HReg rreg, Int offsetB, Bool mode64 )
   1932 {
   1933    AMD64AMode* am;
   1934    vassert(offsetB >= 0);
   1935    vassert(!hregIsVirtual(rreg));
   1936    vassert(mode64 == True);
   1937    *i1 = *i2 = NULL;
   1938    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1939    switch (hregClass(rreg)) {
   1940       case HRcInt64:
   1941          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
   1942          return;
   1943       case HRcVec128:
   1944          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
   1945          return;
   1946       default:
   1947          ppHRegClass(hregClass(rreg));
   1948          vpanic("genReload_AMD64: unimplemented regclass");
   1949    }
   1950 }
   1951 
   1952 
   1953 /* --------- The amd64 assembler (bleh.) --------- */
   1954 
   1955 /* Produce the low three bits of an integer register number. */
   1956 static UChar iregBits210 ( HReg r )
   1957 {
   1958    UInt n;
   1959    vassert(hregClass(r) == HRcInt64);
   1960    vassert(!hregIsVirtual(r));
   1961    n = hregNumber(r);
   1962    vassert(n <= 15);
   1963    return toUChar(n & 7);
   1964 }
   1965 
   1966 /* Produce bit 3 of an integer register number. */
   1967 static UChar iregBit3 ( HReg r )
   1968 {
   1969    UInt n;
   1970    vassert(hregClass(r) == HRcInt64);
   1971    vassert(!hregIsVirtual(r));
   1972    n = hregNumber(r);
   1973    vassert(n <= 15);
   1974    return toUChar((n >> 3) & 1);
   1975 }
   1976 
   1977 /* Produce a complete 4-bit integer register number. */
   1978 static UChar iregBits3210 ( HReg r )
   1979 {
   1980    UInt n;
   1981    vassert(hregClass(r) == HRcInt64);
   1982    vassert(!hregIsVirtual(r));
   1983    n = hregNumber(r);
   1984    vassert(n <= 15);
   1985    return toUChar(n);
   1986 }
   1987 
   1988 /* Given an xmm (128bit V-class) register number, produce the
   1989    equivalent numbered register in 64-bit I-class.  This is a bit of
   1990    fakery which facilitates using functions that work on integer
   1991    register numbers to be used when assembling SSE instructions
   1992    too. */
   1993 static UInt vreg2ireg ( HReg r )
   1994 {
   1995    UInt n;
   1996    vassert(hregClass(r) == HRcVec128);
   1997    vassert(!hregIsVirtual(r));
   1998    n = hregNumber(r);
   1999    vassert(n <= 15);
   2000    return mkHReg(n, HRcInt64, False);
   2001 }
   2002 
   2003 static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
   2004 {
   2005    return toUChar( ((mod & 3) << 6)
   2006                    | ((reg & 7) << 3)
   2007                    | (regmem & 7) );
   2008 }
   2009 
   2010 static UChar mkSIB ( Int shift, Int regindex, Int regbase )
   2011 {
   2012    return toUChar( ((shift & 3) << 6)
   2013                    | ((regindex & 7) << 3)
   2014                    | (regbase & 7) );
   2015 }
   2016 
   2017 static UChar* emit32 ( UChar* p, UInt w32 )
   2018 {
   2019    *p++ = toUChar((w32)       & 0x000000FF);
   2020    *p++ = toUChar((w32 >>  8) & 0x000000FF);
   2021    *p++ = toUChar((w32 >> 16) & 0x000000FF);
   2022    *p++ = toUChar((w32 >> 24) & 0x000000FF);
   2023    return p;
   2024 }
   2025 
   2026 static UChar* emit64 ( UChar* p, ULong w64 )
   2027 {
   2028    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
   2029    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
   2030    return p;
   2031 }
   2032 
   2033 /* Does a sign-extend of the lowest 8 bits give
   2034    the original number? */
   2035 static Bool fits8bits ( UInt w32 )
   2036 {
   2037    Int i32 = (Int)w32;
   2038    return toBool(i32 == ((i32 << 24) >> 24));
   2039 }
   2040 /* Can the lower 32 bits be signedly widened to produce the whole
   2041    64-bit value?  In other words, are the top 33 bits either all 0 or
   2042    all 1 ? */
   2043 static Bool fitsIn32Bits ( ULong x )
   2044 {
   2045    Long y0 = (Long)x;
   2046    Long y1 = y0;
   2047    y1 <<= 32;
   2048    y1 >>=/*s*/ 32;
   2049    return toBool(x == y1);
   2050 }
   2051 
   2052 
   2053 /* Forming mod-reg-rm bytes and scale-index-base bytes.
   2054 
   2055      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
   2056                        =  00 greg ereg
   2057 
   2058      greg,  d8(ereg)   |  ereg is neither of: RSP R12
   2059                        =  01 greg ereg, d8
   2060 
   2061      greg,  d32(ereg)  |  ereg is neither of: RSP R12
   2062                        =  10 greg ereg, d32
   2063 
   2064      greg,  d8(ereg)   |  ereg is either: RSP R12
   2065                        =  01 greg 100, 0x24, d8
   2066                        (lowest bit of rex distinguishes R12/RSP)
   2067 
   2068      greg,  d32(ereg)  |  ereg is either: RSP R12
   2069                        =  10 greg 100, 0x24, d32
   2070                        (lowest bit of rex distinguishes R12/RSP)
   2071 
   2072      -----------------------------------------------
   2073 
   2074      greg,  d8(base,index,scale)
   2075                |  index != RSP
   2076                =  01 greg 100, scale index base, d8
   2077 
   2078      greg,  d32(base,index,scale)
   2079                |  index != RSP
   2080                =  10 greg 100, scale index base, d32
   2081 */
   2082 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
   2083 {
   2084    if (am->tag == Aam_IR) {
   2085       if (am->Aam.IR.imm == 0
   2086           && am->Aam.IR.reg != hregAMD64_RSP()
   2087           && am->Aam.IR.reg != hregAMD64_RBP()
   2088           && am->Aam.IR.reg != hregAMD64_R12()
   2089           && am->Aam.IR.reg != hregAMD64_R13()
   2090          ) {
   2091          *p++ = mkModRegRM(0, iregBits210(greg),
   2092                               iregBits210(am->Aam.IR.reg));
   2093          return p;
   2094       }
   2095       if (fits8bits(am->Aam.IR.imm)
   2096           && am->Aam.IR.reg != hregAMD64_RSP()
   2097           && am->Aam.IR.reg != hregAMD64_R12()
   2098          ) {
   2099          *p++ = mkModRegRM(1, iregBits210(greg),
   2100                               iregBits210(am->Aam.IR.reg));
   2101          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2102          return p;
   2103       }
   2104       if (am->Aam.IR.reg != hregAMD64_RSP()
   2105           && am->Aam.IR.reg != hregAMD64_R12()
   2106          ) {
   2107          *p++ = mkModRegRM(2, iregBits210(greg),
   2108                               iregBits210(am->Aam.IR.reg));
   2109          p = emit32(p, am->Aam.IR.imm);
   2110          return p;
   2111       }
   2112       if ((am->Aam.IR.reg == hregAMD64_RSP()
   2113            || am->Aam.IR.reg == hregAMD64_R12())
   2114           && fits8bits(am->Aam.IR.imm)) {
   2115  	 *p++ = mkModRegRM(1, iregBits210(greg), 4);
   2116          *p++ = 0x24;
   2117          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2118          return p;
   2119       }
   2120       if (/* (am->Aam.IR.reg == hregAMD64_RSP()
   2121 	     || wait for test case for RSP case */
   2122           am->Aam.IR.reg == hregAMD64_R12()) {
   2123  	 *p++ = mkModRegRM(2, iregBits210(greg), 4);
   2124          *p++ = 0x24;
   2125          p = emit32(p, am->Aam.IR.imm);
   2126          return p;
   2127       }
   2128       ppAMD64AMode(am);
   2129       vpanic("doAMode_M: can't emit amode IR");
   2130       /*NOTREACHED*/
   2131    }
   2132    if (am->tag == Aam_IRRS) {
   2133       if (fits8bits(am->Aam.IRRS.imm)
   2134           && am->Aam.IRRS.index != hregAMD64_RSP()) {
   2135          *p++ = mkModRegRM(1, iregBits210(greg), 4);
   2136          *p++ = mkSIB(am->Aam.IRRS.shift, am->Aam.IRRS.index,
   2137                                           am->Aam.IRRS.base);
   2138          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
   2139          return p;
   2140       }
   2141       if (am->Aam.IRRS.index != hregAMD64_RSP()) {
   2142          *p++ = mkModRegRM(2, iregBits210(greg), 4);
   2143          *p++ = mkSIB(am->Aam.IRRS.shift, am->Aam.IRRS.index,
   2144                                           am->Aam.IRRS.base);
   2145          p = emit32(p, am->Aam.IRRS.imm);
   2146          return p;
   2147       }
   2148       ppAMD64AMode(am);
   2149       vpanic("doAMode_M: can't emit amode IRRS");
   2150       /*NOTREACHED*/
   2151    }
   2152    vpanic("doAMode_M: unknown amode");
   2153    /*NOTREACHED*/
   2154 }
   2155 
   2156 
   2157 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
   2158 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
   2159 {
   2160    *p++ = mkModRegRM(3, iregBits210(greg), iregBits210(ereg));
   2161    return p;
   2162 }
   2163 
   2164 
   2165 /* Clear the W bit on a REX byte, thereby changing the operand size
   2166    back to whatever that instruction's default operand size is. */
   2167 static inline UChar clearWBit ( UChar rex )
   2168 {
   2169    return toUChar(rex & ~(1<<3));
   2170 }
   2171 
   2172 
   2173 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
   2174 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
   2175 {
   2176    if (am->tag == Aam_IR) {
   2177       UChar W = 1;  /* we want 64-bit mode */
   2178       UChar R = iregBit3(greg);
   2179       UChar X = 0; /* not relevant */
   2180       UChar B = iregBit3(am->Aam.IR.reg);
   2181       return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2182    }
   2183    if (am->tag == Aam_IRRS) {
   2184       UChar W = 1;  /* we want 64-bit mode */
   2185       UChar R = iregBit3(greg);
   2186       UChar X = iregBit3(am->Aam.IRRS.index);
   2187       UChar B = iregBit3(am->Aam.IRRS.base);
   2188       return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2189    }
   2190    vassert(0);
   2191    return 0; /*NOTREACHED*/
   2192 }
   2193 
   2194 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
   2195 static UChar rexAMode_R ( HReg greg, HReg ereg )
   2196 {
   2197    UChar W = 1;  /* we want 64-bit mode */
   2198    UChar R = iregBit3(greg);
   2199    UChar X = 0; /* not relevant */
   2200    UChar B = iregBit3(ereg);
   2201    return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2202 }
   2203 
   2204 
   2205 /* Emit ffree %st(N) */
   2206 static UChar* do_ffree_st ( UChar* p, Int n )
   2207 {
   2208    vassert(n >= 0 && n <= 7);
   2209    *p++ = 0xDD;
   2210    *p++ = toUChar(0xC0 + n);
   2211    return p;
   2212 }
   2213 
   2214 //.. /* Emit fstp %st(i), 1 <= i <= 7 */
   2215 //.. static UChar* do_fstp_st ( UChar* p, Int i )
   2216 //.. {
   2217 //..    vassert(1 <= i && i <= 7);
   2218 //..    *p++ = 0xDD;
   2219 //..    *p++ = 0xD8+i;
   2220 //..    return p;
   2221 //.. }
   2222 //..
   2223 //.. /* Emit fld %st(i), 0 <= i <= 6 */
   2224 //.. static UChar* do_fld_st ( UChar* p, Int i )
   2225 //.. {
   2226 //..    vassert(0 <= i && i <= 6);
   2227 //..    *p++ = 0xD9;
   2228 //..    *p++ = 0xC0+i;
   2229 //..    return p;
   2230 //.. }
   2231 //..
   2232 //.. /* Emit f<op> %st(0) */
   2233 //.. static UChar* do_fop1_st ( UChar* p, AMD64FpOp op )
   2234 //.. {
   2235 //..    switch (op) {
   2236 //..       case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
   2237 //..       case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
   2238 //..       case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
   2239 //..       case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
   2240 //..       case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
   2241 //..       case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
   2242 //..       case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
   2243 //..       case Xfp_MOV:    break;
   2244 //..       case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
   2245 //..                        *p++ = 0xD9; *p++ = 0xF2; /* fptan */
   2246 //..                        *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
   2247 //..                        break;
   2248 //..       default: vpanic("do_fop1_st: unknown op");
   2249 //..    }
   2250 //..    return p;
   2251 //.. }
   2252 //..
   2253 //.. /* Emit f<op> %st(i), 1 <= i <= 5 */
   2254 //.. static UChar* do_fop2_st ( UChar* p, AMD64FpOp op, Int i )
   2255 //.. {
   2256 //.. #  define fake(_n) mkHReg((_n), HRcInt32, False)
   2257 //..    Int subopc;
   2258 //..    switch (op) {
   2259 //..       case Xfp_ADD: subopc = 0; break;
   2260 //..       case Xfp_SUB: subopc = 4; break;
   2261 //..       case Xfp_MUL: subopc = 1; break;
   2262 //..       case Xfp_DIV: subopc = 6; break;
   2263 //..       default: vpanic("do_fop2_st: unknown op");
   2264 //..    }
   2265 //..    *p++ = 0xD8;
   2266 //..    p    = doAMode_R(p, fake(subopc), fake(i));
   2267 //..    return p;
   2268 //.. #  undef fake
   2269 //.. }
   2270 //..
   2271 //.. /* Push a 32-bit word on the stack.  The word depends on tags[3:0];
   2272 //.. each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
   2273 //.. */
   2274 //.. static UChar* push_word_from_tags ( UChar* p, UShort tags )
   2275 //.. {
   2276 //..    UInt w;
   2277 //..    vassert(0 == (tags & ~0xF));
   2278 //..    if (tags == 0) {
   2279 //..       /* pushl $0x00000000 */
   2280 //..       *p++ = 0x6A;
   2281 //..       *p++ = 0x00;
   2282 //..    }
   2283 //..    else
   2284 //..    /* pushl $0xFFFFFFFF */
   2285 //..    if (tags == 0xF) {
   2286 //..       *p++ = 0x6A;
   2287 //..       *p++ = 0xFF;
   2288 //..    } else {
   2289 //..       vassert(0); /* awaiting test case */
   2290 //..       w = 0;
   2291 //..       if (tags & 1) w |= 0x000000FF;
   2292 //..       if (tags & 2) w |= 0x0000FF00;
   2293 //..       if (tags & 4) w |= 0x00FF0000;
   2294 //..       if (tags & 8) w |= 0xFF000000;
   2295 //..       *p++ = 0x68;
   2296 //..       p = emit32(p, w);
   2297 //..    }
   2298 //..    return p;
   2299 //.. }
   2300 
   2301 /* Emit an instruction into buf and return the number of bytes used.
   2302    Note that buf is not the insn's final place, and therefore it is
   2303    imperative to emit position-independent code. */
   2304 
   2305 Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i,
   2306                       Bool mode64, void* dispatch )
   2307 {
   2308    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
   2309    UInt   xtra;
   2310    UInt   reg;
   2311    UChar  rex;
   2312    UChar* p = &buf[0];
   2313    UChar* ptmp;
   2314    Int    j;
   2315    vassert(nbuf >= 32);
   2316    vassert(mode64 == True);
   2317 
   2318    /* Wrap an integer as a int register, for use assembling
   2319       GrpN insns, in which the greg field is used as a sub-opcode
   2320       and does not really contain a register. */
   2321 #  define fake(_n) mkHReg((_n), HRcInt64, False)
   2322 
   2323    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
   2324 
   2325    switch (i->tag) {
   2326 
   2327    case Ain_Imm64:
   2328       *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Imm64.dst)));
   2329       *p++ = toUChar(0xB8 + iregBits210(i->Ain.Imm64.dst));
   2330       p = emit64(p, i->Ain.Imm64.imm64);
   2331       goto done;
   2332 
   2333    case Ain_Alu64R:
   2334       /* Deal specially with MOV */
   2335       if (i->Ain.Alu64R.op == Aalu_MOV) {
   2336          switch (i->Ain.Alu64R.src->tag) {
   2337             case Armi_Imm:
   2338                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFF)) {
   2339                   /* Actually we could use this form for constants in
   2340                      the range 0 through 0x7FFFFFFF inclusive, but
   2341                      limit it to a small range for verifiability
   2342                      purposes. */
   2343                   /* Generate "movl $imm32, 32-bit-register" and let
   2344                      the default zero-extend rule cause the upper half
   2345                      of the dst to be zeroed out too.  This saves 1
   2346                      and sometimes 2 bytes compared to the more
   2347                      obvious encoding in the 'else' branch. */
   2348                   if (1 & iregBit3(i->Ain.Alu64R.dst))
   2349                      *p++ = 0x41;
   2350                   *p++ = 0xB8 + iregBits210(i->Ain.Alu64R.dst);
   2351                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2352                } else {
   2353                   *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Alu64R.dst)));
   2354                   *p++ = 0xC7;
   2355                   *p++ = toUChar(0xC0 + iregBits210(i->Ain.Alu64R.dst));
   2356                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2357                }
   2358                goto done;
   2359             case Armi_Reg:
   2360                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2361                                   i->Ain.Alu64R.dst );
   2362                *p++ = 0x89;
   2363                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2364                                 i->Ain.Alu64R.dst);
   2365                goto done;
   2366             case Armi_Mem:
   2367                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2368                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2369                *p++ = 0x8B;
   2370                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2371                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2372                goto done;
   2373             default:
   2374                goto bad;
   2375          }
   2376       }
   2377       /* MUL */
   2378       if (i->Ain.Alu64R.op == Aalu_MUL) {
   2379          switch (i->Ain.Alu64R.src->tag) {
   2380             case Armi_Reg:
   2381                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
   2382                                   i->Ain.Alu64R.src->Armi.Reg.reg);
   2383                *p++ = 0x0F;
   2384                *p++ = 0xAF;
   2385                p = doAMode_R(p, i->Ain.Alu64R.dst,
   2386                                 i->Ain.Alu64R.src->Armi.Reg.reg);
   2387                goto done;
   2388             case Armi_Mem:
   2389                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2390                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2391                *p++ = 0x0F;
   2392                *p++ = 0xAF;
   2393                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2394                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2395                goto done;
   2396             case Armi_Imm:
   2397                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2398                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2399                   *p++ = 0x6B;
   2400                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2401                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2402                } else {
   2403                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2404                   *p++ = 0x69;
   2405                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2406                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2407                }
   2408                goto done;
   2409             default:
   2410                goto bad;
   2411          }
   2412       }
   2413       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
   2414       opc = opc_rr = subopc_imm = opc_imma = 0;
   2415       switch (i->Ain.Alu64R.op) {
   2416          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
   2417                         subopc_imm = 2; opc_imma = 0x15; break;
   2418          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2419                         subopc_imm = 0; opc_imma = 0x05; break;
   2420          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2421                         subopc_imm = 5; opc_imma = 0x2D; break;
   2422          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
   2423                         subopc_imm = 3; opc_imma = 0x1D; break;
   2424          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2425                         subopc_imm = 4; opc_imma = 0x25; break;
   2426          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2427                         subopc_imm = 6; opc_imma = 0x35; break;
   2428          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2429                         subopc_imm = 1; opc_imma = 0x0D; break;
   2430          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2431                         subopc_imm = 7; opc_imma = 0x3D; break;
   2432          default: goto bad;
   2433       }
   2434       switch (i->Ain.Alu64R.src->tag) {
   2435          case Armi_Imm:
   2436             if (i->Ain.Alu64R.dst == hregAMD64_RAX()
   2437                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2438                goto bad; /* FIXME: awaiting test case */
   2439                *p++ = toUChar(opc_imma);
   2440                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2441             } else
   2442             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2443                *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst );
   2444                *p++ = 0x83;
   2445                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
   2446                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2447             } else {
   2448                *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst);
   2449                *p++ = 0x81;
   2450                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
   2451                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2452             }
   2453             goto done;
   2454          case Armi_Reg:
   2455             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2456                                i->Ain.Alu64R.dst);
   2457             *p++ = toUChar(opc_rr);
   2458             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2459                              i->Ain.Alu64R.dst);
   2460             goto done;
   2461          case Armi_Mem:
   2462             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
   2463                                i->Ain.Alu64R.src->Armi.Mem.am);
   2464             *p++ = toUChar(opc);
   2465             p = doAMode_M(p, i->Ain.Alu64R.dst,
   2466                              i->Ain.Alu64R.src->Armi.Mem.am);
   2467             goto done;
   2468          default:
   2469             goto bad;
   2470       }
   2471       break;
   2472 
   2473    case Ain_Alu64M:
   2474       /* Deal specially with MOV */
   2475       if (i->Ain.Alu64M.op == Aalu_MOV) {
   2476          switch (i->Ain.Alu64M.src->tag) {
   2477             case Ari_Reg:
   2478                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
   2479                                  i->Ain.Alu64M.dst);
   2480                *p++ = 0x89;
   2481                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
   2482                                 i->Ain.Alu64M.dst);
   2483                goto done;
   2484             case Ari_Imm:
   2485                *p++ = rexAMode_M(fake(0), i->Ain.Alu64M.dst);
   2486                *p++ = 0xC7;
   2487                p = doAMode_M(p, fake(0), i->Ain.Alu64M.dst);
   2488                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
   2489                goto done;
   2490             default:
   2491                goto bad;
   2492          }
   2493       }
   2494 //..       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
   2495 //..          allowed here. */
   2496 //..       opc = subopc_imm = opc_imma = 0;
   2497 //..       switch (i->Xin.Alu32M.op) {
   2498 //..          case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
   2499 //..          case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
   2500 //..          default: goto bad;
   2501 //..       }
   2502 //..       switch (i->Xin.Alu32M.src->tag) {
   2503 //..          case Xri_Reg:
   2504 //..             *p++ = opc;
   2505 //..             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
   2506 //..                              i->Xin.Alu32M.dst);
   2507 //..             goto done;
   2508 //..          case Xri_Imm:
   2509 //..             if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
   2510 //..                *p++ = 0x83;
   2511 //..                p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
   2512 //..                *p++ = 0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32;
   2513 //..                goto done;
   2514 //..             } else {
   2515 //..                *p++ = 0x81;
   2516 //..                p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
   2517 //..                p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
   2518 //..                goto done;
   2519 //..             }
   2520 //..          default:
   2521 //..             goto bad;
   2522 //..       }
   2523       break;
   2524 
   2525    case Ain_Sh64:
   2526       opc_cl = opc_imm = subopc = 0;
   2527       switch (i->Ain.Sh64.op) {
   2528          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
   2529          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
   2530          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
   2531          default: goto bad;
   2532       }
   2533       if (i->Ain.Sh64.src == 0) {
   2534          *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
   2535          *p++ = toUChar(opc_cl);
   2536          p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
   2537          goto done;
   2538       } else {
   2539          *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
   2540          *p++ = toUChar(opc_imm);
   2541          p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
   2542          *p++ = (UChar)(i->Ain.Sh64.src);
   2543          goto done;
   2544       }
   2545       break;
   2546 
   2547    case Ain_Test64:
   2548       /* testq sign-extend($imm32), %reg */
   2549       *p++ = rexAMode_R(fake(0), i->Ain.Test64.dst);
   2550       *p++ = 0xF7;
   2551       p = doAMode_R(p, fake(0), i->Ain.Test64.dst);
   2552       p = emit32(p, i->Ain.Test64.imm32);
   2553       goto done;
   2554 
   2555    case Ain_Unary64:
   2556       if (i->Ain.Unary64.op == Aun_NOT) {
   2557          *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
   2558          *p++ = 0xF7;
   2559          p = doAMode_R(p, fake(2), i->Ain.Unary64.dst);
   2560          goto done;
   2561       }
   2562       if (i->Ain.Unary64.op == Aun_NEG) {
   2563          *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
   2564          *p++ = 0xF7;
   2565          p = doAMode_R(p, fake(3), i->Ain.Unary64.dst);
   2566          goto done;
   2567       }
   2568       break;
   2569 
   2570    case Ain_Lea64:
   2571       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2572       *p++ = 0x8D;
   2573       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2574       goto done;
   2575 
   2576    case Ain_MulL:
   2577       subopc = i->Ain.MulL.syned ? 5 : 4;
   2578       switch (i->Ain.MulL.src->tag)  {
   2579          case Arm_Mem:
   2580             *p++ = rexAMode_M( fake(0),
   2581                                i->Ain.MulL.src->Arm.Mem.am);
   2582             *p++ = 0xF7;
   2583             p = doAMode_M(p, fake(subopc),
   2584                              i->Ain.MulL.src->Arm.Mem.am);
   2585             goto done;
   2586          case Arm_Reg:
   2587             *p++ = rexAMode_R(fake(0),
   2588                               i->Ain.MulL.src->Arm.Reg.reg);
   2589             *p++ = 0xF7;
   2590             p = doAMode_R(p, fake(subopc),
   2591                              i->Ain.MulL.src->Arm.Reg.reg);
   2592             goto done;
   2593          default:
   2594             goto bad;
   2595       }
   2596       break;
   2597 
   2598    case Ain_Div:
   2599       subopc = i->Ain.Div.syned ? 7 : 6;
   2600       if (i->Ain.Div.sz == 4) {
   2601          switch (i->Ain.Div.src->tag)  {
   2602             case Arm_Mem:
   2603                goto bad;
   2604                /*FIXME*/
   2605                *p++ = 0xF7;
   2606                p = doAMode_M(p, fake(subopc),
   2607                                 i->Ain.Div.src->Arm.Mem.am);
   2608                goto done;
   2609             case Arm_Reg:
   2610                *p++ = clearWBit(
   2611                       rexAMode_R( fake(0), i->Ain.Div.src->Arm.Reg.reg));
   2612                *p++ = 0xF7;
   2613                p = doAMode_R(p, fake(subopc),
   2614                                 i->Ain.Div.src->Arm.Reg.reg);
   2615                goto done;
   2616             default:
   2617                goto bad;
   2618          }
   2619       }
   2620       if (i->Ain.Div.sz == 8) {
   2621          switch (i->Ain.Div.src->tag)  {
   2622             case Arm_Mem:
   2623                *p++ = rexAMode_M( fake(0),
   2624                                   i->Ain.Div.src->Arm.Mem.am);
   2625                *p++ = 0xF7;
   2626                p = doAMode_M(p, fake(subopc),
   2627                                 i->Ain.Div.src->Arm.Mem.am);
   2628                goto done;
   2629             case Arm_Reg:
   2630                *p++ = rexAMode_R( fake(0),
   2631                                   i->Ain.Div.src->Arm.Reg.reg);
   2632                *p++ = 0xF7;
   2633                p = doAMode_R(p, fake(subopc),
   2634                                 i->Ain.Div.src->Arm.Reg.reg);
   2635                goto done;
   2636             default:
   2637                goto bad;
   2638          }
   2639       }
   2640       break;
   2641 
   2642 //..    case Xin_Sh3232:
   2643 //..       vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
   2644 //..       if (i->Xin.Sh3232.amt == 0) {
   2645 //..          /* shldl/shrdl by %cl */
   2646 //..          *p++ = 0x0F;
   2647 //..          if (i->Xin.Sh3232.op == Xsh_SHL) {
   2648 //..             *p++ = 0xA5;
   2649 //..          } else {
   2650 //..             *p++ = 0xAD;
   2651 //..          }
   2652 //..          p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
   2653 //..          goto done;
   2654 //..       }
   2655 //..       break;
   2656 
   2657    case Ain_Push:
   2658       switch (i->Ain.Push.src->tag) {
   2659          case Armi_Mem:
   2660             *p++ = clearWBit(
   2661                    rexAMode_M(fake(0), i->Ain.Push.src->Armi.Mem.am));
   2662             *p++ = 0xFF;
   2663             p = doAMode_M(p, fake(6), i->Ain.Push.src->Armi.Mem.am);
   2664             goto done;
   2665          case Armi_Imm:
   2666             *p++ = 0x68;
   2667             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
   2668             goto done;
   2669          case Armi_Reg:
   2670             *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.Push.src->Armi.Reg.reg)));
   2671             *p++ = toUChar(0x50 + iregBits210(i->Ain.Push.src->Armi.Reg.reg));
   2672             goto done;
   2673         default:
   2674             goto bad;
   2675       }
   2676 
   2677    case Ain_Call: {
   2678       /* As per detailed comment for Ain_Call in
   2679          getRegUsage_AMD64Instr above, %r11 is used as an address
   2680          temporary. */
   2681       /* jump over the following two insns if the condition does not
   2682          hold */
   2683       Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
   2684       if (i->Ain.Call.cond != Acc_ALWAYS) {
   2685          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
   2686          *p++ = shortImm ? 10 : 13;
   2687          /* 10 or 13 bytes in the next two insns */
   2688       }
   2689       if (shortImm) {
   2690          /* 7 bytes: movl sign-extend(imm32), %r11 */
   2691          *p++ = 0x49;
   2692          *p++ = 0xC7;
   2693          *p++ = 0xC3;
   2694          p = emit32(p, (UInt)i->Ain.Call.target);
   2695       } else {
   2696          /* 10 bytes: movabsq $target, %r11 */
   2697          *p++ = 0x49;
   2698          *p++ = 0xBB;
   2699          p = emit64(p, i->Ain.Call.target);
   2700       }
   2701       /* 3 bytes: call *%r11 */
   2702       *p++ = 0x41;
   2703       *p++ = 0xFF;
   2704       *p++ = 0xD3;
   2705       goto done;
   2706    }
   2707 
   2708    case Ain_Goto:
   2709       /* Use ptmp for backpatching conditional jumps. */
   2710       ptmp = NULL;
   2711 
   2712       /* First off, if this is conditional, create a conditional
   2713          jump over the rest of it. */
   2714       if (i->Ain.Goto.cond != Acc_ALWAYS) {
   2715          /* jmp fwds if !condition */
   2716          *p++ = toUChar(0x70 + (i->Ain.Goto.cond ^ 1));
   2717          ptmp = p; /* fill in this bit later */
   2718          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2719       }
   2720 
   2721       /* If a non-boring, set %rbp (the guest state pointer)
   2722          appropriately.  Since these numbers are all small positive
   2723          integers, we can get away with "movl $N, %ebp" rather than
   2724          the longer "movq $N, %rbp". */
   2725       /* movl $magic_number, %ebp */
   2726       switch (i->Ain.Goto.jk) {
   2727          case Ijk_ClientReq:
   2728             *p++ = 0xBD;
   2729             p = emit32(p, VEX_TRC_JMP_CLIENTREQ); break;
   2730          case Ijk_Sys_syscall:
   2731             *p++ = 0xBD;
   2732             p = emit32(p, VEX_TRC_JMP_SYS_SYSCALL); break;
   2733          case Ijk_Sys_int32:
   2734             *p++ = 0xBD;
   2735             p = emit32(p, VEX_TRC_JMP_SYS_INT32); break;
   2736          case Ijk_Yield:
   2737             *p++ = 0xBD;
   2738             p = emit32(p, VEX_TRC_JMP_YIELD); break;
   2739          case Ijk_YieldNoRedir:
   2740             *p++ = 0xBD;
   2741             p = emit32(p, VEX_TRC_JMP_YIELD_NOREDIR); break;
   2742          case Ijk_EmWarn:
   2743             *p++ = 0xBD;
   2744             p = emit32(p, VEX_TRC_JMP_EMWARN); break;
   2745          case Ijk_MapFail:
   2746             *p++ = 0xBD;
   2747             p = emit32(p, VEX_TRC_JMP_MAPFAIL); break;
   2748          case Ijk_NoDecode:
   2749             *p++ = 0xBD;
   2750             p = emit32(p, VEX_TRC_JMP_NODECODE); break;
   2751          case Ijk_TInval:
   2752             *p++ = 0xBD;
   2753             p = emit32(p, VEX_TRC_JMP_TINVAL); break;
   2754          case Ijk_NoRedir:
   2755             *p++ = 0xBD;
   2756             p = emit32(p, VEX_TRC_JMP_NOREDIR); break;
   2757          case Ijk_SigTRAP:
   2758             *p++ = 0xBD;
   2759             p = emit32(p, VEX_TRC_JMP_SIGTRAP); break;
   2760          case Ijk_SigSEGV:
   2761             *p++ = 0xBD;
   2762             p = emit32(p, VEX_TRC_JMP_SIGSEGV); break;
   2763          case Ijk_Ret:
   2764          case Ijk_Call:
   2765          case Ijk_Boring:
   2766             break;
   2767          default:
   2768             ppIRJumpKind(i->Ain.Goto.jk);
   2769             vpanic("emit_AMD64Instr.Ain_Goto: unknown jump kind");
   2770       }
   2771 
   2772       /* Get the destination address into %rax */
   2773       if (i->Ain.Goto.dst->tag == Ari_Imm) {
   2774          /* movl sign-ext($immediate), %rax ; ret */
   2775          *p++ = 0x48;
   2776          *p++ = 0xC7;
   2777          *p++ = 0xC0;
   2778          p = emit32(p, i->Ain.Goto.dst->Ari.Imm.imm32);
   2779       } else {
   2780          vassert(i->Ain.Goto.dst->tag == Ari_Reg);
   2781          /* movq %reg, %rax ; ret */
   2782          if (i->Ain.Goto.dst->Ari.Reg.reg != hregAMD64_RAX()) {
   2783             *p++ = rexAMode_R(i->Ain.Goto.dst->Ari.Reg.reg, hregAMD64_RAX());
   2784             *p++ = 0x89;
   2785             p = doAMode_R(p, i->Ain.Goto.dst->Ari.Reg.reg, hregAMD64_RAX());
   2786          }
   2787       }
   2788 
   2789       /* Get the dispatcher address into %rdx.  This has to happen
   2790          after the load of %rax since %rdx might be carrying the value
   2791          destined for %rax immediately prior to this Ain_Goto. */
   2792       vassert(sizeof(ULong) == sizeof(void*));
   2793       vassert(dispatch != NULL);
   2794 
   2795       if (fitsIn32Bits(Ptr_to_ULong(dispatch))) {
   2796          /* movl sign-extend(imm32), %rdx */
   2797          *p++ = 0x48;
   2798          *p++ = 0xC7;
   2799          *p++ = 0xC2;
   2800          p = emit32(p, (UInt)Ptr_to_ULong(dispatch));
   2801       } else {
   2802          /* movabsq $imm64, %rdx */
   2803          *p++ = 0x48;
   2804          *p++ = 0xBA;
   2805          p = emit64(p, Ptr_to_ULong(dispatch));
   2806       }
   2807       /* jmp *%rdx */
   2808       *p++ = 0xFF;
   2809       *p++ = 0xE2;
   2810 
   2811       /* Fix up the conditional jump, if there was one. */
   2812       if (i->Ain.Goto.cond != Acc_ALWAYS) {
   2813          Int delta = p - ptmp;
   2814          vassert(delta > 0 && delta < 30);
   2815          *ptmp = toUChar(delta-1);
   2816       }
   2817       goto done;
   2818 
   2819    case Ain_CMov64:
   2820       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
   2821       if (i->Ain.CMov64.src->tag == Arm_Reg) {
   2822          *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
   2823          *p++ = 0x0F;
   2824          *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
   2825          p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
   2826          goto done;
   2827       }
   2828       if (i->Ain.CMov64.src->tag == Arm_Mem) {
   2829          *p++ = rexAMode_M(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
   2830          *p++ = 0x0F;
   2831          *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
   2832          p = doAMode_M(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
   2833          goto done;
   2834       }
   2835       break;
   2836 
   2837    case Ain_MovxLQ:
   2838       /* No, _don't_ ask me why the sense of the args has to be
   2839          different in the S vs Z case.  I don't know. */
   2840       if (i->Ain.MovxLQ.syned) {
   2841          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
   2842          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   2843          *p++ = 0x63;
   2844          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   2845       } else {
   2846          /* Produce a 32-bit reg-reg move, since the implicit
   2847             zero-extend does what we want. */
   2848          *p++ = clearWBit (
   2849                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
   2850          *p++ = 0x89;
   2851          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
   2852       }
   2853       goto done;
   2854 
   2855    case Ain_LoadEX:
   2856       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
   2857          /* movzbq */
   2858          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2859          *p++ = 0x0F;
   2860          *p++ = 0xB6;
   2861          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2862          goto done;
   2863       }
   2864       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
   2865          /* movzwq */
   2866          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2867          *p++ = 0x0F;
   2868          *p++ = 0xB7;
   2869          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2870          goto done;
   2871       }
   2872       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
   2873          /* movzlq */
   2874          /* This isn't really an existing AMD64 instruction per se.
   2875             Rather, we have to do a 32-bit load.  Because a 32-bit
   2876             write implicitly clears the upper 32 bits of the target
   2877             register, we get what we want. */
   2878          *p++ = clearWBit(
   2879                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
   2880          *p++ = 0x8B;
   2881          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2882          goto done;
   2883       }
   2884       break;
   2885 
   2886    case Ain_Set64:
   2887       /* Make the destination register be 1 or 0, depending on whether
   2888          the relevant condition holds.  Complication: the top 56 bits
   2889          of the destination should be forced to zero, but doing 'xorq
   2890          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
   2891          start off my moving $0 into the dest. */
   2892       reg = iregBits3210(i->Ain.Set64.dst);
   2893       vassert(reg < 16);
   2894 
   2895       /* movq $0, %dst */
   2896       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
   2897       *p++ = 0xC7;
   2898       *p++ = toUChar(0xC0 + (reg & 7));
   2899       p = emit32(p, 0);
   2900 
   2901       /* setb lo8(%dst) */
   2902       /* note, 8-bit register rex trickyness.  Be careful here. */
   2903       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
   2904       *p++ = 0x0F;
   2905       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
   2906       *p++ = toUChar(0xC0 + (reg & 7));
   2907       goto done;
   2908 
   2909    case Ain_Bsfr64:
   2910       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   2911       *p++ = 0x0F;
   2912       if (i->Ain.Bsfr64.isFwds) {
   2913          *p++ = 0xBC;
   2914       } else {
   2915          *p++ = 0xBD;
   2916       }
   2917       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   2918       goto done;
   2919 
   2920    case Ain_MFence:
   2921       /* mfence */
   2922       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
   2923       goto done;
   2924 
   2925    case Ain_ACAS:
   2926       /* lock */
   2927       *p++ = 0xF0;
   2928       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
   2929       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
   2930          in %rbx.  The new-value register is hardwired to be %rbx
   2931          since dealing with byte integer registers is too much hassle,
   2932          so we force the register operand to %rbx (could equally be
   2933          %rcx or %rdx). */
   2934       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
   2935       if (i->Ain.ACAS.sz != 8)
   2936          rex = clearWBit(rex);
   2937 
   2938       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
   2939       *p++ = 0x0F;
   2940       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
   2941       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
   2942       goto done;
   2943 
   2944    case Ain_DACAS:
   2945       /* lock */
   2946       *p++ = 0xF0;
   2947       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
   2948          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
   2949          aren't encoded in the insn. */
   2950       rex = rexAMode_M( fake(1), i->Ain.ACAS.addr );
   2951       if (i->Ain.ACAS.sz != 8)
   2952          rex = clearWBit(rex);
   2953       *p++ = rex;
   2954       *p++ = 0x0F;
   2955       *p++ = 0xC7;
   2956       p = doAMode_M(p, fake(1), i->Ain.DACAS.addr);
   2957       goto done;
   2958 
   2959    case Ain_A87Free:
   2960       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
   2961       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
   2962          p = do_ffree_st(p, 7-j);
   2963       }
   2964       goto done;
   2965 
   2966    case Ain_A87PushPop:
   2967       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
   2968       if (i->Ain.A87PushPop.isPush) {
   2969          /* Load from memory into %st(0): flds/fldl amode */
   2970          *p++ = clearWBit(
   2971                    rexAMode_M(fake(0), i->Ain.A87PushPop.addr) );
   2972          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   2973 	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Ain.A87PushPop.addr);
   2974       } else {
   2975          /* Dump %st(0) to memory: fstps/fstpl amode */
   2976          *p++ = clearWBit(
   2977                    rexAMode_M(fake(3), i->Ain.A87PushPop.addr) );
   2978          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   2979          p = doAMode_M(p, fake(3)/*subopcode*/, i->Ain.A87PushPop.addr);
   2980          goto done;
   2981       }
   2982       goto done;
   2983 
   2984    case Ain_A87FpOp:
   2985       switch (i->Ain.A87FpOp.op) {
   2986          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
   2987          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
   2988          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
   2989          case Afp_TAN:    *p++ = 0xD9; *p++ = 0xF2; break;
   2990          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
   2991          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
   2992          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
   2993          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
   2994          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
   2995          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
   2996          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
   2997          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
   2998          default: goto bad;
   2999       }
   3000       goto done;
   3001 
   3002    case Ain_A87LdCW:
   3003       *p++ = clearWBit(
   3004                 rexAMode_M(fake(5), i->Ain.A87LdCW.addr) );
   3005       *p++ = 0xD9;
   3006       p = doAMode_M(p, fake(5)/*subopcode*/, i->Ain.A87LdCW.addr);
   3007       goto done;
   3008 
   3009    case Ain_A87StSW:
   3010       *p++ = clearWBit(
   3011                 rexAMode_M(fake(7), i->Ain.A87StSW.addr) );
   3012       *p++ = 0xDD;
   3013       p = doAMode_M(p, fake(7)/*subopcode*/, i->Ain.A87StSW.addr);
   3014       goto done;
   3015 
   3016    case Ain_Store:
   3017       if (i->Ain.Store.sz == 2) {
   3018          /* This just goes to show the crazyness of the instruction
   3019             set encoding.  We have to insert two prefix bytes, but be
   3020             careful to avoid a conflict in what the size should be, by
   3021             ensuring that REX.W = 0. */
   3022          *p++ = 0x66; /* override to 16-bits */
   3023 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3024          *p++ = 0x89;
   3025          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3026          goto done;
   3027       }
   3028       if (i->Ain.Store.sz == 4) {
   3029 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3030          *p++ = 0x89;
   3031          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3032          goto done;
   3033       }
   3034       if (i->Ain.Store.sz == 1) {
   3035          /* This is one place where it would be wrong to skip emitting
   3036             a rex byte of 0x40, since the mere presence of rex changes
   3037             the meaning of the byte register access.  Be careful. */
   3038 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3039          *p++ = 0x88;
   3040          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3041          goto done;
   3042       }
   3043       break;
   3044 
   3045 //..    case Xin_FpUnary:
   3046 //..       /* gop %src, %dst
   3047 //..          --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
   3048 //..       */
   3049 //..       p = do_ffree_st7(p);
   3050 //..       p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
   3051 //..       p = do_fop1_st(p, i->Xin.FpUnary.op);
   3052 //..       p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
   3053 //..       goto done;
   3054 //..
   3055 //..    case Xin_FpBinary:
   3056 //..       if (i->Xin.FpBinary.op == Xfp_YL2X
   3057 //..           || i->Xin.FpBinary.op == Xfp_YL2XP1) {
   3058 //..          /* Have to do this specially. */
   3059 //..          /* ffree %st7 ; fld %st(srcL) ;
   3060 //..             ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
   3061 //..          p = do_ffree_st7(p);
   3062 //..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
   3063 //..          p = do_ffree_st7(p);
   3064 //..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
   3065 //..          *p++ = 0xD9;
   3066 //..          *p++ = i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9;
   3067 //..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
   3068 //..          goto done;
   3069 //..       }
   3070 //..       if (i->Xin.FpBinary.op == Xfp_ATAN) {
   3071 //..          /* Have to do this specially. */
   3072 //..          /* ffree %st7 ; fld %st(srcL) ;
   3073 //..             ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
   3074 //..          p = do_ffree_st7(p);
   3075 //..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
   3076 //..          p = do_ffree_st7(p);
   3077 //..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
   3078 //..          *p++ = 0xD9; *p++ = 0xF3;
   3079 //..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
   3080 //..          goto done;
   3081 //..       }
   3082 //..       if (i->Xin.FpBinary.op == Xfp_PREM
   3083 //..           || i->Xin.FpBinary.op == Xfp_PREM1
   3084 //..           || i->Xin.FpBinary.op == Xfp_SCALE) {
   3085 //..          /* Have to do this specially. */
   3086 //..          /* ffree %st7 ; fld %st(srcR) ;
   3087 //..             ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ;
   3088 //..             fincstp ; ffree %st7 */
   3089 //..          p = do_ffree_st7(p);
   3090 //..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
   3091 //..          p = do_ffree_st7(p);
   3092 //..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
   3093 //..          *p++ = 0xD9;
   3094 //..          switch (i->Xin.FpBinary.op) {
   3095 //..             case Xfp_PREM: *p++ = 0xF8; break;
   3096 //..             case Xfp_PREM1: *p++ = 0xF5; break;
   3097 //..             case Xfp_SCALE: *p++ =  0xFD; break;
   3098 //..             default: vpanic("emitAMD64Instr(FpBinary,PREM/PREM1/SCALE)");
   3099 //..          }
   3100 //..          p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
   3101 //..          *p++ = 0xD9; *p++ = 0xF7;
   3102 //..          p = do_ffree_st7(p);
   3103 //..          goto done;
   3104 //..       }
   3105 //..       /* General case */
   3106 //..       /* gop %srcL, %srcR, %dst
   3107 //..          --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
   3108 //..       */
   3109 //..       p = do_ffree_st7(p);
   3110 //..       p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
   3111 //..       p = do_fop2_st(p, i->Xin.FpBinary.op,
   3112 //..                         1+hregNumber(i->Xin.FpBinary.srcR));
   3113 //..       p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
   3114 //..       goto done;
   3115 //..
   3116 //..    case Xin_FpLdSt:
   3117 //..       vassert(i->Xin.FpLdSt.sz == 4 || i->Xin.FpLdSt.sz == 8);
   3118 //..       if (i->Xin.FpLdSt.isLoad) {
   3119 //..          /* Load from memory into %fakeN.
   3120 //..             --> ffree %st(7) ; fld{s/l} amode ; fstp st(N+1)
   3121 //..          */
   3122 //..          p = do_ffree_st7(p);
   3123 //..          *p++ = i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD;
   3124 //.. 	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
   3125 //..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
   3126 //..          goto done;
   3127 //..       } else {
   3128 //..          /* Store from %fakeN into memory.
   3129 //..             --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
   3130 //.. 	 */
   3131 //..          p = do_ffree_st7(p);
   3132 //..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
   3133 //..          *p++ = i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD;
   3134 //..          p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
   3135 //..          goto done;
   3136 //..       }
   3137 //..       break;
   3138 //..
   3139 //..    case Xin_FpLdStI:
   3140 //..       if (i->Xin.FpLdStI.isLoad) {
   3141 //..          /* Load from memory into %fakeN, converting from an int.
   3142 //..             --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1)
   3143 //..          */
   3144 //..          switch (i->Xin.FpLdStI.sz) {
   3145 //..             case 8:  opc = 0xDF; subopc_imm = 5; break;
   3146 //..             case 4:  opc = 0xDB; subopc_imm = 0; break;
   3147 //..             case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
   3148 //..             default: vpanic("emitAMD64Instr(Xin_FpLdStI-load)");
   3149 //..          }
   3150 //..          p = do_ffree_st7(p);
   3151 //..          *p++ = opc;
   3152 //..          p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
   3153 //..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
   3154 //..          goto done;
   3155 //..       } else {
   3156 //..          /* Store from %fakeN into memory, converting to an int.
   3157 //..             --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
   3158 //.. 	 */
   3159 //..          switch (i->Xin.FpLdStI.sz) {
   3160 //..             case 8:  opc = 0xDF; subopc_imm = 7; break;
   3161 //..             case 4:  opc = 0xDB; subopc_imm = 3; break;
   3162 //..             case 2:  opc = 0xDF; subopc_imm = 3; break;
   3163 //..             default: vpanic("emitAMD64Instr(Xin_FpLdStI-store)");
   3164 //..          }
   3165 //..          p = do_ffree_st7(p);
   3166 //..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
   3167 //..          *p++ = opc;
   3168 //..          p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
   3169 //..          goto done;
   3170 //..       }
   3171 //..       break;
   3172 //..
   3173 //..    case Xin_Fp64to32:
   3174 //..       /* ffree %st7 ; fld %st(src) */
   3175 //..       p = do_ffree_st7(p);
   3176 //..       p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
   3177 //..       /* subl $4, %esp */
   3178 //..       *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
   3179 //..       /* fstps (%esp) */
   3180 //..       *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
   3181 //..       /* flds (%esp) */
   3182 //..       *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
   3183 //..       /* addl $4, %esp */
   3184 //..       *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
   3185 //..       /* fstp %st(1+dst) */
   3186 //..       p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
   3187 //..       goto done;
   3188 //..
   3189 //..    case Xin_FpCMov:
   3190 //..       /* jmp fwds if !condition */
   3191 //..       *p++ = 0x70 + (i->Xin.FpCMov.cond ^ 1);
   3192 //..       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   3193 //..       ptmp = p;
   3194 //..
   3195 //..       /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
   3196 //..       p = do_ffree_st7(p);
   3197 //..       p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
   3198 //..       p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
   3199 //..
   3200 //..       /* Fill in the jump offset. */
   3201 //..       *(ptmp-1) = p - ptmp;
   3202 //..       goto done;
   3203 
   3204    case Ain_LdMXCSR:
   3205       *p++ = clearWBit(rexAMode_M( fake(0), i->Ain.LdMXCSR.addr));
   3206       *p++ = 0x0F;
   3207       *p++ = 0xAE;
   3208       p = doAMode_M(p, fake(2)/*subopcode*/, i->Ain.LdMXCSR.addr);
   3209       goto done;
   3210 
   3211 //..    case Xin_FpStSW_AX:
   3212 //..       /* note, this emits fnstsw %ax, not fstsw %ax */
   3213 //..       *p++ = 0xDF;
   3214 //..       *p++ = 0xE0;
   3215 //..       goto done;
   3216 
   3217    case Ain_SseUComIS:
   3218       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
   3219       /* ucomi[sd] %srcL, %srcR */
   3220       if (i->Ain.SseUComIS.sz == 8) {
   3221          *p++ = 0x66;
   3222       } else {
   3223          goto bad;
   3224          vassert(i->Ain.SseUComIS.sz == 4);
   3225       }
   3226       *p++ = clearWBit (
   3227              rexAMode_R( vreg2ireg(i->Ain.SseUComIS.srcL),
   3228                          vreg2ireg(i->Ain.SseUComIS.srcR) ));
   3229       *p++ = 0x0F;
   3230       *p++ = 0x2E;
   3231       p = doAMode_R(p, vreg2ireg(i->Ain.SseUComIS.srcL),
   3232                        vreg2ireg(i->Ain.SseUComIS.srcR) );
   3233       /* pushfq */
   3234       *p++ = 0x9C;
   3235       /* popq %dst */
   3236       *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.SseUComIS.dst)));
   3237       *p++ = toUChar(0x58 + iregBits210(i->Ain.SseUComIS.dst));
   3238       goto done;
   3239 
   3240    case Ain_SseSI2SF:
   3241       /* cvssi2s[sd] %src, %dst */
   3242       rex = rexAMode_R( vreg2ireg(i->Ain.SseSI2SF.dst),
   3243                         i->Ain.SseSI2SF.src );
   3244       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
   3245       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
   3246       *p++ = 0x0F;
   3247       *p++ = 0x2A;
   3248       p = doAMode_R( p, vreg2ireg(i->Ain.SseSI2SF.dst),
   3249                         i->Ain.SseSI2SF.src );
   3250       goto done;
   3251 
   3252    case Ain_SseSF2SI:
   3253       /* cvss[sd]2si %src, %dst */
   3254       rex = rexAMode_R( i->Ain.SseSF2SI.dst,
   3255                         vreg2ireg(i->Ain.SseSF2SI.src) );
   3256       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
   3257       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
   3258       *p++ = 0x0F;
   3259       *p++ = 0x2D;
   3260       p = doAMode_R( p, i->Ain.SseSF2SI.dst,
   3261                         vreg2ireg(i->Ain.SseSF2SI.src) );
   3262       goto done;
   3263 
   3264    case Ain_SseSDSS:
   3265       /* cvtsd2ss/cvtss2sd %src, %dst */
   3266       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
   3267       *p++ = clearWBit(
   3268               rexAMode_R( vreg2ireg(i->Ain.SseSDSS.dst),
   3269                           vreg2ireg(i->Ain.SseSDSS.src) ));
   3270       *p++ = 0x0F;
   3271       *p++ = 0x5A;
   3272       p = doAMode_R( p, vreg2ireg(i->Ain.SseSDSS.dst),
   3273                         vreg2ireg(i->Ain.SseSDSS.src) );
   3274       goto done;
   3275 
   3276 //..
   3277 //..    case Xin_FpCmp:
   3278 //..       /* gcmp %fL, %fR, %dst
   3279 //..          -> ffree %st7; fpush %fL ; fucomp %(fR+1) ;
   3280 //..             fnstsw %ax ; movl %eax, %dst
   3281 //..       */
   3282 //..       /* ffree %st7 */
   3283 //..       p = do_ffree_st7(p);
   3284 //..       /* fpush %fL */
   3285 //..       p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
   3286 //..       /* fucomp %(fR+1) */
   3287 //..       *p++ = 0xDD;
   3288 //..       *p++ = 0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR)));
   3289 //..       /* fnstsw %ax */
   3290 //..       *p++ = 0xDF;
   3291 //..       *p++ = 0xE0;
   3292 //..       /*  movl %eax, %dst */
   3293 //..       *p++ = 0x89;
   3294 //..       p = doAMode_R(p, hregAMD64_EAX(), i->Xin.FpCmp.dst);
   3295 //..       goto done;
   3296 //..
   3297 //..    case Xin_SseConst: {
   3298 //..       UShort con = i->Xin.SseConst.con;
   3299 //..       p = push_word_from_tags(p, (con >> 12) & 0xF);
   3300 //..       p = push_word_from_tags(p, (con >> 8) & 0xF);
   3301 //..       p = push_word_from_tags(p, (con >> 4) & 0xF);
   3302 //..       p = push_word_from_tags(p, con & 0xF);
   3303 //..       /* movl (%esp), %xmm-dst */
   3304 //..       *p++ = 0x0F;
   3305 //..       *p++ = 0x10;
   3306 //..       *p++ = 0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst));
   3307 //..       *p++ = 0x24;
   3308 //..       /* addl $16, %esp */
   3309 //..       *p++ = 0x83;
   3310 //..       *p++ = 0xC4;
   3311 //..       *p++ = 0x10;
   3312 //..       goto done;
   3313 //..    }
   3314 
   3315    case Ain_SseLdSt:
   3316       if (i->Ain.SseLdSt.sz == 8) {
   3317          *p++ = 0xF2;
   3318       } else
   3319       if (i->Ain.SseLdSt.sz == 4) {
   3320          *p++ = 0xF3;
   3321       } else
   3322       if (i->Ain.SseLdSt.sz != 16) {
   3323          vassert(0);
   3324       }
   3325       *p++ = clearWBit(
   3326              rexAMode_M( vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr));
   3327       *p++ = 0x0F;
   3328       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
   3329       p = doAMode_M(p, vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr);
   3330       goto done;
   3331 
   3332    case Ain_SseLdzLO:
   3333       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
   3334       /* movs[sd] amode, %xmm-dst */
   3335       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
   3336       *p++ = clearWBit(
   3337              rexAMode_M(vreg2ireg(i->Ain.SseLdzLO.reg),
   3338                         i->Ain.SseLdzLO.addr));
   3339       *p++ = 0x0F;
   3340       *p++ = 0x10;
   3341       p = doAMode_M(p, vreg2ireg(i->Ain.SseLdzLO.reg),
   3342                        i->Ain.SseLdzLO.addr);
   3343       goto done;
   3344 
   3345    case Ain_Sse32Fx4:
   3346       xtra = 0;
   3347       *p++ = clearWBit(
   3348              rexAMode_R( vreg2ireg(i->Ain.Sse32Fx4.dst),
   3349                          vreg2ireg(i->Ain.Sse32Fx4.src) ));
   3350       *p++ = 0x0F;
   3351       switch (i->Ain.Sse32Fx4.op) {
   3352          case Asse_ADDF:   *p++ = 0x58; break;
   3353          case Asse_DIVF:   *p++ = 0x5E; break;
   3354          case Asse_MAXF:   *p++ = 0x5F; break;
   3355          case Asse_MINF:   *p++ = 0x5D; break;
   3356          case Asse_MULF:   *p++ = 0x59; break;
   3357          case Asse_RCPF:   *p++ = 0x53; break;
   3358          case Asse_RSQRTF: *p++ = 0x52; break;
   3359          case Asse_SQRTF:  *p++ = 0x51; break;
   3360          case Asse_SUBF:   *p++ = 0x5C; break;
   3361          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3362          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3363          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3364          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3365          default: goto bad;
   3366       }
   3367       p = doAMode_R(p, vreg2ireg(i->Ain.Sse32Fx4.dst),
   3368                        vreg2ireg(i->Ain.Sse32Fx4.src) );
   3369       if (xtra & 0x100)
   3370          *p++ = toUChar(xtra & 0xFF);
   3371       goto done;
   3372 
   3373    case Ain_Sse64Fx2:
   3374       xtra = 0;
   3375       *p++ = 0x66;
   3376       *p++ = clearWBit(
   3377              rexAMode_R( vreg2ireg(i->Ain.Sse64Fx2.dst),
   3378                          vreg2ireg(i->Ain.Sse64Fx2.src) ));
   3379       *p++ = 0x0F;
   3380       switch (i->Ain.Sse64Fx2.op) {
   3381          case Asse_ADDF:   *p++ = 0x58; break;
   3382          case Asse_DIVF:   *p++ = 0x5E; break;
   3383          case Asse_MAXF:   *p++ = 0x5F; break;
   3384          case Asse_MINF:   *p++ = 0x5D; break;
   3385          case Asse_MULF:   *p++ = 0x59; break;
   3386 //..          case Xsse_RCPF:   *p++ = 0x53; break;
   3387 //..          case Xsse_RSQRTF: *p++ = 0x52; break;
   3388          case Asse_SQRTF:  *p++ = 0x51; break;
   3389          case Asse_SUBF:   *p++ = 0x5C; break;
   3390          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3391          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3392          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3393          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3394          default: goto bad;
   3395       }
   3396       p = doAMode_R(p, vreg2ireg(i->Ain.Sse64Fx2.dst),
   3397                        vreg2ireg(i->Ain.Sse64Fx2.src) );
   3398       if (xtra & 0x100)
   3399          *p++ = toUChar(xtra & 0xFF);
   3400       goto done;
   3401 
   3402    case Ain_Sse32FLo:
   3403       xtra = 0;
   3404       *p++ = 0xF3;
   3405       *p++ = clearWBit(
   3406              rexAMode_R( vreg2ireg(i->Ain.Sse32FLo.dst),
   3407                          vreg2ireg(i->Ain.Sse32FLo.src) ));
   3408       *p++ = 0x0F;
   3409       switch (i->Ain.Sse32FLo.op) {
   3410          case Asse_ADDF:   *p++ = 0x58; break;
   3411          case Asse_DIVF:   *p++ = 0x5E; break;
   3412          case Asse_MAXF:   *p++ = 0x5F; break;
   3413          case Asse_MINF:   *p++ = 0x5D; break;
   3414          case Asse_MULF:   *p++ = 0x59; break;
   3415          case Asse_RCPF:   *p++ = 0x53; break;
   3416          case Asse_RSQRTF: *p++ = 0x52; break;
   3417          case Asse_SQRTF:  *p++ = 0x51; break;
   3418          case Asse_SUBF:   *p++ = 0x5C; break;
   3419          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3420          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3421          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3422          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3423          default: goto bad;
   3424       }
   3425       p = doAMode_R(p, vreg2ireg(i->Ain.Sse32FLo.dst),
   3426                        vreg2ireg(i->Ain.Sse32FLo.src) );
   3427       if (xtra & 0x100)
   3428          *p++ = toUChar(xtra & 0xFF);
   3429       goto done;
   3430 
   3431    case Ain_Sse64FLo:
   3432       xtra = 0;
   3433       *p++ = 0xF2;
   3434       *p++ = clearWBit(
   3435              rexAMode_R( vreg2ireg(i->Ain.Sse64FLo.dst),
   3436                          vreg2ireg(i->Ain.Sse64FLo.src) ));
   3437       *p++ = 0x0F;
   3438       switch (i->Ain.Sse64FLo.op) {
   3439          case Asse_ADDF:   *p++ = 0x58; break;
   3440          case Asse_DIVF:   *p++ = 0x5E; break;
   3441          case Asse_MAXF:   *p++ = 0x5F; break;
   3442          case Asse_MINF:   *p++ = 0x5D; break;
   3443          case Asse_MULF:   *p++ = 0x59; break;
   3444 //..          case Xsse_RCPF:   *p++ = 0x53; break;
   3445 //..          case Xsse_RSQRTF: *p++ = 0x52; break;
   3446          case Asse_SQRTF:  *p++ = 0x51; break;
   3447          case Asse_SUBF:   *p++ = 0x5C; break;
   3448          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3449          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3450          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3451          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3452          default: goto bad;
   3453       }
   3454       p = doAMode_R(p, vreg2ireg(i->Ain.Sse64FLo.dst),
   3455                        vreg2ireg(i->Ain.Sse64FLo.src) );
   3456       if (xtra & 0x100)
   3457          *p++ = toUChar(xtra & 0xFF);
   3458       goto done;
   3459 
   3460    case Ain_SseReRg:
   3461 #     define XX(_n) *p++ = (_n)
   3462 
   3463       rex = clearWBit(
   3464             rexAMode_R( vreg2ireg(i->Ain.SseReRg.dst),
   3465                         vreg2ireg(i->Ain.SseReRg.src) ));
   3466 
   3467       switch (i->Ain.SseReRg.op) {
   3468          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
   3469          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
   3470          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
   3471          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
   3472          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
   3473          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
   3474          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
   3475          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
   3476          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
   3477          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
   3478          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
   3479          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
   3480          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
   3481          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
   3482          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
   3483          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
   3484          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
   3485          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
   3486          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
   3487          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
   3488          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
   3489          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
   3490          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
   3491          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
   3492          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
   3493          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
   3494          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
   3495          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
   3496          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
   3497          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
   3498          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
   3499          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
   3500          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
   3501          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
   3502          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
   3503          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
   3504          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
   3505          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
   3506          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
   3507          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
   3508          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
   3509          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
   3510          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
   3511          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
   3512          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
   3513          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
   3514          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
   3515          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
   3516          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
   3517          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
   3518          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
   3519          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
   3520          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
   3521          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
   3522          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
   3523          default: goto bad;
   3524       }
   3525       p = doAMode_R(p, vreg2ireg(i->Ain.SseReRg.dst),
   3526                        vreg2ireg(i->Ain.SseReRg.src) );
   3527 #     undef XX
   3528       goto done;
   3529 
   3530    case Ain_SseCMov:
   3531       /* jmp fwds if !condition */
   3532       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
   3533       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   3534       ptmp = p;
   3535 
   3536       /* movaps %src, %dst */
   3537       *p++ = clearWBit(
   3538              rexAMode_R( vreg2ireg(i->Ain.SseCMov.dst),
   3539                          vreg2ireg(i->Ain.SseCMov.src) ));
   3540       *p++ = 0x0F;
   3541       *p++ = 0x28;
   3542       p = doAMode_R(p, vreg2ireg(i->Ain.SseCMov.dst),
   3543                        vreg2ireg(i->Ain.SseCMov.src) );
   3544 
   3545       /* Fill in the jump offset. */
   3546       *(ptmp-1) = toUChar(p - ptmp);
   3547       goto done;
   3548 
   3549    case Ain_SseShuf:
   3550       *p++ = 0x66;
   3551       *p++ = clearWBit(
   3552              rexAMode_R( vreg2ireg(i->Ain.SseShuf.dst),
   3553                          vreg2ireg(i->Ain.SseShuf.src) ));
   3554       *p++ = 0x0F;
   3555       *p++ = 0x70;
   3556       p = doAMode_R(p, vreg2ireg(i->Ain.SseShuf.dst),
   3557                        vreg2ireg(i->Ain.SseShuf.src) );
   3558       *p++ = (UChar)(i->Ain.SseShuf.order);
   3559       goto done;
   3560 
   3561    default:
   3562       goto bad;
   3563    }
   3564 
   3565   bad:
   3566    ppAMD64Instr(i, mode64);
   3567    vpanic("emit_AMD64Instr");
   3568    /*NOTREACHED*/
   3569 
   3570   done:
   3571    vassert(p - &buf[0] <= 32);
   3572    return p - &buf[0];
   3573 
   3574 #  undef fake
   3575 }
   3576 
   3577 /*---------------------------------------------------------------*/
   3578 /*--- end                                   host_amd64_defs.c ---*/
   3579 /*---------------------------------------------------------------*/
   3580