Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_defs.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2012 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex.h"
     38 #include "libvex_trc_values.h"
     39 
     40 #include "main_util.h"
     41 #include "host_generic_regs.h"
     42 #include "host_amd64_defs.h"
     43 
     44 
     45 /* --------- Registers. --------- */
     46 
     47 void ppHRegAMD64 ( HReg reg )
     48 {
     49    Int r;
     50    static HChar* ireg64_names[16]
     51      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
     52          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
     53    /* Be generic for all virtual regs. */
     54    if (hregIsVirtual(reg)) {
     55       ppHReg(reg);
     56       return;
     57    }
     58    /* But specific for real regs. */
     59    switch (hregClass(reg)) {
     60       case HRcInt64:
     61          r = hregNumber(reg);
     62          vassert(r >= 0 && r < 16);
     63          vex_printf("%s", ireg64_names[r]);
     64          return;
     65       case HRcFlt64:
     66          r = hregNumber(reg);
     67          vassert(r >= 0 && r < 6);
     68          vex_printf("%%fake%d", r);
     69          return;
     70       case HRcVec128:
     71          r = hregNumber(reg);
     72          vassert(r >= 0 && r < 16);
     73          vex_printf("%%xmm%d", r);
     74          return;
     75       default:
     76          vpanic("ppHRegAMD64");
     77    }
     78 }
     79 
     80 static void ppHRegAMD64_lo32 ( HReg reg )
     81 {
     82    Int r;
     83    static HChar* ireg32_names[16]
     84      = { "%eax",  "%ecx",  "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
     85          "%r8d",  "%r9d",  "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
     86    /* Be generic for all virtual regs. */
     87    if (hregIsVirtual(reg)) {
     88       ppHReg(reg);
     89       vex_printf("d");
     90       return;
     91    }
     92    /* But specific for real regs. */
     93    switch (hregClass(reg)) {
     94       case HRcInt64:
     95          r = hregNumber(reg);
     96          vassert(r >= 0 && r < 16);
     97          vex_printf("%s", ireg32_names[r]);
     98          return;
     99       default:
    100          vpanic("ppHRegAMD64_lo32: invalid regclass");
    101    }
    102 }
    103 
    104 HReg hregAMD64_RAX ( void ) { return mkHReg( 0, HRcInt64, False); }
    105 HReg hregAMD64_RCX ( void ) { return mkHReg( 1, HRcInt64, False); }
    106 HReg hregAMD64_RDX ( void ) { return mkHReg( 2, HRcInt64, False); }
    107 HReg hregAMD64_RBX ( void ) { return mkHReg( 3, HRcInt64, False); }
    108 HReg hregAMD64_RSP ( void ) { return mkHReg( 4, HRcInt64, False); }
    109 HReg hregAMD64_RBP ( void ) { return mkHReg( 5, HRcInt64, False); }
    110 HReg hregAMD64_RSI ( void ) { return mkHReg( 6, HRcInt64, False); }
    111 HReg hregAMD64_RDI ( void ) { return mkHReg( 7, HRcInt64, False); }
    112 HReg hregAMD64_R8  ( void ) { return mkHReg( 8, HRcInt64, False); }
    113 HReg hregAMD64_R9  ( void ) { return mkHReg( 9, HRcInt64, False); }
    114 HReg hregAMD64_R10 ( void ) { return mkHReg(10, HRcInt64, False); }
    115 HReg hregAMD64_R11 ( void ) { return mkHReg(11, HRcInt64, False); }
    116 HReg hregAMD64_R12 ( void ) { return mkHReg(12, HRcInt64, False); }
    117 HReg hregAMD64_R13 ( void ) { return mkHReg(13, HRcInt64, False); }
    118 HReg hregAMD64_R14 ( void ) { return mkHReg(14, HRcInt64, False); }
    119 HReg hregAMD64_R15 ( void ) { return mkHReg(15, HRcInt64, False); }
    120 
    121 HReg hregAMD64_XMM0  ( void ) { return mkHReg( 0, HRcVec128, False); }
    122 HReg hregAMD64_XMM1  ( void ) { return mkHReg( 1, HRcVec128, False); }
    123 HReg hregAMD64_XMM3  ( void ) { return mkHReg( 3, HRcVec128, False); }
    124 HReg hregAMD64_XMM4  ( void ) { return mkHReg( 4, HRcVec128, False); }
    125 HReg hregAMD64_XMM5  ( void ) { return mkHReg( 5, HRcVec128, False); }
    126 HReg hregAMD64_XMM6  ( void ) { return mkHReg( 6, HRcVec128, False); }
    127 HReg hregAMD64_XMM7  ( void ) { return mkHReg( 7, HRcVec128, False); }
    128 HReg hregAMD64_XMM8  ( void ) { return mkHReg( 8, HRcVec128, False); }
    129 HReg hregAMD64_XMM9  ( void ) { return mkHReg( 9, HRcVec128, False); }
    130 HReg hregAMD64_XMM10 ( void ) { return mkHReg(10, HRcVec128, False); }
    131 HReg hregAMD64_XMM11 ( void ) { return mkHReg(11, HRcVec128, False); }
    132 HReg hregAMD64_XMM12 ( void ) { return mkHReg(12, HRcVec128, False); }
    133 
    134 
    135 void getAllocableRegs_AMD64 ( Int* nregs, HReg** arr )
    136 {
    137 #if 0
    138    *nregs = 6;
    139    *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
    140    (*arr)[ 0] = hregAMD64_RSI();
    141    (*arr)[ 1] = hregAMD64_RDI();
    142    (*arr)[ 2] = hregAMD64_RBX();
    143 
    144    (*arr)[ 3] = hregAMD64_XMM7();
    145    (*arr)[ 4] = hregAMD64_XMM8();
    146    (*arr)[ 5] = hregAMD64_XMM9();
    147 #endif
    148 #if 1
    149    *nregs = 20;
    150    *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
    151    (*arr)[ 0] = hregAMD64_RSI();
    152    (*arr)[ 1] = hregAMD64_RDI();
    153    (*arr)[ 2] = hregAMD64_R8();
    154    (*arr)[ 3] = hregAMD64_R9();
    155    (*arr)[ 4] = hregAMD64_R12();
    156    (*arr)[ 5] = hregAMD64_R13();
    157    (*arr)[ 6] = hregAMD64_R14();
    158    (*arr)[ 7] = hregAMD64_R15();
    159    (*arr)[ 8] = hregAMD64_RBX();
    160 
    161    (*arr)[ 9] = hregAMD64_XMM3();
    162    (*arr)[10] = hregAMD64_XMM4();
    163    (*arr)[11] = hregAMD64_XMM5();
    164    (*arr)[12] = hregAMD64_XMM6();
    165    (*arr)[13] = hregAMD64_XMM7();
    166    (*arr)[14] = hregAMD64_XMM8();
    167    (*arr)[15] = hregAMD64_XMM9();
    168    (*arr)[16] = hregAMD64_XMM10();
    169    (*arr)[17] = hregAMD64_XMM11();
    170    (*arr)[18] = hregAMD64_XMM12();
    171    (*arr)[19] = hregAMD64_R10();
    172 #endif
    173 }
    174 
    175 
    176 /* --------- Condition codes, Intel encoding. --------- */
    177 
    178 HChar* showAMD64CondCode ( AMD64CondCode cond )
    179 {
    180    switch (cond) {
    181       case Acc_O:      return "o";
    182       case Acc_NO:     return "no";
    183       case Acc_B:      return "b";
    184       case Acc_NB:     return "nb";
    185       case Acc_Z:      return "z";
    186       case Acc_NZ:     return "nz";
    187       case Acc_BE:     return "be";
    188       case Acc_NBE:    return "nbe";
    189       case Acc_S:      return "s";
    190       case Acc_NS:     return "ns";
    191       case Acc_P:      return "p";
    192       case Acc_NP:     return "np";
    193       case Acc_L:      return "l";
    194       case Acc_NL:     return "nl";
    195       case Acc_LE:     return "le";
    196       case Acc_NLE:    return "nle";
    197       case Acc_ALWAYS: return "ALWAYS";
    198       default: vpanic("ppAMD64CondCode");
    199    }
    200 }
    201 
    202 
    203 /* --------- AMD64AMode: memory address expressions. --------- */
    204 
    205 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
    206    AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
    207    am->tag        = Aam_IR;
    208    am->Aam.IR.imm = imm32;
    209    am->Aam.IR.reg = reg;
    210    return am;
    211 }
    212 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
    213    AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
    214    am->tag = Aam_IRRS;
    215    am->Aam.IRRS.imm   = imm32;
    216    am->Aam.IRRS.base  = base;
    217    am->Aam.IRRS.index = indEx;
    218    am->Aam.IRRS.shift = shift;
    219    vassert(shift >= 0 && shift <= 3);
    220    return am;
    221 }
    222 
    223 void ppAMD64AMode ( AMD64AMode* am ) {
    224    switch (am->tag) {
    225       case Aam_IR:
    226          if (am->Aam.IR.imm == 0)
    227             vex_printf("(");
    228          else
    229             vex_printf("0x%x(", am->Aam.IR.imm);
    230          ppHRegAMD64(am->Aam.IR.reg);
    231          vex_printf(")");
    232          return;
    233       case Aam_IRRS:
    234          vex_printf("0x%x(", am->Aam.IRRS.imm);
    235          ppHRegAMD64(am->Aam.IRRS.base);
    236          vex_printf(",");
    237          ppHRegAMD64(am->Aam.IRRS.index);
    238          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
    239          return;
    240       default:
    241          vpanic("ppAMD64AMode");
    242    }
    243 }
    244 
    245 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
    246    switch (am->tag) {
    247       case Aam_IR:
    248          addHRegUse(u, HRmRead, am->Aam.IR.reg);
    249          return;
    250       case Aam_IRRS:
    251          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
    252          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
    253          return;
    254       default:
    255          vpanic("addRegUsage_AMD64AMode");
    256    }
    257 }
    258 
    259 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
    260    switch (am->tag) {
    261       case Aam_IR:
    262          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
    263          return;
    264       case Aam_IRRS:
    265          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
    266          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
    267          return;
    268       default:
    269          vpanic("mapRegs_AMD64AMode");
    270    }
    271 }
    272 
    273 /* --------- Operand, which can be reg, immediate or memory. --------- */
    274 
    275 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
    276    AMD64RMI* op       = LibVEX_Alloc(sizeof(AMD64RMI));
    277    op->tag            = Armi_Imm;
    278    op->Armi.Imm.imm32 = imm32;
    279    return op;
    280 }
    281 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
    282    AMD64RMI* op     = LibVEX_Alloc(sizeof(AMD64RMI));
    283    op->tag          = Armi_Reg;
    284    op->Armi.Reg.reg = reg;
    285    return op;
    286 }
    287 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
    288    AMD64RMI* op    = LibVEX_Alloc(sizeof(AMD64RMI));
    289    op->tag         = Armi_Mem;
    290    op->Armi.Mem.am = am;
    291    return op;
    292 }
    293 
    294 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
    295    switch (op->tag) {
    296       case Armi_Imm:
    297          vex_printf("$0x%x", op->Armi.Imm.imm32);
    298          return;
    299       case Armi_Reg:
    300          if (lo32)
    301             ppHRegAMD64_lo32(op->Armi.Reg.reg);
    302          else
    303             ppHRegAMD64(op->Armi.Reg.reg);
    304          return;
    305       case Armi_Mem:
    306          ppAMD64AMode(op->Armi.Mem.am);
    307          return;
    308      default:
    309          vpanic("ppAMD64RMI");
    310    }
    311 }
    312 void ppAMD64RMI ( AMD64RMI* op ) {
    313    ppAMD64RMI_wrk(op, False/*!lo32*/);
    314 }
    315 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
    316    ppAMD64RMI_wrk(op, True/*lo32*/);
    317 }
    318 
    319 /* An AMD64RMI can only be used in a "read" context (what would it mean
    320    to write or modify a literal?) and so we enumerate its registers
    321    accordingly. */
    322 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
    323    switch (op->tag) {
    324       case Armi_Imm:
    325          return;
    326       case Armi_Reg:
    327          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
    328          return;
    329       case Armi_Mem:
    330          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
    331          return;
    332       default:
    333          vpanic("addRegUsage_AMD64RMI");
    334    }
    335 }
    336 
    337 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
    338    switch (op->tag) {
    339       case Armi_Imm:
    340          return;
    341       case Armi_Reg:
    342          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
    343          return;
    344       case Armi_Mem:
    345          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
    346          return;
    347       default:
    348          vpanic("mapRegs_AMD64RMI");
    349    }
    350 }
    351 
    352 
    353 /* --------- Operand, which can be reg or immediate only. --------- */
    354 
    355 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
    356    AMD64RI* op       = LibVEX_Alloc(sizeof(AMD64RI));
    357    op->tag           = Ari_Imm;
    358    op->Ari.Imm.imm32 = imm32;
    359    return op;
    360 }
    361 AMD64RI* AMD64RI_Reg ( HReg reg ) {
    362    AMD64RI* op     = LibVEX_Alloc(sizeof(AMD64RI));
    363    op->tag         = Ari_Reg;
    364    op->Ari.Reg.reg = reg;
    365    return op;
    366 }
    367 
    368 void ppAMD64RI ( AMD64RI* op ) {
    369    switch (op->tag) {
    370       case Ari_Imm:
    371          vex_printf("$0x%x", op->Ari.Imm.imm32);
    372          return;
    373       case Ari_Reg:
    374          ppHRegAMD64(op->Ari.Reg.reg);
    375          return;
    376      default:
    377          vpanic("ppAMD64RI");
    378    }
    379 }
    380 
    381 /* An AMD64RI can only be used in a "read" context (what would it mean
    382    to write or modify a literal?) and so we enumerate its registers
    383    accordingly. */
    384 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
    385    switch (op->tag) {
    386       case Ari_Imm:
    387          return;
    388       case Ari_Reg:
    389          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
    390          return;
    391       default:
    392          vpanic("addRegUsage_AMD64RI");
    393    }
    394 }
    395 
    396 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
    397    switch (op->tag) {
    398       case Ari_Imm:
    399          return;
    400       case Ari_Reg:
    401          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
    402          return;
    403       default:
    404          vpanic("mapRegs_AMD64RI");
    405    }
    406 }
    407 
    408 
    409 /* --------- Operand, which can be reg or memory only. --------- */
    410 
    411 AMD64RM* AMD64RM_Reg ( HReg reg ) {
    412    AMD64RM* op       = LibVEX_Alloc(sizeof(AMD64RM));
    413    op->tag         = Arm_Reg;
    414    op->Arm.Reg.reg = reg;
    415    return op;
    416 }
    417 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
    418    AMD64RM* op    = LibVEX_Alloc(sizeof(AMD64RM));
    419    op->tag        = Arm_Mem;
    420    op->Arm.Mem.am = am;
    421    return op;
    422 }
    423 
    424 void ppAMD64RM ( AMD64RM* op ) {
    425    switch (op->tag) {
    426       case Arm_Mem:
    427          ppAMD64AMode(op->Arm.Mem.am);
    428          return;
    429       case Arm_Reg:
    430          ppHRegAMD64(op->Arm.Reg.reg);
    431          return;
    432      default:
    433          vpanic("ppAMD64RM");
    434    }
    435 }
    436 
    437 /* Because an AMD64RM can be both a source or destination operand, we
    438    have to supply a mode -- pertaining to the operand as a whole --
    439    indicating how it's being used. */
    440 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
    441    switch (op->tag) {
    442       case Arm_Mem:
    443          /* Memory is read, written or modified.  So we just want to
    444             know the regs read by the amode. */
    445          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
    446          return;
    447       case Arm_Reg:
    448          /* reg is read, written or modified.  Add it in the
    449             appropriate way. */
    450          addHRegUse(u, mode, op->Arm.Reg.reg);
    451          return;
    452      default:
    453          vpanic("addRegUsage_AMD64RM");
    454    }
    455 }
    456 
    457 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
    458 {
    459    switch (op->tag) {
    460       case Arm_Mem:
    461          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
    462          return;
    463       case Arm_Reg:
    464          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
    465          return;
    466      default:
    467          vpanic("mapRegs_AMD64RM");
    468    }
    469 }
    470 
    471 
    472 /* --------- Instructions. --------- */
    473 
    474 static HChar* showAMD64ScalarSz ( Int sz ) {
    475    switch (sz) {
    476       case 2: return "w";
    477       case 4: return "l";
    478       case 8: return "q";
    479       default: vpanic("showAMD64ScalarSz");
    480    }
    481 }
    482 
    483 HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
    484    switch (op) {
    485       case Aun_NOT: return "not";
    486       case Aun_NEG: return "neg";
    487       default: vpanic("showAMD64UnaryOp");
    488    }
    489 }
    490 
    491 HChar* showAMD64AluOp ( AMD64AluOp op ) {
    492    switch (op) {
    493       case Aalu_MOV:  return "mov";
    494       case Aalu_CMP:  return "cmp";
    495       case Aalu_ADD:  return "add";
    496       case Aalu_SUB:  return "sub";
    497       case Aalu_ADC:  return "adc";
    498       case Aalu_SBB:  return "sbb";
    499       case Aalu_AND:  return "and";
    500       case Aalu_OR:   return "or";
    501       case Aalu_XOR:  return "xor";
    502       case Aalu_MUL:  return "imul";
    503       default: vpanic("showAMD64AluOp");
    504    }
    505 }
    506 
    507 HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
    508    switch (op) {
    509       case Ash_SHL: return "shl";
    510       case Ash_SHR: return "shr";
    511       case Ash_SAR: return "sar";
    512       default: vpanic("showAMD64ShiftOp");
    513    }
    514 }
    515 
    516 HChar* showA87FpOp ( A87FpOp op ) {
    517    switch (op) {
    518       case Afp_SCALE:  return "scale";
    519       case Afp_ATAN:   return "atan";
    520       case Afp_YL2X:   return "yl2x";
    521       case Afp_YL2XP1: return "yl2xp1";
    522       case Afp_PREM:   return "prem";
    523       case Afp_PREM1:  return "prem1";
    524       case Afp_SQRT:   return "sqrt";
    525       case Afp_SIN:    return "sin";
    526       case Afp_COS:    return "cos";
    527       case Afp_TAN:    return "tan";
    528       case Afp_ROUND:  return "round";
    529       case Afp_2XM1:   return "2xm1";
    530       default: vpanic("showA87FpOp");
    531    }
    532 }
    533 
    534 HChar* showAMD64SseOp ( AMD64SseOp op ) {
    535    switch (op) {
    536       case Asse_MOV:      return "movups";
    537       case Asse_ADDF:     return "add";
    538       case Asse_SUBF:     return "sub";
    539       case Asse_MULF:     return "mul";
    540       case Asse_DIVF:     return "div";
    541       case Asse_MAXF:     return "max";
    542       case Asse_MINF:     return "min";
    543       case Asse_CMPEQF:   return "cmpFeq";
    544       case Asse_CMPLTF:   return "cmpFlt";
    545       case Asse_CMPLEF:   return "cmpFle";
    546       case Asse_CMPUNF:   return "cmpFun";
    547       case Asse_RCPF:     return "rcp";
    548       case Asse_RSQRTF:   return "rsqrt";
    549       case Asse_SQRTF:    return "sqrt";
    550       case Asse_AND:      return "and";
    551       case Asse_OR:       return "or";
    552       case Asse_XOR:      return "xor";
    553       case Asse_ANDN:     return "andn";
    554       case Asse_ADD8:     return "paddb";
    555       case Asse_ADD16:    return "paddw";
    556       case Asse_ADD32:    return "paddd";
    557       case Asse_ADD64:    return "paddq";
    558       case Asse_QADD8U:   return "paddusb";
    559       case Asse_QADD16U:  return "paddusw";
    560       case Asse_QADD8S:   return "paddsb";
    561       case Asse_QADD16S:  return "paddsw";
    562       case Asse_SUB8:     return "psubb";
    563       case Asse_SUB16:    return "psubw";
    564       case Asse_SUB32:    return "psubd";
    565       case Asse_SUB64:    return "psubq";
    566       case Asse_QSUB8U:   return "psubusb";
    567       case Asse_QSUB16U:  return "psubusw";
    568       case Asse_QSUB8S:   return "psubsb";
    569       case Asse_QSUB16S:  return "psubsw";
    570       case Asse_MUL16:    return "pmullw";
    571       case Asse_MULHI16U: return "pmulhuw";
    572       case Asse_MULHI16S: return "pmulhw";
    573       case Asse_AVG8U:    return "pavgb";
    574       case Asse_AVG16U:   return "pavgw";
    575       case Asse_MAX16S:   return "pmaxw";
    576       case Asse_MAX8U:    return "pmaxub";
    577       case Asse_MIN16S:   return "pminw";
    578       case Asse_MIN8U:    return "pminub";
    579       case Asse_CMPEQ8:   return "pcmpeqb";
    580       case Asse_CMPEQ16:  return "pcmpeqw";
    581       case Asse_CMPEQ32:  return "pcmpeqd";
    582       case Asse_CMPGT8S:  return "pcmpgtb";
    583       case Asse_CMPGT16S: return "pcmpgtw";
    584       case Asse_CMPGT32S: return "pcmpgtd";
    585       case Asse_SHL16:    return "psllw";
    586       case Asse_SHL32:    return "pslld";
    587       case Asse_SHL64:    return "psllq";
    588       case Asse_SHR16:    return "psrlw";
    589       case Asse_SHR32:    return "psrld";
    590       case Asse_SHR64:    return "psrlq";
    591       case Asse_SAR16:    return "psraw";
    592       case Asse_SAR32:    return "psrad";
    593       case Asse_PACKSSD:  return "packssdw";
    594       case Asse_PACKSSW:  return "packsswb";
    595       case Asse_PACKUSW:  return "packuswb";
    596       case Asse_UNPCKHB:  return "punpckhb";
    597       case Asse_UNPCKHW:  return "punpckhw";
    598       case Asse_UNPCKHD:  return "punpckhd";
    599       case Asse_UNPCKHQ:  return "punpckhq";
    600       case Asse_UNPCKLB:  return "punpcklb";
    601       case Asse_UNPCKLW:  return "punpcklw";
    602       case Asse_UNPCKLD:  return "punpckld";
    603       case Asse_UNPCKLQ:  return "punpcklq";
    604       default: vpanic("showAMD64SseOp");
    605    }
    606 }
    607 
    608 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
    609    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    610    i->tag             = Ain_Imm64;
    611    i->Ain.Imm64.imm64 = imm64;
    612    i->Ain.Imm64.dst   = dst;
    613    return i;
    614 }
    615 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    616    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    617    i->tag            = Ain_Alu64R;
    618    i->Ain.Alu64R.op  = op;
    619    i->Ain.Alu64R.src = src;
    620    i->Ain.Alu64R.dst = dst;
    621    return i;
    622 }
    623 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
    624    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    625    i->tag            = Ain_Alu64M;
    626    i->Ain.Alu64M.op  = op;
    627    i->Ain.Alu64M.src = src;
    628    i->Ain.Alu64M.dst = dst;
    629    vassert(op != Aalu_MUL);
    630    return i;
    631 }
    632 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
    633    AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
    634    i->tag          = Ain_Sh64;
    635    i->Ain.Sh64.op  = op;
    636    i->Ain.Sh64.src = src;
    637    i->Ain.Sh64.dst = dst;
    638    return i;
    639 }
    640 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
    641    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    642    i->tag              = Ain_Test64;
    643    i->Ain.Test64.imm32 = imm32;
    644    i->Ain.Test64.dst   = dst;
    645    return i;
    646 }
    647 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
    648    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    649    i->tag             = Ain_Unary64;
    650    i->Ain.Unary64.op  = op;
    651    i->Ain.Unary64.dst = dst;
    652    return i;
    653 }
    654 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
    655    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    656    i->tag             = Ain_Lea64;
    657    i->Ain.Lea64.am    = am;
    658    i->Ain.Lea64.dst   = dst;
    659    return i;
    660 }
    661 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    662    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    663    i->tag            = Ain_Alu32R;
    664    i->Ain.Alu32R.op  = op;
    665    i->Ain.Alu32R.src = src;
    666    i->Ain.Alu32R.dst = dst;
    667    switch (op) {
    668       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
    669       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
    670       default: vassert(0);
    671    }
    672    return i;
    673 }
    674 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
    675    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    676    i->tag            = Ain_MulL;
    677    i->Ain.MulL.syned = syned;
    678    i->Ain.MulL.src   = src;
    679    return i;
    680 }
    681 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
    682    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    683    i->tag            = Ain_Div;
    684    i->Ain.Div.syned  = syned;
    685    i->Ain.Div.sz     = sz;
    686    i->Ain.Div.src    = src;
    687    vassert(sz == 4 || sz == 8);
    688    return i;
    689 }
    690 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
    691    AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
    692    i->tag          = Ain_Push;
    693    i->Ain.Push.src = src;
    694    return i;
    695 }
    696 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms ) {
    697    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    698    i->tag               = Ain_Call;
    699    i->Ain.Call.cond     = cond;
    700    i->Ain.Call.target   = target;
    701    i->Ain.Call.regparms = regparms;
    702    vassert(regparms >= 0 && regparms <= 6);
    703    return i;
    704 }
    705 
    706 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
    707                                  AMD64CondCode cond, Bool toFastEP ) {
    708    AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
    709    i->tag                  = Ain_XDirect;
    710    i->Ain.XDirect.dstGA    = dstGA;
    711    i->Ain.XDirect.amRIP    = amRIP;
    712    i->Ain.XDirect.cond     = cond;
    713    i->Ain.XDirect.toFastEP = toFastEP;
    714    return i;
    715 }
    716 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
    717                                 AMD64CondCode cond ) {
    718    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    719    i->tag              = Ain_XIndir;
    720    i->Ain.XIndir.dstGA = dstGA;
    721    i->Ain.XIndir.amRIP = amRIP;
    722    i->Ain.XIndir.cond  = cond;
    723    return i;
    724 }
    725 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
    726                                    AMD64CondCode cond, IRJumpKind jk ) {
    727    AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
    728    i->tag                 = Ain_XAssisted;
    729    i->Ain.XAssisted.dstGA = dstGA;
    730    i->Ain.XAssisted.amRIP = amRIP;
    731    i->Ain.XAssisted.cond  = cond;
    732    i->Ain.XAssisted.jk    = jk;
    733    return i;
    734 }
    735 
    736 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, AMD64RM* src, HReg dst ) {
    737    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    738    i->tag             = Ain_CMov64;
    739    i->Ain.CMov64.cond = cond;
    740    i->Ain.CMov64.src  = src;
    741    i->Ain.CMov64.dst  = dst;
    742    vassert(cond != Acc_ALWAYS);
    743    return i;
    744 }
    745 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
    746    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    747    i->tag              = Ain_MovxLQ;
    748    i->Ain.MovxLQ.syned = syned;
    749    i->Ain.MovxLQ.src   = src;
    750    i->Ain.MovxLQ.dst   = dst;
    751    return i;
    752 }
    753 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
    754                                 AMD64AMode* src, HReg dst ) {
    755    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    756    i->tag                = Ain_LoadEX;
    757    i->Ain.LoadEX.szSmall = szSmall;
    758    i->Ain.LoadEX.syned   = syned;
    759    i->Ain.LoadEX.src     = src;
    760    i->Ain.LoadEX.dst     = dst;
    761    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
    762    return i;
    763 }
    764 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
    765    AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
    766    i->tag           = Ain_Store;
    767    i->Ain.Store.sz  = sz;
    768    i->Ain.Store.src = src;
    769    i->Ain.Store.dst = dst;
    770    vassert(sz == 1 || sz == 2 || sz == 4);
    771    return i;
    772 }
    773 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
    774    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    775    i->tag            = Ain_Set64;
    776    i->Ain.Set64.cond = cond;
    777    i->Ain.Set64.dst  = dst;
    778    return i;
    779 }
    780 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
    781    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    782    i->tag               = Ain_Bsfr64;
    783    i->Ain.Bsfr64.isFwds = isFwds;
    784    i->Ain.Bsfr64.src    = src;
    785    i->Ain.Bsfr64.dst    = dst;
    786    return i;
    787 }
    788 AMD64Instr* AMD64Instr_MFence ( void ) {
    789    AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
    790    i->tag        = Ain_MFence;
    791    return i;
    792 }
    793 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
    794    AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
    795    i->tag           = Ain_ACAS;
    796    i->Ain.ACAS.addr = addr;
    797    i->Ain.ACAS.sz   = sz;
    798    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
    799    return i;
    800 }
    801 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
    802    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    803    i->tag            = Ain_DACAS;
    804    i->Ain.DACAS.addr = addr;
    805    i->Ain.DACAS.sz   = sz;
    806    vassert(sz == 8 || sz == 4);
    807    return i;
    808 }
    809 
    810 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
    811 {
    812    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    813    i->tag               = Ain_A87Free;
    814    i->Ain.A87Free.nregs = nregs;
    815    vassert(nregs >= 1 && nregs <= 7);
    816    return i;
    817 }
    818 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
    819 {
    820    AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
    821    i->tag                   = Ain_A87PushPop;
    822    i->Ain.A87PushPop.addr   = addr;
    823    i->Ain.A87PushPop.isPush = isPush;
    824    i->Ain.A87PushPop.szB    = szB;
    825    vassert(szB == 8 || szB == 4);
    826    return i;
    827 }
    828 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
    829 {
    830    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    831    i->tag            = Ain_A87FpOp;
    832    i->Ain.A87FpOp.op = op;
    833    return i;
    834 }
    835 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
    836 {
    837    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    838    i->tag              = Ain_A87LdCW;
    839    i->Ain.A87LdCW.addr = addr;
    840    return i;
    841 }
    842 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
    843 {
    844    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    845    i->tag              = Ain_A87StSW;
    846    i->Ain.A87StSW.addr = addr;
    847    return i;
    848 }
    849 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
    850    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    851    i->tag                = Ain_LdMXCSR;
    852    i->Ain.LdMXCSR.addr   = addr;
    853    return i;
    854 }
    855 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
    856    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    857    i->tag                = Ain_SseUComIS;
    858    i->Ain.SseUComIS.sz   = toUChar(sz);
    859    i->Ain.SseUComIS.srcL = srcL;
    860    i->Ain.SseUComIS.srcR = srcR;
    861    i->Ain.SseUComIS.dst  = dst;
    862    vassert(sz == 4 || sz == 8);
    863    return i;
    864 }
    865 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
    866    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    867    i->tag              = Ain_SseSI2SF;
    868    i->Ain.SseSI2SF.szS = toUChar(szS);
    869    i->Ain.SseSI2SF.szD = toUChar(szD);
    870    i->Ain.SseSI2SF.src = src;
    871    i->Ain.SseSI2SF.dst = dst;
    872    vassert(szS == 4 || szS == 8);
    873    vassert(szD == 4 || szD == 8);
    874    return i;
    875 }
    876 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
    877    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    878    i->tag              = Ain_SseSF2SI;
    879    i->Ain.SseSF2SI.szS = toUChar(szS);
    880    i->Ain.SseSF2SI.szD = toUChar(szD);
    881    i->Ain.SseSF2SI.src = src;
    882    i->Ain.SseSF2SI.dst = dst;
    883    vassert(szS == 4 || szS == 8);
    884    vassert(szD == 4 || szD == 8);
    885    return i;
    886 }
    887 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
    888 {
    889    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    890    i->tag                = Ain_SseSDSS;
    891    i->Ain.SseSDSS.from64 = from64;
    892    i->Ain.SseSDSS.src    = src;
    893    i->Ain.SseSDSS.dst    = dst;
    894    return i;
    895 }
    896 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
    897                                  HReg reg, AMD64AMode* addr ) {
    898    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    899    i->tag                = Ain_SseLdSt;
    900    i->Ain.SseLdSt.isLoad = isLoad;
    901    i->Ain.SseLdSt.sz     = toUChar(sz);
    902    i->Ain.SseLdSt.reg    = reg;
    903    i->Ain.SseLdSt.addr   = addr;
    904    vassert(sz == 4 || sz == 8 || sz == 16);
    905    return i;
    906 }
    907 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
    908 {
    909    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    910    i->tag                = Ain_SseLdzLO;
    911    i->Ain.SseLdzLO.sz    = sz;
    912    i->Ain.SseLdzLO.reg   = reg;
    913    i->Ain.SseLdzLO.addr  = addr;
    914    vassert(sz == 4 || sz == 8);
    915    return i;
    916 }
    917 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
    918    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    919    i->tag              = Ain_Sse32Fx4;
    920    i->Ain.Sse32Fx4.op  = op;
    921    i->Ain.Sse32Fx4.src = src;
    922    i->Ain.Sse32Fx4.dst = dst;
    923    vassert(op != Asse_MOV);
    924    return i;
    925 }
    926 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    927    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    928    i->tag              = Ain_Sse32FLo;
    929    i->Ain.Sse32FLo.op  = op;
    930    i->Ain.Sse32FLo.src = src;
    931    i->Ain.Sse32FLo.dst = dst;
    932    vassert(op != Asse_MOV);
    933    return i;
    934 }
    935 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
    936    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    937    i->tag              = Ain_Sse64Fx2;
    938    i->Ain.Sse64Fx2.op  = op;
    939    i->Ain.Sse64Fx2.src = src;
    940    i->Ain.Sse64Fx2.dst = dst;
    941    vassert(op != Asse_MOV);
    942    return i;
    943 }
    944 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    945    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    946    i->tag              = Ain_Sse64FLo;
    947    i->Ain.Sse64FLo.op  = op;
    948    i->Ain.Sse64FLo.src = src;
    949    i->Ain.Sse64FLo.dst = dst;
    950    vassert(op != Asse_MOV);
    951    return i;
    952 }
    953 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
    954    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    955    i->tag             = Ain_SseReRg;
    956    i->Ain.SseReRg.op  = op;
    957    i->Ain.SseReRg.src = re;
    958    i->Ain.SseReRg.dst = rg;
    959    return i;
    960 }
    961 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
    962    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    963    i->tag              = Ain_SseCMov;
    964    i->Ain.SseCMov.cond = cond;
    965    i->Ain.SseCMov.src  = src;
    966    i->Ain.SseCMov.dst  = dst;
    967    vassert(cond != Acc_ALWAYS);
    968    return i;
    969 }
    970 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
    971    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    972    i->tag               = Ain_SseShuf;
    973    i->Ain.SseShuf.order = order;
    974    i->Ain.SseShuf.src   = src;
    975    i->Ain.SseShuf.dst   = dst;
    976    vassert(order >= 0 && order <= 0xFF);
    977    return i;
    978 }
    979 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
    980 //uu                                  HReg reg, AMD64AMode* addr ) {
    981 //uu    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    982 //uu    i->tag                = Ain_AvxLdSt;
    983 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
    984 //uu    i->Ain.AvxLdSt.reg    = reg;
    985 //uu    i->Ain.AvxLdSt.addr   = addr;
    986 //uu    return i;
    987 //uu }
    988 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
    989 //uu    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    990 //uu    i->tag             = Ain_AvxReRg;
    991 //uu    i->Ain.AvxReRg.op  = op;
    992 //uu    i->Ain.AvxReRg.src = re;
    993 //uu    i->Ain.AvxReRg.dst = rg;
    994 //uu    return i;
    995 //uu }
    996 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
    997                                  AMD64AMode* amFailAddr ) {
    998    AMD64Instr* i             = LibVEX_Alloc(sizeof(AMD64Instr));
    999    i->tag                    = Ain_EvCheck;
   1000    i->Ain.EvCheck.amCounter  = amCounter;
   1001    i->Ain.EvCheck.amFailAddr = amFailAddr;
   1002    return i;
   1003 }
   1004 AMD64Instr* AMD64Instr_ProfInc ( void ) {
   1005    AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
   1006    i->tag        = Ain_ProfInc;
   1007    return i;
   1008 }
   1009 
   1010 void ppAMD64Instr ( AMD64Instr* i, Bool mode64 )
   1011 {
   1012    vassert(mode64 == True);
   1013    switch (i->tag) {
   1014       case Ain_Imm64:
   1015          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
   1016          ppHRegAMD64(i->Ain.Imm64.dst);
   1017          return;
   1018       case Ain_Alu64R:
   1019          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
   1020          ppAMD64RMI(i->Ain.Alu64R.src);
   1021          vex_printf(",");
   1022          ppHRegAMD64(i->Ain.Alu64R.dst);
   1023          return;
   1024       case Ain_Alu64M:
   1025          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
   1026          ppAMD64RI(i->Ain.Alu64M.src);
   1027          vex_printf(",");
   1028          ppAMD64AMode(i->Ain.Alu64M.dst);
   1029          return;
   1030       case Ain_Sh64:
   1031          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
   1032          if (i->Ain.Sh64.src == 0)
   1033             vex_printf("%%cl,");
   1034          else
   1035             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
   1036          ppHRegAMD64(i->Ain.Sh64.dst);
   1037          return;
   1038       case Ain_Test64:
   1039          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
   1040          ppHRegAMD64(i->Ain.Test64.dst);
   1041          return;
   1042       case Ain_Unary64:
   1043          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
   1044          ppHRegAMD64(i->Ain.Unary64.dst);
   1045          return;
   1046       case Ain_Lea64:
   1047          vex_printf("leaq ");
   1048          ppAMD64AMode(i->Ain.Lea64.am);
   1049          vex_printf(",");
   1050          ppHRegAMD64(i->Ain.Lea64.dst);
   1051          return;
   1052       case Ain_Alu32R:
   1053          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
   1054          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
   1055          vex_printf(",");
   1056          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
   1057          return;
   1058       case Ain_MulL:
   1059          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
   1060          ppAMD64RM(i->Ain.MulL.src);
   1061          return;
   1062       case Ain_Div:
   1063          vex_printf("%cdiv%s ",
   1064                     i->Ain.Div.syned ? 's' : 'u',
   1065                     showAMD64ScalarSz(i->Ain.Div.sz));
   1066          ppAMD64RM(i->Ain.Div.src);
   1067          return;
   1068       case Ain_Push:
   1069          vex_printf("pushq ");
   1070          ppAMD64RMI(i->Ain.Push.src);
   1071          return;
   1072       case Ain_Call:
   1073          vex_printf("call%s[%d] ",
   1074                     i->Ain.Call.cond==Acc_ALWAYS
   1075                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
   1076                     i->Ain.Call.regparms );
   1077          vex_printf("0x%llx", i->Ain.Call.target);
   1078          break;
   1079 
   1080       case Ain_XDirect:
   1081          vex_printf("(xDirect) ");
   1082          vex_printf("if (%%rflags.%s) { ",
   1083                     showAMD64CondCode(i->Ain.XDirect.cond));
   1084          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
   1085          vex_printf("movq %%r11,");
   1086          ppAMD64AMode(i->Ain.XDirect.amRIP);
   1087          vex_printf("; ");
   1088          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
   1089                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
   1090          return;
   1091       case Ain_XIndir:
   1092          vex_printf("(xIndir) ");
   1093          vex_printf("if (%%rflags.%s) { ",
   1094                     showAMD64CondCode(i->Ain.XIndir.cond));
   1095          vex_printf("movq ");
   1096          ppHRegAMD64(i->Ain.XIndir.dstGA);
   1097          vex_printf(",");
   1098          ppAMD64AMode(i->Ain.XIndir.amRIP);
   1099          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
   1100          return;
   1101       case Ain_XAssisted:
   1102          vex_printf("(xAssisted) ");
   1103          vex_printf("if (%%rflags.%s) { ",
   1104                     showAMD64CondCode(i->Ain.XAssisted.cond));
   1105          vex_printf("movq ");
   1106          ppHRegAMD64(i->Ain.XAssisted.dstGA);
   1107          vex_printf(",");
   1108          ppAMD64AMode(i->Ain.XAssisted.amRIP);
   1109          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
   1110                     (Int)i->Ain.XAssisted.jk);
   1111          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
   1112          return;
   1113 
   1114       case Ain_CMov64:
   1115          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
   1116          ppAMD64RM(i->Ain.CMov64.src);
   1117          vex_printf(",");
   1118          ppHRegAMD64(i->Ain.CMov64.dst);
   1119          return;
   1120       case Ain_MovxLQ:
   1121          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
   1122          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
   1123          vex_printf(",");
   1124          ppHRegAMD64(i->Ain.MovxLQ.dst);
   1125          return;
   1126       case Ain_LoadEX:
   1127          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
   1128             vex_printf("movl ");
   1129             ppAMD64AMode(i->Ain.LoadEX.src);
   1130             vex_printf(",");
   1131             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
   1132          } else {
   1133             vex_printf("mov%c%cq ",
   1134                        i->Ain.LoadEX.syned ? 's' : 'z',
   1135                        i->Ain.LoadEX.szSmall==1
   1136                           ? 'b'
   1137                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
   1138             ppAMD64AMode(i->Ain.LoadEX.src);
   1139             vex_printf(",");
   1140             ppHRegAMD64(i->Ain.LoadEX.dst);
   1141          }
   1142          return;
   1143       case Ain_Store:
   1144          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
   1145                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
   1146          ppHRegAMD64(i->Ain.Store.src);
   1147          vex_printf(",");
   1148          ppAMD64AMode(i->Ain.Store.dst);
   1149          return;
   1150       case Ain_Set64:
   1151          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
   1152          ppHRegAMD64(i->Ain.Set64.dst);
   1153          return;
   1154       case Ain_Bsfr64:
   1155          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
   1156          ppHRegAMD64(i->Ain.Bsfr64.src);
   1157          vex_printf(",");
   1158          ppHRegAMD64(i->Ain.Bsfr64.dst);
   1159          return;
   1160       case Ain_MFence:
   1161          vex_printf("mfence" );
   1162          return;
   1163       case Ain_ACAS:
   1164          vex_printf("lock cmpxchg%c ",
   1165                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
   1166                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
   1167          vex_printf("{%%rax->%%rbx},");
   1168          ppAMD64AMode(i->Ain.ACAS.addr);
   1169          return;
   1170       case Ain_DACAS:
   1171          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
   1172                     (Int)(2 * i->Ain.DACAS.sz));
   1173          ppAMD64AMode(i->Ain.DACAS.addr);
   1174          return;
   1175       case Ain_A87Free:
   1176          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
   1177          break;
   1178       case Ain_A87PushPop:
   1179          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
   1180                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
   1181          ppAMD64AMode(i->Ain.A87PushPop.addr);
   1182          break;
   1183       case Ain_A87FpOp:
   1184          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
   1185          break;
   1186       case Ain_A87LdCW:
   1187          vex_printf("fldcw ");
   1188          ppAMD64AMode(i->Ain.A87LdCW.addr);
   1189          break;
   1190       case Ain_A87StSW:
   1191          vex_printf("fstsw ");
   1192          ppAMD64AMode(i->Ain.A87StSW.addr);
   1193          break;
   1194       case Ain_LdMXCSR:
   1195          vex_printf("ldmxcsr ");
   1196          ppAMD64AMode(i->Ain.LdMXCSR.addr);
   1197          break;
   1198       case Ain_SseUComIS:
   1199          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
   1200          ppHRegAMD64(i->Ain.SseUComIS.srcL);
   1201          vex_printf(",");
   1202          ppHRegAMD64(i->Ain.SseUComIS.srcR);
   1203          vex_printf(" ; pushfq ; popq ");
   1204          ppHRegAMD64(i->Ain.SseUComIS.dst);
   1205          break;
   1206       case Ain_SseSI2SF:
   1207          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
   1208          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1209             (i->Ain.SseSI2SF.src);
   1210          vex_printf(",");
   1211          ppHRegAMD64(i->Ain.SseSI2SF.dst);
   1212          break;
   1213       case Ain_SseSF2SI:
   1214          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
   1215          ppHRegAMD64(i->Ain.SseSF2SI.src);
   1216          vex_printf(",");
   1217          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1218             (i->Ain.SseSF2SI.dst);
   1219          break;
   1220       case Ain_SseSDSS:
   1221          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
   1222          ppHRegAMD64(i->Ain.SseSDSS.src);
   1223          vex_printf(",");
   1224          ppHRegAMD64(i->Ain.SseSDSS.dst);
   1225          break;
   1226       case Ain_SseLdSt:
   1227          switch (i->Ain.SseLdSt.sz) {
   1228             case 4:  vex_printf("movss "); break;
   1229             case 8:  vex_printf("movsd "); break;
   1230             case 16: vex_printf("movups "); break;
   1231             default: vassert(0);
   1232          }
   1233          if (i->Ain.SseLdSt.isLoad) {
   1234             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1235             vex_printf(",");
   1236             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1237          } else {
   1238             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1239             vex_printf(",");
   1240             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1241          }
   1242          return;
   1243       case Ain_SseLdzLO:
   1244          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
   1245          ppAMD64AMode(i->Ain.SseLdzLO.addr);
   1246          vex_printf(",");
   1247          ppHRegAMD64(i->Ain.SseLdzLO.reg);
   1248          return;
   1249       case Ain_Sse32Fx4:
   1250          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
   1251          ppHRegAMD64(i->Ain.Sse32Fx4.src);
   1252          vex_printf(",");
   1253          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
   1254          return;
   1255       case Ain_Sse32FLo:
   1256          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
   1257          ppHRegAMD64(i->Ain.Sse32FLo.src);
   1258          vex_printf(",");
   1259          ppHRegAMD64(i->Ain.Sse32FLo.dst);
   1260          return;
   1261       case Ain_Sse64Fx2:
   1262          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
   1263          ppHRegAMD64(i->Ain.Sse64Fx2.src);
   1264          vex_printf(",");
   1265          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
   1266          return;
   1267       case Ain_Sse64FLo:
   1268          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
   1269          ppHRegAMD64(i->Ain.Sse64FLo.src);
   1270          vex_printf(",");
   1271          ppHRegAMD64(i->Ain.Sse64FLo.dst);
   1272          return;
   1273       case Ain_SseReRg:
   1274          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1275          ppHRegAMD64(i->Ain.SseReRg.src);
   1276          vex_printf(",");
   1277          ppHRegAMD64(i->Ain.SseReRg.dst);
   1278          return;
   1279       case Ain_SseCMov:
   1280          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
   1281          ppHRegAMD64(i->Ain.SseCMov.src);
   1282          vex_printf(",");
   1283          ppHRegAMD64(i->Ain.SseCMov.dst);
   1284          return;
   1285       case Ain_SseShuf:
   1286          vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
   1287          ppHRegAMD64(i->Ain.SseShuf.src);
   1288          vex_printf(",");
   1289          ppHRegAMD64(i->Ain.SseShuf.dst);
   1290          return;
   1291       //uu case Ain_AvxLdSt:
   1292       //uu    vex_printf("vmovups ");
   1293       //uu    if (i->Ain.AvxLdSt.isLoad) {
   1294       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
   1295       //uu       vex_printf(",");
   1296       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
   1297       //uu    } else {
   1298       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
   1299       //uu       vex_printf(",");
   1300       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
   1301       //uu    }
   1302       //uu    return;
   1303       //uu case Ain_AvxReRg:
   1304       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1305       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
   1306       //uu    vex_printf(",");
   1307       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
   1308       //uu    return;
   1309       case Ain_EvCheck:
   1310          vex_printf("(evCheck) decl ");
   1311          ppAMD64AMode(i->Ain.EvCheck.amCounter);
   1312          vex_printf("; jns nofail; jmp *");
   1313          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
   1314          vex_printf("; nofail:");
   1315          return;
   1316       case Ain_ProfInc:
   1317          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
   1318          return;
   1319       default:
   1320          vpanic("ppAMD64Instr");
   1321    }
   1322 }
   1323 
   1324 /* --------- Helpers for register allocation. --------- */
   1325 
   1326 void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 )
   1327 {
   1328    Bool unary;
   1329    vassert(mode64 == True);
   1330    initHRegUsage(u);
   1331    switch (i->tag) {
   1332       case Ain_Imm64:
   1333          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
   1334          return;
   1335       case Ain_Alu64R:
   1336          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
   1337          if (i->Ain.Alu64R.op == Aalu_MOV) {
   1338             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
   1339             return;
   1340          }
   1341          if (i->Ain.Alu64R.op == Aalu_CMP) {
   1342             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
   1343             return;
   1344          }
   1345          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
   1346          return;
   1347       case Ain_Alu64M:
   1348          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
   1349          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
   1350          return;
   1351       case Ain_Sh64:
   1352          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
   1353          if (i->Ain.Sh64.src == 0)
   1354             addHRegUse(u, HRmRead, hregAMD64_RCX());
   1355          return;
   1356       case Ain_Test64:
   1357          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
   1358          return;
   1359       case Ain_Unary64:
   1360          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
   1361          return;
   1362       case Ain_Lea64:
   1363          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
   1364          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
   1365          return;
   1366       case Ain_Alu32R:
   1367          vassert(i->Ain.Alu32R.op != Aalu_MOV);
   1368          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
   1369          if (i->Ain.Alu32R.op == Aalu_CMP) {
   1370             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
   1371             return;
   1372          }
   1373          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
   1374          return;
   1375       case Ain_MulL:
   1376          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
   1377          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1378          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1379          return;
   1380       case Ain_Div:
   1381          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
   1382          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1383          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1384          return;
   1385       case Ain_Push:
   1386          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
   1387          addHRegUse(u, HRmModify, hregAMD64_RSP());
   1388          return;
   1389       case Ain_Call:
   1390          /* This is a bit subtle. */
   1391          /* First off, claim it trashes all the caller-saved regs
   1392             which fall within the register allocator's jurisdiction.
   1393             These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
   1394             and all the xmm registers.
   1395          */
   1396          addHRegUse(u, HRmWrite, hregAMD64_RAX());
   1397          addHRegUse(u, HRmWrite, hregAMD64_RCX());
   1398          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1399          addHRegUse(u, HRmWrite, hregAMD64_RSI());
   1400          addHRegUse(u, HRmWrite, hregAMD64_RDI());
   1401          addHRegUse(u, HRmWrite, hregAMD64_R8());
   1402          addHRegUse(u, HRmWrite, hregAMD64_R9());
   1403          addHRegUse(u, HRmWrite, hregAMD64_R10());
   1404          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1405          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
   1406          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
   1407          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
   1408          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
   1409          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
   1410          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
   1411          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
   1412          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
   1413          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
   1414          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
   1415          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
   1416          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
   1417 
   1418          /* Now we have to state any parameter-carrying registers
   1419             which might be read.  This depends on the regparmness. */
   1420          switch (i->Ain.Call.regparms) {
   1421             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
   1422             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
   1423             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
   1424             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
   1425             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
   1426             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
   1427             case 0: break;
   1428             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
   1429          }
   1430          /* Finally, there is the issue that the insn trashes a
   1431             register because the literal target address has to be
   1432             loaded into a register.  Fortunately, r11 is stated in the
   1433             ABI as a scratch register, and so seems a suitable victim.  */
   1434          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1435          /* Upshot of this is that the assembler really must use r11,
   1436             and no other, as a destination temporary. */
   1437          return;
   1438       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
   1439          conditionally exit the block.  Hence we only need to list (1)
   1440          the registers that they read, and (2) the registers that they
   1441          write in the case where the block is not exited.  (2) is
   1442          empty, hence only (1) is relevant here. */
   1443       case Ain_XDirect:
   1444          /* Don't bother to mention the write to %r11, since it is not
   1445             available to the allocator. */
   1446          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
   1447          return;
   1448       case Ain_XIndir:
   1449          /* Ditto re %r11 */
   1450          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
   1451          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
   1452          return;
   1453       case Ain_XAssisted:
   1454          /* Ditto re %r11 and %rbp (the baseblock ptr) */
   1455          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
   1456          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
   1457          return;
   1458       case Ain_CMov64:
   1459          addRegUsage_AMD64RM(u, i->Ain.CMov64.src, HRmRead);
   1460          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
   1461          return;
   1462       case Ain_MovxLQ:
   1463          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
   1464          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
   1465          return;
   1466       case Ain_LoadEX:
   1467          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
   1468          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
   1469          return;
   1470       case Ain_Store:
   1471          addHRegUse(u, HRmRead, i->Ain.Store.src);
   1472          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
   1473          return;
   1474       case Ain_Set64:
   1475          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
   1476          return;
   1477       case Ain_Bsfr64:
   1478          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
   1479          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
   1480          return;
   1481       case Ain_MFence:
   1482          return;
   1483       case Ain_ACAS:
   1484          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
   1485          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1486          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1487          return;
   1488       case Ain_DACAS:
   1489          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
   1490          addHRegUse(u, HRmRead, hregAMD64_RCX());
   1491          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1492          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1493          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1494          return;
   1495       case Ain_A87Free:
   1496          return;
   1497       case Ain_A87PushPop:
   1498          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
   1499          return;
   1500       case Ain_A87FpOp:
   1501          return;
   1502       case Ain_A87LdCW:
   1503          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
   1504          return;
   1505       case Ain_A87StSW:
   1506          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
   1507          return;
   1508       case Ain_LdMXCSR:
   1509          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
   1510          return;
   1511       case Ain_SseUComIS:
   1512          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
   1513          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
   1514          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
   1515          return;
   1516       case Ain_SseSI2SF:
   1517          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
   1518          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
   1519          return;
   1520       case Ain_SseSF2SI:
   1521          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
   1522          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
   1523          return;
   1524       case Ain_SseSDSS:
   1525          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
   1526          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
   1527          return;
   1528       case Ain_SseLdSt:
   1529          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
   1530          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
   1531                        i->Ain.SseLdSt.reg);
   1532          return;
   1533       case Ain_SseLdzLO:
   1534          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
   1535          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
   1536          return;
   1537       case Ain_Sse32Fx4:
   1538          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
   1539          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
   1540                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
   1541                          || i->Ain.Sse32Fx4.op == Asse_SQRTF );
   1542          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
   1543          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1544                        i->Ain.Sse32Fx4.dst);
   1545          return;
   1546       case Ain_Sse32FLo:
   1547          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
   1548          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
   1549                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
   1550                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
   1551          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
   1552          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1553                        i->Ain.Sse32FLo.dst);
   1554          return;
   1555       case Ain_Sse64Fx2:
   1556          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
   1557          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
   1558                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
   1559                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
   1560          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
   1561          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1562                        i->Ain.Sse64Fx2.dst);
   1563          return;
   1564       case Ain_Sse64FLo:
   1565          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
   1566          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
   1567                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
   1568                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
   1569          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
   1570          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1571                        i->Ain.Sse64FLo.dst);
   1572          return;
   1573       case Ain_SseReRg:
   1574          if ( (i->Ain.SseReRg.op == Asse_XOR
   1575                || i->Ain.SseReRg.op == Asse_CMPEQ32)
   1576               && i->Ain.SseReRg.src == i->Ain.SseReRg.dst) {
   1577             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
   1578                r,r' as a write of a value to r, and independent of any
   1579                previous value in r */
   1580             /* (as opposed to a rite of passage :-) */
   1581             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
   1582          } else {
   1583             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
   1584             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
   1585                              ? HRmWrite : HRmModify,
   1586                           i->Ain.SseReRg.dst);
   1587          }
   1588          return;
   1589       case Ain_SseCMov:
   1590          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
   1591          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
   1592          return;
   1593       case Ain_SseShuf:
   1594          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
   1595          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
   1596          return;
   1597       //uu case Ain_AvxLdSt:
   1598       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
   1599       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
   1600       //uu               i->Ain.AvxLdSt.reg);
   1601       //uu return;
   1602       //uu case Ain_AvxReRg:
   1603       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
   1604       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
   1605       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
   1606       //uu       /* See comments on the case for Ain_SseReRg. */
   1607       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
   1608       //uu    } else {
   1609       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
   1610       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
   1611       //uu                        ? HRmWrite : HRmModify,
   1612       //uu                     i->Ain.AvxReRg.dst);
   1613       //uu    }
   1614       //uu    return;
   1615       case Ain_EvCheck:
   1616          /* We expect both amodes only to mention %rbp, so this is in
   1617             fact pointless, since %rbp isn't allocatable, but anyway.. */
   1618          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
   1619          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
   1620          return;
   1621       case Ain_ProfInc:
   1622          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1623          return;
   1624       default:
   1625          ppAMD64Instr(i, mode64);
   1626          vpanic("getRegUsage_AMD64Instr");
   1627    }
   1628 }
   1629 
   1630 /* local helper */
   1631 static inline void mapReg(HRegRemap* m, HReg* r)
   1632 {
   1633    *r = lookupHRegRemap(m, *r);
   1634 }
   1635 
   1636 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
   1637 {
   1638    vassert(mode64 == True);
   1639    switch (i->tag) {
   1640       case Ain_Imm64:
   1641          mapReg(m, &i->Ain.Imm64.dst);
   1642          return;
   1643       case Ain_Alu64R:
   1644          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
   1645          mapReg(m, &i->Ain.Alu64R.dst);
   1646          return;
   1647       case Ain_Alu64M:
   1648          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
   1649          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
   1650          return;
   1651       case Ain_Sh64:
   1652          mapReg(m, &i->Ain.Sh64.dst);
   1653          return;
   1654       case Ain_Test64:
   1655          mapReg(m, &i->Ain.Test64.dst);
   1656          return;
   1657       case Ain_Unary64:
   1658          mapReg(m, &i->Ain.Unary64.dst);
   1659          return;
   1660       case Ain_Lea64:
   1661          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
   1662          mapReg(m, &i->Ain.Lea64.dst);
   1663          return;
   1664       case Ain_Alu32R:
   1665          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
   1666          mapReg(m, &i->Ain.Alu32R.dst);
   1667          return;
   1668       case Ain_MulL:
   1669          mapRegs_AMD64RM(m, i->Ain.MulL.src);
   1670          return;
   1671       case Ain_Div:
   1672          mapRegs_AMD64RM(m, i->Ain.Div.src);
   1673          return;
   1674       case Ain_Push:
   1675          mapRegs_AMD64RMI(m, i->Ain.Push.src);
   1676          return;
   1677       case Ain_Call:
   1678          return;
   1679       case Ain_XDirect:
   1680          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
   1681          return;
   1682       case Ain_XIndir:
   1683          mapReg(m, &i->Ain.XIndir.dstGA);
   1684          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
   1685          return;
   1686       case Ain_XAssisted:
   1687          mapReg(m, &i->Ain.XAssisted.dstGA);
   1688          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
   1689          return;
   1690       case Ain_CMov64:
   1691          mapRegs_AMD64RM(m, i->Ain.CMov64.src);
   1692          mapReg(m, &i->Ain.CMov64.dst);
   1693          return;
   1694       case Ain_MovxLQ:
   1695          mapReg(m, &i->Ain.MovxLQ.src);
   1696          mapReg(m, &i->Ain.MovxLQ.dst);
   1697          return;
   1698       case Ain_LoadEX:
   1699          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
   1700          mapReg(m, &i->Ain.LoadEX.dst);
   1701          return;
   1702       case Ain_Store:
   1703          mapReg(m, &i->Ain.Store.src);
   1704          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
   1705          return;
   1706       case Ain_Set64:
   1707          mapReg(m, &i->Ain.Set64.dst);
   1708          return;
   1709       case Ain_Bsfr64:
   1710          mapReg(m, &i->Ain.Bsfr64.src);
   1711          mapReg(m, &i->Ain.Bsfr64.dst);
   1712          return;
   1713       case Ain_MFence:
   1714          return;
   1715       case Ain_ACAS:
   1716          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
   1717          return;
   1718       case Ain_DACAS:
   1719          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
   1720          return;
   1721       case Ain_A87Free:
   1722          return;
   1723       case Ain_A87PushPop:
   1724          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
   1725          return;
   1726       case Ain_A87FpOp:
   1727          return;
   1728       case Ain_A87LdCW:
   1729          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
   1730          return;
   1731       case Ain_A87StSW:
   1732          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
   1733          return;
   1734       case Ain_LdMXCSR:
   1735          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
   1736          return;
   1737       case Ain_SseUComIS:
   1738          mapReg(m, &i->Ain.SseUComIS.srcL);
   1739          mapReg(m, &i->Ain.SseUComIS.srcR);
   1740          mapReg(m, &i->Ain.SseUComIS.dst);
   1741          return;
   1742       case Ain_SseSI2SF:
   1743          mapReg(m, &i->Ain.SseSI2SF.src);
   1744          mapReg(m, &i->Ain.SseSI2SF.dst);
   1745          return;
   1746       case Ain_SseSF2SI:
   1747          mapReg(m, &i->Ain.SseSF2SI.src);
   1748          mapReg(m, &i->Ain.SseSF2SI.dst);
   1749          return;
   1750       case Ain_SseSDSS:
   1751          mapReg(m, &i->Ain.SseSDSS.src);
   1752          mapReg(m, &i->Ain.SseSDSS.dst);
   1753          return;
   1754       case Ain_SseLdSt:
   1755          mapReg(m, &i->Ain.SseLdSt.reg);
   1756          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
   1757          break;
   1758       case Ain_SseLdzLO:
   1759          mapReg(m, &i->Ain.SseLdzLO.reg);
   1760          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
   1761          break;
   1762       case Ain_Sse32Fx4:
   1763          mapReg(m, &i->Ain.Sse32Fx4.src);
   1764          mapReg(m, &i->Ain.Sse32Fx4.dst);
   1765          return;
   1766       case Ain_Sse32FLo:
   1767          mapReg(m, &i->Ain.Sse32FLo.src);
   1768          mapReg(m, &i->Ain.Sse32FLo.dst);
   1769          return;
   1770       case Ain_Sse64Fx2:
   1771          mapReg(m, &i->Ain.Sse64Fx2.src);
   1772          mapReg(m, &i->Ain.Sse64Fx2.dst);
   1773          return;
   1774       case Ain_Sse64FLo:
   1775          mapReg(m, &i->Ain.Sse64FLo.src);
   1776          mapReg(m, &i->Ain.Sse64FLo.dst);
   1777          return;
   1778       case Ain_SseReRg:
   1779          mapReg(m, &i->Ain.SseReRg.src);
   1780          mapReg(m, &i->Ain.SseReRg.dst);
   1781          return;
   1782       case Ain_SseCMov:
   1783          mapReg(m, &i->Ain.SseCMov.src);
   1784          mapReg(m, &i->Ain.SseCMov.dst);
   1785          return;
   1786       case Ain_SseShuf:
   1787          mapReg(m, &i->Ain.SseShuf.src);
   1788          mapReg(m, &i->Ain.SseShuf.dst);
   1789          return;
   1790       //uu case Ain_AvxLdSt:
   1791       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
   1792       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
   1793       //uu    break;
   1794       //uu case Ain_AvxReRg:
   1795       //uu    mapReg(m, &i->Ain.AvxReRg.src);
   1796       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
   1797       //uu    return;
   1798       case Ain_EvCheck:
   1799          /* We expect both amodes only to mention %rbp, so this is in
   1800             fact pointless, since %rbp isn't allocatable, but anyway.. */
   1801          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
   1802          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
   1803          return;
   1804       case Ain_ProfInc:
   1805          /* hardwires r11 -- nothing to modify. */
   1806          return;
   1807       default:
   1808          ppAMD64Instr(i, mode64);
   1809          vpanic("mapRegs_AMD64Instr");
   1810    }
   1811 }
   1812 
   1813 /* Figure out if i represents a reg-reg move, and if so assign the
   1814    source and destination to *src and *dst.  If in doubt say No.  Used
   1815    by the register allocator to do move coalescing.
   1816 */
   1817 Bool isMove_AMD64Instr ( AMD64Instr* i, HReg* src, HReg* dst )
   1818 {
   1819    switch (i->tag) {
   1820       case Ain_Alu64R:
   1821          /* Moves between integer regs */
   1822          if (i->Ain.Alu64R.op != Aalu_MOV)
   1823             return False;
   1824          if (i->Ain.Alu64R.src->tag != Armi_Reg)
   1825             return False;
   1826          *src = i->Ain.Alu64R.src->Armi.Reg.reg;
   1827          *dst = i->Ain.Alu64R.dst;
   1828          return True;
   1829       case Ain_SseReRg:
   1830          /* Moves between SSE regs */
   1831          if (i->Ain.SseReRg.op != Asse_MOV)
   1832             return False;
   1833          *src = i->Ain.SseReRg.src;
   1834          *dst = i->Ain.SseReRg.dst;
   1835          return True;
   1836       //uu case Ain_AvxReRg:
   1837       //uu    /* Moves between AVX regs */
   1838       //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
   1839       //uu       return False;
   1840       //uu    *src = i->Ain.AvxReRg.src;
   1841       //uu    *dst = i->Ain.AvxReRg.dst;
   1842       //uu    return True;
   1843       default:
   1844          return False;
   1845    }
   1846    /*NOTREACHED*/
   1847 }
   1848 
   1849 
   1850 /* Generate amd64 spill/reload instructions under the direction of the
   1851    register allocator.  Note it's critical these don't write the
   1852    condition codes. */
   1853 
   1854 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1855                       HReg rreg, Int offsetB, Bool mode64 )
   1856 {
   1857    AMD64AMode* am;
   1858    vassert(offsetB >= 0);
   1859    vassert(!hregIsVirtual(rreg));
   1860    vassert(mode64 == True);
   1861    *i1 = *i2 = NULL;
   1862    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1863    switch (hregClass(rreg)) {
   1864       case HRcInt64:
   1865          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
   1866          return;
   1867       case HRcVec128:
   1868          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
   1869          return;
   1870       default:
   1871          ppHRegClass(hregClass(rreg));
   1872          vpanic("genSpill_AMD64: unimplemented regclass");
   1873    }
   1874 }
   1875 
   1876 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1877                        HReg rreg, Int offsetB, Bool mode64 )
   1878 {
   1879    AMD64AMode* am;
   1880    vassert(offsetB >= 0);
   1881    vassert(!hregIsVirtual(rreg));
   1882    vassert(mode64 == True);
   1883    *i1 = *i2 = NULL;
   1884    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1885    switch (hregClass(rreg)) {
   1886       case HRcInt64:
   1887          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
   1888          return;
   1889       case HRcVec128:
   1890          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
   1891          return;
   1892       default:
   1893          ppHRegClass(hregClass(rreg));
   1894          vpanic("genReload_AMD64: unimplemented regclass");
   1895    }
   1896 }
   1897 
   1898 
   1899 /* --------- The amd64 assembler (bleh.) --------- */
   1900 
   1901 /* Produce the low three bits of an integer register number. */
   1902 static UChar iregBits210 ( HReg r )
   1903 {
   1904    UInt n;
   1905    vassert(hregClass(r) == HRcInt64);
   1906    vassert(!hregIsVirtual(r));
   1907    n = hregNumber(r);
   1908    vassert(n <= 15);
   1909    return toUChar(n & 7);
   1910 }
   1911 
   1912 /* Produce bit 3 of an integer register number. */
   1913 static UChar iregBit3 ( HReg r )
   1914 {
   1915    UInt n;
   1916    vassert(hregClass(r) == HRcInt64);
   1917    vassert(!hregIsVirtual(r));
   1918    n = hregNumber(r);
   1919    vassert(n <= 15);
   1920    return toUChar((n >> 3) & 1);
   1921 }
   1922 
   1923 /* Produce a complete 4-bit integer register number. */
   1924 static UChar iregBits3210 ( HReg r )
   1925 {
   1926    UInt n;
   1927    vassert(hregClass(r) == HRcInt64);
   1928    vassert(!hregIsVirtual(r));
   1929    n = hregNumber(r);
   1930    vassert(n <= 15);
   1931    return toUChar(n);
   1932 }
   1933 
   1934 /* Given an xmm (128bit V-class) register number, produce the
   1935    equivalent numbered register in 64-bit I-class.  This is a bit of
   1936    fakery which facilitates using functions that work on integer
   1937    register numbers to be used when assembling SSE instructions
   1938    too. */
   1939 static UInt vreg2ireg ( HReg r )
   1940 {
   1941    UInt n;
   1942    vassert(hregClass(r) == HRcVec128);
   1943    vassert(!hregIsVirtual(r));
   1944    n = hregNumber(r);
   1945    vassert(n <= 15);
   1946    return mkHReg(n, HRcInt64, False);
   1947 }
   1948 
   1949 //uu /* Ditto for ymm regs. */
   1950 //uu static UInt dvreg2ireg ( HReg r )
   1951 //uu {
   1952 //uu    UInt n;
   1953 //uu    vassert(hregClass(r) == HRcVec256);
   1954 //uu    vassert(!hregIsVirtual(r));
   1955 //uu    n = hregNumber(r);
   1956 //uu    vassert(n <= 15);
   1957 //uu    return mkHReg(n, HRcInt64, False);
   1958 //uu }
   1959 
   1960 static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
   1961 {
   1962    return toUChar( ((mod & 3) << 6)
   1963                    | ((reg & 7) << 3)
   1964                    | (regmem & 7) );
   1965 }
   1966 
   1967 static UChar mkSIB ( Int shift, Int regindex, Int regbase )
   1968 {
   1969    return toUChar( ((shift & 3) << 6)
   1970                    | ((regindex & 7) << 3)
   1971                    | (regbase & 7) );
   1972 }
   1973 
   1974 static UChar* emit32 ( UChar* p, UInt w32 )
   1975 {
   1976    *p++ = toUChar((w32)       & 0x000000FF);
   1977    *p++ = toUChar((w32 >>  8) & 0x000000FF);
   1978    *p++ = toUChar((w32 >> 16) & 0x000000FF);
   1979    *p++ = toUChar((w32 >> 24) & 0x000000FF);
   1980    return p;
   1981 }
   1982 
   1983 static UChar* emit64 ( UChar* p, ULong w64 )
   1984 {
   1985    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
   1986    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
   1987    return p;
   1988 }
   1989 
   1990 /* Does a sign-extend of the lowest 8 bits give
   1991    the original number? */
   1992 static Bool fits8bits ( UInt w32 )
   1993 {
   1994    Int i32 = (Int)w32;
   1995    return toBool(i32 == ((i32 << 24) >> 24));
   1996 }
   1997 /* Can the lower 32 bits be signedly widened to produce the whole
   1998    64-bit value?  In other words, are the top 33 bits either all 0 or
   1999    all 1 ? */
   2000 static Bool fitsIn32Bits ( ULong x )
   2001 {
   2002    Long y0 = (Long)x;
   2003    Long y1 = y0;
   2004    y1 <<= 32;
   2005    y1 >>=/*s*/ 32;
   2006    return toBool(x == y1);
   2007 }
   2008 
   2009 
   2010 /* Forming mod-reg-rm bytes and scale-index-base bytes.
   2011 
   2012      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
   2013                        =  00 greg ereg
   2014 
   2015      greg,  d8(ereg)   |  ereg is neither of: RSP R12
   2016                        =  01 greg ereg, d8
   2017 
   2018      greg,  d32(ereg)  |  ereg is neither of: RSP R12
   2019                        =  10 greg ereg, d32
   2020 
   2021      greg,  d8(ereg)   |  ereg is either: RSP R12
   2022                        =  01 greg 100, 0x24, d8
   2023                        (lowest bit of rex distinguishes R12/RSP)
   2024 
   2025      greg,  d32(ereg)  |  ereg is either: RSP R12
   2026                        =  10 greg 100, 0x24, d32
   2027                        (lowest bit of rex distinguishes R12/RSP)
   2028 
   2029      -----------------------------------------------
   2030 
   2031      greg,  d8(base,index,scale)
   2032                |  index != RSP
   2033                =  01 greg 100, scale index base, d8
   2034 
   2035      greg,  d32(base,index,scale)
   2036                |  index != RSP
   2037                =  10 greg 100, scale index base, d32
   2038 */
   2039 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
   2040 {
   2041    if (am->tag == Aam_IR) {
   2042       if (am->Aam.IR.imm == 0
   2043           && am->Aam.IR.reg != hregAMD64_RSP()
   2044           && am->Aam.IR.reg != hregAMD64_RBP()
   2045           && am->Aam.IR.reg != hregAMD64_R12()
   2046           && am->Aam.IR.reg != hregAMD64_R13()
   2047          ) {
   2048          *p++ = mkModRegRM(0, iregBits210(greg),
   2049                               iregBits210(am->Aam.IR.reg));
   2050          return p;
   2051       }
   2052       if (fits8bits(am->Aam.IR.imm)
   2053           && am->Aam.IR.reg != hregAMD64_RSP()
   2054           && am->Aam.IR.reg != hregAMD64_R12()
   2055          ) {
   2056          *p++ = mkModRegRM(1, iregBits210(greg),
   2057                               iregBits210(am->Aam.IR.reg));
   2058          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2059          return p;
   2060       }
   2061       if (am->Aam.IR.reg != hregAMD64_RSP()
   2062           && am->Aam.IR.reg != hregAMD64_R12()
   2063          ) {
   2064          *p++ = mkModRegRM(2, iregBits210(greg),
   2065                               iregBits210(am->Aam.IR.reg));
   2066          p = emit32(p, am->Aam.IR.imm);
   2067          return p;
   2068       }
   2069       if ((am->Aam.IR.reg == hregAMD64_RSP()
   2070            || am->Aam.IR.reg == hregAMD64_R12())
   2071           && fits8bits(am->Aam.IR.imm)) {
   2072  	 *p++ = mkModRegRM(1, iregBits210(greg), 4);
   2073          *p++ = 0x24;
   2074          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2075          return p;
   2076       }
   2077       if (/* (am->Aam.IR.reg == hregAMD64_RSP()
   2078 	     || wait for test case for RSP case */
   2079           am->Aam.IR.reg == hregAMD64_R12()) {
   2080  	 *p++ = mkModRegRM(2, iregBits210(greg), 4);
   2081          *p++ = 0x24;
   2082          p = emit32(p, am->Aam.IR.imm);
   2083          return p;
   2084       }
   2085       ppAMD64AMode(am);
   2086       vpanic("doAMode_M: can't emit amode IR");
   2087       /*NOTREACHED*/
   2088    }
   2089    if (am->tag == Aam_IRRS) {
   2090       if (fits8bits(am->Aam.IRRS.imm)
   2091           && am->Aam.IRRS.index != hregAMD64_RSP()) {
   2092          *p++ = mkModRegRM(1, iregBits210(greg), 4);
   2093          *p++ = mkSIB(am->Aam.IRRS.shift, am->Aam.IRRS.index,
   2094                                           am->Aam.IRRS.base);
   2095          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
   2096          return p;
   2097       }
   2098       if (am->Aam.IRRS.index != hregAMD64_RSP()) {
   2099          *p++ = mkModRegRM(2, iregBits210(greg), 4);
   2100          *p++ = mkSIB(am->Aam.IRRS.shift, am->Aam.IRRS.index,
   2101                                           am->Aam.IRRS.base);
   2102          p = emit32(p, am->Aam.IRRS.imm);
   2103          return p;
   2104       }
   2105       ppAMD64AMode(am);
   2106       vpanic("doAMode_M: can't emit amode IRRS");
   2107       /*NOTREACHED*/
   2108    }
   2109    vpanic("doAMode_M: unknown amode");
   2110    /*NOTREACHED*/
   2111 }
   2112 
   2113 
   2114 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
   2115 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
   2116 {
   2117    *p++ = mkModRegRM(3, iregBits210(greg), iregBits210(ereg));
   2118    return p;
   2119 }
   2120 
   2121 
   2122 /* Clear the W bit on a REX byte, thereby changing the operand size
   2123    back to whatever that instruction's default operand size is. */
   2124 static inline UChar clearWBit ( UChar rex )
   2125 {
   2126    return toUChar(rex & ~(1<<3));
   2127 }
   2128 
   2129 
   2130 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
   2131 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
   2132 {
   2133    if (am->tag == Aam_IR) {
   2134       UChar W = 1;  /* we want 64-bit mode */
   2135       UChar R = iregBit3(greg);
   2136       UChar X = 0; /* not relevant */
   2137       UChar B = iregBit3(am->Aam.IR.reg);
   2138       return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2139    }
   2140    if (am->tag == Aam_IRRS) {
   2141       UChar W = 1;  /* we want 64-bit mode */
   2142       UChar R = iregBit3(greg);
   2143       UChar X = iregBit3(am->Aam.IRRS.index);
   2144       UChar B = iregBit3(am->Aam.IRRS.base);
   2145       return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2146    }
   2147    vassert(0);
   2148    return 0; /*NOTREACHED*/
   2149 }
   2150 
   2151 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
   2152 static UChar rexAMode_R ( HReg greg, HReg ereg )
   2153 {
   2154    UChar W = 1;  /* we want 64-bit mode */
   2155    UChar R = iregBit3(greg);
   2156    UChar X = 0; /* not relevant */
   2157    UChar B = iregBit3(ereg);
   2158    return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2159 }
   2160 
   2161 
   2162 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
   2163 //uu    verified correct (I reckon).  Certainly it has been known to
   2164 //uu    produce correct VEX prefixes during testing. */
   2165 //uu
   2166 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
   2167 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
   2168 //uu    in verbatim.  There's no range checking on the bits. */
   2169 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
   2170 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
   2171 //uu                             UInt L, UInt pp )
   2172 //uu {
   2173 //uu    UChar byte0 = 0;
   2174 //uu    UChar byte1 = 0;
   2175 //uu    UChar byte2 = 0;
   2176 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
   2177 //uu       /* 2 byte encoding is possible. */
   2178 //uu       byte0 = 0xC5;
   2179 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
   2180 //uu               | (L << 2) | pp;
   2181 //uu    } else {
   2182 //uu       /* 3 byte encoding is needed. */
   2183 //uu       byte0 = 0xC4;
   2184 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
   2185 //uu               | ((rexB ^ 1) << 5) | mmmmm;
   2186 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
   2187 //uu    }
   2188 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
   2189 //uu }
   2190 //uu
   2191 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
   2192 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
   2193 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
   2194 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
   2195 //uu    vvvv=1111 (unused 3rd reg). */
   2196 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
   2197 //uu {
   2198 //uu    UChar L       = 1; /* size = 256 */
   2199 //uu    UChar pp      = 0; /* no SIMD prefix */
   2200 //uu    UChar mmmmm   = 1; /* 0F */
   2201 //uu    UChar notVvvv = 0; /* unused */
   2202 //uu    UChar rexW    = 0;
   2203 //uu    UChar rexR    = 0;
   2204 //uu    UChar rexX    = 0;
   2205 //uu    UChar rexB    = 0;
   2206 //uu    /* Same logic as in rexAMode_M. */
   2207 //uu    if (am->tag == Aam_IR) {
   2208 //uu       rexR = iregBit3(greg);
   2209 //uu       rexX = 0; /* not relevant */
   2210 //uu       rexB = iregBit3(am->Aam.IR.reg);
   2211 //uu    }
   2212 //uu    else if (am->tag == Aam_IRRS) {
   2213 //uu       rexR = iregBit3(greg);
   2214 //uu       rexX = iregBit3(am->Aam.IRRS.index);
   2215 //uu       rexB = iregBit3(am->Aam.IRRS.base);
   2216 //uu    } else {
   2217 //uu       vassert(0);
   2218 //uu    }
   2219 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
   2220 //uu }
   2221 //uu
   2222 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
   2223 //uu {
   2224 //uu    switch (vex & 0xFF) {
   2225 //uu       case 0xC5:
   2226 //uu          *p++ = 0xC5;
   2227 //uu          *p++ = (vex >> 8) & 0xFF;
   2228 //uu          vassert(0 == (vex >> 16));
   2229 //uu          break;
   2230 //uu       case 0xC4:
   2231 //uu          *p++ = 0xC4;
   2232 //uu          *p++ = (vex >> 8) & 0xFF;
   2233 //uu          *p++ = (vex >> 16) & 0xFF;
   2234 //uu          vassert(0 == (vex >> 24));
   2235 //uu          break;
   2236 //uu       default:
   2237 //uu          vassert(0);
   2238 //uu    }
   2239 //uu    return p;
   2240 //uu }
   2241 
   2242 
   2243 /* Emit ffree %st(N) */
   2244 static UChar* do_ffree_st ( UChar* p, Int n )
   2245 {
   2246    vassert(n >= 0 && n <= 7);
   2247    *p++ = 0xDD;
   2248    *p++ = toUChar(0xC0 + n);
   2249    return p;
   2250 }
   2251 
   2252 /* Emit an instruction into buf and return the number of bytes used.
   2253    Note that buf is not the insn's final place, and therefore it is
   2254    imperative to emit position-independent code.  If the emitted
   2255    instruction was a profiler inc, set *is_profInc to True, else
   2256    leave it unchanged. */
   2257 
   2258 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
   2259                       UChar* buf, Int nbuf, AMD64Instr* i,
   2260                       Bool mode64,
   2261                       void* disp_cp_chain_me_to_slowEP,
   2262                       void* disp_cp_chain_me_to_fastEP,
   2263                       void* disp_cp_xindir,
   2264                       void* disp_cp_xassisted )
   2265 {
   2266    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
   2267    UInt   xtra;
   2268    UInt   reg;
   2269    UChar  rex;
   2270    UChar* p = &buf[0];
   2271    UChar* ptmp;
   2272    Int    j;
   2273    vassert(nbuf >= 32);
   2274    vassert(mode64 == True);
   2275 
   2276    /* Wrap an integer as a int register, for use assembling
   2277       GrpN insns, in which the greg field is used as a sub-opcode
   2278       and does not really contain a register. */
   2279 #  define fake(_n) mkHReg((_n), HRcInt64, False)
   2280 
   2281    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
   2282 
   2283    switch (i->tag) {
   2284 
   2285    case Ain_Imm64:
   2286       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
   2287          /* Use the short form (load into 32 bit reg, + default
   2288             widening rule) for constants under 1 million.  We could
   2289             use this form for the range 0 to 0x7FFFFFFF inclusive, but
   2290             limit it to a smaller range for verifiability purposes. */
   2291          if (1 & iregBit3(i->Ain.Imm64.dst))
   2292             *p++ = 0x41;
   2293          *p++ = 0xB8 + iregBits210(i->Ain.Imm64.dst);
   2294          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
   2295       } else {
   2296          *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Imm64.dst)));
   2297          *p++ = toUChar(0xB8 + iregBits210(i->Ain.Imm64.dst));
   2298          p = emit64(p, i->Ain.Imm64.imm64);
   2299       }
   2300       goto done;
   2301 
   2302    case Ain_Alu64R:
   2303       /* Deal specially with MOV */
   2304       if (i->Ain.Alu64R.op == Aalu_MOV) {
   2305          switch (i->Ain.Alu64R.src->tag) {
   2306             case Armi_Imm:
   2307                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
   2308                   /* Actually we could use this form for constants in
   2309                      the range 0 through 0x7FFFFFFF inclusive, but
   2310                      limit it to a small range for verifiability
   2311                      purposes. */
   2312                   /* Generate "movl $imm32, 32-bit-register" and let
   2313                      the default zero-extend rule cause the upper half
   2314                      of the dst to be zeroed out too.  This saves 1
   2315                      and sometimes 2 bytes compared to the more
   2316                      obvious encoding in the 'else' branch. */
   2317                   if (1 & iregBit3(i->Ain.Alu64R.dst))
   2318                      *p++ = 0x41;
   2319                   *p++ = 0xB8 + iregBits210(i->Ain.Alu64R.dst);
   2320                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2321                } else {
   2322                   *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Alu64R.dst)));
   2323                   *p++ = 0xC7;
   2324                   *p++ = toUChar(0xC0 + iregBits210(i->Ain.Alu64R.dst));
   2325                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2326                }
   2327                goto done;
   2328             case Armi_Reg:
   2329                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2330                                   i->Ain.Alu64R.dst );
   2331                *p++ = 0x89;
   2332                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2333                                 i->Ain.Alu64R.dst);
   2334                goto done;
   2335             case Armi_Mem:
   2336                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2337                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2338                *p++ = 0x8B;
   2339                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2340                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2341                goto done;
   2342             default:
   2343                goto bad;
   2344          }
   2345       }
   2346       /* MUL */
   2347       if (i->Ain.Alu64R.op == Aalu_MUL) {
   2348          switch (i->Ain.Alu64R.src->tag) {
   2349             case Armi_Reg:
   2350                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
   2351                                   i->Ain.Alu64R.src->Armi.Reg.reg);
   2352                *p++ = 0x0F;
   2353                *p++ = 0xAF;
   2354                p = doAMode_R(p, i->Ain.Alu64R.dst,
   2355                                 i->Ain.Alu64R.src->Armi.Reg.reg);
   2356                goto done;
   2357             case Armi_Mem:
   2358                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2359                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2360                *p++ = 0x0F;
   2361                *p++ = 0xAF;
   2362                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2363                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2364                goto done;
   2365             case Armi_Imm:
   2366                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2367                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2368                   *p++ = 0x6B;
   2369                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2370                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2371                } else {
   2372                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2373                   *p++ = 0x69;
   2374                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2375                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2376                }
   2377                goto done;
   2378             default:
   2379                goto bad;
   2380          }
   2381       }
   2382       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
   2383       opc = opc_rr = subopc_imm = opc_imma = 0;
   2384       switch (i->Ain.Alu64R.op) {
   2385          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
   2386                         subopc_imm = 2; opc_imma = 0x15; break;
   2387          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2388                         subopc_imm = 0; opc_imma = 0x05; break;
   2389          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2390                         subopc_imm = 5; opc_imma = 0x2D; break;
   2391          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
   2392                         subopc_imm = 3; opc_imma = 0x1D; break;
   2393          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2394                         subopc_imm = 4; opc_imma = 0x25; break;
   2395          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2396                         subopc_imm = 6; opc_imma = 0x35; break;
   2397          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2398                         subopc_imm = 1; opc_imma = 0x0D; break;
   2399          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2400                         subopc_imm = 7; opc_imma = 0x3D; break;
   2401          default: goto bad;
   2402       }
   2403       switch (i->Ain.Alu64R.src->tag) {
   2404          case Armi_Imm:
   2405             if (i->Ain.Alu64R.dst == hregAMD64_RAX()
   2406                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2407                goto bad; /* FIXME: awaiting test case */
   2408                *p++ = toUChar(opc_imma);
   2409                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2410             } else
   2411             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2412                *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst );
   2413                *p++ = 0x83;
   2414                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
   2415                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2416             } else {
   2417                *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst);
   2418                *p++ = 0x81;
   2419                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
   2420                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2421             }
   2422             goto done;
   2423          case Armi_Reg:
   2424             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2425                                i->Ain.Alu64R.dst);
   2426             *p++ = toUChar(opc_rr);
   2427             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2428                              i->Ain.Alu64R.dst);
   2429             goto done;
   2430          case Armi_Mem:
   2431             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
   2432                                i->Ain.Alu64R.src->Armi.Mem.am);
   2433             *p++ = toUChar(opc);
   2434             p = doAMode_M(p, i->Ain.Alu64R.dst,
   2435                              i->Ain.Alu64R.src->Armi.Mem.am);
   2436             goto done;
   2437          default:
   2438             goto bad;
   2439       }
   2440       break;
   2441 
   2442    case Ain_Alu64M:
   2443       /* Deal specially with MOV */
   2444       if (i->Ain.Alu64M.op == Aalu_MOV) {
   2445          switch (i->Ain.Alu64M.src->tag) {
   2446             case Ari_Reg:
   2447                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
   2448                                  i->Ain.Alu64M.dst);
   2449                *p++ = 0x89;
   2450                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
   2451                                 i->Ain.Alu64M.dst);
   2452                goto done;
   2453             case Ari_Imm:
   2454                *p++ = rexAMode_M(fake(0), i->Ain.Alu64M.dst);
   2455                *p++ = 0xC7;
   2456                p = doAMode_M(p, fake(0), i->Ain.Alu64M.dst);
   2457                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
   2458                goto done;
   2459             default:
   2460                goto bad;
   2461          }
   2462       }
   2463       break;
   2464 
   2465    case Ain_Sh64:
   2466       opc_cl = opc_imm = subopc = 0;
   2467       switch (i->Ain.Sh64.op) {
   2468          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
   2469          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
   2470          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
   2471          default: goto bad;
   2472       }
   2473       if (i->Ain.Sh64.src == 0) {
   2474          *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
   2475          *p++ = toUChar(opc_cl);
   2476          p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
   2477          goto done;
   2478       } else {
   2479          *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
   2480          *p++ = toUChar(opc_imm);
   2481          p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
   2482          *p++ = (UChar)(i->Ain.Sh64.src);
   2483          goto done;
   2484       }
   2485       break;
   2486 
   2487    case Ain_Test64:
   2488       /* testq sign-extend($imm32), %reg */
   2489       *p++ = rexAMode_R(fake(0), i->Ain.Test64.dst);
   2490       *p++ = 0xF7;
   2491       p = doAMode_R(p, fake(0), i->Ain.Test64.dst);
   2492       p = emit32(p, i->Ain.Test64.imm32);
   2493       goto done;
   2494 
   2495    case Ain_Unary64:
   2496       if (i->Ain.Unary64.op == Aun_NOT) {
   2497          *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
   2498          *p++ = 0xF7;
   2499          p = doAMode_R(p, fake(2), i->Ain.Unary64.dst);
   2500          goto done;
   2501       }
   2502       if (i->Ain.Unary64.op == Aun_NEG) {
   2503          *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
   2504          *p++ = 0xF7;
   2505          p = doAMode_R(p, fake(3), i->Ain.Unary64.dst);
   2506          goto done;
   2507       }
   2508       break;
   2509 
   2510    case Ain_Lea64:
   2511       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2512       *p++ = 0x8D;
   2513       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2514       goto done;
   2515 
   2516    case Ain_Alu32R:
   2517       /* ADD/SUB/AND/OR/XOR/CMP */
   2518       opc = opc_rr = subopc_imm = opc_imma = 0;
   2519       switch (i->Ain.Alu32R.op) {
   2520          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2521                         subopc_imm = 0; opc_imma = 0x05; break;
   2522          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2523                         subopc_imm = 5; opc_imma = 0x2D; break;
   2524          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2525                         subopc_imm = 4; opc_imma = 0x25; break;
   2526          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2527                         subopc_imm = 6; opc_imma = 0x35; break;
   2528          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2529                         subopc_imm = 1; opc_imma = 0x0D; break;
   2530          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2531                         subopc_imm = 7; opc_imma = 0x3D; break;
   2532          default: goto bad;
   2533       }
   2534       switch (i->Ain.Alu32R.src->tag) {
   2535          case Armi_Imm:
   2536             if (i->Ain.Alu32R.dst == hregAMD64_RAX()
   2537                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
   2538                goto bad; /* FIXME: awaiting test case */
   2539                *p++ = toUChar(opc_imma);
   2540                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
   2541             } else
   2542             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
   2543                rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst ) );
   2544                if (rex != 0x40) *p++ = rex;
   2545                *p++ = 0x83;
   2546                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
   2547                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
   2548             } else {
   2549                rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst) );
   2550                if (rex != 0x40) *p++ = rex;
   2551                *p++ = 0x81;
   2552                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
   2553                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
   2554             }
   2555             goto done;
   2556          case Armi_Reg:
   2557             rex  = clearWBit(
   2558                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
   2559                                i->Ain.Alu32R.dst) );
   2560             if (rex != 0x40) *p++ = rex;
   2561             *p++ = toUChar(opc_rr);
   2562             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
   2563                              i->Ain.Alu32R.dst);
   2564             goto done;
   2565          case Armi_Mem:
   2566             rex  = clearWBit(
   2567                    rexAMode_M( i->Ain.Alu32R.dst,
   2568                                i->Ain.Alu32R.src->Armi.Mem.am) );
   2569             if (rex != 0x40) *p++ = rex;
   2570             *p++ = toUChar(opc);
   2571             p = doAMode_M(p, i->Ain.Alu32R.dst,
   2572                              i->Ain.Alu32R.src->Armi.Mem.am);
   2573             goto done;
   2574          default:
   2575             goto bad;
   2576       }
   2577       break;
   2578 
   2579    case Ain_MulL:
   2580       subopc = i->Ain.MulL.syned ? 5 : 4;
   2581       switch (i->Ain.MulL.src->tag)  {
   2582          case Arm_Mem:
   2583             *p++ = rexAMode_M( fake(0),
   2584                                i->Ain.MulL.src->Arm.Mem.am);
   2585             *p++ = 0xF7;
   2586             p = doAMode_M(p, fake(subopc),
   2587                              i->Ain.MulL.src->Arm.Mem.am);
   2588             goto done;
   2589          case Arm_Reg:
   2590             *p++ = rexAMode_R(fake(0),
   2591                               i->Ain.MulL.src->Arm.Reg.reg);
   2592             *p++ = 0xF7;
   2593             p = doAMode_R(p, fake(subopc),
   2594                              i->Ain.MulL.src->Arm.Reg.reg);
   2595             goto done;
   2596          default:
   2597             goto bad;
   2598       }
   2599       break;
   2600 
   2601    case Ain_Div:
   2602       subopc = i->Ain.Div.syned ? 7 : 6;
   2603       if (i->Ain.Div.sz == 4) {
   2604          switch (i->Ain.Div.src->tag)  {
   2605             case Arm_Mem:
   2606                goto bad;
   2607                /*FIXME*/
   2608                *p++ = 0xF7;
   2609                p = doAMode_M(p, fake(subopc),
   2610                                 i->Ain.Div.src->Arm.Mem.am);
   2611                goto done;
   2612             case Arm_Reg:
   2613                *p++ = clearWBit(
   2614                       rexAMode_R( fake(0), i->Ain.Div.src->Arm.Reg.reg));
   2615                *p++ = 0xF7;
   2616                p = doAMode_R(p, fake(subopc),
   2617                                 i->Ain.Div.src->Arm.Reg.reg);
   2618                goto done;
   2619             default:
   2620                goto bad;
   2621          }
   2622       }
   2623       if (i->Ain.Div.sz == 8) {
   2624          switch (i->Ain.Div.src->tag)  {
   2625             case Arm_Mem:
   2626                *p++ = rexAMode_M( fake(0),
   2627                                   i->Ain.Div.src->Arm.Mem.am);
   2628                *p++ = 0xF7;
   2629                p = doAMode_M(p, fake(subopc),
   2630                                 i->Ain.Div.src->Arm.Mem.am);
   2631                goto done;
   2632             case Arm_Reg:
   2633                *p++ = rexAMode_R( fake(0),
   2634                                   i->Ain.Div.src->Arm.Reg.reg);
   2635                *p++ = 0xF7;
   2636                p = doAMode_R(p, fake(subopc),
   2637                                 i->Ain.Div.src->Arm.Reg.reg);
   2638                goto done;
   2639             default:
   2640                goto bad;
   2641          }
   2642       }
   2643       break;
   2644 
   2645    case Ain_Push:
   2646       switch (i->Ain.Push.src->tag) {
   2647          case Armi_Mem:
   2648             *p++ = clearWBit(
   2649                    rexAMode_M(fake(0), i->Ain.Push.src->Armi.Mem.am));
   2650             *p++ = 0xFF;
   2651             p = doAMode_M(p, fake(6), i->Ain.Push.src->Armi.Mem.am);
   2652             goto done;
   2653          case Armi_Imm:
   2654             *p++ = 0x68;
   2655             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
   2656             goto done;
   2657          case Armi_Reg:
   2658             *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.Push.src->Armi.Reg.reg)));
   2659             *p++ = toUChar(0x50 + iregBits210(i->Ain.Push.src->Armi.Reg.reg));
   2660             goto done;
   2661         default:
   2662             goto bad;
   2663       }
   2664 
   2665    case Ain_Call: {
   2666       /* As per detailed comment for Ain_Call in
   2667          getRegUsage_AMD64Instr above, %r11 is used as an address
   2668          temporary. */
   2669       /* jump over the following two insns if the condition does not
   2670          hold */
   2671       Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
   2672       if (i->Ain.Call.cond != Acc_ALWAYS) {
   2673          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
   2674          *p++ = shortImm ? 10 : 13;
   2675          /* 10 or 13 bytes in the next two insns */
   2676       }
   2677       if (shortImm) {
   2678          /* 7 bytes: movl sign-extend(imm32), %r11 */
   2679          *p++ = 0x49;
   2680          *p++ = 0xC7;
   2681          *p++ = 0xC3;
   2682          p = emit32(p, (UInt)i->Ain.Call.target);
   2683       } else {
   2684          /* 10 bytes: movabsq $target, %r11 */
   2685          *p++ = 0x49;
   2686          *p++ = 0xBB;
   2687          p = emit64(p, i->Ain.Call.target);
   2688       }
   2689       /* 3 bytes: call *%r11 */
   2690       *p++ = 0x41;
   2691       *p++ = 0xFF;
   2692       *p++ = 0xD3;
   2693       goto done;
   2694    }
   2695 
   2696    case Ain_XDirect: {
   2697       /* NB: what goes on here has to be very closely coordinated with the
   2698          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
   2699       /* We're generating chain-me requests here, so we need to be
   2700          sure this is actually allowed -- no-redir translations can't
   2701          use chain-me's.  Hence: */
   2702       vassert(disp_cp_chain_me_to_slowEP != NULL);
   2703       vassert(disp_cp_chain_me_to_fastEP != NULL);
   2704 
   2705       HReg r11 = hregAMD64_R11();
   2706 
   2707       /* Use ptmp for backpatching conditional jumps. */
   2708       ptmp = NULL;
   2709 
   2710       /* First off, if this is conditional, create a conditional
   2711          jump over the rest of it. */
   2712       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
   2713          /* jmp fwds if !condition */
   2714          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
   2715          ptmp = p; /* fill in this bit later */
   2716          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2717       }
   2718 
   2719       /* Update the guest RIP. */
   2720       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
   2721          /* use a shorter encoding */
   2722          /* movl sign-extend(dstGA), %r11 */
   2723          *p++ = 0x49;
   2724          *p++ = 0xC7;
   2725          *p++ = 0xC3;
   2726          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
   2727       } else {
   2728          /* movabsq $dstGA, %r11 */
   2729          *p++ = 0x49;
   2730          *p++ = 0xBB;
   2731          p = emit64(p, i->Ain.XDirect.dstGA);
   2732       }
   2733 
   2734       /* movq %r11, amRIP */
   2735       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
   2736       *p++ = 0x89;
   2737       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
   2738 
   2739       /* --- FIRST PATCHABLE BYTE follows --- */
   2740       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
   2741          to) backs up the return address, so as to find the address of
   2742          the first patchable byte.  So: don't change the length of the
   2743          two instructions below. */
   2744       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
   2745       *p++ = 0x49;
   2746       *p++ = 0xBB;
   2747       void* disp_cp_chain_me
   2748                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
   2749                                          : disp_cp_chain_me_to_slowEP;
   2750       p = emit64(p, Ptr_to_ULong(disp_cp_chain_me));
   2751       /* call *%r11 */
   2752       *p++ = 0x41;
   2753       *p++ = 0xFF;
   2754       *p++ = 0xD3;
   2755       /* --- END of PATCHABLE BYTES --- */
   2756 
   2757       /* Fix up the conditional jump, if there was one. */
   2758       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
   2759          Int delta = p - ptmp;
   2760          vassert(delta > 0 && delta < 40);
   2761          *ptmp = toUChar(delta-1);
   2762       }
   2763       goto done;
   2764    }
   2765 
   2766    case Ain_XIndir: {
   2767       /* We're generating transfers that could lead indirectly to a
   2768          chain-me, so we need to be sure this is actually allowed --
   2769          no-redir translations are not allowed to reach normal
   2770          translations without going through the scheduler.  That means
   2771          no XDirects or XIndirs out from no-redir translations.
   2772          Hence: */
   2773       vassert(disp_cp_xindir != NULL);
   2774 
   2775       /* Use ptmp for backpatching conditional jumps. */
   2776       ptmp = NULL;
   2777 
   2778       /* First off, if this is conditional, create a conditional
   2779          jump over the rest of it. */
   2780       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
   2781          /* jmp fwds if !condition */
   2782          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
   2783          ptmp = p; /* fill in this bit later */
   2784          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2785       }
   2786 
   2787       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
   2788       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
   2789       *p++ = 0x89;
   2790       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
   2791 
   2792       /* get $disp_cp_xindir into %r11 */
   2793       if (fitsIn32Bits(Ptr_to_ULong(disp_cp_xindir))) {
   2794          /* use a shorter encoding */
   2795          /* movl sign-extend(disp_cp_xindir), %r11 */
   2796          *p++ = 0x49;
   2797          *p++ = 0xC7;
   2798          *p++ = 0xC3;
   2799          p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xindir));
   2800       } else {
   2801          /* movabsq $disp_cp_xindir, %r11 */
   2802          *p++ = 0x49;
   2803          *p++ = 0xBB;
   2804          p = emit64(p, Ptr_to_ULong(disp_cp_xindir));
   2805       }
   2806 
   2807       /* jmp *%r11 */
   2808       *p++ = 0x41;
   2809       *p++ = 0xFF;
   2810       *p++ = 0xE3;
   2811 
   2812       /* Fix up the conditional jump, if there was one. */
   2813       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
   2814          Int delta = p - ptmp;
   2815          vassert(delta > 0 && delta < 40);
   2816          *ptmp = toUChar(delta-1);
   2817       }
   2818       goto done;
   2819    }
   2820 
   2821    case Ain_XAssisted: {
   2822       /* Use ptmp for backpatching conditional jumps. */
   2823       ptmp = NULL;
   2824 
   2825       /* First off, if this is conditional, create a conditional
   2826          jump over the rest of it. */
   2827       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
   2828          /* jmp fwds if !condition */
   2829          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
   2830          ptmp = p; /* fill in this bit later */
   2831          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2832       }
   2833 
   2834       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
   2835       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
   2836       *p++ = 0x89;
   2837       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
   2838       /* movl $magic_number, %ebp.  Since these numbers are all small positive
   2839          integers, we can get away with "movl $N, %ebp" rather than
   2840          the longer "movq $N, %rbp". */
   2841       UInt trcval = 0;
   2842       switch (i->Ain.XAssisted.jk) {
   2843          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
   2844          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
   2845          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
   2846          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
   2847          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
   2848          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
   2849          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
   2850          case Ijk_TInval:      trcval = VEX_TRC_JMP_TINVAL;      break;
   2851          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
   2852          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
   2853          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
   2854          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
   2855          /* We don't expect to see the following being assisted. */
   2856          case Ijk_Ret:
   2857          case Ijk_Call:
   2858          /* fallthrough */
   2859          default:
   2860             ppIRJumpKind(i->Ain.XAssisted.jk);
   2861             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
   2862       }
   2863       vassert(trcval != 0);
   2864       *p++ = 0xBD;
   2865       p = emit32(p, trcval);
   2866       /* movabsq $disp_assisted, %r11 */
   2867       *p++ = 0x49;
   2868       *p++ = 0xBB;
   2869       p = emit64(p, Ptr_to_ULong(disp_cp_xassisted));
   2870       /* jmp *%r11 */
   2871       *p++ = 0x41;
   2872       *p++ = 0xFF;
   2873       *p++ = 0xE3;
   2874 
   2875       /* Fix up the conditional jump, if there was one. */
   2876       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
   2877          Int delta = p - ptmp;
   2878          vassert(delta > 0 && delta < 40);
   2879          *ptmp = toUChar(delta-1);
   2880       }
   2881       goto done;
   2882    }
   2883 
   2884    case Ain_CMov64:
   2885       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
   2886       if (i->Ain.CMov64.src->tag == Arm_Reg) {
   2887          *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
   2888          *p++ = 0x0F;
   2889          *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
   2890          p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
   2891          goto done;
   2892       }
   2893       if (i->Ain.CMov64.src->tag == Arm_Mem) {
   2894          *p++ = rexAMode_M(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
   2895          *p++ = 0x0F;
   2896          *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
   2897          p = doAMode_M(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
   2898          goto done;
   2899       }
   2900       break;
   2901 
   2902    case Ain_MovxLQ:
   2903       /* No, _don't_ ask me why the sense of the args has to be
   2904          different in the S vs Z case.  I don't know. */
   2905       if (i->Ain.MovxLQ.syned) {
   2906          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
   2907          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   2908          *p++ = 0x63;
   2909          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   2910       } else {
   2911          /* Produce a 32-bit reg-reg move, since the implicit
   2912             zero-extend does what we want. */
   2913          *p++ = clearWBit (
   2914                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
   2915          *p++ = 0x89;
   2916          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
   2917       }
   2918       goto done;
   2919 
   2920    case Ain_LoadEX:
   2921       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
   2922          /* movzbq */
   2923          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2924          *p++ = 0x0F;
   2925          *p++ = 0xB6;
   2926          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2927          goto done;
   2928       }
   2929       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
   2930          /* movzwq */
   2931          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2932          *p++ = 0x0F;
   2933          *p++ = 0xB7;
   2934          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2935          goto done;
   2936       }
   2937       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
   2938          /* movzlq */
   2939          /* This isn't really an existing AMD64 instruction per se.
   2940             Rather, we have to do a 32-bit load.  Because a 32-bit
   2941             write implicitly clears the upper 32 bits of the target
   2942             register, we get what we want. */
   2943          *p++ = clearWBit(
   2944                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
   2945          *p++ = 0x8B;
   2946          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2947          goto done;
   2948       }
   2949       break;
   2950 
   2951    case Ain_Set64:
   2952       /* Make the destination register be 1 or 0, depending on whether
   2953          the relevant condition holds.  Complication: the top 56 bits
   2954          of the destination should be forced to zero, but doing 'xorq
   2955          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
   2956          start off my moving $0 into the dest. */
   2957       reg = iregBits3210(i->Ain.Set64.dst);
   2958       vassert(reg < 16);
   2959 
   2960       /* movq $0, %dst */
   2961       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
   2962       *p++ = 0xC7;
   2963       *p++ = toUChar(0xC0 + (reg & 7));
   2964       p = emit32(p, 0);
   2965 
   2966       /* setb lo8(%dst) */
   2967       /* note, 8-bit register rex trickyness.  Be careful here. */
   2968       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
   2969       *p++ = 0x0F;
   2970       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
   2971       *p++ = toUChar(0xC0 + (reg & 7));
   2972       goto done;
   2973 
   2974    case Ain_Bsfr64:
   2975       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   2976       *p++ = 0x0F;
   2977       if (i->Ain.Bsfr64.isFwds) {
   2978          *p++ = 0xBC;
   2979       } else {
   2980          *p++ = 0xBD;
   2981       }
   2982       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   2983       goto done;
   2984 
   2985    case Ain_MFence:
   2986       /* mfence */
   2987       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
   2988       goto done;
   2989 
   2990    case Ain_ACAS:
   2991       /* lock */
   2992       *p++ = 0xF0;
   2993       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
   2994       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
   2995          in %rbx.  The new-value register is hardwired to be %rbx
   2996          since dealing with byte integer registers is too much hassle,
   2997          so we force the register operand to %rbx (could equally be
   2998          %rcx or %rdx). */
   2999       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
   3000       if (i->Ain.ACAS.sz != 8)
   3001          rex = clearWBit(rex);
   3002 
   3003       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
   3004       *p++ = 0x0F;
   3005       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
   3006       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
   3007       goto done;
   3008 
   3009    case Ain_DACAS:
   3010       /* lock */
   3011       *p++ = 0xF0;
   3012       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
   3013          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
   3014          aren't encoded in the insn. */
   3015       rex = rexAMode_M( fake(1), i->Ain.ACAS.addr );
   3016       if (i->Ain.ACAS.sz != 8)
   3017          rex = clearWBit(rex);
   3018       *p++ = rex;
   3019       *p++ = 0x0F;
   3020       *p++ = 0xC7;
   3021       p = doAMode_M(p, fake(1), i->Ain.DACAS.addr);
   3022       goto done;
   3023 
   3024    case Ain_A87Free:
   3025       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
   3026       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
   3027          p = do_ffree_st(p, 7-j);
   3028       }
   3029       goto done;
   3030 
   3031    case Ain_A87PushPop:
   3032       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
   3033       if (i->Ain.A87PushPop.isPush) {
   3034          /* Load from memory into %st(0): flds/fldl amode */
   3035          *p++ = clearWBit(
   3036                    rexAMode_M(fake(0), i->Ain.A87PushPop.addr) );
   3037          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   3038 	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Ain.A87PushPop.addr);
   3039       } else {
   3040          /* Dump %st(0) to memory: fstps/fstpl amode */
   3041          *p++ = clearWBit(
   3042                    rexAMode_M(fake(3), i->Ain.A87PushPop.addr) );
   3043          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   3044          p = doAMode_M(p, fake(3)/*subopcode*/, i->Ain.A87PushPop.addr);
   3045          goto done;
   3046       }
   3047       goto done;
   3048 
   3049    case Ain_A87FpOp:
   3050       switch (i->Ain.A87FpOp.op) {
   3051          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
   3052          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
   3053          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
   3054          case Afp_TAN:    *p++ = 0xD9; *p++ = 0xF2; break;
   3055          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
   3056          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
   3057          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
   3058          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
   3059          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
   3060          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
   3061          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
   3062          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
   3063          default: goto bad;
   3064       }
   3065       goto done;
   3066 
   3067    case Ain_A87LdCW:
   3068       *p++ = clearWBit(
   3069                 rexAMode_M(fake(5), i->Ain.A87LdCW.addr) );
   3070       *p++ = 0xD9;
   3071       p = doAMode_M(p, fake(5)/*subopcode*/, i->Ain.A87LdCW.addr);
   3072       goto done;
   3073 
   3074    case Ain_A87StSW:
   3075       *p++ = clearWBit(
   3076                 rexAMode_M(fake(7), i->Ain.A87StSW.addr) );
   3077       *p++ = 0xDD;
   3078       p = doAMode_M(p, fake(7)/*subopcode*/, i->Ain.A87StSW.addr);
   3079       goto done;
   3080 
   3081    case Ain_Store:
   3082       if (i->Ain.Store.sz == 2) {
   3083          /* This just goes to show the crazyness of the instruction
   3084             set encoding.  We have to insert two prefix bytes, but be
   3085             careful to avoid a conflict in what the size should be, by
   3086             ensuring that REX.W = 0. */
   3087          *p++ = 0x66; /* override to 16-bits */
   3088 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3089          *p++ = 0x89;
   3090          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3091          goto done;
   3092       }
   3093       if (i->Ain.Store.sz == 4) {
   3094 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3095          *p++ = 0x89;
   3096          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3097          goto done;
   3098       }
   3099       if (i->Ain.Store.sz == 1) {
   3100          /* This is one place where it would be wrong to skip emitting
   3101             a rex byte of 0x40, since the mere presence of rex changes
   3102             the meaning of the byte register access.  Be careful. */
   3103 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3104          *p++ = 0x88;
   3105          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3106          goto done;
   3107       }
   3108       break;
   3109 
   3110    case Ain_LdMXCSR:
   3111       *p++ = clearWBit(rexAMode_M( fake(0), i->Ain.LdMXCSR.addr));
   3112       *p++ = 0x0F;
   3113       *p++ = 0xAE;
   3114       p = doAMode_M(p, fake(2)/*subopcode*/, i->Ain.LdMXCSR.addr);
   3115       goto done;
   3116 
   3117    case Ain_SseUComIS:
   3118       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
   3119       /* ucomi[sd] %srcL, %srcR */
   3120       if (i->Ain.SseUComIS.sz == 8) {
   3121          *p++ = 0x66;
   3122       } else {
   3123          goto bad;
   3124          vassert(i->Ain.SseUComIS.sz == 4);
   3125       }
   3126       *p++ = clearWBit (
   3127              rexAMode_R( vreg2ireg(i->Ain.SseUComIS.srcL),
   3128                          vreg2ireg(i->Ain.SseUComIS.srcR) ));
   3129       *p++ = 0x0F;
   3130       *p++ = 0x2E;
   3131       p = doAMode_R(p, vreg2ireg(i->Ain.SseUComIS.srcL),
   3132                        vreg2ireg(i->Ain.SseUComIS.srcR) );
   3133       /* pushfq */
   3134       *p++ = 0x9C;
   3135       /* popq %dst */
   3136       *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.SseUComIS.dst)));
   3137       *p++ = toUChar(0x58 + iregBits210(i->Ain.SseUComIS.dst));
   3138       goto done;
   3139 
   3140    case Ain_SseSI2SF:
   3141       /* cvssi2s[sd] %src, %dst */
   3142       rex = rexAMode_R( vreg2ireg(i->Ain.SseSI2SF.dst),
   3143                         i->Ain.SseSI2SF.src );
   3144       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
   3145       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
   3146       *p++ = 0x0F;
   3147       *p++ = 0x2A;
   3148       p = doAMode_R( p, vreg2ireg(i->Ain.SseSI2SF.dst),
   3149                         i->Ain.SseSI2SF.src );
   3150       goto done;
   3151 
   3152    case Ain_SseSF2SI:
   3153       /* cvss[sd]2si %src, %dst */
   3154       rex = rexAMode_R( i->Ain.SseSF2SI.dst,
   3155                         vreg2ireg(i->Ain.SseSF2SI.src) );
   3156       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
   3157       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
   3158       *p++ = 0x0F;
   3159       *p++ = 0x2D;
   3160       p = doAMode_R( p, i->Ain.SseSF2SI.dst,
   3161                         vreg2ireg(i->Ain.SseSF2SI.src) );
   3162       goto done;
   3163 
   3164    case Ain_SseSDSS:
   3165       /* cvtsd2ss/cvtss2sd %src, %dst */
   3166       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
   3167       *p++ = clearWBit(
   3168               rexAMode_R( vreg2ireg(i->Ain.SseSDSS.dst),
   3169                           vreg2ireg(i->Ain.SseSDSS.src) ));
   3170       *p++ = 0x0F;
   3171       *p++ = 0x5A;
   3172       p = doAMode_R( p, vreg2ireg(i->Ain.SseSDSS.dst),
   3173                         vreg2ireg(i->Ain.SseSDSS.src) );
   3174       goto done;
   3175 
   3176    case Ain_SseLdSt:
   3177       if (i->Ain.SseLdSt.sz == 8) {
   3178          *p++ = 0xF2;
   3179       } else
   3180       if (i->Ain.SseLdSt.sz == 4) {
   3181          *p++ = 0xF3;
   3182       } else
   3183       if (i->Ain.SseLdSt.sz != 16) {
   3184          vassert(0);
   3185       }
   3186       *p++ = clearWBit(
   3187              rexAMode_M( vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr));
   3188       *p++ = 0x0F;
   3189       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
   3190       p = doAMode_M(p, vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr);
   3191       goto done;
   3192 
   3193    case Ain_SseLdzLO:
   3194       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
   3195       /* movs[sd] amode, %xmm-dst */
   3196       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
   3197       *p++ = clearWBit(
   3198              rexAMode_M(vreg2ireg(i->Ain.SseLdzLO.reg),
   3199                         i->Ain.SseLdzLO.addr));
   3200       *p++ = 0x0F;
   3201       *p++ = 0x10;
   3202       p = doAMode_M(p, vreg2ireg(i->Ain.SseLdzLO.reg),
   3203                        i->Ain.SseLdzLO.addr);
   3204       goto done;
   3205 
   3206    case Ain_Sse32Fx4:
   3207       xtra = 0;
   3208       *p++ = clearWBit(
   3209              rexAMode_R( vreg2ireg(i->Ain.Sse32Fx4.dst),
   3210                          vreg2ireg(i->Ain.Sse32Fx4.src) ));
   3211       *p++ = 0x0F;
   3212       switch (i->Ain.Sse32Fx4.op) {
   3213          case Asse_ADDF:   *p++ = 0x58; break;
   3214          case Asse_DIVF:   *p++ = 0x5E; break;
   3215          case Asse_MAXF:   *p++ = 0x5F; break;
   3216          case Asse_MINF:   *p++ = 0x5D; break;
   3217          case Asse_MULF:   *p++ = 0x59; break;
   3218          case Asse_RCPF:   *p++ = 0x53; break;
   3219          case Asse_RSQRTF: *p++ = 0x52; break;
   3220          case Asse_SQRTF:  *p++ = 0x51; break;
   3221          case Asse_SUBF:   *p++ = 0x5C; break;
   3222          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3223          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3224          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3225          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3226          default: goto bad;
   3227       }
   3228       p = doAMode_R(p, vreg2ireg(i->Ain.Sse32Fx4.dst),
   3229                        vreg2ireg(i->Ain.Sse32Fx4.src) );
   3230       if (xtra & 0x100)
   3231          *p++ = toUChar(xtra & 0xFF);
   3232       goto done;
   3233 
   3234    case Ain_Sse64Fx2:
   3235       xtra = 0;
   3236       *p++ = 0x66;
   3237       *p++ = clearWBit(
   3238              rexAMode_R( vreg2ireg(i->Ain.Sse64Fx2.dst),
   3239                          vreg2ireg(i->Ain.Sse64Fx2.src) ));
   3240       *p++ = 0x0F;
   3241       switch (i->Ain.Sse64Fx2.op) {
   3242          case Asse_ADDF:   *p++ = 0x58; break;
   3243          case Asse_DIVF:   *p++ = 0x5E; break;
   3244          case Asse_MAXF:   *p++ = 0x5F; break;
   3245          case Asse_MINF:   *p++ = 0x5D; break;
   3246          case Asse_MULF:   *p++ = 0x59; break;
   3247          case Asse_SQRTF:  *p++ = 0x51; break;
   3248          case Asse_SUBF:   *p++ = 0x5C; break;
   3249          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3250          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3251          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3252          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3253          default: goto bad;
   3254       }
   3255       p = doAMode_R(p, vreg2ireg(i->Ain.Sse64Fx2.dst),
   3256                        vreg2ireg(i->Ain.Sse64Fx2.src) );
   3257       if (xtra & 0x100)
   3258          *p++ = toUChar(xtra & 0xFF);
   3259       goto done;
   3260 
   3261    case Ain_Sse32FLo:
   3262       xtra = 0;
   3263       *p++ = 0xF3;
   3264       *p++ = clearWBit(
   3265              rexAMode_R( vreg2ireg(i->Ain.Sse32FLo.dst),
   3266                          vreg2ireg(i->Ain.Sse32FLo.src) ));
   3267       *p++ = 0x0F;
   3268       switch (i->Ain.Sse32FLo.op) {
   3269          case Asse_ADDF:   *p++ = 0x58; break;
   3270          case Asse_DIVF:   *p++ = 0x5E; break;
   3271          case Asse_MAXF:   *p++ = 0x5F; break;
   3272          case Asse_MINF:   *p++ = 0x5D; break;
   3273          case Asse_MULF:   *p++ = 0x59; break;
   3274          case Asse_RCPF:   *p++ = 0x53; break;
   3275          case Asse_RSQRTF: *p++ = 0x52; break;
   3276          case Asse_SQRTF:  *p++ = 0x51; break;
   3277          case Asse_SUBF:   *p++ = 0x5C; break;
   3278          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3279          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3280          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3281          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3282          default: goto bad;
   3283       }
   3284       p = doAMode_R(p, vreg2ireg(i->Ain.Sse32FLo.dst),
   3285                        vreg2ireg(i->Ain.Sse32FLo.src) );
   3286       if (xtra & 0x100)
   3287          *p++ = toUChar(xtra & 0xFF);
   3288       goto done;
   3289 
   3290    case Ain_Sse64FLo:
   3291       xtra = 0;
   3292       *p++ = 0xF2;
   3293       *p++ = clearWBit(
   3294              rexAMode_R( vreg2ireg(i->Ain.Sse64FLo.dst),
   3295                          vreg2ireg(i->Ain.Sse64FLo.src) ));
   3296       *p++ = 0x0F;
   3297       switch (i->Ain.Sse64FLo.op) {
   3298          case Asse_ADDF:   *p++ = 0x58; break;
   3299          case Asse_DIVF:   *p++ = 0x5E; break;
   3300          case Asse_MAXF:   *p++ = 0x5F; break;
   3301          case Asse_MINF:   *p++ = 0x5D; break;
   3302          case Asse_MULF:   *p++ = 0x59; break;
   3303          case Asse_SQRTF:  *p++ = 0x51; break;
   3304          case Asse_SUBF:   *p++ = 0x5C; break;
   3305          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3306          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3307          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3308          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3309          default: goto bad;
   3310       }
   3311       p = doAMode_R(p, vreg2ireg(i->Ain.Sse64FLo.dst),
   3312                        vreg2ireg(i->Ain.Sse64FLo.src) );
   3313       if (xtra & 0x100)
   3314          *p++ = toUChar(xtra & 0xFF);
   3315       goto done;
   3316 
   3317    case Ain_SseReRg:
   3318 #     define XX(_n) *p++ = (_n)
   3319 
   3320       rex = clearWBit(
   3321             rexAMode_R( vreg2ireg(i->Ain.SseReRg.dst),
   3322                         vreg2ireg(i->Ain.SseReRg.src) ));
   3323 
   3324       switch (i->Ain.SseReRg.op) {
   3325          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
   3326          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
   3327          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
   3328          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
   3329          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
   3330          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
   3331          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
   3332          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
   3333          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
   3334          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
   3335          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
   3336          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
   3337          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
   3338          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
   3339          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
   3340          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
   3341          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
   3342          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
   3343          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
   3344          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
   3345          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
   3346          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
   3347          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
   3348          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
   3349          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
   3350          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
   3351          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
   3352          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
   3353          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
   3354          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
   3355          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
   3356          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
   3357          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
   3358          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
   3359          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
   3360          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
   3361          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
   3362          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
   3363          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
   3364          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
   3365          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
   3366          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
   3367          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
   3368          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
   3369          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
   3370          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
   3371          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
   3372          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
   3373          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
   3374          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
   3375          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
   3376          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
   3377          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
   3378          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
   3379          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
   3380          default: goto bad;
   3381       }
   3382       p = doAMode_R(p, vreg2ireg(i->Ain.SseReRg.dst),
   3383                        vreg2ireg(i->Ain.SseReRg.src) );
   3384 #     undef XX
   3385       goto done;
   3386 
   3387    case Ain_SseCMov:
   3388       /* jmp fwds if !condition */
   3389       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
   3390       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   3391       ptmp = p;
   3392 
   3393       /* movaps %src, %dst */
   3394       *p++ = clearWBit(
   3395              rexAMode_R( vreg2ireg(i->Ain.SseCMov.dst),
   3396                          vreg2ireg(i->Ain.SseCMov.src) ));
   3397       *p++ = 0x0F;
   3398       *p++ = 0x28;
   3399       p = doAMode_R(p, vreg2ireg(i->Ain.SseCMov.dst),
   3400                        vreg2ireg(i->Ain.SseCMov.src) );
   3401 
   3402       /* Fill in the jump offset. */
   3403       *(ptmp-1) = toUChar(p - ptmp);
   3404       goto done;
   3405 
   3406    case Ain_SseShuf:
   3407       *p++ = 0x66;
   3408       *p++ = clearWBit(
   3409              rexAMode_R( vreg2ireg(i->Ain.SseShuf.dst),
   3410                          vreg2ireg(i->Ain.SseShuf.src) ));
   3411       *p++ = 0x0F;
   3412       *p++ = 0x70;
   3413       p = doAMode_R(p, vreg2ireg(i->Ain.SseShuf.dst),
   3414                        vreg2ireg(i->Ain.SseShuf.src) );
   3415       *p++ = (UChar)(i->Ain.SseShuf.order);
   3416       goto done;
   3417 
   3418    //uu case Ain_AvxLdSt: {
   3419    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
   3420    //uu                           i->Ain.AvxLdSt.addr );
   3421    //uu    p = emitVexPrefix(p, vex);
   3422    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
   3423    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
   3424    //uu      goto done;
   3425    //uu }
   3426 
   3427    case Ain_EvCheck: {
   3428       /* We generate:
   3429             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
   3430             (2 bytes)  jns  nofail     expected taken
   3431             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
   3432             nofail:
   3433       */
   3434       /* This is heavily asserted re instruction lengths.  It needs to
   3435          be.  If we get given unexpected forms of .amCounter or
   3436          .amFailAddr -- basically, anything that's not of the form
   3437          uimm7(%rbp) -- they are likely to fail. */
   3438       /* Note also that after the decl we must be very careful not to
   3439          read the carry flag, else we get a partial flags stall.
   3440          js/jns avoids that, though. */
   3441       UChar* p0 = p;
   3442       /* ---  decl 8(%rbp) --- */
   3443       /* Need to compute the REX byte for the decl in order to prove
   3444          that we don't need it, since this is a 32-bit inc and all
   3445          registers involved in the amode are < r8.  "fake(1)" because
   3446          there's no register in this encoding; instead the register
   3447          field is used as a sub opcode.  The encoding for "decl r/m32"
   3448          is FF /1, hence the fake(1). */
   3449       rex = clearWBit(rexAMode_M(fake(1), i->Ain.EvCheck.amCounter));
   3450       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
   3451       *p++ = 0xFF;
   3452       p = doAMode_M(p, fake(1), i->Ain.EvCheck.amCounter);
   3453       vassert(p - p0 == 3);
   3454       /* --- jns nofail --- */
   3455       *p++ = 0x79;
   3456       *p++ = 0x03; /* need to check this 0x03 after the next insn */
   3457       vassert(p - p0 == 5);
   3458       /* --- jmp* 0(%rbp) --- */
   3459       /* Once again, verify we don't need REX.  The encoding is FF /4.
   3460          We don't need REX.W since by default FF /4 in 64-bit mode
   3461          implies a 64 bit load. */
   3462       rex = clearWBit(rexAMode_M(fake(4), i->Ain.EvCheck.amFailAddr));
   3463       if (rex != 0x40) goto bad;
   3464       *p++ = 0xFF;
   3465       p = doAMode_M(p, fake(4), i->Ain.EvCheck.amFailAddr);
   3466       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
   3467       /* And crosscheck .. */
   3468       vassert(evCheckSzB_AMD64() == 8);
   3469       goto done;
   3470    }
   3471 
   3472    case Ain_ProfInc: {
   3473       /* We generate   movabsq $0, %r11
   3474                        incq (%r11)
   3475          in the expectation that a later call to LibVEX_patchProfCtr
   3476          will be used to fill in the immediate field once the right
   3477          value is known.
   3478          49 BB 00 00 00 00 00 00 00 00
   3479          49 FF 03
   3480       */
   3481       *p++ = 0x49; *p++ = 0xBB;
   3482       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3483       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3484       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
   3485       /* Tell the caller .. */
   3486       vassert(!(*is_profInc));
   3487       *is_profInc = True;
   3488       goto done;
   3489    }
   3490 
   3491    default:
   3492       goto bad;
   3493    }
   3494 
   3495   bad:
   3496    ppAMD64Instr(i, mode64);
   3497    vpanic("emit_AMD64Instr");
   3498    /*NOTREACHED*/
   3499 
   3500   done:
   3501    vassert(p - &buf[0] <= 32);
   3502    return p - &buf[0];
   3503 
   3504 #  undef fake
   3505 }
   3506 
   3507 
   3508 /* How big is an event check?  See case for Ain_EvCheck in
   3509    emit_AMD64Instr just above.  That crosschecks what this returns, so
   3510    we can tell if we're inconsistent. */
   3511 Int evCheckSzB_AMD64 ( void )
   3512 {
   3513    return 8;
   3514 }
   3515 
   3516 
   3517 /* NB: what goes on here has to be very closely coordinated with the
   3518    emitInstr case for XDirect, above. */
   3519 VexInvalRange chainXDirect_AMD64 ( void* place_to_chain,
   3520                                    void* disp_cp_chain_me_EXPECTED,
   3521                                    void* place_to_jump_to )
   3522 {
   3523    /* What we're expecting to see is:
   3524         movabsq $disp_cp_chain_me_EXPECTED, %r11
   3525         call *%r11
   3526       viz
   3527         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
   3528         41 FF D3
   3529    */
   3530    UChar* p = (UChar*)place_to_chain;
   3531    vassert(p[0] == 0x49);
   3532    vassert(p[1] == 0xBB);
   3533    vassert(*(ULong*)(&p[2]) == Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
   3534    vassert(p[10] == 0x41);
   3535    vassert(p[11] == 0xFF);
   3536    vassert(p[12] == 0xD3);
   3537    /* And what we want to change it to is either:
   3538         (general case):
   3539           movabsq $place_to_jump_to, %r11
   3540           jmpq *%r11
   3541         viz
   3542           49 BB <8 bytes value == place_to_jump_to>
   3543           41 FF E3
   3544         So it's the same length (convenient, huh) and we don't
   3545         need to change all the bits.
   3546       ---OR---
   3547         in the case where the displacement falls within 32 bits
   3548           jmpq disp32   where disp32 is relative to the next insn
   3549           ud2; ud2; ud2; ud2
   3550         viz
   3551           E9 <4 bytes == disp32>
   3552           0F 0B 0F 0B 0F 0B 0F 0B
   3553 
   3554       In both cases the replacement has the same length as the original.
   3555       To remain sane & verifiable,
   3556       (1) limit the displacement for the short form to
   3557           (say) +/- one billion, so as to avoid wraparound
   3558           off-by-ones
   3559       (2) even if the short form is applicable, once every (say)
   3560           1024 times use the long form anyway, so as to maintain
   3561           verifiability
   3562    */
   3563    /* This is the delta we need to put into a JMP d32 insn.  It's
   3564       relative to the start of the next insn, hence the -5.  */
   3565    Long delta   = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
   3566    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
   3567 
   3568    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
   3569    if (shortOK) {
   3570       shortCTR++; // thread safety bleh
   3571       if (0 == (shortCTR & 0x3FF)) {
   3572          shortOK = False;
   3573          if (0)
   3574             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
   3575                        "using long jmp\n", shortCTR);
   3576       }
   3577    }
   3578 
   3579    /* And make the modifications. */
   3580    if (shortOK) {
   3581       p[0]  = 0xE9;
   3582       p[1]  = (delta >> 0) & 0xFF;
   3583       p[2]  = (delta >> 8) & 0xFF;
   3584       p[3]  = (delta >> 16) & 0xFF;
   3585       p[4]  = (delta >> 24) & 0xFF;
   3586       p[5]  = 0x0F; p[6]  = 0x0B;
   3587       p[7]  = 0x0F; p[8]  = 0x0B;
   3588       p[9]  = 0x0F; p[10] = 0x0B;
   3589       p[11] = 0x0F; p[12] = 0x0B;
   3590       /* sanity check on the delta -- top 32 are all 0 or all 1 */
   3591       delta >>= 32;
   3592       vassert(delta == 0LL || delta == -1LL);
   3593    } else {
   3594       /* Minimal modifications from the starting sequence. */
   3595       *(ULong*)(&p[2]) = Ptr_to_ULong(place_to_jump_to);
   3596       p[12] = 0xE3;
   3597    }
   3598    VexInvalRange vir = {0, 0};
   3599    return vir;
   3600 }
   3601 
   3602 
   3603 /* NB: what goes on here has to be very closely coordinated with the
   3604    emitInstr case for XDirect, above. */
   3605 VexInvalRange unchainXDirect_AMD64 ( void* place_to_unchain,
   3606                                      void* place_to_jump_to_EXPECTED,
   3607                                      void* disp_cp_chain_me )
   3608 {
   3609    /* What we're expecting to see is either:
   3610         (general case)
   3611           movabsq $place_to_jump_to_EXPECTED, %r11
   3612           jmpq *%r11
   3613         viz
   3614           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
   3615           41 FF E3
   3616       ---OR---
   3617         in the case where the displacement falls within 32 bits
   3618           jmpq d32
   3619           ud2; ud2; ud2; ud2
   3620         viz
   3621           E9 <4 bytes == disp32>
   3622           0F 0B 0F 0B 0F 0B 0F 0B
   3623    */
   3624    UChar* p     = (UChar*)place_to_unchain;
   3625    Bool   valid = False;
   3626    if (p[0] == 0x49 && p[1] == 0xBB
   3627        && *(ULong*)(&p[2]) == Ptr_to_ULong(place_to_jump_to_EXPECTED)
   3628        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
   3629       /* it's the long form */
   3630       valid = True;
   3631    }
   3632    else
   3633    if (p[0] == 0xE9
   3634        && p[5]  == 0x0F && p[6]  == 0x0B
   3635        && p[7]  == 0x0F && p[8]  == 0x0B
   3636        && p[9]  == 0x0F && p[10] == 0x0B
   3637        && p[11] == 0x0F && p[12] == 0x0B) {
   3638       /* It's the short form.  Check the offset is right. */
   3639       Int  s32 = *(Int*)(&p[1]);
   3640       Long s64 = (Long)s32;
   3641       if ((UChar*)p + 5 + s64 == (UChar*)place_to_jump_to_EXPECTED) {
   3642          valid = True;
   3643          if (0)
   3644             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
   3645       }
   3646    }
   3647    vassert(valid);
   3648    /* And what we want to change it to is:
   3649         movabsq $disp_cp_chain_me, %r11
   3650         call *%r11
   3651       viz
   3652         49 BB <8 bytes value == disp_cp_chain_me>
   3653         41 FF D3
   3654       So it's the same length (convenient, huh).
   3655    */
   3656    p[0] = 0x49;
   3657    p[1] = 0xBB;
   3658    *(ULong*)(&p[2]) = Ptr_to_ULong(disp_cp_chain_me);
   3659    p[10] = 0x41;
   3660    p[11] = 0xFF;
   3661    p[12] = 0xD3;
   3662    VexInvalRange vir = {0, 0};
   3663    return vir;
   3664 }
   3665 
   3666 
   3667 /* Patch the counter address into a profile inc point, as previously
   3668    created by the Ain_ProfInc case for emit_AMD64Instr. */
   3669 VexInvalRange patchProfInc_AMD64 ( void*  place_to_patch,
   3670                                    ULong* location_of_counter )
   3671 {
   3672    vassert(sizeof(ULong*) == 8);
   3673    UChar* p = (UChar*)place_to_patch;
   3674    vassert(p[0] == 0x49);
   3675    vassert(p[1] == 0xBB);
   3676    vassert(p[2] == 0x00);
   3677    vassert(p[3] == 0x00);
   3678    vassert(p[4] == 0x00);
   3679    vassert(p[5] == 0x00);
   3680    vassert(p[6] == 0x00);
   3681    vassert(p[7] == 0x00);
   3682    vassert(p[8] == 0x00);
   3683    vassert(p[9] == 0x00);
   3684    vassert(p[10] == 0x49);
   3685    vassert(p[11] == 0xFF);
   3686    vassert(p[12] == 0x03);
   3687    ULong imm64 = (ULong)Ptr_to_ULong(location_of_counter);
   3688    p[2] = imm64 & 0xFF; imm64 >>= 8;
   3689    p[3] = imm64 & 0xFF; imm64 >>= 8;
   3690    p[4] = imm64 & 0xFF; imm64 >>= 8;
   3691    p[5] = imm64 & 0xFF; imm64 >>= 8;
   3692    p[6] = imm64 & 0xFF; imm64 >>= 8;
   3693    p[7] = imm64 & 0xFF; imm64 >>= 8;
   3694    p[8] = imm64 & 0xFF; imm64 >>= 8;
   3695    p[9] = imm64 & 0xFF; imm64 >>= 8;
   3696    VexInvalRange vir = {0, 0};
   3697    return vir;
   3698 }
   3699 
   3700 
   3701 /*---------------------------------------------------------------*/
   3702 /*--- end                                   host_amd64_defs.c ---*/
   3703 /*---------------------------------------------------------------*/
   3704