Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_defs.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex.h"
     38 #include "libvex_trc_values.h"
     39 
     40 #include "main_util.h"
     41 #include "host_generic_regs.h"
     42 #include "host_amd64_defs.h"
     43 
     44 
     45 /* --------- Registers. --------- */
     46 
     47 void ppHRegAMD64 ( HReg reg )
     48 {
     49    Int r;
     50    static const HChar* ireg64_names[16]
     51      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
     52          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
     53    /* Be generic for all virtual regs. */
     54    if (hregIsVirtual(reg)) {
     55       ppHReg(reg);
     56       return;
     57    }
     58    /* But specific for real regs. */
     59    switch (hregClass(reg)) {
     60       case HRcInt64:
     61          r = hregNumber(reg);
     62          vassert(r >= 0 && r < 16);
     63          vex_printf("%s", ireg64_names[r]);
     64          return;
     65       case HRcFlt64:
     66          r = hregNumber(reg);
     67          vassert(r >= 0 && r < 6);
     68          vex_printf("%%fake%d", r);
     69          return;
     70       case HRcVec128:
     71          r = hregNumber(reg);
     72          vassert(r >= 0 && r < 16);
     73          vex_printf("%%xmm%d", r);
     74          return;
     75       default:
     76          vpanic("ppHRegAMD64");
     77    }
     78 }
     79 
     80 static void ppHRegAMD64_lo32 ( HReg reg )
     81 {
     82    Int r;
     83    static const HChar* ireg32_names[16]
     84      = { "%eax",  "%ecx",  "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
     85          "%r8d",  "%r9d",  "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
     86    /* Be generic for all virtual regs. */
     87    if (hregIsVirtual(reg)) {
     88       ppHReg(reg);
     89       vex_printf("d");
     90       return;
     91    }
     92    /* But specific for real regs. */
     93    switch (hregClass(reg)) {
     94       case HRcInt64:
     95          r = hregNumber(reg);
     96          vassert(r >= 0 && r < 16);
     97          vex_printf("%s", ireg32_names[r]);
     98          return;
     99       default:
    100          vpanic("ppHRegAMD64_lo32: invalid regclass");
    101    }
    102 }
    103 
    104 HReg hregAMD64_RAX ( void ) { return mkHReg( 0, HRcInt64, False); }
    105 HReg hregAMD64_RCX ( void ) { return mkHReg( 1, HRcInt64, False); }
    106 HReg hregAMD64_RDX ( void ) { return mkHReg( 2, HRcInt64, False); }
    107 HReg hregAMD64_RBX ( void ) { return mkHReg( 3, HRcInt64, False); }
    108 HReg hregAMD64_RSP ( void ) { return mkHReg( 4, HRcInt64, False); }
    109 HReg hregAMD64_RBP ( void ) { return mkHReg( 5, HRcInt64, False); }
    110 HReg hregAMD64_RSI ( void ) { return mkHReg( 6, HRcInt64, False); }
    111 HReg hregAMD64_RDI ( void ) { return mkHReg( 7, HRcInt64, False); }
    112 HReg hregAMD64_R8  ( void ) { return mkHReg( 8, HRcInt64, False); }
    113 HReg hregAMD64_R9  ( void ) { return mkHReg( 9, HRcInt64, False); }
    114 HReg hregAMD64_R10 ( void ) { return mkHReg(10, HRcInt64, False); }
    115 HReg hregAMD64_R11 ( void ) { return mkHReg(11, HRcInt64, False); }
    116 HReg hregAMD64_R12 ( void ) { return mkHReg(12, HRcInt64, False); }
    117 HReg hregAMD64_R13 ( void ) { return mkHReg(13, HRcInt64, False); }
    118 HReg hregAMD64_R14 ( void ) { return mkHReg(14, HRcInt64, False); }
    119 HReg hregAMD64_R15 ( void ) { return mkHReg(15, HRcInt64, False); }
    120 
    121 HReg hregAMD64_XMM0  ( void ) { return mkHReg( 0, HRcVec128, False); }
    122 HReg hregAMD64_XMM1  ( void ) { return mkHReg( 1, HRcVec128, False); }
    123 HReg hregAMD64_XMM3  ( void ) { return mkHReg( 3, HRcVec128, False); }
    124 HReg hregAMD64_XMM4  ( void ) { return mkHReg( 4, HRcVec128, False); }
    125 HReg hregAMD64_XMM5  ( void ) { return mkHReg( 5, HRcVec128, False); }
    126 HReg hregAMD64_XMM6  ( void ) { return mkHReg( 6, HRcVec128, False); }
    127 HReg hregAMD64_XMM7  ( void ) { return mkHReg( 7, HRcVec128, False); }
    128 HReg hregAMD64_XMM8  ( void ) { return mkHReg( 8, HRcVec128, False); }
    129 HReg hregAMD64_XMM9  ( void ) { return mkHReg( 9, HRcVec128, False); }
    130 HReg hregAMD64_XMM10 ( void ) { return mkHReg(10, HRcVec128, False); }
    131 HReg hregAMD64_XMM11 ( void ) { return mkHReg(11, HRcVec128, False); }
    132 HReg hregAMD64_XMM12 ( void ) { return mkHReg(12, HRcVec128, False); }
    133 
    134 
    135 void getAllocableRegs_AMD64 ( Int* nregs, HReg** arr )
    136 {
    137 #if 0
    138    *nregs = 6;
    139    *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
    140    (*arr)[ 0] = hregAMD64_RSI();
    141    (*arr)[ 1] = hregAMD64_RDI();
    142    (*arr)[ 2] = hregAMD64_RBX();
    143 
    144    (*arr)[ 3] = hregAMD64_XMM7();
    145    (*arr)[ 4] = hregAMD64_XMM8();
    146    (*arr)[ 5] = hregAMD64_XMM9();
    147 #endif
    148 #if 1
    149    *nregs = 20;
    150    *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
    151    (*arr)[ 0] = hregAMD64_RSI();
    152    (*arr)[ 1] = hregAMD64_RDI();
    153    (*arr)[ 2] = hregAMD64_R8();
    154    (*arr)[ 3] = hregAMD64_R9();
    155    (*arr)[ 4] = hregAMD64_R12();
    156    (*arr)[ 5] = hregAMD64_R13();
    157    (*arr)[ 6] = hregAMD64_R14();
    158    (*arr)[ 7] = hregAMD64_R15();
    159    (*arr)[ 8] = hregAMD64_RBX();
    160 
    161    (*arr)[ 9] = hregAMD64_XMM3();
    162    (*arr)[10] = hregAMD64_XMM4();
    163    (*arr)[11] = hregAMD64_XMM5();
    164    (*arr)[12] = hregAMD64_XMM6();
    165    (*arr)[13] = hregAMD64_XMM7();
    166    (*arr)[14] = hregAMD64_XMM8();
    167    (*arr)[15] = hregAMD64_XMM9();
    168    (*arr)[16] = hregAMD64_XMM10();
    169    (*arr)[17] = hregAMD64_XMM11();
    170    (*arr)[18] = hregAMD64_XMM12();
    171    (*arr)[19] = hregAMD64_R10();
    172 #endif
    173 }
    174 
    175 
    176 /* --------- Condition codes, Intel encoding. --------- */
    177 
    178 const HChar* showAMD64CondCode ( AMD64CondCode cond )
    179 {
    180    switch (cond) {
    181       case Acc_O:      return "o";
    182       case Acc_NO:     return "no";
    183       case Acc_B:      return "b";
    184       case Acc_NB:     return "nb";
    185       case Acc_Z:      return "z";
    186       case Acc_NZ:     return "nz";
    187       case Acc_BE:     return "be";
    188       case Acc_NBE:    return "nbe";
    189       case Acc_S:      return "s";
    190       case Acc_NS:     return "ns";
    191       case Acc_P:      return "p";
    192       case Acc_NP:     return "np";
    193       case Acc_L:      return "l";
    194       case Acc_NL:     return "nl";
    195       case Acc_LE:     return "le";
    196       case Acc_NLE:    return "nle";
    197       case Acc_ALWAYS: return "ALWAYS";
    198       default: vpanic("ppAMD64CondCode");
    199    }
    200 }
    201 
    202 
    203 /* --------- AMD64AMode: memory address expressions. --------- */
    204 
    205 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
    206    AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
    207    am->tag        = Aam_IR;
    208    am->Aam.IR.imm = imm32;
    209    am->Aam.IR.reg = reg;
    210    return am;
    211 }
    212 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
    213    AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
    214    am->tag = Aam_IRRS;
    215    am->Aam.IRRS.imm   = imm32;
    216    am->Aam.IRRS.base  = base;
    217    am->Aam.IRRS.index = indEx;
    218    am->Aam.IRRS.shift = shift;
    219    vassert(shift >= 0 && shift <= 3);
    220    return am;
    221 }
    222 
    223 void ppAMD64AMode ( AMD64AMode* am ) {
    224    switch (am->tag) {
    225       case Aam_IR:
    226          if (am->Aam.IR.imm == 0)
    227             vex_printf("(");
    228          else
    229             vex_printf("0x%x(", am->Aam.IR.imm);
    230          ppHRegAMD64(am->Aam.IR.reg);
    231          vex_printf(")");
    232          return;
    233       case Aam_IRRS:
    234          vex_printf("0x%x(", am->Aam.IRRS.imm);
    235          ppHRegAMD64(am->Aam.IRRS.base);
    236          vex_printf(",");
    237          ppHRegAMD64(am->Aam.IRRS.index);
    238          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
    239          return;
    240       default:
    241          vpanic("ppAMD64AMode");
    242    }
    243 }
    244 
    245 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
    246    switch (am->tag) {
    247       case Aam_IR:
    248          addHRegUse(u, HRmRead, am->Aam.IR.reg);
    249          return;
    250       case Aam_IRRS:
    251          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
    252          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
    253          return;
    254       default:
    255          vpanic("addRegUsage_AMD64AMode");
    256    }
    257 }
    258 
    259 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
    260    switch (am->tag) {
    261       case Aam_IR:
    262          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
    263          return;
    264       case Aam_IRRS:
    265          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
    266          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
    267          return;
    268       default:
    269          vpanic("mapRegs_AMD64AMode");
    270    }
    271 }
    272 
    273 /* --------- Operand, which can be reg, immediate or memory. --------- */
    274 
    275 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
    276    AMD64RMI* op       = LibVEX_Alloc(sizeof(AMD64RMI));
    277    op->tag            = Armi_Imm;
    278    op->Armi.Imm.imm32 = imm32;
    279    return op;
    280 }
    281 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
    282    AMD64RMI* op     = LibVEX_Alloc(sizeof(AMD64RMI));
    283    op->tag          = Armi_Reg;
    284    op->Armi.Reg.reg = reg;
    285    return op;
    286 }
    287 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
    288    AMD64RMI* op    = LibVEX_Alloc(sizeof(AMD64RMI));
    289    op->tag         = Armi_Mem;
    290    op->Armi.Mem.am = am;
    291    return op;
    292 }
    293 
    294 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
    295    switch (op->tag) {
    296       case Armi_Imm:
    297          vex_printf("$0x%x", op->Armi.Imm.imm32);
    298          return;
    299       case Armi_Reg:
    300          if (lo32)
    301             ppHRegAMD64_lo32(op->Armi.Reg.reg);
    302          else
    303             ppHRegAMD64(op->Armi.Reg.reg);
    304          return;
    305       case Armi_Mem:
    306          ppAMD64AMode(op->Armi.Mem.am);
    307          return;
    308      default:
    309          vpanic("ppAMD64RMI");
    310    }
    311 }
    312 void ppAMD64RMI ( AMD64RMI* op ) {
    313    ppAMD64RMI_wrk(op, False/*!lo32*/);
    314 }
    315 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
    316    ppAMD64RMI_wrk(op, True/*lo32*/);
    317 }
    318 
    319 /* An AMD64RMI can only be used in a "read" context (what would it mean
    320    to write or modify a literal?) and so we enumerate its registers
    321    accordingly. */
    322 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
    323    switch (op->tag) {
    324       case Armi_Imm:
    325          return;
    326       case Armi_Reg:
    327          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
    328          return;
    329       case Armi_Mem:
    330          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
    331          return;
    332       default:
    333          vpanic("addRegUsage_AMD64RMI");
    334    }
    335 }
    336 
    337 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
    338    switch (op->tag) {
    339       case Armi_Imm:
    340          return;
    341       case Armi_Reg:
    342          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
    343          return;
    344       case Armi_Mem:
    345          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
    346          return;
    347       default:
    348          vpanic("mapRegs_AMD64RMI");
    349    }
    350 }
    351 
    352 
    353 /* --------- Operand, which can be reg or immediate only. --------- */
    354 
    355 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
    356    AMD64RI* op       = LibVEX_Alloc(sizeof(AMD64RI));
    357    op->tag           = Ari_Imm;
    358    op->Ari.Imm.imm32 = imm32;
    359    return op;
    360 }
    361 AMD64RI* AMD64RI_Reg ( HReg reg ) {
    362    AMD64RI* op     = LibVEX_Alloc(sizeof(AMD64RI));
    363    op->tag         = Ari_Reg;
    364    op->Ari.Reg.reg = reg;
    365    return op;
    366 }
    367 
    368 void ppAMD64RI ( AMD64RI* op ) {
    369    switch (op->tag) {
    370       case Ari_Imm:
    371          vex_printf("$0x%x", op->Ari.Imm.imm32);
    372          return;
    373       case Ari_Reg:
    374          ppHRegAMD64(op->Ari.Reg.reg);
    375          return;
    376      default:
    377          vpanic("ppAMD64RI");
    378    }
    379 }
    380 
    381 /* An AMD64RI can only be used in a "read" context (what would it mean
    382    to write or modify a literal?) and so we enumerate its registers
    383    accordingly. */
    384 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
    385    switch (op->tag) {
    386       case Ari_Imm:
    387          return;
    388       case Ari_Reg:
    389          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
    390          return;
    391       default:
    392          vpanic("addRegUsage_AMD64RI");
    393    }
    394 }
    395 
    396 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
    397    switch (op->tag) {
    398       case Ari_Imm:
    399          return;
    400       case Ari_Reg:
    401          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
    402          return;
    403       default:
    404          vpanic("mapRegs_AMD64RI");
    405    }
    406 }
    407 
    408 
    409 /* --------- Operand, which can be reg or memory only. --------- */
    410 
    411 AMD64RM* AMD64RM_Reg ( HReg reg ) {
    412    AMD64RM* op       = LibVEX_Alloc(sizeof(AMD64RM));
    413    op->tag         = Arm_Reg;
    414    op->Arm.Reg.reg = reg;
    415    return op;
    416 }
    417 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
    418    AMD64RM* op    = LibVEX_Alloc(sizeof(AMD64RM));
    419    op->tag        = Arm_Mem;
    420    op->Arm.Mem.am = am;
    421    return op;
    422 }
    423 
    424 void ppAMD64RM ( AMD64RM* op ) {
    425    switch (op->tag) {
    426       case Arm_Mem:
    427          ppAMD64AMode(op->Arm.Mem.am);
    428          return;
    429       case Arm_Reg:
    430          ppHRegAMD64(op->Arm.Reg.reg);
    431          return;
    432      default:
    433          vpanic("ppAMD64RM");
    434    }
    435 }
    436 
    437 /* Because an AMD64RM can be both a source or destination operand, we
    438    have to supply a mode -- pertaining to the operand as a whole --
    439    indicating how it's being used. */
    440 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
    441    switch (op->tag) {
    442       case Arm_Mem:
    443          /* Memory is read, written or modified.  So we just want to
    444             know the regs read by the amode. */
    445          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
    446          return;
    447       case Arm_Reg:
    448          /* reg is read, written or modified.  Add it in the
    449             appropriate way. */
    450          addHRegUse(u, mode, op->Arm.Reg.reg);
    451          return;
    452      default:
    453          vpanic("addRegUsage_AMD64RM");
    454    }
    455 }
    456 
    457 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
    458 {
    459    switch (op->tag) {
    460       case Arm_Mem:
    461          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
    462          return;
    463       case Arm_Reg:
    464          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
    465          return;
    466      default:
    467          vpanic("mapRegs_AMD64RM");
    468    }
    469 }
    470 
    471 
    472 /* --------- Instructions. --------- */
    473 
    474 static const HChar* showAMD64ScalarSz ( Int sz ) {
    475    switch (sz) {
    476       case 2: return "w";
    477       case 4: return "l";
    478       case 8: return "q";
    479       default: vpanic("showAMD64ScalarSz");
    480    }
    481 }
    482 
    483 const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
    484    switch (op) {
    485       case Aun_NOT: return "not";
    486       case Aun_NEG: return "neg";
    487       default: vpanic("showAMD64UnaryOp");
    488    }
    489 }
    490 
    491 const HChar* showAMD64AluOp ( AMD64AluOp op ) {
    492    switch (op) {
    493       case Aalu_MOV:  return "mov";
    494       case Aalu_CMP:  return "cmp";
    495       case Aalu_ADD:  return "add";
    496       case Aalu_SUB:  return "sub";
    497       case Aalu_ADC:  return "adc";
    498       case Aalu_SBB:  return "sbb";
    499       case Aalu_AND:  return "and";
    500       case Aalu_OR:   return "or";
    501       case Aalu_XOR:  return "xor";
    502       case Aalu_MUL:  return "imul";
    503       default: vpanic("showAMD64AluOp");
    504    }
    505 }
    506 
    507 const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
    508    switch (op) {
    509       case Ash_SHL: return "shl";
    510       case Ash_SHR: return "shr";
    511       case Ash_SAR: return "sar";
    512       default: vpanic("showAMD64ShiftOp");
    513    }
    514 }
    515 
    516 const HChar* showA87FpOp ( A87FpOp op ) {
    517    switch (op) {
    518       case Afp_SCALE:  return "scale";
    519       case Afp_ATAN:   return "atan";
    520       case Afp_YL2X:   return "yl2x";
    521       case Afp_YL2XP1: return "yl2xp1";
    522       case Afp_PREM:   return "prem";
    523       case Afp_PREM1:  return "prem1";
    524       case Afp_SQRT:   return "sqrt";
    525       case Afp_SIN:    return "sin";
    526       case Afp_COS:    return "cos";
    527       case Afp_TAN:    return "tan";
    528       case Afp_ROUND:  return "round";
    529       case Afp_2XM1:   return "2xm1";
    530       default: vpanic("showA87FpOp");
    531    }
    532 }
    533 
    534 const HChar* showAMD64SseOp ( AMD64SseOp op ) {
    535    switch (op) {
    536       case Asse_MOV:      return "movups";
    537       case Asse_ADDF:     return "add";
    538       case Asse_SUBF:     return "sub";
    539       case Asse_MULF:     return "mul";
    540       case Asse_DIVF:     return "div";
    541       case Asse_MAXF:     return "max";
    542       case Asse_MINF:     return "min";
    543       case Asse_CMPEQF:   return "cmpFeq";
    544       case Asse_CMPLTF:   return "cmpFlt";
    545       case Asse_CMPLEF:   return "cmpFle";
    546       case Asse_CMPUNF:   return "cmpFun";
    547       case Asse_RCPF:     return "rcp";
    548       case Asse_RSQRTF:   return "rsqrt";
    549       case Asse_SQRTF:    return "sqrt";
    550       case Asse_AND:      return "and";
    551       case Asse_OR:       return "or";
    552       case Asse_XOR:      return "xor";
    553       case Asse_ANDN:     return "andn";
    554       case Asse_ADD8:     return "paddb";
    555       case Asse_ADD16:    return "paddw";
    556       case Asse_ADD32:    return "paddd";
    557       case Asse_ADD64:    return "paddq";
    558       case Asse_QADD8U:   return "paddusb";
    559       case Asse_QADD16U:  return "paddusw";
    560       case Asse_QADD8S:   return "paddsb";
    561       case Asse_QADD16S:  return "paddsw";
    562       case Asse_SUB8:     return "psubb";
    563       case Asse_SUB16:    return "psubw";
    564       case Asse_SUB32:    return "psubd";
    565       case Asse_SUB64:    return "psubq";
    566       case Asse_QSUB8U:   return "psubusb";
    567       case Asse_QSUB16U:  return "psubusw";
    568       case Asse_QSUB8S:   return "psubsb";
    569       case Asse_QSUB16S:  return "psubsw";
    570       case Asse_MUL16:    return "pmullw";
    571       case Asse_MULHI16U: return "pmulhuw";
    572       case Asse_MULHI16S: return "pmulhw";
    573       case Asse_AVG8U:    return "pavgb";
    574       case Asse_AVG16U:   return "pavgw";
    575       case Asse_MAX16S:   return "pmaxw";
    576       case Asse_MAX8U:    return "pmaxub";
    577       case Asse_MIN16S:   return "pminw";
    578       case Asse_MIN8U:    return "pminub";
    579       case Asse_CMPEQ8:   return "pcmpeqb";
    580       case Asse_CMPEQ16:  return "pcmpeqw";
    581       case Asse_CMPEQ32:  return "pcmpeqd";
    582       case Asse_CMPGT8S:  return "pcmpgtb";
    583       case Asse_CMPGT16S: return "pcmpgtw";
    584       case Asse_CMPGT32S: return "pcmpgtd";
    585       case Asse_SHL16:    return "psllw";
    586       case Asse_SHL32:    return "pslld";
    587       case Asse_SHL64:    return "psllq";
    588       case Asse_SHR16:    return "psrlw";
    589       case Asse_SHR32:    return "psrld";
    590       case Asse_SHR64:    return "psrlq";
    591       case Asse_SAR16:    return "psraw";
    592       case Asse_SAR32:    return "psrad";
    593       case Asse_PACKSSD:  return "packssdw";
    594       case Asse_PACKSSW:  return "packsswb";
    595       case Asse_PACKUSW:  return "packuswb";
    596       case Asse_UNPCKHB:  return "punpckhb";
    597       case Asse_UNPCKHW:  return "punpckhw";
    598       case Asse_UNPCKHD:  return "punpckhd";
    599       case Asse_UNPCKHQ:  return "punpckhq";
    600       case Asse_UNPCKLB:  return "punpcklb";
    601       case Asse_UNPCKLW:  return "punpcklw";
    602       case Asse_UNPCKLD:  return "punpckld";
    603       case Asse_UNPCKLQ:  return "punpcklq";
    604       default: vpanic("showAMD64SseOp");
    605    }
    606 }
    607 
    608 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
    609    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    610    i->tag             = Ain_Imm64;
    611    i->Ain.Imm64.imm64 = imm64;
    612    i->Ain.Imm64.dst   = dst;
    613    return i;
    614 }
    615 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    616    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    617    i->tag            = Ain_Alu64R;
    618    i->Ain.Alu64R.op  = op;
    619    i->Ain.Alu64R.src = src;
    620    i->Ain.Alu64R.dst = dst;
    621    return i;
    622 }
    623 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
    624    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    625    i->tag            = Ain_Alu64M;
    626    i->Ain.Alu64M.op  = op;
    627    i->Ain.Alu64M.src = src;
    628    i->Ain.Alu64M.dst = dst;
    629    vassert(op != Aalu_MUL);
    630    return i;
    631 }
    632 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
    633    AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
    634    i->tag          = Ain_Sh64;
    635    i->Ain.Sh64.op  = op;
    636    i->Ain.Sh64.src = src;
    637    i->Ain.Sh64.dst = dst;
    638    return i;
    639 }
    640 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
    641    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    642    i->tag              = Ain_Test64;
    643    i->Ain.Test64.imm32 = imm32;
    644    i->Ain.Test64.dst   = dst;
    645    return i;
    646 }
    647 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
    648    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    649    i->tag             = Ain_Unary64;
    650    i->Ain.Unary64.op  = op;
    651    i->Ain.Unary64.dst = dst;
    652    return i;
    653 }
    654 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
    655    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    656    i->tag             = Ain_Lea64;
    657    i->Ain.Lea64.am    = am;
    658    i->Ain.Lea64.dst   = dst;
    659    return i;
    660 }
    661 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    662    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    663    i->tag            = Ain_Alu32R;
    664    i->Ain.Alu32R.op  = op;
    665    i->Ain.Alu32R.src = src;
    666    i->Ain.Alu32R.dst = dst;
    667    switch (op) {
    668       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
    669       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
    670       default: vassert(0);
    671    }
    672    return i;
    673 }
    674 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
    675    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    676    i->tag            = Ain_MulL;
    677    i->Ain.MulL.syned = syned;
    678    i->Ain.MulL.src   = src;
    679    return i;
    680 }
    681 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
    682    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    683    i->tag            = Ain_Div;
    684    i->Ain.Div.syned  = syned;
    685    i->Ain.Div.sz     = sz;
    686    i->Ain.Div.src    = src;
    687    vassert(sz == 4 || sz == 8);
    688    return i;
    689 }
    690 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
    691    AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
    692    i->tag          = Ain_Push;
    693    i->Ain.Push.src = src;
    694    return i;
    695 }
    696 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
    697                               RetLoc rloc ) {
    698    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    699    i->tag               = Ain_Call;
    700    i->Ain.Call.cond     = cond;
    701    i->Ain.Call.target   = target;
    702    i->Ain.Call.regparms = regparms;
    703    i->Ain.Call.rloc     = rloc;
    704    vassert(regparms >= 0 && regparms <= 6);
    705    vassert(is_sane_RetLoc(rloc));
    706    return i;
    707 }
    708 
    709 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
    710                                  AMD64CondCode cond, Bool toFastEP ) {
    711    AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
    712    i->tag                  = Ain_XDirect;
    713    i->Ain.XDirect.dstGA    = dstGA;
    714    i->Ain.XDirect.amRIP    = amRIP;
    715    i->Ain.XDirect.cond     = cond;
    716    i->Ain.XDirect.toFastEP = toFastEP;
    717    return i;
    718 }
    719 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
    720                                 AMD64CondCode cond ) {
    721    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    722    i->tag              = Ain_XIndir;
    723    i->Ain.XIndir.dstGA = dstGA;
    724    i->Ain.XIndir.amRIP = amRIP;
    725    i->Ain.XIndir.cond  = cond;
    726    return i;
    727 }
    728 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
    729                                    AMD64CondCode cond, IRJumpKind jk ) {
    730    AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
    731    i->tag                 = Ain_XAssisted;
    732    i->Ain.XAssisted.dstGA = dstGA;
    733    i->Ain.XAssisted.amRIP = amRIP;
    734    i->Ain.XAssisted.cond  = cond;
    735    i->Ain.XAssisted.jk    = jk;
    736    return i;
    737 }
    738 
    739 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, AMD64RM* src, HReg dst ) {
    740    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    741    i->tag             = Ain_CMov64;
    742    i->Ain.CMov64.cond = cond;
    743    i->Ain.CMov64.src  = src;
    744    i->Ain.CMov64.dst  = dst;
    745    vassert(cond != Acc_ALWAYS);
    746    return i;
    747 }
    748 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
    749    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    750    i->tag              = Ain_MovxLQ;
    751    i->Ain.MovxLQ.syned = syned;
    752    i->Ain.MovxLQ.src   = src;
    753    i->Ain.MovxLQ.dst   = dst;
    754    return i;
    755 }
    756 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
    757                                 AMD64AMode* src, HReg dst ) {
    758    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    759    i->tag                = Ain_LoadEX;
    760    i->Ain.LoadEX.szSmall = szSmall;
    761    i->Ain.LoadEX.syned   = syned;
    762    i->Ain.LoadEX.src     = src;
    763    i->Ain.LoadEX.dst     = dst;
    764    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
    765    return i;
    766 }
    767 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
    768    AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
    769    i->tag           = Ain_Store;
    770    i->Ain.Store.sz  = sz;
    771    i->Ain.Store.src = src;
    772    i->Ain.Store.dst = dst;
    773    vassert(sz == 1 || sz == 2 || sz == 4);
    774    return i;
    775 }
    776 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
    777    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    778    i->tag            = Ain_Set64;
    779    i->Ain.Set64.cond = cond;
    780    i->Ain.Set64.dst  = dst;
    781    return i;
    782 }
    783 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
    784    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    785    i->tag               = Ain_Bsfr64;
    786    i->Ain.Bsfr64.isFwds = isFwds;
    787    i->Ain.Bsfr64.src    = src;
    788    i->Ain.Bsfr64.dst    = dst;
    789    return i;
    790 }
    791 AMD64Instr* AMD64Instr_MFence ( void ) {
    792    AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
    793    i->tag        = Ain_MFence;
    794    return i;
    795 }
    796 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
    797    AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
    798    i->tag           = Ain_ACAS;
    799    i->Ain.ACAS.addr = addr;
    800    i->Ain.ACAS.sz   = sz;
    801    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
    802    return i;
    803 }
    804 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
    805    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    806    i->tag            = Ain_DACAS;
    807    i->Ain.DACAS.addr = addr;
    808    i->Ain.DACAS.sz   = sz;
    809    vassert(sz == 8 || sz == 4);
    810    return i;
    811 }
    812 
    813 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
    814 {
    815    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    816    i->tag               = Ain_A87Free;
    817    i->Ain.A87Free.nregs = nregs;
    818    vassert(nregs >= 1 && nregs <= 7);
    819    return i;
    820 }
    821 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
    822 {
    823    AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
    824    i->tag                   = Ain_A87PushPop;
    825    i->Ain.A87PushPop.addr   = addr;
    826    i->Ain.A87PushPop.isPush = isPush;
    827    i->Ain.A87PushPop.szB    = szB;
    828    vassert(szB == 8 || szB == 4);
    829    return i;
    830 }
    831 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
    832 {
    833    AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
    834    i->tag            = Ain_A87FpOp;
    835    i->Ain.A87FpOp.op = op;
    836    return i;
    837 }
    838 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
    839 {
    840    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    841    i->tag              = Ain_A87LdCW;
    842    i->Ain.A87LdCW.addr = addr;
    843    return i;
    844 }
    845 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
    846 {
    847    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    848    i->tag              = Ain_A87StSW;
    849    i->Ain.A87StSW.addr = addr;
    850    return i;
    851 }
    852 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
    853    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    854    i->tag                = Ain_LdMXCSR;
    855    i->Ain.LdMXCSR.addr   = addr;
    856    return i;
    857 }
    858 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
    859    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    860    i->tag                = Ain_SseUComIS;
    861    i->Ain.SseUComIS.sz   = toUChar(sz);
    862    i->Ain.SseUComIS.srcL = srcL;
    863    i->Ain.SseUComIS.srcR = srcR;
    864    i->Ain.SseUComIS.dst  = dst;
    865    vassert(sz == 4 || sz == 8);
    866    return i;
    867 }
    868 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
    869    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    870    i->tag              = Ain_SseSI2SF;
    871    i->Ain.SseSI2SF.szS = toUChar(szS);
    872    i->Ain.SseSI2SF.szD = toUChar(szD);
    873    i->Ain.SseSI2SF.src = src;
    874    i->Ain.SseSI2SF.dst = dst;
    875    vassert(szS == 4 || szS == 8);
    876    vassert(szD == 4 || szD == 8);
    877    return i;
    878 }
    879 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
    880    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    881    i->tag              = Ain_SseSF2SI;
    882    i->Ain.SseSF2SI.szS = toUChar(szS);
    883    i->Ain.SseSF2SI.szD = toUChar(szD);
    884    i->Ain.SseSF2SI.src = src;
    885    i->Ain.SseSF2SI.dst = dst;
    886    vassert(szS == 4 || szS == 8);
    887    vassert(szD == 4 || szD == 8);
    888    return i;
    889 }
    890 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
    891 {
    892    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    893    i->tag                = Ain_SseSDSS;
    894    i->Ain.SseSDSS.from64 = from64;
    895    i->Ain.SseSDSS.src    = src;
    896    i->Ain.SseSDSS.dst    = dst;
    897    return i;
    898 }
    899 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
    900                                  HReg reg, AMD64AMode* addr ) {
    901    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    902    i->tag                = Ain_SseLdSt;
    903    i->Ain.SseLdSt.isLoad = isLoad;
    904    i->Ain.SseLdSt.sz     = toUChar(sz);
    905    i->Ain.SseLdSt.reg    = reg;
    906    i->Ain.SseLdSt.addr   = addr;
    907    vassert(sz == 4 || sz == 8 || sz == 16);
    908    return i;
    909 }
    910 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
    911 {
    912    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    913    i->tag                = Ain_SseLdzLO;
    914    i->Ain.SseLdzLO.sz    = sz;
    915    i->Ain.SseLdzLO.reg   = reg;
    916    i->Ain.SseLdzLO.addr  = addr;
    917    vassert(sz == 4 || sz == 8);
    918    return i;
    919 }
    920 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
    921    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    922    i->tag              = Ain_Sse32Fx4;
    923    i->Ain.Sse32Fx4.op  = op;
    924    i->Ain.Sse32Fx4.src = src;
    925    i->Ain.Sse32Fx4.dst = dst;
    926    vassert(op != Asse_MOV);
    927    return i;
    928 }
    929 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    930    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    931    i->tag              = Ain_Sse32FLo;
    932    i->Ain.Sse32FLo.op  = op;
    933    i->Ain.Sse32FLo.src = src;
    934    i->Ain.Sse32FLo.dst = dst;
    935    vassert(op != Asse_MOV);
    936    return i;
    937 }
    938 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
    939    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    940    i->tag              = Ain_Sse64Fx2;
    941    i->Ain.Sse64Fx2.op  = op;
    942    i->Ain.Sse64Fx2.src = src;
    943    i->Ain.Sse64Fx2.dst = dst;
    944    vassert(op != Asse_MOV);
    945    return i;
    946 }
    947 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    948    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    949    i->tag              = Ain_Sse64FLo;
    950    i->Ain.Sse64FLo.op  = op;
    951    i->Ain.Sse64FLo.src = src;
    952    i->Ain.Sse64FLo.dst = dst;
    953    vassert(op != Asse_MOV);
    954    return i;
    955 }
    956 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
    957    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    958    i->tag             = Ain_SseReRg;
    959    i->Ain.SseReRg.op  = op;
    960    i->Ain.SseReRg.src = re;
    961    i->Ain.SseReRg.dst = rg;
    962    return i;
    963 }
    964 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
    965    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
    966    i->tag              = Ain_SseCMov;
    967    i->Ain.SseCMov.cond = cond;
    968    i->Ain.SseCMov.src  = src;
    969    i->Ain.SseCMov.dst  = dst;
    970    vassert(cond != Acc_ALWAYS);
    971    return i;
    972 }
    973 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
    974    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
    975    i->tag               = Ain_SseShuf;
    976    i->Ain.SseShuf.order = order;
    977    i->Ain.SseShuf.src   = src;
    978    i->Ain.SseShuf.dst   = dst;
    979    vassert(order >= 0 && order <= 0xFF);
    980    return i;
    981 }
    982 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
    983 //uu                                  HReg reg, AMD64AMode* addr ) {
    984 //uu    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
    985 //uu    i->tag                = Ain_AvxLdSt;
    986 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
    987 //uu    i->Ain.AvxLdSt.reg    = reg;
    988 //uu    i->Ain.AvxLdSt.addr   = addr;
    989 //uu    return i;
    990 //uu }
    991 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
    992 //uu    AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
    993 //uu    i->tag             = Ain_AvxReRg;
    994 //uu    i->Ain.AvxReRg.op  = op;
    995 //uu    i->Ain.AvxReRg.src = re;
    996 //uu    i->Ain.AvxReRg.dst = rg;
    997 //uu    return i;
    998 //uu }
    999 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
   1000                                  AMD64AMode* amFailAddr ) {
   1001    AMD64Instr* i             = LibVEX_Alloc(sizeof(AMD64Instr));
   1002    i->tag                    = Ain_EvCheck;
   1003    i->Ain.EvCheck.amCounter  = amCounter;
   1004    i->Ain.EvCheck.amFailAddr = amFailAddr;
   1005    return i;
   1006 }
   1007 AMD64Instr* AMD64Instr_ProfInc ( void ) {
   1008    AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
   1009    i->tag        = Ain_ProfInc;
   1010    return i;
   1011 }
   1012 
   1013 void ppAMD64Instr ( AMD64Instr* i, Bool mode64 )
   1014 {
   1015    vassert(mode64 == True);
   1016    switch (i->tag) {
   1017       case Ain_Imm64:
   1018          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
   1019          ppHRegAMD64(i->Ain.Imm64.dst);
   1020          return;
   1021       case Ain_Alu64R:
   1022          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
   1023          ppAMD64RMI(i->Ain.Alu64R.src);
   1024          vex_printf(",");
   1025          ppHRegAMD64(i->Ain.Alu64R.dst);
   1026          return;
   1027       case Ain_Alu64M:
   1028          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
   1029          ppAMD64RI(i->Ain.Alu64M.src);
   1030          vex_printf(",");
   1031          ppAMD64AMode(i->Ain.Alu64M.dst);
   1032          return;
   1033       case Ain_Sh64:
   1034          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
   1035          if (i->Ain.Sh64.src == 0)
   1036             vex_printf("%%cl,");
   1037          else
   1038             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
   1039          ppHRegAMD64(i->Ain.Sh64.dst);
   1040          return;
   1041       case Ain_Test64:
   1042          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
   1043          ppHRegAMD64(i->Ain.Test64.dst);
   1044          return;
   1045       case Ain_Unary64:
   1046          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
   1047          ppHRegAMD64(i->Ain.Unary64.dst);
   1048          return;
   1049       case Ain_Lea64:
   1050          vex_printf("leaq ");
   1051          ppAMD64AMode(i->Ain.Lea64.am);
   1052          vex_printf(",");
   1053          ppHRegAMD64(i->Ain.Lea64.dst);
   1054          return;
   1055       case Ain_Alu32R:
   1056          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
   1057          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
   1058          vex_printf(",");
   1059          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
   1060          return;
   1061       case Ain_MulL:
   1062          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
   1063          ppAMD64RM(i->Ain.MulL.src);
   1064          return;
   1065       case Ain_Div:
   1066          vex_printf("%cdiv%s ",
   1067                     i->Ain.Div.syned ? 's' : 'u',
   1068                     showAMD64ScalarSz(i->Ain.Div.sz));
   1069          ppAMD64RM(i->Ain.Div.src);
   1070          return;
   1071       case Ain_Push:
   1072          vex_printf("pushq ");
   1073          ppAMD64RMI(i->Ain.Push.src);
   1074          return;
   1075       case Ain_Call:
   1076          vex_printf("call%s[%d,",
   1077                     i->Ain.Call.cond==Acc_ALWAYS
   1078                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
   1079                     i->Ain.Call.regparms );
   1080          ppRetLoc(i->Ain.Call.rloc);
   1081          vex_printf("] 0x%llx", i->Ain.Call.target);
   1082          break;
   1083 
   1084       case Ain_XDirect:
   1085          vex_printf("(xDirect) ");
   1086          vex_printf("if (%%rflags.%s) { ",
   1087                     showAMD64CondCode(i->Ain.XDirect.cond));
   1088          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
   1089          vex_printf("movq %%r11,");
   1090          ppAMD64AMode(i->Ain.XDirect.amRIP);
   1091          vex_printf("; ");
   1092          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
   1093                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
   1094          return;
   1095       case Ain_XIndir:
   1096          vex_printf("(xIndir) ");
   1097          vex_printf("if (%%rflags.%s) { ",
   1098                     showAMD64CondCode(i->Ain.XIndir.cond));
   1099          vex_printf("movq ");
   1100          ppHRegAMD64(i->Ain.XIndir.dstGA);
   1101          vex_printf(",");
   1102          ppAMD64AMode(i->Ain.XIndir.amRIP);
   1103          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
   1104          return;
   1105       case Ain_XAssisted:
   1106          vex_printf("(xAssisted) ");
   1107          vex_printf("if (%%rflags.%s) { ",
   1108                     showAMD64CondCode(i->Ain.XAssisted.cond));
   1109          vex_printf("movq ");
   1110          ppHRegAMD64(i->Ain.XAssisted.dstGA);
   1111          vex_printf(",");
   1112          ppAMD64AMode(i->Ain.XAssisted.amRIP);
   1113          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
   1114                     (Int)i->Ain.XAssisted.jk);
   1115          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
   1116          return;
   1117 
   1118       case Ain_CMov64:
   1119          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
   1120          ppAMD64RM(i->Ain.CMov64.src);
   1121          vex_printf(",");
   1122          ppHRegAMD64(i->Ain.CMov64.dst);
   1123          return;
   1124       case Ain_MovxLQ:
   1125          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
   1126          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
   1127          vex_printf(",");
   1128          ppHRegAMD64(i->Ain.MovxLQ.dst);
   1129          return;
   1130       case Ain_LoadEX:
   1131          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
   1132             vex_printf("movl ");
   1133             ppAMD64AMode(i->Ain.LoadEX.src);
   1134             vex_printf(",");
   1135             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
   1136          } else {
   1137             vex_printf("mov%c%cq ",
   1138                        i->Ain.LoadEX.syned ? 's' : 'z',
   1139                        i->Ain.LoadEX.szSmall==1
   1140                           ? 'b'
   1141                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
   1142             ppAMD64AMode(i->Ain.LoadEX.src);
   1143             vex_printf(",");
   1144             ppHRegAMD64(i->Ain.LoadEX.dst);
   1145          }
   1146          return;
   1147       case Ain_Store:
   1148          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
   1149                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
   1150          ppHRegAMD64(i->Ain.Store.src);
   1151          vex_printf(",");
   1152          ppAMD64AMode(i->Ain.Store.dst);
   1153          return;
   1154       case Ain_Set64:
   1155          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
   1156          ppHRegAMD64(i->Ain.Set64.dst);
   1157          return;
   1158       case Ain_Bsfr64:
   1159          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
   1160          ppHRegAMD64(i->Ain.Bsfr64.src);
   1161          vex_printf(",");
   1162          ppHRegAMD64(i->Ain.Bsfr64.dst);
   1163          return;
   1164       case Ain_MFence:
   1165          vex_printf("mfence" );
   1166          return;
   1167       case Ain_ACAS:
   1168          vex_printf("lock cmpxchg%c ",
   1169                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
   1170                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
   1171          vex_printf("{%%rax->%%rbx},");
   1172          ppAMD64AMode(i->Ain.ACAS.addr);
   1173          return;
   1174       case Ain_DACAS:
   1175          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
   1176                     (Int)(2 * i->Ain.DACAS.sz));
   1177          ppAMD64AMode(i->Ain.DACAS.addr);
   1178          return;
   1179       case Ain_A87Free:
   1180          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
   1181          break;
   1182       case Ain_A87PushPop:
   1183          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
   1184                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
   1185          ppAMD64AMode(i->Ain.A87PushPop.addr);
   1186          break;
   1187       case Ain_A87FpOp:
   1188          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
   1189          break;
   1190       case Ain_A87LdCW:
   1191          vex_printf("fldcw ");
   1192          ppAMD64AMode(i->Ain.A87LdCW.addr);
   1193          break;
   1194       case Ain_A87StSW:
   1195          vex_printf("fstsw ");
   1196          ppAMD64AMode(i->Ain.A87StSW.addr);
   1197          break;
   1198       case Ain_LdMXCSR:
   1199          vex_printf("ldmxcsr ");
   1200          ppAMD64AMode(i->Ain.LdMXCSR.addr);
   1201          break;
   1202       case Ain_SseUComIS:
   1203          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
   1204          ppHRegAMD64(i->Ain.SseUComIS.srcL);
   1205          vex_printf(",");
   1206          ppHRegAMD64(i->Ain.SseUComIS.srcR);
   1207          vex_printf(" ; pushfq ; popq ");
   1208          ppHRegAMD64(i->Ain.SseUComIS.dst);
   1209          break;
   1210       case Ain_SseSI2SF:
   1211          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
   1212          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1213             (i->Ain.SseSI2SF.src);
   1214          vex_printf(",");
   1215          ppHRegAMD64(i->Ain.SseSI2SF.dst);
   1216          break;
   1217       case Ain_SseSF2SI:
   1218          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
   1219          ppHRegAMD64(i->Ain.SseSF2SI.src);
   1220          vex_printf(",");
   1221          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1222             (i->Ain.SseSF2SI.dst);
   1223          break;
   1224       case Ain_SseSDSS:
   1225          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
   1226          ppHRegAMD64(i->Ain.SseSDSS.src);
   1227          vex_printf(",");
   1228          ppHRegAMD64(i->Ain.SseSDSS.dst);
   1229          break;
   1230       case Ain_SseLdSt:
   1231          switch (i->Ain.SseLdSt.sz) {
   1232             case 4:  vex_printf("movss "); break;
   1233             case 8:  vex_printf("movsd "); break;
   1234             case 16: vex_printf("movups "); break;
   1235             default: vassert(0);
   1236          }
   1237          if (i->Ain.SseLdSt.isLoad) {
   1238             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1239             vex_printf(",");
   1240             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1241          } else {
   1242             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1243             vex_printf(",");
   1244             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1245          }
   1246          return;
   1247       case Ain_SseLdzLO:
   1248          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
   1249          ppAMD64AMode(i->Ain.SseLdzLO.addr);
   1250          vex_printf(",");
   1251          ppHRegAMD64(i->Ain.SseLdzLO.reg);
   1252          return;
   1253       case Ain_Sse32Fx4:
   1254          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
   1255          ppHRegAMD64(i->Ain.Sse32Fx4.src);
   1256          vex_printf(",");
   1257          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
   1258          return;
   1259       case Ain_Sse32FLo:
   1260          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
   1261          ppHRegAMD64(i->Ain.Sse32FLo.src);
   1262          vex_printf(",");
   1263          ppHRegAMD64(i->Ain.Sse32FLo.dst);
   1264          return;
   1265       case Ain_Sse64Fx2:
   1266          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
   1267          ppHRegAMD64(i->Ain.Sse64Fx2.src);
   1268          vex_printf(",");
   1269          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
   1270          return;
   1271       case Ain_Sse64FLo:
   1272          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
   1273          ppHRegAMD64(i->Ain.Sse64FLo.src);
   1274          vex_printf(",");
   1275          ppHRegAMD64(i->Ain.Sse64FLo.dst);
   1276          return;
   1277       case Ain_SseReRg:
   1278          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1279          ppHRegAMD64(i->Ain.SseReRg.src);
   1280          vex_printf(",");
   1281          ppHRegAMD64(i->Ain.SseReRg.dst);
   1282          return;
   1283       case Ain_SseCMov:
   1284          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
   1285          ppHRegAMD64(i->Ain.SseCMov.src);
   1286          vex_printf(",");
   1287          ppHRegAMD64(i->Ain.SseCMov.dst);
   1288          return;
   1289       case Ain_SseShuf:
   1290          vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
   1291          ppHRegAMD64(i->Ain.SseShuf.src);
   1292          vex_printf(",");
   1293          ppHRegAMD64(i->Ain.SseShuf.dst);
   1294          return;
   1295       //uu case Ain_AvxLdSt:
   1296       //uu    vex_printf("vmovups ");
   1297       //uu    if (i->Ain.AvxLdSt.isLoad) {
   1298       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
   1299       //uu       vex_printf(",");
   1300       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
   1301       //uu    } else {
   1302       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
   1303       //uu       vex_printf(",");
   1304       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
   1305       //uu    }
   1306       //uu    return;
   1307       //uu case Ain_AvxReRg:
   1308       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1309       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
   1310       //uu    vex_printf(",");
   1311       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
   1312       //uu    return;
   1313       case Ain_EvCheck:
   1314          vex_printf("(evCheck) decl ");
   1315          ppAMD64AMode(i->Ain.EvCheck.amCounter);
   1316          vex_printf("; jns nofail; jmp *");
   1317          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
   1318          vex_printf("; nofail:");
   1319          return;
   1320       case Ain_ProfInc:
   1321          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
   1322          return;
   1323       default:
   1324          vpanic("ppAMD64Instr");
   1325    }
   1326 }
   1327 
   1328 /* --------- Helpers for register allocation. --------- */
   1329 
   1330 void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 )
   1331 {
   1332    Bool unary;
   1333    vassert(mode64 == True);
   1334    initHRegUsage(u);
   1335    switch (i->tag) {
   1336       case Ain_Imm64:
   1337          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
   1338          return;
   1339       case Ain_Alu64R:
   1340          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
   1341          if (i->Ain.Alu64R.op == Aalu_MOV) {
   1342             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
   1343             return;
   1344          }
   1345          if (i->Ain.Alu64R.op == Aalu_CMP) {
   1346             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
   1347             return;
   1348          }
   1349          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
   1350          return;
   1351       case Ain_Alu64M:
   1352          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
   1353          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
   1354          return;
   1355       case Ain_Sh64:
   1356          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
   1357          if (i->Ain.Sh64.src == 0)
   1358             addHRegUse(u, HRmRead, hregAMD64_RCX());
   1359          return;
   1360       case Ain_Test64:
   1361          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
   1362          return;
   1363       case Ain_Unary64:
   1364          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
   1365          return;
   1366       case Ain_Lea64:
   1367          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
   1368          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
   1369          return;
   1370       case Ain_Alu32R:
   1371          vassert(i->Ain.Alu32R.op != Aalu_MOV);
   1372          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
   1373          if (i->Ain.Alu32R.op == Aalu_CMP) {
   1374             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
   1375             return;
   1376          }
   1377          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
   1378          return;
   1379       case Ain_MulL:
   1380          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
   1381          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1382          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1383          return;
   1384       case Ain_Div:
   1385          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
   1386          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1387          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1388          return;
   1389       case Ain_Push:
   1390          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
   1391          addHRegUse(u, HRmModify, hregAMD64_RSP());
   1392          return;
   1393       case Ain_Call:
   1394          /* This is a bit subtle. */
   1395          /* First off, claim it trashes all the caller-saved regs
   1396             which fall within the register allocator's jurisdiction.
   1397             These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
   1398             and all the xmm registers.
   1399          */
   1400          addHRegUse(u, HRmWrite, hregAMD64_RAX());
   1401          addHRegUse(u, HRmWrite, hregAMD64_RCX());
   1402          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1403          addHRegUse(u, HRmWrite, hregAMD64_RSI());
   1404          addHRegUse(u, HRmWrite, hregAMD64_RDI());
   1405          addHRegUse(u, HRmWrite, hregAMD64_R8());
   1406          addHRegUse(u, HRmWrite, hregAMD64_R9());
   1407          addHRegUse(u, HRmWrite, hregAMD64_R10());
   1408          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1409          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
   1410          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
   1411          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
   1412          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
   1413          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
   1414          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
   1415          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
   1416          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
   1417          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
   1418          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
   1419          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
   1420          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
   1421 
   1422          /* Now we have to state any parameter-carrying registers
   1423             which might be read.  This depends on the regparmness. */
   1424          switch (i->Ain.Call.regparms) {
   1425             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
   1426             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
   1427             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
   1428             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
   1429             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
   1430             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
   1431             case 0: break;
   1432             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
   1433          }
   1434          /* Finally, there is the issue that the insn trashes a
   1435             register because the literal target address has to be
   1436             loaded into a register.  Fortunately, r11 is stated in the
   1437             ABI as a scratch register, and so seems a suitable victim.  */
   1438          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1439          /* Upshot of this is that the assembler really must use r11,
   1440             and no other, as a destination temporary. */
   1441          return;
   1442       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
   1443          conditionally exit the block.  Hence we only need to list (1)
   1444          the registers that they read, and (2) the registers that they
   1445          write in the case where the block is not exited.  (2) is
   1446          empty, hence only (1) is relevant here. */
   1447       case Ain_XDirect:
   1448          /* Don't bother to mention the write to %r11, since it is not
   1449             available to the allocator. */
   1450          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
   1451          return;
   1452       case Ain_XIndir:
   1453          /* Ditto re %r11 */
   1454          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
   1455          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
   1456          return;
   1457       case Ain_XAssisted:
   1458          /* Ditto re %r11 and %rbp (the baseblock ptr) */
   1459          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
   1460          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
   1461          return;
   1462       case Ain_CMov64:
   1463          addRegUsage_AMD64RM(u, i->Ain.CMov64.src, HRmRead);
   1464          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
   1465          return;
   1466       case Ain_MovxLQ:
   1467          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
   1468          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
   1469          return;
   1470       case Ain_LoadEX:
   1471          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
   1472          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
   1473          return;
   1474       case Ain_Store:
   1475          addHRegUse(u, HRmRead, i->Ain.Store.src);
   1476          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
   1477          return;
   1478       case Ain_Set64:
   1479          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
   1480          return;
   1481       case Ain_Bsfr64:
   1482          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
   1483          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
   1484          return;
   1485       case Ain_MFence:
   1486          return;
   1487       case Ain_ACAS:
   1488          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
   1489          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1490          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1491          return;
   1492       case Ain_DACAS:
   1493          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
   1494          addHRegUse(u, HRmRead, hregAMD64_RCX());
   1495          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1496          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1497          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1498          return;
   1499       case Ain_A87Free:
   1500          return;
   1501       case Ain_A87PushPop:
   1502          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
   1503          return;
   1504       case Ain_A87FpOp:
   1505          return;
   1506       case Ain_A87LdCW:
   1507          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
   1508          return;
   1509       case Ain_A87StSW:
   1510          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
   1511          return;
   1512       case Ain_LdMXCSR:
   1513          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
   1514          return;
   1515       case Ain_SseUComIS:
   1516          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
   1517          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
   1518          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
   1519          return;
   1520       case Ain_SseSI2SF:
   1521          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
   1522          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
   1523          return;
   1524       case Ain_SseSF2SI:
   1525          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
   1526          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
   1527          return;
   1528       case Ain_SseSDSS:
   1529          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
   1530          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
   1531          return;
   1532       case Ain_SseLdSt:
   1533          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
   1534          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
   1535                        i->Ain.SseLdSt.reg);
   1536          return;
   1537       case Ain_SseLdzLO:
   1538          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
   1539          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
   1540          return;
   1541       case Ain_Sse32Fx4:
   1542          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
   1543          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
   1544                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
   1545                          || i->Ain.Sse32Fx4.op == Asse_SQRTF );
   1546          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
   1547          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1548                        i->Ain.Sse32Fx4.dst);
   1549          return;
   1550       case Ain_Sse32FLo:
   1551          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
   1552          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
   1553                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
   1554                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
   1555          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
   1556          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1557                        i->Ain.Sse32FLo.dst);
   1558          return;
   1559       case Ain_Sse64Fx2:
   1560          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
   1561          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
   1562                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
   1563                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
   1564          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
   1565          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1566                        i->Ain.Sse64Fx2.dst);
   1567          return;
   1568       case Ain_Sse64FLo:
   1569          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
   1570          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
   1571                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
   1572                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
   1573          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
   1574          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1575                        i->Ain.Sse64FLo.dst);
   1576          return;
   1577       case Ain_SseReRg:
   1578          if ( (i->Ain.SseReRg.op == Asse_XOR
   1579                || i->Ain.SseReRg.op == Asse_CMPEQ32)
   1580               && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
   1581             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
   1582                r,r' as a write of a value to r, and independent of any
   1583                previous value in r */
   1584             /* (as opposed to a rite of passage :-) */
   1585             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
   1586          } else {
   1587             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
   1588             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
   1589                              ? HRmWrite : HRmModify,
   1590                           i->Ain.SseReRg.dst);
   1591          }
   1592          return;
   1593       case Ain_SseCMov:
   1594          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
   1595          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
   1596          return;
   1597       case Ain_SseShuf:
   1598          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
   1599          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
   1600          return;
   1601       //uu case Ain_AvxLdSt:
   1602       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
   1603       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
   1604       //uu               i->Ain.AvxLdSt.reg);
   1605       //uu return;
   1606       //uu case Ain_AvxReRg:
   1607       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
   1608       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
   1609       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
   1610       //uu       /* See comments on the case for Ain_SseReRg. */
   1611       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
   1612       //uu    } else {
   1613       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
   1614       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
   1615       //uu                        ? HRmWrite : HRmModify,
   1616       //uu                     i->Ain.AvxReRg.dst);
   1617       //uu    }
   1618       //uu    return;
   1619       case Ain_EvCheck:
   1620          /* We expect both amodes only to mention %rbp, so this is in
   1621             fact pointless, since %rbp isn't allocatable, but anyway.. */
   1622          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
   1623          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
   1624          return;
   1625       case Ain_ProfInc:
   1626          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1627          return;
   1628       default:
   1629          ppAMD64Instr(i, mode64);
   1630          vpanic("getRegUsage_AMD64Instr");
   1631    }
   1632 }
   1633 
   1634 /* local helper */
   1635 static inline void mapReg(HRegRemap* m, HReg* r)
   1636 {
   1637    *r = lookupHRegRemap(m, *r);
   1638 }
   1639 
   1640 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
   1641 {
   1642    vassert(mode64 == True);
   1643    switch (i->tag) {
   1644       case Ain_Imm64:
   1645          mapReg(m, &i->Ain.Imm64.dst);
   1646          return;
   1647       case Ain_Alu64R:
   1648          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
   1649          mapReg(m, &i->Ain.Alu64R.dst);
   1650          return;
   1651       case Ain_Alu64M:
   1652          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
   1653          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
   1654          return;
   1655       case Ain_Sh64:
   1656          mapReg(m, &i->Ain.Sh64.dst);
   1657          return;
   1658       case Ain_Test64:
   1659          mapReg(m, &i->Ain.Test64.dst);
   1660          return;
   1661       case Ain_Unary64:
   1662          mapReg(m, &i->Ain.Unary64.dst);
   1663          return;
   1664       case Ain_Lea64:
   1665          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
   1666          mapReg(m, &i->Ain.Lea64.dst);
   1667          return;
   1668       case Ain_Alu32R:
   1669          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
   1670          mapReg(m, &i->Ain.Alu32R.dst);
   1671          return;
   1672       case Ain_MulL:
   1673          mapRegs_AMD64RM(m, i->Ain.MulL.src);
   1674          return;
   1675       case Ain_Div:
   1676          mapRegs_AMD64RM(m, i->Ain.Div.src);
   1677          return;
   1678       case Ain_Push:
   1679          mapRegs_AMD64RMI(m, i->Ain.Push.src);
   1680          return;
   1681       case Ain_Call:
   1682          return;
   1683       case Ain_XDirect:
   1684          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
   1685          return;
   1686       case Ain_XIndir:
   1687          mapReg(m, &i->Ain.XIndir.dstGA);
   1688          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
   1689          return;
   1690       case Ain_XAssisted:
   1691          mapReg(m, &i->Ain.XAssisted.dstGA);
   1692          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
   1693          return;
   1694       case Ain_CMov64:
   1695          mapRegs_AMD64RM(m, i->Ain.CMov64.src);
   1696          mapReg(m, &i->Ain.CMov64.dst);
   1697          return;
   1698       case Ain_MovxLQ:
   1699          mapReg(m, &i->Ain.MovxLQ.src);
   1700          mapReg(m, &i->Ain.MovxLQ.dst);
   1701          return;
   1702       case Ain_LoadEX:
   1703          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
   1704          mapReg(m, &i->Ain.LoadEX.dst);
   1705          return;
   1706       case Ain_Store:
   1707          mapReg(m, &i->Ain.Store.src);
   1708          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
   1709          return;
   1710       case Ain_Set64:
   1711          mapReg(m, &i->Ain.Set64.dst);
   1712          return;
   1713       case Ain_Bsfr64:
   1714          mapReg(m, &i->Ain.Bsfr64.src);
   1715          mapReg(m, &i->Ain.Bsfr64.dst);
   1716          return;
   1717       case Ain_MFence:
   1718          return;
   1719       case Ain_ACAS:
   1720          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
   1721          return;
   1722       case Ain_DACAS:
   1723          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
   1724          return;
   1725       case Ain_A87Free:
   1726          return;
   1727       case Ain_A87PushPop:
   1728          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
   1729          return;
   1730       case Ain_A87FpOp:
   1731          return;
   1732       case Ain_A87LdCW:
   1733          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
   1734          return;
   1735       case Ain_A87StSW:
   1736          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
   1737          return;
   1738       case Ain_LdMXCSR:
   1739          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
   1740          return;
   1741       case Ain_SseUComIS:
   1742          mapReg(m, &i->Ain.SseUComIS.srcL);
   1743          mapReg(m, &i->Ain.SseUComIS.srcR);
   1744          mapReg(m, &i->Ain.SseUComIS.dst);
   1745          return;
   1746       case Ain_SseSI2SF:
   1747          mapReg(m, &i->Ain.SseSI2SF.src);
   1748          mapReg(m, &i->Ain.SseSI2SF.dst);
   1749          return;
   1750       case Ain_SseSF2SI:
   1751          mapReg(m, &i->Ain.SseSF2SI.src);
   1752          mapReg(m, &i->Ain.SseSF2SI.dst);
   1753          return;
   1754       case Ain_SseSDSS:
   1755          mapReg(m, &i->Ain.SseSDSS.src);
   1756          mapReg(m, &i->Ain.SseSDSS.dst);
   1757          return;
   1758       case Ain_SseLdSt:
   1759          mapReg(m, &i->Ain.SseLdSt.reg);
   1760          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
   1761          break;
   1762       case Ain_SseLdzLO:
   1763          mapReg(m, &i->Ain.SseLdzLO.reg);
   1764          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
   1765          break;
   1766       case Ain_Sse32Fx4:
   1767          mapReg(m, &i->Ain.Sse32Fx4.src);
   1768          mapReg(m, &i->Ain.Sse32Fx4.dst);
   1769          return;
   1770       case Ain_Sse32FLo:
   1771          mapReg(m, &i->Ain.Sse32FLo.src);
   1772          mapReg(m, &i->Ain.Sse32FLo.dst);
   1773          return;
   1774       case Ain_Sse64Fx2:
   1775          mapReg(m, &i->Ain.Sse64Fx2.src);
   1776          mapReg(m, &i->Ain.Sse64Fx2.dst);
   1777          return;
   1778       case Ain_Sse64FLo:
   1779          mapReg(m, &i->Ain.Sse64FLo.src);
   1780          mapReg(m, &i->Ain.Sse64FLo.dst);
   1781          return;
   1782       case Ain_SseReRg:
   1783          mapReg(m, &i->Ain.SseReRg.src);
   1784          mapReg(m, &i->Ain.SseReRg.dst);
   1785          return;
   1786       case Ain_SseCMov:
   1787          mapReg(m, &i->Ain.SseCMov.src);
   1788          mapReg(m, &i->Ain.SseCMov.dst);
   1789          return;
   1790       case Ain_SseShuf:
   1791          mapReg(m, &i->Ain.SseShuf.src);
   1792          mapReg(m, &i->Ain.SseShuf.dst);
   1793          return;
   1794       //uu case Ain_AvxLdSt:
   1795       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
   1796       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
   1797       //uu    break;
   1798       //uu case Ain_AvxReRg:
   1799       //uu    mapReg(m, &i->Ain.AvxReRg.src);
   1800       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
   1801       //uu    return;
   1802       case Ain_EvCheck:
   1803          /* We expect both amodes only to mention %rbp, so this is in
   1804             fact pointless, since %rbp isn't allocatable, but anyway.. */
   1805          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
   1806          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
   1807          return;
   1808       case Ain_ProfInc:
   1809          /* hardwires r11 -- nothing to modify. */
   1810          return;
   1811       default:
   1812          ppAMD64Instr(i, mode64);
   1813          vpanic("mapRegs_AMD64Instr");
   1814    }
   1815 }
   1816 
   1817 /* Figure out if i represents a reg-reg move, and if so assign the
   1818    source and destination to *src and *dst.  If in doubt say No.  Used
   1819    by the register allocator to do move coalescing.
   1820 */
   1821 Bool isMove_AMD64Instr ( AMD64Instr* i, HReg* src, HReg* dst )
   1822 {
   1823    switch (i->tag) {
   1824       case Ain_Alu64R:
   1825          /* Moves between integer regs */
   1826          if (i->Ain.Alu64R.op != Aalu_MOV)
   1827             return False;
   1828          if (i->Ain.Alu64R.src->tag != Armi_Reg)
   1829             return False;
   1830          *src = i->Ain.Alu64R.src->Armi.Reg.reg;
   1831          *dst = i->Ain.Alu64R.dst;
   1832          return True;
   1833       case Ain_SseReRg:
   1834          /* Moves between SSE regs */
   1835          if (i->Ain.SseReRg.op != Asse_MOV)
   1836             return False;
   1837          *src = i->Ain.SseReRg.src;
   1838          *dst = i->Ain.SseReRg.dst;
   1839          return True;
   1840       //uu case Ain_AvxReRg:
   1841       //uu    /* Moves between AVX regs */
   1842       //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
   1843       //uu       return False;
   1844       //uu    *src = i->Ain.AvxReRg.src;
   1845       //uu    *dst = i->Ain.AvxReRg.dst;
   1846       //uu    return True;
   1847       default:
   1848          return False;
   1849    }
   1850    /*NOTREACHED*/
   1851 }
   1852 
   1853 
   1854 /* Generate amd64 spill/reload instructions under the direction of the
   1855    register allocator.  Note it's critical these don't write the
   1856    condition codes. */
   1857 
   1858 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1859                       HReg rreg, Int offsetB, Bool mode64 )
   1860 {
   1861    AMD64AMode* am;
   1862    vassert(offsetB >= 0);
   1863    vassert(!hregIsVirtual(rreg));
   1864    vassert(mode64 == True);
   1865    *i1 = *i2 = NULL;
   1866    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1867    switch (hregClass(rreg)) {
   1868       case HRcInt64:
   1869          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
   1870          return;
   1871       case HRcVec128:
   1872          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
   1873          return;
   1874       default:
   1875          ppHRegClass(hregClass(rreg));
   1876          vpanic("genSpill_AMD64: unimplemented regclass");
   1877    }
   1878 }
   1879 
   1880 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1881                        HReg rreg, Int offsetB, Bool mode64 )
   1882 {
   1883    AMD64AMode* am;
   1884    vassert(offsetB >= 0);
   1885    vassert(!hregIsVirtual(rreg));
   1886    vassert(mode64 == True);
   1887    *i1 = *i2 = NULL;
   1888    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1889    switch (hregClass(rreg)) {
   1890       case HRcInt64:
   1891          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
   1892          return;
   1893       case HRcVec128:
   1894          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
   1895          return;
   1896       default:
   1897          ppHRegClass(hregClass(rreg));
   1898          vpanic("genReload_AMD64: unimplemented regclass");
   1899    }
   1900 }
   1901 
   1902 
   1903 /* --------- The amd64 assembler (bleh.) --------- */
   1904 
   1905 /* Produce the low three bits of an integer register number. */
   1906 static UChar iregBits210 ( HReg r )
   1907 {
   1908    UInt n;
   1909    vassert(hregClass(r) == HRcInt64);
   1910    vassert(!hregIsVirtual(r));
   1911    n = hregNumber(r);
   1912    vassert(n <= 15);
   1913    return toUChar(n & 7);
   1914 }
   1915 
   1916 /* Produce bit 3 of an integer register number. */
   1917 static UChar iregBit3 ( HReg r )
   1918 {
   1919    UInt n;
   1920    vassert(hregClass(r) == HRcInt64);
   1921    vassert(!hregIsVirtual(r));
   1922    n = hregNumber(r);
   1923    vassert(n <= 15);
   1924    return toUChar((n >> 3) & 1);
   1925 }
   1926 
   1927 /* Produce a complete 4-bit integer register number. */
   1928 static UChar iregBits3210 ( HReg r )
   1929 {
   1930    UInt n;
   1931    vassert(hregClass(r) == HRcInt64);
   1932    vassert(!hregIsVirtual(r));
   1933    n = hregNumber(r);
   1934    vassert(n <= 15);
   1935    return toUChar(n);
   1936 }
   1937 
   1938 /* Given an xmm (128bit V-class) register number, produce the
   1939    equivalent numbered register in 64-bit I-class.  This is a bit of
   1940    fakery which facilitates using functions that work on integer
   1941    register numbers to be used when assembling SSE instructions
   1942    too. */
   1943 static HReg vreg2ireg ( HReg r )
   1944 {
   1945    UInt n;
   1946    vassert(hregClass(r) == HRcVec128);
   1947    vassert(!hregIsVirtual(r));
   1948    n = hregNumber(r);
   1949    vassert(n <= 15);
   1950    return mkHReg(n, HRcInt64, False);
   1951 }
   1952 
   1953 //uu /* Ditto for ymm regs. */
   1954 //uu static HReg dvreg2ireg ( HReg r )
   1955 //uu {
   1956 //uu    UInt n;
   1957 //uu    vassert(hregClass(r) == HRcVec256);
   1958 //uu    vassert(!hregIsVirtual(r));
   1959 //uu    n = hregNumber(r);
   1960 //uu    vassert(n <= 15);
   1961 //uu    return mkHReg(n, HRcInt64, False);
   1962 //uu }
   1963 
   1964 static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
   1965 {
   1966    vassert(mod < 4);
   1967    vassert((reg|regmem) < 8);
   1968    return toUChar( ((mod & 3) << 6)
   1969                    | ((reg & 7) << 3)
   1970                    | (regmem & 7) );
   1971 }
   1972 
   1973 static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
   1974 {
   1975    vassert(shift < 4);
   1976    vassert((regindex|regbase) < 8);
   1977    return toUChar( ((shift & 3) << 6)
   1978                    | ((regindex & 7) << 3)
   1979                    | (regbase & 7) );
   1980 }
   1981 
   1982 static UChar* emit32 ( UChar* p, UInt w32 )
   1983 {
   1984    *p++ = toUChar((w32)       & 0x000000FF);
   1985    *p++ = toUChar((w32 >>  8) & 0x000000FF);
   1986    *p++ = toUChar((w32 >> 16) & 0x000000FF);
   1987    *p++ = toUChar((w32 >> 24) & 0x000000FF);
   1988    return p;
   1989 }
   1990 
   1991 static UChar* emit64 ( UChar* p, ULong w64 )
   1992 {
   1993    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
   1994    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
   1995    return p;
   1996 }
   1997 
   1998 /* Does a sign-extend of the lowest 8 bits give
   1999    the original number? */
   2000 static Bool fits8bits ( UInt w32 )
   2001 {
   2002    Int i32 = (Int)w32;
   2003    return toBool(i32 == ((i32 << 24) >> 24));
   2004 }
   2005 /* Can the lower 32 bits be signedly widened to produce the whole
   2006    64-bit value?  In other words, are the top 33 bits either all 0 or
   2007    all 1 ? */
   2008 static Bool fitsIn32Bits ( ULong x )
   2009 {
   2010    Long y0 = (Long)x;
   2011    Long y1 = y0;
   2012    y1 <<= 32;
   2013    y1 >>=/*s*/ 32;
   2014    return toBool(x == y1);
   2015 }
   2016 
   2017 
   2018 /* Forming mod-reg-rm bytes and scale-index-base bytes.
   2019 
   2020      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
   2021                        =  00 greg ereg
   2022 
   2023      greg,  d8(ereg)   |  ereg is neither of: RSP R12
   2024                        =  01 greg ereg, d8
   2025 
   2026      greg,  d32(ereg)  |  ereg is neither of: RSP R12
   2027                        =  10 greg ereg, d32
   2028 
   2029      greg,  d8(ereg)   |  ereg is either: RSP R12
   2030                        =  01 greg 100, 0x24, d8
   2031                        (lowest bit of rex distinguishes R12/RSP)
   2032 
   2033      greg,  d32(ereg)  |  ereg is either: RSP R12
   2034                        =  10 greg 100, 0x24, d32
   2035                        (lowest bit of rex distinguishes R12/RSP)
   2036 
   2037      -----------------------------------------------
   2038 
   2039      greg,  d8(base,index,scale)
   2040                |  index != RSP
   2041                =  01 greg 100, scale index base, d8
   2042 
   2043      greg,  d32(base,index,scale)
   2044                |  index != RSP
   2045                =  10 greg 100, scale index base, d32
   2046 */
   2047 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
   2048 {
   2049    if (am->tag == Aam_IR) {
   2050       if (am->Aam.IR.imm == 0
   2051           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2052           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
   2053           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2054           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
   2055          ) {
   2056          *p++ = mkModRegRM(0, iregBits210(greg),
   2057                               iregBits210(am->Aam.IR.reg));
   2058          return p;
   2059       }
   2060       if (fits8bits(am->Aam.IR.imm)
   2061           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2062           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2063          ) {
   2064          *p++ = mkModRegRM(1, iregBits210(greg),
   2065                               iregBits210(am->Aam.IR.reg));
   2066          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2067          return p;
   2068       }
   2069       if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2070           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2071          ) {
   2072          *p++ = mkModRegRM(2, iregBits210(greg),
   2073                               iregBits210(am->Aam.IR.reg));
   2074          p = emit32(p, am->Aam.IR.imm);
   2075          return p;
   2076       }
   2077       if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2078            || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
   2079           && fits8bits(am->Aam.IR.imm)) {
   2080  	 *p++ = mkModRegRM(1, iregBits210(greg), 4);
   2081          *p++ = 0x24;
   2082          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2083          return p;
   2084       }
   2085       if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2086 	     || wait for test case for RSP case */
   2087           sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
   2088  	 *p++ = mkModRegRM(2, iregBits210(greg), 4);
   2089          *p++ = 0x24;
   2090          p = emit32(p, am->Aam.IR.imm);
   2091          return p;
   2092       }
   2093       ppAMD64AMode(am);
   2094       vpanic("doAMode_M: can't emit amode IR");
   2095       /*NOTREACHED*/
   2096    }
   2097    if (am->tag == Aam_IRRS) {
   2098       if (fits8bits(am->Aam.IRRS.imm)
   2099           && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
   2100          *p++ = mkModRegRM(1, iregBits210(greg), 4);
   2101          *p++ = mkSIB(am->Aam.IRRS.shift, iregBits210(am->Aam.IRRS.index),
   2102                                           iregBits210(am->Aam.IRRS.base));
   2103          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
   2104          return p;
   2105       }
   2106       if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
   2107          *p++ = mkModRegRM(2, iregBits210(greg), 4);
   2108          *p++ = mkSIB(am->Aam.IRRS.shift, iregBits210(am->Aam.IRRS.index),
   2109                                           iregBits210(am->Aam.IRRS.base));
   2110          p = emit32(p, am->Aam.IRRS.imm);
   2111          return p;
   2112       }
   2113       ppAMD64AMode(am);
   2114       vpanic("doAMode_M: can't emit amode IRRS");
   2115       /*NOTREACHED*/
   2116    }
   2117    vpanic("doAMode_M: unknown amode");
   2118    /*NOTREACHED*/
   2119 }
   2120 
   2121 
   2122 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
   2123 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
   2124 {
   2125    *p++ = mkModRegRM(3, iregBits210(greg), iregBits210(ereg));
   2126    return p;
   2127 }
   2128 
   2129 
   2130 /* Clear the W bit on a REX byte, thereby changing the operand size
   2131    back to whatever that instruction's default operand size is. */
   2132 static inline UChar clearWBit ( UChar rex )
   2133 {
   2134    return toUChar(rex & ~(1<<3));
   2135 }
   2136 
   2137 
   2138 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
   2139 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
   2140 {
   2141    if (am->tag == Aam_IR) {
   2142       UChar W = 1;  /* we want 64-bit mode */
   2143       UChar R = iregBit3(greg);
   2144       UChar X = 0; /* not relevant */
   2145       UChar B = iregBit3(am->Aam.IR.reg);
   2146       return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2147    }
   2148    if (am->tag == Aam_IRRS) {
   2149       UChar W = 1;  /* we want 64-bit mode */
   2150       UChar R = iregBit3(greg);
   2151       UChar X = iregBit3(am->Aam.IRRS.index);
   2152       UChar B = iregBit3(am->Aam.IRRS.base);
   2153       return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2154    }
   2155    vassert(0);
   2156    return 0; /*NOTREACHED*/
   2157 }
   2158 
   2159 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
   2160 static UChar rexAMode_R ( HReg greg, HReg ereg )
   2161 {
   2162    UChar W = 1;  /* we want 64-bit mode */
   2163    UChar R = iregBit3(greg);
   2164    UChar X = 0; /* not relevant */
   2165    UChar B = iregBit3(ereg);
   2166    return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
   2167 }
   2168 
   2169 
   2170 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
   2171 //uu    verified correct (I reckon).  Certainly it has been known to
   2172 //uu    produce correct VEX prefixes during testing. */
   2173 //uu
   2174 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
   2175 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
   2176 //uu    in verbatim.  There's no range checking on the bits. */
   2177 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
   2178 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
   2179 //uu                             UInt L, UInt pp )
   2180 //uu {
   2181 //uu    UChar byte0 = 0;
   2182 //uu    UChar byte1 = 0;
   2183 //uu    UChar byte2 = 0;
   2184 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
   2185 //uu       /* 2 byte encoding is possible. */
   2186 //uu       byte0 = 0xC5;
   2187 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
   2188 //uu               | (L << 2) | pp;
   2189 //uu    } else {
   2190 //uu       /* 3 byte encoding is needed. */
   2191 //uu       byte0 = 0xC4;
   2192 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
   2193 //uu               | ((rexB ^ 1) << 5) | mmmmm;
   2194 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
   2195 //uu    }
   2196 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
   2197 //uu }
   2198 //uu
   2199 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
   2200 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
   2201 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
   2202 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
   2203 //uu    vvvv=1111 (unused 3rd reg). */
   2204 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
   2205 //uu {
   2206 //uu    UChar L       = 1; /* size = 256 */
   2207 //uu    UChar pp      = 0; /* no SIMD prefix */
   2208 //uu    UChar mmmmm   = 1; /* 0F */
   2209 //uu    UChar notVvvv = 0; /* unused */
   2210 //uu    UChar rexW    = 0;
   2211 //uu    UChar rexR    = 0;
   2212 //uu    UChar rexX    = 0;
   2213 //uu    UChar rexB    = 0;
   2214 //uu    /* Same logic as in rexAMode_M. */
   2215 //uu    if (am->tag == Aam_IR) {
   2216 //uu       rexR = iregBit3(greg);
   2217 //uu       rexX = 0; /* not relevant */
   2218 //uu       rexB = iregBit3(am->Aam.IR.reg);
   2219 //uu    }
   2220 //uu    else if (am->tag == Aam_IRRS) {
   2221 //uu       rexR = iregBit3(greg);
   2222 //uu       rexX = iregBit3(am->Aam.IRRS.index);
   2223 //uu       rexB = iregBit3(am->Aam.IRRS.base);
   2224 //uu    } else {
   2225 //uu       vassert(0);
   2226 //uu    }
   2227 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
   2228 //uu }
   2229 //uu
   2230 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
   2231 //uu {
   2232 //uu    switch (vex & 0xFF) {
   2233 //uu       case 0xC5:
   2234 //uu          *p++ = 0xC5;
   2235 //uu          *p++ = (vex >> 8) & 0xFF;
   2236 //uu          vassert(0 == (vex >> 16));
   2237 //uu          break;
   2238 //uu       case 0xC4:
   2239 //uu          *p++ = 0xC4;
   2240 //uu          *p++ = (vex >> 8) & 0xFF;
   2241 //uu          *p++ = (vex >> 16) & 0xFF;
   2242 //uu          vassert(0 == (vex >> 24));
   2243 //uu          break;
   2244 //uu       default:
   2245 //uu          vassert(0);
   2246 //uu    }
   2247 //uu    return p;
   2248 //uu }
   2249 
   2250 
   2251 /* Emit ffree %st(N) */
   2252 static UChar* do_ffree_st ( UChar* p, Int n )
   2253 {
   2254    vassert(n >= 0 && n <= 7);
   2255    *p++ = 0xDD;
   2256    *p++ = toUChar(0xC0 + n);
   2257    return p;
   2258 }
   2259 
   2260 /* Emit an instruction into buf and return the number of bytes used.
   2261    Note that buf is not the insn's final place, and therefore it is
   2262    imperative to emit position-independent code.  If the emitted
   2263    instruction was a profiler inc, set *is_profInc to True, else
   2264    leave it unchanged. */
   2265 
   2266 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
   2267                       UChar* buf, Int nbuf, AMD64Instr* i,
   2268                       Bool mode64,
   2269                       void* disp_cp_chain_me_to_slowEP,
   2270                       void* disp_cp_chain_me_to_fastEP,
   2271                       void* disp_cp_xindir,
   2272                       void* disp_cp_xassisted )
   2273 {
   2274    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
   2275    UInt   xtra;
   2276    UInt   reg;
   2277    UChar  rex;
   2278    UChar* p = &buf[0];
   2279    UChar* ptmp;
   2280    Int    j;
   2281    vassert(nbuf >= 32);
   2282    vassert(mode64 == True);
   2283 
   2284    /* Wrap an integer as a int register, for use assembling
   2285       GrpN insns, in which the greg field is used as a sub-opcode
   2286       and does not really contain a register. */
   2287 #  define fake(_n) mkHReg((_n), HRcInt64, False)
   2288 
   2289    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
   2290 
   2291    switch (i->tag) {
   2292 
   2293    case Ain_Imm64:
   2294       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
   2295          /* Use the short form (load into 32 bit reg, + default
   2296             widening rule) for constants under 1 million.  We could
   2297             use this form for the range 0 to 0x7FFFFFFF inclusive, but
   2298             limit it to a smaller range for verifiability purposes. */
   2299          if (1 & iregBit3(i->Ain.Imm64.dst))
   2300             *p++ = 0x41;
   2301          *p++ = 0xB8 + iregBits210(i->Ain.Imm64.dst);
   2302          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
   2303       } else {
   2304          *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Imm64.dst)));
   2305          *p++ = toUChar(0xB8 + iregBits210(i->Ain.Imm64.dst));
   2306          p = emit64(p, i->Ain.Imm64.imm64);
   2307       }
   2308       goto done;
   2309 
   2310    case Ain_Alu64R:
   2311       /* Deal specially with MOV */
   2312       if (i->Ain.Alu64R.op == Aalu_MOV) {
   2313          switch (i->Ain.Alu64R.src->tag) {
   2314             case Armi_Imm:
   2315                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
   2316                   /* Actually we could use this form for constants in
   2317                      the range 0 through 0x7FFFFFFF inclusive, but
   2318                      limit it to a small range for verifiability
   2319                      purposes. */
   2320                   /* Generate "movl $imm32, 32-bit-register" and let
   2321                      the default zero-extend rule cause the upper half
   2322                      of the dst to be zeroed out too.  This saves 1
   2323                      and sometimes 2 bytes compared to the more
   2324                      obvious encoding in the 'else' branch. */
   2325                   if (1 & iregBit3(i->Ain.Alu64R.dst))
   2326                      *p++ = 0x41;
   2327                   *p++ = 0xB8 + iregBits210(i->Ain.Alu64R.dst);
   2328                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2329                } else {
   2330                   *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Alu64R.dst)));
   2331                   *p++ = 0xC7;
   2332                   *p++ = toUChar(0xC0 + iregBits210(i->Ain.Alu64R.dst));
   2333                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2334                }
   2335                goto done;
   2336             case Armi_Reg:
   2337                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2338                                   i->Ain.Alu64R.dst );
   2339                *p++ = 0x89;
   2340                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2341                                 i->Ain.Alu64R.dst);
   2342                goto done;
   2343             case Armi_Mem:
   2344                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2345                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2346                *p++ = 0x8B;
   2347                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2348                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2349                goto done;
   2350             default:
   2351                goto bad;
   2352          }
   2353       }
   2354       /* MUL */
   2355       if (i->Ain.Alu64R.op == Aalu_MUL) {
   2356          switch (i->Ain.Alu64R.src->tag) {
   2357             case Armi_Reg:
   2358                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
   2359                                   i->Ain.Alu64R.src->Armi.Reg.reg);
   2360                *p++ = 0x0F;
   2361                *p++ = 0xAF;
   2362                p = doAMode_R(p, i->Ain.Alu64R.dst,
   2363                                 i->Ain.Alu64R.src->Armi.Reg.reg);
   2364                goto done;
   2365             case Armi_Mem:
   2366                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2367                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2368                *p++ = 0x0F;
   2369                *p++ = 0xAF;
   2370                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2371                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2372                goto done;
   2373             case Armi_Imm:
   2374                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2375                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2376                   *p++ = 0x6B;
   2377                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2378                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2379                } else {
   2380                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2381                   *p++ = 0x69;
   2382                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2383                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2384                }
   2385                goto done;
   2386             default:
   2387                goto bad;
   2388          }
   2389       }
   2390       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
   2391       opc = opc_rr = subopc_imm = opc_imma = 0;
   2392       switch (i->Ain.Alu64R.op) {
   2393          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
   2394                         subopc_imm = 2; opc_imma = 0x15; break;
   2395          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2396                         subopc_imm = 0; opc_imma = 0x05; break;
   2397          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2398                         subopc_imm = 5; opc_imma = 0x2D; break;
   2399          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
   2400                         subopc_imm = 3; opc_imma = 0x1D; break;
   2401          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2402                         subopc_imm = 4; opc_imma = 0x25; break;
   2403          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2404                         subopc_imm = 6; opc_imma = 0x35; break;
   2405          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2406                         subopc_imm = 1; opc_imma = 0x0D; break;
   2407          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2408                         subopc_imm = 7; opc_imma = 0x3D; break;
   2409          default: goto bad;
   2410       }
   2411       switch (i->Ain.Alu64R.src->tag) {
   2412          case Armi_Imm:
   2413             if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
   2414                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2415                goto bad; /* FIXME: awaiting test case */
   2416                *p++ = toUChar(opc_imma);
   2417                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2418             } else
   2419             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2420                *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst );
   2421                *p++ = 0x83;
   2422                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
   2423                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2424             } else {
   2425                *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst);
   2426                *p++ = 0x81;
   2427                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
   2428                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2429             }
   2430             goto done;
   2431          case Armi_Reg:
   2432             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2433                                i->Ain.Alu64R.dst);
   2434             *p++ = toUChar(opc_rr);
   2435             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2436                              i->Ain.Alu64R.dst);
   2437             goto done;
   2438          case Armi_Mem:
   2439             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
   2440                                i->Ain.Alu64R.src->Armi.Mem.am);
   2441             *p++ = toUChar(opc);
   2442             p = doAMode_M(p, i->Ain.Alu64R.dst,
   2443                              i->Ain.Alu64R.src->Armi.Mem.am);
   2444             goto done;
   2445          default:
   2446             goto bad;
   2447       }
   2448       break;
   2449 
   2450    case Ain_Alu64M:
   2451       /* Deal specially with MOV */
   2452       if (i->Ain.Alu64M.op == Aalu_MOV) {
   2453          switch (i->Ain.Alu64M.src->tag) {
   2454             case Ari_Reg:
   2455                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
   2456                                  i->Ain.Alu64M.dst);
   2457                *p++ = 0x89;
   2458                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
   2459                                 i->Ain.Alu64M.dst);
   2460                goto done;
   2461             case Ari_Imm:
   2462                *p++ = rexAMode_M(fake(0), i->Ain.Alu64M.dst);
   2463                *p++ = 0xC7;
   2464                p = doAMode_M(p, fake(0), i->Ain.Alu64M.dst);
   2465                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
   2466                goto done;
   2467             default:
   2468                goto bad;
   2469          }
   2470       }
   2471       break;
   2472 
   2473    case Ain_Sh64:
   2474       opc_cl = opc_imm = subopc = 0;
   2475       switch (i->Ain.Sh64.op) {
   2476          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
   2477          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
   2478          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
   2479          default: goto bad;
   2480       }
   2481       if (i->Ain.Sh64.src == 0) {
   2482          *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
   2483          *p++ = toUChar(opc_cl);
   2484          p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
   2485          goto done;
   2486       } else {
   2487          *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
   2488          *p++ = toUChar(opc_imm);
   2489          p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
   2490          *p++ = (UChar)(i->Ain.Sh64.src);
   2491          goto done;
   2492       }
   2493       break;
   2494 
   2495    case Ain_Test64:
   2496       /* testq sign-extend($imm32), %reg */
   2497       *p++ = rexAMode_R(fake(0), i->Ain.Test64.dst);
   2498       *p++ = 0xF7;
   2499       p = doAMode_R(p, fake(0), i->Ain.Test64.dst);
   2500       p = emit32(p, i->Ain.Test64.imm32);
   2501       goto done;
   2502 
   2503    case Ain_Unary64:
   2504       if (i->Ain.Unary64.op == Aun_NOT) {
   2505          *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
   2506          *p++ = 0xF7;
   2507          p = doAMode_R(p, fake(2), i->Ain.Unary64.dst);
   2508          goto done;
   2509       }
   2510       if (i->Ain.Unary64.op == Aun_NEG) {
   2511          *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
   2512          *p++ = 0xF7;
   2513          p = doAMode_R(p, fake(3), i->Ain.Unary64.dst);
   2514          goto done;
   2515       }
   2516       break;
   2517 
   2518    case Ain_Lea64:
   2519       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2520       *p++ = 0x8D;
   2521       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2522       goto done;
   2523 
   2524    case Ain_Alu32R:
   2525       /* ADD/SUB/AND/OR/XOR/CMP */
   2526       opc = opc_rr = subopc_imm = opc_imma = 0;
   2527       switch (i->Ain.Alu32R.op) {
   2528          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2529                         subopc_imm = 0; opc_imma = 0x05; break;
   2530          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2531                         subopc_imm = 5; opc_imma = 0x2D; break;
   2532          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2533                         subopc_imm = 4; opc_imma = 0x25; break;
   2534          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2535                         subopc_imm = 6; opc_imma = 0x35; break;
   2536          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2537                         subopc_imm = 1; opc_imma = 0x0D; break;
   2538          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2539                         subopc_imm = 7; opc_imma = 0x3D; break;
   2540          default: goto bad;
   2541       }
   2542       switch (i->Ain.Alu32R.src->tag) {
   2543          case Armi_Imm:
   2544             if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
   2545                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
   2546                goto bad; /* FIXME: awaiting test case */
   2547                *p++ = toUChar(opc_imma);
   2548                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
   2549             } else
   2550             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
   2551                rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst ) );
   2552                if (rex != 0x40) *p++ = rex;
   2553                *p++ = 0x83;
   2554                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
   2555                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
   2556             } else {
   2557                rex  = clearWBit( rexAMode_R( fake(0), i->Ain.Alu32R.dst) );
   2558                if (rex != 0x40) *p++ = rex;
   2559                *p++ = 0x81;
   2560                p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu32R.dst);
   2561                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
   2562             }
   2563             goto done;
   2564          case Armi_Reg:
   2565             rex  = clearWBit(
   2566                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
   2567                                i->Ain.Alu32R.dst) );
   2568             if (rex != 0x40) *p++ = rex;
   2569             *p++ = toUChar(opc_rr);
   2570             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
   2571                              i->Ain.Alu32R.dst);
   2572             goto done;
   2573          case Armi_Mem:
   2574             rex  = clearWBit(
   2575                    rexAMode_M( i->Ain.Alu32R.dst,
   2576                                i->Ain.Alu32R.src->Armi.Mem.am) );
   2577             if (rex != 0x40) *p++ = rex;
   2578             *p++ = toUChar(opc);
   2579             p = doAMode_M(p, i->Ain.Alu32R.dst,
   2580                              i->Ain.Alu32R.src->Armi.Mem.am);
   2581             goto done;
   2582          default:
   2583             goto bad;
   2584       }
   2585       break;
   2586 
   2587    case Ain_MulL:
   2588       subopc = i->Ain.MulL.syned ? 5 : 4;
   2589       switch (i->Ain.MulL.src->tag)  {
   2590          case Arm_Mem:
   2591             *p++ = rexAMode_M( fake(0),
   2592                                i->Ain.MulL.src->Arm.Mem.am);
   2593             *p++ = 0xF7;
   2594             p = doAMode_M(p, fake(subopc),
   2595                              i->Ain.MulL.src->Arm.Mem.am);
   2596             goto done;
   2597          case Arm_Reg:
   2598             *p++ = rexAMode_R(fake(0),
   2599                               i->Ain.MulL.src->Arm.Reg.reg);
   2600             *p++ = 0xF7;
   2601             p = doAMode_R(p, fake(subopc),
   2602                              i->Ain.MulL.src->Arm.Reg.reg);
   2603             goto done;
   2604          default:
   2605             goto bad;
   2606       }
   2607       break;
   2608 
   2609    case Ain_Div:
   2610       subopc = i->Ain.Div.syned ? 7 : 6;
   2611       if (i->Ain.Div.sz == 4) {
   2612          switch (i->Ain.Div.src->tag)  {
   2613             case Arm_Mem:
   2614                goto bad;
   2615                /*FIXME*/
   2616                *p++ = 0xF7;
   2617                p = doAMode_M(p, fake(subopc),
   2618                                 i->Ain.Div.src->Arm.Mem.am);
   2619                goto done;
   2620             case Arm_Reg:
   2621                *p++ = clearWBit(
   2622                       rexAMode_R( fake(0), i->Ain.Div.src->Arm.Reg.reg));
   2623                *p++ = 0xF7;
   2624                p = doAMode_R(p, fake(subopc),
   2625                                 i->Ain.Div.src->Arm.Reg.reg);
   2626                goto done;
   2627             default:
   2628                goto bad;
   2629          }
   2630       }
   2631       if (i->Ain.Div.sz == 8) {
   2632          switch (i->Ain.Div.src->tag)  {
   2633             case Arm_Mem:
   2634                *p++ = rexAMode_M( fake(0),
   2635                                   i->Ain.Div.src->Arm.Mem.am);
   2636                *p++ = 0xF7;
   2637                p = doAMode_M(p, fake(subopc),
   2638                                 i->Ain.Div.src->Arm.Mem.am);
   2639                goto done;
   2640             case Arm_Reg:
   2641                *p++ = rexAMode_R( fake(0),
   2642                                   i->Ain.Div.src->Arm.Reg.reg);
   2643                *p++ = 0xF7;
   2644                p = doAMode_R(p, fake(subopc),
   2645                                 i->Ain.Div.src->Arm.Reg.reg);
   2646                goto done;
   2647             default:
   2648                goto bad;
   2649          }
   2650       }
   2651       break;
   2652 
   2653    case Ain_Push:
   2654       switch (i->Ain.Push.src->tag) {
   2655          case Armi_Mem:
   2656             *p++ = clearWBit(
   2657                    rexAMode_M(fake(0), i->Ain.Push.src->Armi.Mem.am));
   2658             *p++ = 0xFF;
   2659             p = doAMode_M(p, fake(6), i->Ain.Push.src->Armi.Mem.am);
   2660             goto done;
   2661          case Armi_Imm:
   2662             *p++ = 0x68;
   2663             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
   2664             goto done;
   2665          case Armi_Reg:
   2666             *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.Push.src->Armi.Reg.reg)));
   2667             *p++ = toUChar(0x50 + iregBits210(i->Ain.Push.src->Armi.Reg.reg));
   2668             goto done;
   2669         default:
   2670             goto bad;
   2671       }
   2672 
   2673    case Ain_Call: {
   2674       if (i->Ain.Call.cond != Acc_ALWAYS
   2675           && i->Ain.Call.rloc.pri != RLPri_None) {
   2676          /* The call might not happen (it isn't unconditional) and it
   2677             returns a result.  In this case we will need to generate a
   2678             control flow diamond to put 0x555..555 in the return
   2679             register(s) in the case where the call doesn't happen.  If
   2680             this ever becomes necessary, maybe copy code from the ARM
   2681             equivalent.  Until that day, just give up. */
   2682          goto bad;
   2683       }
   2684       /* As per detailed comment for Ain_Call in
   2685          getRegUsage_AMD64Instr above, %r11 is used as an address
   2686          temporary. */
   2687       /* jump over the following two insns if the condition does not
   2688          hold */
   2689       Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
   2690       if (i->Ain.Call.cond != Acc_ALWAYS) {
   2691          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
   2692          *p++ = shortImm ? 10 : 13;
   2693          /* 10 or 13 bytes in the next two insns */
   2694       }
   2695       if (shortImm) {
   2696          /* 7 bytes: movl sign-extend(imm32), %r11 */
   2697          *p++ = 0x49;
   2698          *p++ = 0xC7;
   2699          *p++ = 0xC3;
   2700          p = emit32(p, (UInt)i->Ain.Call.target);
   2701       } else {
   2702          /* 10 bytes: movabsq $target, %r11 */
   2703          *p++ = 0x49;
   2704          *p++ = 0xBB;
   2705          p = emit64(p, i->Ain.Call.target);
   2706       }
   2707       /* 3 bytes: call *%r11 */
   2708       *p++ = 0x41;
   2709       *p++ = 0xFF;
   2710       *p++ = 0xD3;
   2711       goto done;
   2712    }
   2713 
   2714    case Ain_XDirect: {
   2715       /* NB: what goes on here has to be very closely coordinated with the
   2716          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
   2717       /* We're generating chain-me requests here, so we need to be
   2718          sure this is actually allowed -- no-redir translations can't
   2719          use chain-me's.  Hence: */
   2720       vassert(disp_cp_chain_me_to_slowEP != NULL);
   2721       vassert(disp_cp_chain_me_to_fastEP != NULL);
   2722 
   2723       HReg r11 = hregAMD64_R11();
   2724 
   2725       /* Use ptmp for backpatching conditional jumps. */
   2726       ptmp = NULL;
   2727 
   2728       /* First off, if this is conditional, create a conditional
   2729          jump over the rest of it. */
   2730       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
   2731          /* jmp fwds if !condition */
   2732          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
   2733          ptmp = p; /* fill in this bit later */
   2734          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2735       }
   2736 
   2737       /* Update the guest RIP. */
   2738       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
   2739          /* use a shorter encoding */
   2740          /* movl sign-extend(dstGA), %r11 */
   2741          *p++ = 0x49;
   2742          *p++ = 0xC7;
   2743          *p++ = 0xC3;
   2744          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
   2745       } else {
   2746          /* movabsq $dstGA, %r11 */
   2747          *p++ = 0x49;
   2748          *p++ = 0xBB;
   2749          p = emit64(p, i->Ain.XDirect.dstGA);
   2750       }
   2751 
   2752       /* movq %r11, amRIP */
   2753       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
   2754       *p++ = 0x89;
   2755       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
   2756 
   2757       /* --- FIRST PATCHABLE BYTE follows --- */
   2758       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
   2759          to) backs up the return address, so as to find the address of
   2760          the first patchable byte.  So: don't change the length of the
   2761          two instructions below. */
   2762       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
   2763       *p++ = 0x49;
   2764       *p++ = 0xBB;
   2765       void* disp_cp_chain_me
   2766                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
   2767                                          : disp_cp_chain_me_to_slowEP;
   2768       p = emit64(p, Ptr_to_ULong(disp_cp_chain_me));
   2769       /* call *%r11 */
   2770       *p++ = 0x41;
   2771       *p++ = 0xFF;
   2772       *p++ = 0xD3;
   2773       /* --- END of PATCHABLE BYTES --- */
   2774 
   2775       /* Fix up the conditional jump, if there was one. */
   2776       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
   2777          Int delta = p - ptmp;
   2778          vassert(delta > 0 && delta < 40);
   2779          *ptmp = toUChar(delta-1);
   2780       }
   2781       goto done;
   2782    }
   2783 
   2784    case Ain_XIndir: {
   2785       /* We're generating transfers that could lead indirectly to a
   2786          chain-me, so we need to be sure this is actually allowed --
   2787          no-redir translations are not allowed to reach normal
   2788          translations without going through the scheduler.  That means
   2789          no XDirects or XIndirs out from no-redir translations.
   2790          Hence: */
   2791       vassert(disp_cp_xindir != NULL);
   2792 
   2793       /* Use ptmp for backpatching conditional jumps. */
   2794       ptmp = NULL;
   2795 
   2796       /* First off, if this is conditional, create a conditional
   2797          jump over the rest of it. */
   2798       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
   2799          /* jmp fwds if !condition */
   2800          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
   2801          ptmp = p; /* fill in this bit later */
   2802          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2803       }
   2804 
   2805       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
   2806       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
   2807       *p++ = 0x89;
   2808       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
   2809 
   2810       /* get $disp_cp_xindir into %r11 */
   2811       if (fitsIn32Bits(Ptr_to_ULong(disp_cp_xindir))) {
   2812          /* use a shorter encoding */
   2813          /* movl sign-extend(disp_cp_xindir), %r11 */
   2814          *p++ = 0x49;
   2815          *p++ = 0xC7;
   2816          *p++ = 0xC3;
   2817          p = emit32(p, (UInt)Ptr_to_ULong(disp_cp_xindir));
   2818       } else {
   2819          /* movabsq $disp_cp_xindir, %r11 */
   2820          *p++ = 0x49;
   2821          *p++ = 0xBB;
   2822          p = emit64(p, Ptr_to_ULong(disp_cp_xindir));
   2823       }
   2824 
   2825       /* jmp *%r11 */
   2826       *p++ = 0x41;
   2827       *p++ = 0xFF;
   2828       *p++ = 0xE3;
   2829 
   2830       /* Fix up the conditional jump, if there was one. */
   2831       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
   2832          Int delta = p - ptmp;
   2833          vassert(delta > 0 && delta < 40);
   2834          *ptmp = toUChar(delta-1);
   2835       }
   2836       goto done;
   2837    }
   2838 
   2839    case Ain_XAssisted: {
   2840       /* Use ptmp for backpatching conditional jumps. */
   2841       ptmp = NULL;
   2842 
   2843       /* First off, if this is conditional, create a conditional
   2844          jump over the rest of it. */
   2845       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
   2846          /* jmp fwds if !condition */
   2847          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
   2848          ptmp = p; /* fill in this bit later */
   2849          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2850       }
   2851 
   2852       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
   2853       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
   2854       *p++ = 0x89;
   2855       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
   2856       /* movl $magic_number, %ebp.  Since these numbers are all small positive
   2857          integers, we can get away with "movl $N, %ebp" rather than
   2858          the longer "movq $N, %rbp". */
   2859       UInt trcval = 0;
   2860       switch (i->Ain.XAssisted.jk) {
   2861          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
   2862          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
   2863          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
   2864          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
   2865          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
   2866          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
   2867          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
   2868          case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
   2869          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
   2870          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
   2871          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
   2872          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
   2873          /* We don't expect to see the following being assisted. */
   2874          case Ijk_Ret:
   2875          case Ijk_Call:
   2876          /* fallthrough */
   2877          default:
   2878             ppIRJumpKind(i->Ain.XAssisted.jk);
   2879             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
   2880       }
   2881       vassert(trcval != 0);
   2882       *p++ = 0xBD;
   2883       p = emit32(p, trcval);
   2884       /* movabsq $disp_assisted, %r11 */
   2885       *p++ = 0x49;
   2886       *p++ = 0xBB;
   2887       p = emit64(p, Ptr_to_ULong(disp_cp_xassisted));
   2888       /* jmp *%r11 */
   2889       *p++ = 0x41;
   2890       *p++ = 0xFF;
   2891       *p++ = 0xE3;
   2892 
   2893       /* Fix up the conditional jump, if there was one. */
   2894       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
   2895          Int delta = p - ptmp;
   2896          vassert(delta > 0 && delta < 40);
   2897          *ptmp = toUChar(delta-1);
   2898       }
   2899       goto done;
   2900    }
   2901 
   2902    case Ain_CMov64:
   2903       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
   2904       if (i->Ain.CMov64.src->tag == Arm_Reg) {
   2905          *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
   2906          *p++ = 0x0F;
   2907          *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
   2908          p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
   2909          goto done;
   2910       }
   2911       if (i->Ain.CMov64.src->tag == Arm_Mem) {
   2912          *p++ = rexAMode_M(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
   2913          *p++ = 0x0F;
   2914          *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
   2915          p = doAMode_M(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
   2916          goto done;
   2917       }
   2918       break;
   2919 
   2920    case Ain_MovxLQ:
   2921       /* No, _don't_ ask me why the sense of the args has to be
   2922          different in the S vs Z case.  I don't know. */
   2923       if (i->Ain.MovxLQ.syned) {
   2924          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
   2925          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   2926          *p++ = 0x63;
   2927          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   2928       } else {
   2929          /* Produce a 32-bit reg-reg move, since the implicit
   2930             zero-extend does what we want. */
   2931          *p++ = clearWBit (
   2932                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
   2933          *p++ = 0x89;
   2934          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
   2935       }
   2936       goto done;
   2937 
   2938    case Ain_LoadEX:
   2939       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
   2940          /* movzbq */
   2941          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2942          *p++ = 0x0F;
   2943          *p++ = 0xB6;
   2944          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2945          goto done;
   2946       }
   2947       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
   2948          /* movzwq */
   2949          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2950          *p++ = 0x0F;
   2951          *p++ = 0xB7;
   2952          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2953          goto done;
   2954       }
   2955       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
   2956          /* movzlq */
   2957          /* This isn't really an existing AMD64 instruction per se.
   2958             Rather, we have to do a 32-bit load.  Because a 32-bit
   2959             write implicitly clears the upper 32 bits of the target
   2960             register, we get what we want. */
   2961          *p++ = clearWBit(
   2962                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
   2963          *p++ = 0x8B;
   2964          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   2965          goto done;
   2966       }
   2967       break;
   2968 
   2969    case Ain_Set64:
   2970       /* Make the destination register be 1 or 0, depending on whether
   2971          the relevant condition holds.  Complication: the top 56 bits
   2972          of the destination should be forced to zero, but doing 'xorq
   2973          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
   2974          start off my moving $0 into the dest. */
   2975       reg = iregBits3210(i->Ain.Set64.dst);
   2976       vassert(reg < 16);
   2977 
   2978       /* movq $0, %dst */
   2979       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
   2980       *p++ = 0xC7;
   2981       *p++ = toUChar(0xC0 + (reg & 7));
   2982       p = emit32(p, 0);
   2983 
   2984       /* setb lo8(%dst) */
   2985       /* note, 8-bit register rex trickyness.  Be careful here. */
   2986       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
   2987       *p++ = 0x0F;
   2988       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
   2989       *p++ = toUChar(0xC0 + (reg & 7));
   2990       goto done;
   2991 
   2992    case Ain_Bsfr64:
   2993       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   2994       *p++ = 0x0F;
   2995       if (i->Ain.Bsfr64.isFwds) {
   2996          *p++ = 0xBC;
   2997       } else {
   2998          *p++ = 0xBD;
   2999       }
   3000       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   3001       goto done;
   3002 
   3003    case Ain_MFence:
   3004       /* mfence */
   3005       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
   3006       goto done;
   3007 
   3008    case Ain_ACAS:
   3009       /* lock */
   3010       *p++ = 0xF0;
   3011       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
   3012       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
   3013          in %rbx.  The new-value register is hardwired to be %rbx
   3014          since dealing with byte integer registers is too much hassle,
   3015          so we force the register operand to %rbx (could equally be
   3016          %rcx or %rdx). */
   3017       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
   3018       if (i->Ain.ACAS.sz != 8)
   3019          rex = clearWBit(rex);
   3020 
   3021       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
   3022       *p++ = 0x0F;
   3023       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
   3024       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
   3025       goto done;
   3026 
   3027    case Ain_DACAS:
   3028       /* lock */
   3029       *p++ = 0xF0;
   3030       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
   3031          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
   3032          aren't encoded in the insn. */
   3033       rex = rexAMode_M( fake(1), i->Ain.ACAS.addr );
   3034       if (i->Ain.ACAS.sz != 8)
   3035          rex = clearWBit(rex);
   3036       *p++ = rex;
   3037       *p++ = 0x0F;
   3038       *p++ = 0xC7;
   3039       p = doAMode_M(p, fake(1), i->Ain.DACAS.addr);
   3040       goto done;
   3041 
   3042    case Ain_A87Free:
   3043       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
   3044       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
   3045          p = do_ffree_st(p, 7-j);
   3046       }
   3047       goto done;
   3048 
   3049    case Ain_A87PushPop:
   3050       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
   3051       if (i->Ain.A87PushPop.isPush) {
   3052          /* Load from memory into %st(0): flds/fldl amode */
   3053          *p++ = clearWBit(
   3054                    rexAMode_M(fake(0), i->Ain.A87PushPop.addr) );
   3055          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   3056 	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Ain.A87PushPop.addr);
   3057       } else {
   3058          /* Dump %st(0) to memory: fstps/fstpl amode */
   3059          *p++ = clearWBit(
   3060                    rexAMode_M(fake(3), i->Ain.A87PushPop.addr) );
   3061          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   3062          p = doAMode_M(p, fake(3)/*subopcode*/, i->Ain.A87PushPop.addr);
   3063          goto done;
   3064       }
   3065       goto done;
   3066 
   3067    case Ain_A87FpOp:
   3068       switch (i->Ain.A87FpOp.op) {
   3069          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
   3070          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
   3071          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
   3072          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
   3073          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
   3074          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
   3075          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
   3076          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
   3077          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
   3078          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
   3079          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
   3080          case Afp_TAN:
   3081             /* fptan pushes 1.0 on the FP stack, except when the
   3082                argument is out of range.  Hence we have to do the
   3083                instruction, then inspect C2 to see if there is an out
   3084                of range condition.  If there is, we skip the fincstp
   3085                that is used by the in-range case to get rid of this
   3086                extra 1.0 value. */
   3087             *p++ = 0xD9; *p++ = 0xF2; // fptan
   3088             *p++ = 0x50;              // pushq %rax
   3089             *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
   3090             *p++ = 0x66; *p++ = 0xA9;
   3091             *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
   3092             *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
   3093             *p++ = 0xD9; *p++ = 0xF7; // fincstp
   3094             *p++ = 0x58;              // after_fincstp: popq %rax
   3095             break;
   3096          default:
   3097             goto bad;
   3098       }
   3099       goto done;
   3100 
   3101    case Ain_A87LdCW:
   3102       *p++ = clearWBit(
   3103                 rexAMode_M(fake(5), i->Ain.A87LdCW.addr) );
   3104       *p++ = 0xD9;
   3105       p = doAMode_M(p, fake(5)/*subopcode*/, i->Ain.A87LdCW.addr);
   3106       goto done;
   3107 
   3108    case Ain_A87StSW:
   3109       *p++ = clearWBit(
   3110                 rexAMode_M(fake(7), i->Ain.A87StSW.addr) );
   3111       *p++ = 0xDD;
   3112       p = doAMode_M(p, fake(7)/*subopcode*/, i->Ain.A87StSW.addr);
   3113       goto done;
   3114 
   3115    case Ain_Store:
   3116       if (i->Ain.Store.sz == 2) {
   3117          /* This just goes to show the crazyness of the instruction
   3118             set encoding.  We have to insert two prefix bytes, but be
   3119             careful to avoid a conflict in what the size should be, by
   3120             ensuring that REX.W = 0. */
   3121          *p++ = 0x66; /* override to 16-bits */
   3122 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3123          *p++ = 0x89;
   3124          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3125          goto done;
   3126       }
   3127       if (i->Ain.Store.sz == 4) {
   3128 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3129          *p++ = 0x89;
   3130          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3131          goto done;
   3132       }
   3133       if (i->Ain.Store.sz == 1) {
   3134          /* This is one place where it would be wrong to skip emitting
   3135             a rex byte of 0x40, since the mere presence of rex changes
   3136             the meaning of the byte register access.  Be careful. */
   3137 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3138          *p++ = 0x88;
   3139          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3140          goto done;
   3141       }
   3142       break;
   3143 
   3144    case Ain_LdMXCSR:
   3145       *p++ = clearWBit(rexAMode_M( fake(0), i->Ain.LdMXCSR.addr));
   3146       *p++ = 0x0F;
   3147       *p++ = 0xAE;
   3148       p = doAMode_M(p, fake(2)/*subopcode*/, i->Ain.LdMXCSR.addr);
   3149       goto done;
   3150 
   3151    case Ain_SseUComIS:
   3152       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
   3153       /* ucomi[sd] %srcL, %srcR */
   3154       if (i->Ain.SseUComIS.sz == 8) {
   3155          *p++ = 0x66;
   3156       } else {
   3157          goto bad;
   3158          vassert(i->Ain.SseUComIS.sz == 4);
   3159       }
   3160       *p++ = clearWBit (
   3161              rexAMode_R( vreg2ireg(i->Ain.SseUComIS.srcL),
   3162                          vreg2ireg(i->Ain.SseUComIS.srcR) ));
   3163       *p++ = 0x0F;
   3164       *p++ = 0x2E;
   3165       p = doAMode_R(p, vreg2ireg(i->Ain.SseUComIS.srcL),
   3166                        vreg2ireg(i->Ain.SseUComIS.srcR) );
   3167       /* pushfq */
   3168       *p++ = 0x9C;
   3169       /* popq %dst */
   3170       *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.SseUComIS.dst)));
   3171       *p++ = toUChar(0x58 + iregBits210(i->Ain.SseUComIS.dst));
   3172       goto done;
   3173 
   3174    case Ain_SseSI2SF:
   3175       /* cvssi2s[sd] %src, %dst */
   3176       rex = rexAMode_R( vreg2ireg(i->Ain.SseSI2SF.dst),
   3177                         i->Ain.SseSI2SF.src );
   3178       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
   3179       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
   3180       *p++ = 0x0F;
   3181       *p++ = 0x2A;
   3182       p = doAMode_R( p, vreg2ireg(i->Ain.SseSI2SF.dst),
   3183                         i->Ain.SseSI2SF.src );
   3184       goto done;
   3185 
   3186    case Ain_SseSF2SI:
   3187       /* cvss[sd]2si %src, %dst */
   3188       rex = rexAMode_R( i->Ain.SseSF2SI.dst,
   3189                         vreg2ireg(i->Ain.SseSF2SI.src) );
   3190       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
   3191       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
   3192       *p++ = 0x0F;
   3193       *p++ = 0x2D;
   3194       p = doAMode_R( p, i->Ain.SseSF2SI.dst,
   3195                         vreg2ireg(i->Ain.SseSF2SI.src) );
   3196       goto done;
   3197 
   3198    case Ain_SseSDSS:
   3199       /* cvtsd2ss/cvtss2sd %src, %dst */
   3200       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
   3201       *p++ = clearWBit(
   3202               rexAMode_R( vreg2ireg(i->Ain.SseSDSS.dst),
   3203                           vreg2ireg(i->Ain.SseSDSS.src) ));
   3204       *p++ = 0x0F;
   3205       *p++ = 0x5A;
   3206       p = doAMode_R( p, vreg2ireg(i->Ain.SseSDSS.dst),
   3207                         vreg2ireg(i->Ain.SseSDSS.src) );
   3208       goto done;
   3209 
   3210    case Ain_SseLdSt:
   3211       if (i->Ain.SseLdSt.sz == 8) {
   3212          *p++ = 0xF2;
   3213       } else
   3214       if (i->Ain.SseLdSt.sz == 4) {
   3215          *p++ = 0xF3;
   3216       } else
   3217       if (i->Ain.SseLdSt.sz != 16) {
   3218          vassert(0);
   3219       }
   3220       *p++ = clearWBit(
   3221              rexAMode_M( vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr));
   3222       *p++ = 0x0F;
   3223       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
   3224       p = doAMode_M(p, vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr);
   3225       goto done;
   3226 
   3227    case Ain_SseLdzLO:
   3228       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
   3229       /* movs[sd] amode, %xmm-dst */
   3230       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
   3231       *p++ = clearWBit(
   3232              rexAMode_M(vreg2ireg(i->Ain.SseLdzLO.reg),
   3233                         i->Ain.SseLdzLO.addr));
   3234       *p++ = 0x0F;
   3235       *p++ = 0x10;
   3236       p = doAMode_M(p, vreg2ireg(i->Ain.SseLdzLO.reg),
   3237                        i->Ain.SseLdzLO.addr);
   3238       goto done;
   3239 
   3240    case Ain_Sse32Fx4:
   3241       xtra = 0;
   3242       *p++ = clearWBit(
   3243              rexAMode_R( vreg2ireg(i->Ain.Sse32Fx4.dst),
   3244                          vreg2ireg(i->Ain.Sse32Fx4.src) ));
   3245       *p++ = 0x0F;
   3246       switch (i->Ain.Sse32Fx4.op) {
   3247          case Asse_ADDF:   *p++ = 0x58; break;
   3248          case Asse_DIVF:   *p++ = 0x5E; break;
   3249          case Asse_MAXF:   *p++ = 0x5F; break;
   3250          case Asse_MINF:   *p++ = 0x5D; break;
   3251          case Asse_MULF:   *p++ = 0x59; break;
   3252          case Asse_RCPF:   *p++ = 0x53; break;
   3253          case Asse_RSQRTF: *p++ = 0x52; break;
   3254          case Asse_SQRTF:  *p++ = 0x51; break;
   3255          case Asse_SUBF:   *p++ = 0x5C; break;
   3256          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3257          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3258          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3259          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3260          default: goto bad;
   3261       }
   3262       p = doAMode_R(p, vreg2ireg(i->Ain.Sse32Fx4.dst),
   3263                        vreg2ireg(i->Ain.Sse32Fx4.src) );
   3264       if (xtra & 0x100)
   3265          *p++ = toUChar(xtra & 0xFF);
   3266       goto done;
   3267 
   3268    case Ain_Sse64Fx2:
   3269       xtra = 0;
   3270       *p++ = 0x66;
   3271       *p++ = clearWBit(
   3272              rexAMode_R( vreg2ireg(i->Ain.Sse64Fx2.dst),
   3273                          vreg2ireg(i->Ain.Sse64Fx2.src) ));
   3274       *p++ = 0x0F;
   3275       switch (i->Ain.Sse64Fx2.op) {
   3276          case Asse_ADDF:   *p++ = 0x58; break;
   3277          case Asse_DIVF:   *p++ = 0x5E; break;
   3278          case Asse_MAXF:   *p++ = 0x5F; break;
   3279          case Asse_MINF:   *p++ = 0x5D; break;
   3280          case Asse_MULF:   *p++ = 0x59; break;
   3281          case Asse_SQRTF:  *p++ = 0x51; break;
   3282          case Asse_SUBF:   *p++ = 0x5C; break;
   3283          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3284          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3285          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3286          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3287          default: goto bad;
   3288       }
   3289       p = doAMode_R(p, vreg2ireg(i->Ain.Sse64Fx2.dst),
   3290                        vreg2ireg(i->Ain.Sse64Fx2.src) );
   3291       if (xtra & 0x100)
   3292          *p++ = toUChar(xtra & 0xFF);
   3293       goto done;
   3294 
   3295    case Ain_Sse32FLo:
   3296       xtra = 0;
   3297       *p++ = 0xF3;
   3298       *p++ = clearWBit(
   3299              rexAMode_R( vreg2ireg(i->Ain.Sse32FLo.dst),
   3300                          vreg2ireg(i->Ain.Sse32FLo.src) ));
   3301       *p++ = 0x0F;
   3302       switch (i->Ain.Sse32FLo.op) {
   3303          case Asse_ADDF:   *p++ = 0x58; break;
   3304          case Asse_DIVF:   *p++ = 0x5E; break;
   3305          case Asse_MAXF:   *p++ = 0x5F; break;
   3306          case Asse_MINF:   *p++ = 0x5D; break;
   3307          case Asse_MULF:   *p++ = 0x59; break;
   3308          case Asse_RCPF:   *p++ = 0x53; break;
   3309          case Asse_RSQRTF: *p++ = 0x52; break;
   3310          case Asse_SQRTF:  *p++ = 0x51; break;
   3311          case Asse_SUBF:   *p++ = 0x5C; break;
   3312          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3313          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3314          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3315          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3316          default: goto bad;
   3317       }
   3318       p = doAMode_R(p, vreg2ireg(i->Ain.Sse32FLo.dst),
   3319                        vreg2ireg(i->Ain.Sse32FLo.src) );
   3320       if (xtra & 0x100)
   3321          *p++ = toUChar(xtra & 0xFF);
   3322       goto done;
   3323 
   3324    case Ain_Sse64FLo:
   3325       xtra = 0;
   3326       *p++ = 0xF2;
   3327       *p++ = clearWBit(
   3328              rexAMode_R( vreg2ireg(i->Ain.Sse64FLo.dst),
   3329                          vreg2ireg(i->Ain.Sse64FLo.src) ));
   3330       *p++ = 0x0F;
   3331       switch (i->Ain.Sse64FLo.op) {
   3332          case Asse_ADDF:   *p++ = 0x58; break;
   3333          case Asse_DIVF:   *p++ = 0x5E; break;
   3334          case Asse_MAXF:   *p++ = 0x5F; break;
   3335          case Asse_MINF:   *p++ = 0x5D; break;
   3336          case Asse_MULF:   *p++ = 0x59; break;
   3337          case Asse_SQRTF:  *p++ = 0x51; break;
   3338          case Asse_SUBF:   *p++ = 0x5C; break;
   3339          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3340          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3341          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3342          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3343          default: goto bad;
   3344       }
   3345       p = doAMode_R(p, vreg2ireg(i->Ain.Sse64FLo.dst),
   3346                        vreg2ireg(i->Ain.Sse64FLo.src) );
   3347       if (xtra & 0x100)
   3348          *p++ = toUChar(xtra & 0xFF);
   3349       goto done;
   3350 
   3351    case Ain_SseReRg:
   3352 #     define XX(_n) *p++ = (_n)
   3353 
   3354       rex = clearWBit(
   3355             rexAMode_R( vreg2ireg(i->Ain.SseReRg.dst),
   3356                         vreg2ireg(i->Ain.SseReRg.src) ));
   3357 
   3358       switch (i->Ain.SseReRg.op) {
   3359          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
   3360          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
   3361          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
   3362          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
   3363          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
   3364          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
   3365          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
   3366          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
   3367          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
   3368          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
   3369          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
   3370          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
   3371          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
   3372          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
   3373          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
   3374          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
   3375          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
   3376          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
   3377          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
   3378          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
   3379          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
   3380          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
   3381          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
   3382          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
   3383          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
   3384          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
   3385          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
   3386          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
   3387          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
   3388          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
   3389          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
   3390          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
   3391          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
   3392          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
   3393          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
   3394          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
   3395          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
   3396          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
   3397          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
   3398          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
   3399          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
   3400          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
   3401          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
   3402          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
   3403          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
   3404          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
   3405          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
   3406          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
   3407          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
   3408          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
   3409          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
   3410          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
   3411          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
   3412          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
   3413          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
   3414          default: goto bad;
   3415       }
   3416       p = doAMode_R(p, vreg2ireg(i->Ain.SseReRg.dst),
   3417                        vreg2ireg(i->Ain.SseReRg.src) );
   3418 #     undef XX
   3419       goto done;
   3420 
   3421    case Ain_SseCMov:
   3422       /* jmp fwds if !condition */
   3423       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
   3424       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   3425       ptmp = p;
   3426 
   3427       /* movaps %src, %dst */
   3428       *p++ = clearWBit(
   3429              rexAMode_R( vreg2ireg(i->Ain.SseCMov.dst),
   3430                          vreg2ireg(i->Ain.SseCMov.src) ));
   3431       *p++ = 0x0F;
   3432       *p++ = 0x28;
   3433       p = doAMode_R(p, vreg2ireg(i->Ain.SseCMov.dst),
   3434                        vreg2ireg(i->Ain.SseCMov.src) );
   3435 
   3436       /* Fill in the jump offset. */
   3437       *(ptmp-1) = toUChar(p - ptmp);
   3438       goto done;
   3439 
   3440    case Ain_SseShuf:
   3441       *p++ = 0x66;
   3442       *p++ = clearWBit(
   3443              rexAMode_R( vreg2ireg(i->Ain.SseShuf.dst),
   3444                          vreg2ireg(i->Ain.SseShuf.src) ));
   3445       *p++ = 0x0F;
   3446       *p++ = 0x70;
   3447       p = doAMode_R(p, vreg2ireg(i->Ain.SseShuf.dst),
   3448                        vreg2ireg(i->Ain.SseShuf.src) );
   3449       *p++ = (UChar)(i->Ain.SseShuf.order);
   3450       goto done;
   3451 
   3452    //uu case Ain_AvxLdSt: {
   3453    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
   3454    //uu                           i->Ain.AvxLdSt.addr );
   3455    //uu    p = emitVexPrefix(p, vex);
   3456    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
   3457    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
   3458    //uu      goto done;
   3459    //uu }
   3460 
   3461    case Ain_EvCheck: {
   3462       /* We generate:
   3463             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
   3464             (2 bytes)  jns  nofail     expected taken
   3465             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
   3466             nofail:
   3467       */
   3468       /* This is heavily asserted re instruction lengths.  It needs to
   3469          be.  If we get given unexpected forms of .amCounter or
   3470          .amFailAddr -- basically, anything that's not of the form
   3471          uimm7(%rbp) -- they are likely to fail. */
   3472       /* Note also that after the decl we must be very careful not to
   3473          read the carry flag, else we get a partial flags stall.
   3474          js/jns avoids that, though. */
   3475       UChar* p0 = p;
   3476       /* ---  decl 8(%rbp) --- */
   3477       /* Need to compute the REX byte for the decl in order to prove
   3478          that we don't need it, since this is a 32-bit inc and all
   3479          registers involved in the amode are < r8.  "fake(1)" because
   3480          there's no register in this encoding; instead the register
   3481          field is used as a sub opcode.  The encoding for "decl r/m32"
   3482          is FF /1, hence the fake(1). */
   3483       rex = clearWBit(rexAMode_M(fake(1), i->Ain.EvCheck.amCounter));
   3484       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
   3485       *p++ = 0xFF;
   3486       p = doAMode_M(p, fake(1), i->Ain.EvCheck.amCounter);
   3487       vassert(p - p0 == 3);
   3488       /* --- jns nofail --- */
   3489       *p++ = 0x79;
   3490       *p++ = 0x03; /* need to check this 0x03 after the next insn */
   3491       vassert(p - p0 == 5);
   3492       /* --- jmp* 0(%rbp) --- */
   3493       /* Once again, verify we don't need REX.  The encoding is FF /4.
   3494          We don't need REX.W since by default FF /4 in 64-bit mode
   3495          implies a 64 bit load. */
   3496       rex = clearWBit(rexAMode_M(fake(4), i->Ain.EvCheck.amFailAddr));
   3497       if (rex != 0x40) goto bad;
   3498       *p++ = 0xFF;
   3499       p = doAMode_M(p, fake(4), i->Ain.EvCheck.amFailAddr);
   3500       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
   3501       /* And crosscheck .. */
   3502       vassert(evCheckSzB_AMD64() == 8);
   3503       goto done;
   3504    }
   3505 
   3506    case Ain_ProfInc: {
   3507       /* We generate   movabsq $0, %r11
   3508                        incq (%r11)
   3509          in the expectation that a later call to LibVEX_patchProfCtr
   3510          will be used to fill in the immediate field once the right
   3511          value is known.
   3512          49 BB 00 00 00 00 00 00 00 00
   3513          49 FF 03
   3514       */
   3515       *p++ = 0x49; *p++ = 0xBB;
   3516       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3517       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3518       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
   3519       /* Tell the caller .. */
   3520       vassert(!(*is_profInc));
   3521       *is_profInc = True;
   3522       goto done;
   3523    }
   3524 
   3525    default:
   3526       goto bad;
   3527    }
   3528 
   3529   bad:
   3530    ppAMD64Instr(i, mode64);
   3531    vpanic("emit_AMD64Instr");
   3532    /*NOTREACHED*/
   3533 
   3534   done:
   3535    vassert(p - &buf[0] <= 32);
   3536    return p - &buf[0];
   3537 
   3538 #  undef fake
   3539 }
   3540 
   3541 
   3542 /* How big is an event check?  See case for Ain_EvCheck in
   3543    emit_AMD64Instr just above.  That crosschecks what this returns, so
   3544    we can tell if we're inconsistent. */
   3545 Int evCheckSzB_AMD64 ( void )
   3546 {
   3547    return 8;
   3548 }
   3549 
   3550 
   3551 /* NB: what goes on here has to be very closely coordinated with the
   3552    emitInstr case for XDirect, above. */
   3553 VexInvalRange chainXDirect_AMD64 ( void* place_to_chain,
   3554                                    void* disp_cp_chain_me_EXPECTED,
   3555                                    void* place_to_jump_to )
   3556 {
   3557    /* What we're expecting to see is:
   3558         movabsq $disp_cp_chain_me_EXPECTED, %r11
   3559         call *%r11
   3560       viz
   3561         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
   3562         41 FF D3
   3563    */
   3564    UChar* p = (UChar*)place_to_chain;
   3565    vassert(p[0] == 0x49);
   3566    vassert(p[1] == 0xBB);
   3567    vassert(*(ULong*)(&p[2]) == Ptr_to_ULong(disp_cp_chain_me_EXPECTED));
   3568    vassert(p[10] == 0x41);
   3569    vassert(p[11] == 0xFF);
   3570    vassert(p[12] == 0xD3);
   3571    /* And what we want to change it to is either:
   3572         (general case):
   3573           movabsq $place_to_jump_to, %r11
   3574           jmpq *%r11
   3575         viz
   3576           49 BB <8 bytes value == place_to_jump_to>
   3577           41 FF E3
   3578         So it's the same length (convenient, huh) and we don't
   3579         need to change all the bits.
   3580       ---OR---
   3581         in the case where the displacement falls within 32 bits
   3582           jmpq disp32   where disp32 is relative to the next insn
   3583           ud2; ud2; ud2; ud2
   3584         viz
   3585           E9 <4 bytes == disp32>
   3586           0F 0B 0F 0B 0F 0B 0F 0B
   3587 
   3588       In both cases the replacement has the same length as the original.
   3589       To remain sane & verifiable,
   3590       (1) limit the displacement for the short form to
   3591           (say) +/- one billion, so as to avoid wraparound
   3592           off-by-ones
   3593       (2) even if the short form is applicable, once every (say)
   3594           1024 times use the long form anyway, so as to maintain
   3595           verifiability
   3596    */
   3597    /* This is the delta we need to put into a JMP d32 insn.  It's
   3598       relative to the start of the next insn, hence the -5.  */
   3599    Long delta   = (Long)((UChar*)place_to_jump_to - (UChar*)p) - (Long)5;
   3600    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
   3601 
   3602    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
   3603    if (shortOK) {
   3604       shortCTR++; // thread safety bleh
   3605       if (0 == (shortCTR & 0x3FF)) {
   3606          shortOK = False;
   3607          if (0)
   3608             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
   3609                        "using long jmp\n", shortCTR);
   3610       }
   3611    }
   3612 
   3613    /* And make the modifications. */
   3614    if (shortOK) {
   3615       p[0]  = 0xE9;
   3616       p[1]  = (delta >> 0) & 0xFF;
   3617       p[2]  = (delta >> 8) & 0xFF;
   3618       p[3]  = (delta >> 16) & 0xFF;
   3619       p[4]  = (delta >> 24) & 0xFF;
   3620       p[5]  = 0x0F; p[6]  = 0x0B;
   3621       p[7]  = 0x0F; p[8]  = 0x0B;
   3622       p[9]  = 0x0F; p[10] = 0x0B;
   3623       p[11] = 0x0F; p[12] = 0x0B;
   3624       /* sanity check on the delta -- top 32 are all 0 or all 1 */
   3625       delta >>= 32;
   3626       vassert(delta == 0LL || delta == -1LL);
   3627    } else {
   3628       /* Minimal modifications from the starting sequence. */
   3629       *(ULong*)(&p[2]) = Ptr_to_ULong(place_to_jump_to);
   3630       p[12] = 0xE3;
   3631    }
   3632    VexInvalRange vir = { (HWord)place_to_chain, 13 };
   3633    return vir;
   3634 }
   3635 
   3636 
   3637 /* NB: what goes on here has to be very closely coordinated with the
   3638    emitInstr case for XDirect, above. */
   3639 VexInvalRange unchainXDirect_AMD64 ( void* place_to_unchain,
   3640                                      void* place_to_jump_to_EXPECTED,
   3641                                      void* disp_cp_chain_me )
   3642 {
   3643    /* What we're expecting to see is either:
   3644         (general case)
   3645           movabsq $place_to_jump_to_EXPECTED, %r11
   3646           jmpq *%r11
   3647         viz
   3648           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
   3649           41 FF E3
   3650       ---OR---
   3651         in the case where the displacement falls within 32 bits
   3652           jmpq d32
   3653           ud2; ud2; ud2; ud2
   3654         viz
   3655           E9 <4 bytes == disp32>
   3656           0F 0B 0F 0B 0F 0B 0F 0B
   3657    */
   3658    UChar* p     = (UChar*)place_to_unchain;
   3659    Bool   valid = False;
   3660    if (p[0] == 0x49 && p[1] == 0xBB
   3661        && *(ULong*)(&p[2]) == Ptr_to_ULong(place_to_jump_to_EXPECTED)
   3662        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
   3663       /* it's the long form */
   3664       valid = True;
   3665    }
   3666    else
   3667    if (p[0] == 0xE9
   3668        && p[5]  == 0x0F && p[6]  == 0x0B
   3669        && p[7]  == 0x0F && p[8]  == 0x0B
   3670        && p[9]  == 0x0F && p[10] == 0x0B
   3671        && p[11] == 0x0F && p[12] == 0x0B) {
   3672       /* It's the short form.  Check the offset is right. */
   3673       Int  s32 = *(Int*)(&p[1]);
   3674       Long s64 = (Long)s32;
   3675       if ((UChar*)p + 5 + s64 == (UChar*)place_to_jump_to_EXPECTED) {
   3676          valid = True;
   3677          if (0)
   3678             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
   3679       }
   3680    }
   3681    vassert(valid);
   3682    /* And what we want to change it to is:
   3683         movabsq $disp_cp_chain_me, %r11
   3684         call *%r11
   3685       viz
   3686         49 BB <8 bytes value == disp_cp_chain_me>
   3687         41 FF D3
   3688       So it's the same length (convenient, huh).
   3689    */
   3690    p[0] = 0x49;
   3691    p[1] = 0xBB;
   3692    *(ULong*)(&p[2]) = Ptr_to_ULong(disp_cp_chain_me);
   3693    p[10] = 0x41;
   3694    p[11] = 0xFF;
   3695    p[12] = 0xD3;
   3696    VexInvalRange vir = { (HWord)place_to_unchain, 13 };
   3697    return vir;
   3698 }
   3699 
   3700 
   3701 /* Patch the counter address into a profile inc point, as previously
   3702    created by the Ain_ProfInc case for emit_AMD64Instr. */
   3703 VexInvalRange patchProfInc_AMD64 ( void*  place_to_patch,
   3704                                    ULong* location_of_counter )
   3705 {
   3706    vassert(sizeof(ULong*) == 8);
   3707    UChar* p = (UChar*)place_to_patch;
   3708    vassert(p[0] == 0x49);
   3709    vassert(p[1] == 0xBB);
   3710    vassert(p[2] == 0x00);
   3711    vassert(p[3] == 0x00);
   3712    vassert(p[4] == 0x00);
   3713    vassert(p[5] == 0x00);
   3714    vassert(p[6] == 0x00);
   3715    vassert(p[7] == 0x00);
   3716    vassert(p[8] == 0x00);
   3717    vassert(p[9] == 0x00);
   3718    vassert(p[10] == 0x49);
   3719    vassert(p[11] == 0xFF);
   3720    vassert(p[12] == 0x03);
   3721    ULong imm64 = (ULong)Ptr_to_ULong(location_of_counter);
   3722    p[2] = imm64 & 0xFF; imm64 >>= 8;
   3723    p[3] = imm64 & 0xFF; imm64 >>= 8;
   3724    p[4] = imm64 & 0xFF; imm64 >>= 8;
   3725    p[5] = imm64 & 0xFF; imm64 >>= 8;
   3726    p[6] = imm64 & 0xFF; imm64 >>= 8;
   3727    p[7] = imm64 & 0xFF; imm64 >>= 8;
   3728    p[8] = imm64 & 0xFF; imm64 >>= 8;
   3729    p[9] = imm64 & 0xFF; imm64 >>= 8;
   3730    VexInvalRange vir = { (HWord)place_to_patch, 13 };
   3731    return vir;
   3732 }
   3733 
   3734 
   3735 /*---------------------------------------------------------------*/
   3736 /*--- end                                   host_amd64_defs.c ---*/
   3737 /*---------------------------------------------------------------*/
   3738