Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_defs.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2017 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex.h"
     38 #include "libvex_trc_values.h"
     39 
     40 #include "main_util.h"
     41 #include "host_generic_regs.h"
     42 #include "host_amd64_defs.h"
     43 
     44 
     45 /* --------- Registers. --------- */
     46 
     47 const RRegUniverse* getRRegUniverse_AMD64 ( void )
     48 {
     49    /* The real-register universe is a big constant, so we just want to
     50       initialise it once. */
     51    static RRegUniverse rRegUniverse_AMD64;
     52    static Bool         rRegUniverse_AMD64_initted = False;
     53 
     54    /* Handy shorthand, nothing more */
     55    RRegUniverse* ru = &rRegUniverse_AMD64;
     56 
     57    /* This isn't thread-safe.  Sigh. */
     58    if (LIKELY(rRegUniverse_AMD64_initted))
     59       return ru;
     60 
     61    RRegUniverse__init(ru);
     62 
     63    /* Add the registers.  The initial segment of this array must be
     64       those available for allocation by reg-alloc, and those that
     65       follow are not available for allocation. */
     66    ru->regs[ru->size++] = hregAMD64_RSI();
     67    ru->regs[ru->size++] = hregAMD64_RDI();
     68    ru->regs[ru->size++] = hregAMD64_R8();
     69    ru->regs[ru->size++] = hregAMD64_R9();
     70    ru->regs[ru->size++] = hregAMD64_R12();
     71    ru->regs[ru->size++] = hregAMD64_R13();
     72    ru->regs[ru->size++] = hregAMD64_R14();
     73    ru->regs[ru->size++] = hregAMD64_R15();
     74    ru->regs[ru->size++] = hregAMD64_RBX();
     75    ru->regs[ru->size++] = hregAMD64_XMM3();
     76    ru->regs[ru->size++] = hregAMD64_XMM4();
     77    ru->regs[ru->size++] = hregAMD64_XMM5();
     78    ru->regs[ru->size++] = hregAMD64_XMM6();
     79    ru->regs[ru->size++] = hregAMD64_XMM7();
     80    ru->regs[ru->size++] = hregAMD64_XMM8();
     81    ru->regs[ru->size++] = hregAMD64_XMM9();
     82    ru->regs[ru->size++] = hregAMD64_XMM10();
     83    ru->regs[ru->size++] = hregAMD64_XMM11();
     84    ru->regs[ru->size++] = hregAMD64_XMM12();
     85    ru->regs[ru->size++] = hregAMD64_R10();
     86    ru->allocable = ru->size;
     87    /* And other regs, not available to the allocator. */
     88    ru->regs[ru->size++] = hregAMD64_RAX();
     89    ru->regs[ru->size++] = hregAMD64_RCX();
     90    ru->regs[ru->size++] = hregAMD64_RDX();
     91    ru->regs[ru->size++] = hregAMD64_RSP();
     92    ru->regs[ru->size++] = hregAMD64_RBP();
     93    ru->regs[ru->size++] = hregAMD64_R11();
     94    ru->regs[ru->size++] = hregAMD64_XMM0();
     95    ru->regs[ru->size++] = hregAMD64_XMM1();
     96 
     97    rRegUniverse_AMD64_initted = True;
     98 
     99    RRegUniverse__check_is_sane(ru);
    100    return ru;
    101 }
    102 
    103 
    104 void ppHRegAMD64 ( HReg reg )
    105 {
    106    Int r;
    107    static const HChar* ireg64_names[16]
    108      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    109          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    110    /* Be generic for all virtual regs. */
    111    if (hregIsVirtual(reg)) {
    112       ppHReg(reg);
    113       return;
    114    }
    115    /* But specific for real regs. */
    116    switch (hregClass(reg)) {
    117       case HRcInt64:
    118          r = hregEncoding(reg);
    119          vassert(r >= 0 && r < 16);
    120          vex_printf("%s", ireg64_names[r]);
    121          return;
    122       case HRcVec128:
    123          r = hregEncoding(reg);
    124          vassert(r >= 0 && r < 16);
    125          vex_printf("%%xmm%d", r);
    126          return;
    127       default:
    128          vpanic("ppHRegAMD64");
    129    }
    130 }
    131 
    132 static void ppHRegAMD64_lo32 ( HReg reg )
    133 {
    134    Int r;
    135    static const HChar* ireg32_names[16]
    136      = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
    137          "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
    138    /* Be generic for all virtual regs. */
    139    if (hregIsVirtual(reg)) {
    140       ppHReg(reg);
    141       vex_printf("d");
    142       return;
    143    }
    144    /* But specific for real regs. */
    145    switch (hregClass(reg)) {
    146       case HRcInt64:
    147          r = hregEncoding(reg);
    148          vassert(r >= 0 && r < 16);
    149          vex_printf("%s", ireg32_names[r]);
    150          return;
    151       default:
    152          vpanic("ppHRegAMD64_lo32: invalid regclass");
    153    }
    154 }
    155 
    156 
    157 /* --------- Condition codes, Intel encoding. --------- */
    158 
    159 const HChar* showAMD64CondCode ( AMD64CondCode cond )
    160 {
    161    switch (cond) {
    162       case Acc_O:      return "o";
    163       case Acc_NO:     return "no";
    164       case Acc_B:      return "b";
    165       case Acc_NB:     return "nb";
    166       case Acc_Z:      return "z";
    167       case Acc_NZ:     return "nz";
    168       case Acc_BE:     return "be";
    169       case Acc_NBE:    return "nbe";
    170       case Acc_S:      return "s";
    171       case Acc_NS:     return "ns";
    172       case Acc_P:      return "p";
    173       case Acc_NP:     return "np";
    174       case Acc_L:      return "l";
    175       case Acc_NL:     return "nl";
    176       case Acc_LE:     return "le";
    177       case Acc_NLE:    return "nle";
    178       case Acc_ALWAYS: return "ALWAYS";
    179       default: vpanic("ppAMD64CondCode");
    180    }
    181 }
    182 
    183 
    184 /* --------- AMD64AMode: memory address expressions. --------- */
    185 
    186 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
    187    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
    188    am->tag        = Aam_IR;
    189    am->Aam.IR.imm = imm32;
    190    am->Aam.IR.reg = reg;
    191    return am;
    192 }
    193 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
    194    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
    195    am->tag = Aam_IRRS;
    196    am->Aam.IRRS.imm   = imm32;
    197    am->Aam.IRRS.base  = base;
    198    am->Aam.IRRS.index = indEx;
    199    am->Aam.IRRS.shift = shift;
    200    vassert(shift >= 0 && shift <= 3);
    201    return am;
    202 }
    203 
    204 void ppAMD64AMode ( AMD64AMode* am ) {
    205    switch (am->tag) {
    206       case Aam_IR:
    207          if (am->Aam.IR.imm == 0)
    208             vex_printf("(");
    209          else
    210             vex_printf("0x%x(", am->Aam.IR.imm);
    211          ppHRegAMD64(am->Aam.IR.reg);
    212          vex_printf(")");
    213          return;
    214       case Aam_IRRS:
    215          vex_printf("0x%x(", am->Aam.IRRS.imm);
    216          ppHRegAMD64(am->Aam.IRRS.base);
    217          vex_printf(",");
    218          ppHRegAMD64(am->Aam.IRRS.index);
    219          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
    220          return;
    221       default:
    222          vpanic("ppAMD64AMode");
    223    }
    224 }
    225 
    226 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
    227    switch (am->tag) {
    228       case Aam_IR:
    229          addHRegUse(u, HRmRead, am->Aam.IR.reg);
    230          return;
    231       case Aam_IRRS:
    232          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
    233          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
    234          return;
    235       default:
    236          vpanic("addRegUsage_AMD64AMode");
    237    }
    238 }
    239 
    240 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
    241    switch (am->tag) {
    242       case Aam_IR:
    243          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
    244          return;
    245       case Aam_IRRS:
    246          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
    247          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
    248          return;
    249       default:
    250          vpanic("mapRegs_AMD64AMode");
    251    }
    252 }
    253 
    254 /* --------- Operand, which can be reg, immediate or memory. --------- */
    255 
    256 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
    257    AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
    258    op->tag            = Armi_Imm;
    259    op->Armi.Imm.imm32 = imm32;
    260    return op;
    261 }
    262 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
    263    AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
    264    op->tag          = Armi_Reg;
    265    op->Armi.Reg.reg = reg;
    266    return op;
    267 }
    268 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
    269    AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
    270    op->tag         = Armi_Mem;
    271    op->Armi.Mem.am = am;
    272    return op;
    273 }
    274 
    275 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
    276    switch (op->tag) {
    277       case Armi_Imm:
    278          vex_printf("$0x%x", op->Armi.Imm.imm32);
    279          return;
    280       case Armi_Reg:
    281          if (lo32)
    282             ppHRegAMD64_lo32(op->Armi.Reg.reg);
    283          else
    284             ppHRegAMD64(op->Armi.Reg.reg);
    285          return;
    286       case Armi_Mem:
    287          ppAMD64AMode(op->Armi.Mem.am);
    288          return;
    289      default:
    290          vpanic("ppAMD64RMI");
    291    }
    292 }
    293 void ppAMD64RMI ( AMD64RMI* op ) {
    294    ppAMD64RMI_wrk(op, False/*!lo32*/);
    295 }
    296 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
    297    ppAMD64RMI_wrk(op, True/*lo32*/);
    298 }
    299 
    300 /* An AMD64RMI can only be used in a "read" context (what would it mean
    301    to write or modify a literal?) and so we enumerate its registers
    302    accordingly. */
    303 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
    304    switch (op->tag) {
    305       case Armi_Imm:
    306          return;
    307       case Armi_Reg:
    308          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
    309          return;
    310       case Armi_Mem:
    311          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
    312          return;
    313       default:
    314          vpanic("addRegUsage_AMD64RMI");
    315    }
    316 }
    317 
    318 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
    319    switch (op->tag) {
    320       case Armi_Imm:
    321          return;
    322       case Armi_Reg:
    323          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
    324          return;
    325       case Armi_Mem:
    326          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
    327          return;
    328       default:
    329          vpanic("mapRegs_AMD64RMI");
    330    }
    331 }
    332 
    333 
    334 /* --------- Operand, which can be reg or immediate only. --------- */
    335 
    336 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
    337    AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
    338    op->tag           = Ari_Imm;
    339    op->Ari.Imm.imm32 = imm32;
    340    return op;
    341 }
    342 AMD64RI* AMD64RI_Reg ( HReg reg ) {
    343    AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
    344    op->tag         = Ari_Reg;
    345    op->Ari.Reg.reg = reg;
    346    return op;
    347 }
    348 
    349 void ppAMD64RI ( AMD64RI* op ) {
    350    switch (op->tag) {
    351       case Ari_Imm:
    352          vex_printf("$0x%x", op->Ari.Imm.imm32);
    353          return;
    354       case Ari_Reg:
    355          ppHRegAMD64(op->Ari.Reg.reg);
    356          return;
    357      default:
    358          vpanic("ppAMD64RI");
    359    }
    360 }
    361 
    362 /* An AMD64RI can only be used in a "read" context (what would it mean
    363    to write or modify a literal?) and so we enumerate its registers
    364    accordingly. */
    365 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
    366    switch (op->tag) {
    367       case Ari_Imm:
    368          return;
    369       case Ari_Reg:
    370          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
    371          return;
    372       default:
    373          vpanic("addRegUsage_AMD64RI");
    374    }
    375 }
    376 
    377 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
    378    switch (op->tag) {
    379       case Ari_Imm:
    380          return;
    381       case Ari_Reg:
    382          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
    383          return;
    384       default:
    385          vpanic("mapRegs_AMD64RI");
    386    }
    387 }
    388 
    389 
    390 /* --------- Operand, which can be reg or memory only. --------- */
    391 
    392 AMD64RM* AMD64RM_Reg ( HReg reg ) {
    393    AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
    394    op->tag         = Arm_Reg;
    395    op->Arm.Reg.reg = reg;
    396    return op;
    397 }
    398 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
    399    AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
    400    op->tag        = Arm_Mem;
    401    op->Arm.Mem.am = am;
    402    return op;
    403 }
    404 
    405 void ppAMD64RM ( AMD64RM* op ) {
    406    switch (op->tag) {
    407       case Arm_Mem:
    408          ppAMD64AMode(op->Arm.Mem.am);
    409          return;
    410       case Arm_Reg:
    411          ppHRegAMD64(op->Arm.Reg.reg);
    412          return;
    413      default:
    414          vpanic("ppAMD64RM");
    415    }
    416 }
    417 
    418 /* Because an AMD64RM can be both a source or destination operand, we
    419    have to supply a mode -- pertaining to the operand as a whole --
    420    indicating how it's being used. */
    421 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
    422    switch (op->tag) {
    423       case Arm_Mem:
    424          /* Memory is read, written or modified.  So we just want to
    425             know the regs read by the amode. */
    426          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
    427          return;
    428       case Arm_Reg:
    429          /* reg is read, written or modified.  Add it in the
    430             appropriate way. */
    431          addHRegUse(u, mode, op->Arm.Reg.reg);
    432          return;
    433      default:
    434          vpanic("addRegUsage_AMD64RM");
    435    }
    436 }
    437 
    438 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
    439 {
    440    switch (op->tag) {
    441       case Arm_Mem:
    442          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
    443          return;
    444       case Arm_Reg:
    445          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
    446          return;
    447      default:
    448          vpanic("mapRegs_AMD64RM");
    449    }
    450 }
    451 
    452 
    453 /* --------- Instructions. --------- */
    454 
    455 static const HChar* showAMD64ScalarSz ( Int sz ) {
    456    switch (sz) {
    457       case 2: return "w";
    458       case 4: return "l";
    459       case 8: return "q";
    460       default: vpanic("showAMD64ScalarSz");
    461    }
    462 }
    463 
    464 const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
    465    switch (op) {
    466       case Aun_NOT: return "not";
    467       case Aun_NEG: return "neg";
    468       default: vpanic("showAMD64UnaryOp");
    469    }
    470 }
    471 
    472 const HChar* showAMD64AluOp ( AMD64AluOp op ) {
    473    switch (op) {
    474       case Aalu_MOV:  return "mov";
    475       case Aalu_CMP:  return "cmp";
    476       case Aalu_ADD:  return "add";
    477       case Aalu_SUB:  return "sub";
    478       case Aalu_ADC:  return "adc";
    479       case Aalu_SBB:  return "sbb";
    480       case Aalu_AND:  return "and";
    481       case Aalu_OR:   return "or";
    482       case Aalu_XOR:  return "xor";
    483       case Aalu_MUL:  return "imul";
    484       default: vpanic("showAMD64AluOp");
    485    }
    486 }
    487 
    488 const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
    489    switch (op) {
    490       case Ash_SHL: return "shl";
    491       case Ash_SHR: return "shr";
    492       case Ash_SAR: return "sar";
    493       default: vpanic("showAMD64ShiftOp");
    494    }
    495 }
    496 
    497 const HChar* showA87FpOp ( A87FpOp op ) {
    498    switch (op) {
    499       case Afp_SCALE:  return "scale";
    500       case Afp_ATAN:   return "atan";
    501       case Afp_YL2X:   return "yl2x";
    502       case Afp_YL2XP1: return "yl2xp1";
    503       case Afp_PREM:   return "prem";
    504       case Afp_PREM1:  return "prem1";
    505       case Afp_SQRT:   return "sqrt";
    506       case Afp_SIN:    return "sin";
    507       case Afp_COS:    return "cos";
    508       case Afp_TAN:    return "tan";
    509       case Afp_ROUND:  return "round";
    510       case Afp_2XM1:   return "2xm1";
    511       default: vpanic("showA87FpOp");
    512    }
    513 }
    514 
    515 const HChar* showAMD64SseOp ( AMD64SseOp op ) {
    516    switch (op) {
    517       case Asse_MOV:      return "movups";
    518       case Asse_ADDF:     return "add";
    519       case Asse_SUBF:     return "sub";
    520       case Asse_MULF:     return "mul";
    521       case Asse_DIVF:     return "div";
    522       case Asse_MAXF:     return "max";
    523       case Asse_MINF:     return "min";
    524       case Asse_CMPEQF:   return "cmpFeq";
    525       case Asse_CMPLTF:   return "cmpFlt";
    526       case Asse_CMPLEF:   return "cmpFle";
    527       case Asse_CMPUNF:   return "cmpFun";
    528       case Asse_RCPF:     return "rcp";
    529       case Asse_RSQRTF:   return "rsqrt";
    530       case Asse_SQRTF:    return "sqrt";
    531       case Asse_AND:      return "and";
    532       case Asse_OR:       return "or";
    533       case Asse_XOR:      return "xor";
    534       case Asse_ANDN:     return "andn";
    535       case Asse_ADD8:     return "paddb";
    536       case Asse_ADD16:    return "paddw";
    537       case Asse_ADD32:    return "paddd";
    538       case Asse_ADD64:    return "paddq";
    539       case Asse_QADD8U:   return "paddusb";
    540       case Asse_QADD16U:  return "paddusw";
    541       case Asse_QADD8S:   return "paddsb";
    542       case Asse_QADD16S:  return "paddsw";
    543       case Asse_SUB8:     return "psubb";
    544       case Asse_SUB16:    return "psubw";
    545       case Asse_SUB32:    return "psubd";
    546       case Asse_SUB64:    return "psubq";
    547       case Asse_QSUB8U:   return "psubusb";
    548       case Asse_QSUB16U:  return "psubusw";
    549       case Asse_QSUB8S:   return "psubsb";
    550       case Asse_QSUB16S:  return "psubsw";
    551       case Asse_MUL16:    return "pmullw";
    552       case Asse_MULHI16U: return "pmulhuw";
    553       case Asse_MULHI16S: return "pmulhw";
    554       case Asse_AVG8U:    return "pavgb";
    555       case Asse_AVG16U:   return "pavgw";
    556       case Asse_MAX16S:   return "pmaxw";
    557       case Asse_MAX8U:    return "pmaxub";
    558       case Asse_MIN16S:   return "pminw";
    559       case Asse_MIN8U:    return "pminub";
    560       case Asse_CMPEQ8:   return "pcmpeqb";
    561       case Asse_CMPEQ16:  return "pcmpeqw";
    562       case Asse_CMPEQ32:  return "pcmpeqd";
    563       case Asse_CMPGT8S:  return "pcmpgtb";
    564       case Asse_CMPGT16S: return "pcmpgtw";
    565       case Asse_CMPGT32S: return "pcmpgtd";
    566       case Asse_SHL16:    return "psllw";
    567       case Asse_SHL32:    return "pslld";
    568       case Asse_SHL64:    return "psllq";
    569       case Asse_SHR16:    return "psrlw";
    570       case Asse_SHR32:    return "psrld";
    571       case Asse_SHR64:    return "psrlq";
    572       case Asse_SAR16:    return "psraw";
    573       case Asse_SAR32:    return "psrad";
    574       case Asse_PACKSSD:  return "packssdw";
    575       case Asse_PACKSSW:  return "packsswb";
    576       case Asse_PACKUSW:  return "packuswb";
    577       case Asse_UNPCKHB:  return "punpckhb";
    578       case Asse_UNPCKHW:  return "punpckhw";
    579       case Asse_UNPCKHD:  return "punpckhd";
    580       case Asse_UNPCKHQ:  return "punpckhq";
    581       case Asse_UNPCKLB:  return "punpcklb";
    582       case Asse_UNPCKLW:  return "punpcklw";
    583       case Asse_UNPCKLD:  return "punpckld";
    584       case Asse_UNPCKLQ:  return "punpcklq";
    585       default: vpanic("showAMD64SseOp");
    586    }
    587 }
    588 
    589 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
    590    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    591    i->tag             = Ain_Imm64;
    592    i->Ain.Imm64.imm64 = imm64;
    593    i->Ain.Imm64.dst   = dst;
    594    return i;
    595 }
    596 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    597    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    598    i->tag            = Ain_Alu64R;
    599    i->Ain.Alu64R.op  = op;
    600    i->Ain.Alu64R.src = src;
    601    i->Ain.Alu64R.dst = dst;
    602    return i;
    603 }
    604 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
    605    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    606    i->tag            = Ain_Alu64M;
    607    i->Ain.Alu64M.op  = op;
    608    i->Ain.Alu64M.src = src;
    609    i->Ain.Alu64M.dst = dst;
    610    vassert(op != Aalu_MUL);
    611    return i;
    612 }
    613 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
    614    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    615    i->tag          = Ain_Sh64;
    616    i->Ain.Sh64.op  = op;
    617    i->Ain.Sh64.src = src;
    618    i->Ain.Sh64.dst = dst;
    619    return i;
    620 }
    621 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
    622    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    623    i->tag              = Ain_Test64;
    624    i->Ain.Test64.imm32 = imm32;
    625    i->Ain.Test64.dst   = dst;
    626    return i;
    627 }
    628 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
    629    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    630    i->tag             = Ain_Unary64;
    631    i->Ain.Unary64.op  = op;
    632    i->Ain.Unary64.dst = dst;
    633    return i;
    634 }
    635 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
    636    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    637    i->tag             = Ain_Lea64;
    638    i->Ain.Lea64.am    = am;
    639    i->Ain.Lea64.dst   = dst;
    640    return i;
    641 }
    642 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    643    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    644    i->tag            = Ain_Alu32R;
    645    i->Ain.Alu32R.op  = op;
    646    i->Ain.Alu32R.src = src;
    647    i->Ain.Alu32R.dst = dst;
    648    switch (op) {
    649       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
    650       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
    651       default: vassert(0);
    652    }
    653    return i;
    654 }
    655 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
    656    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    657    i->tag            = Ain_MulL;
    658    i->Ain.MulL.syned = syned;
    659    i->Ain.MulL.src   = src;
    660    return i;
    661 }
    662 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
    663    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    664    i->tag            = Ain_Div;
    665    i->Ain.Div.syned  = syned;
    666    i->Ain.Div.sz     = sz;
    667    i->Ain.Div.src    = src;
    668    vassert(sz == 4 || sz == 8);
    669    return i;
    670 }
    671 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
    672    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    673    i->tag          = Ain_Push;
    674    i->Ain.Push.src = src;
    675    return i;
    676 }
    677 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
    678                               RetLoc rloc ) {
    679    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    680    i->tag               = Ain_Call;
    681    i->Ain.Call.cond     = cond;
    682    i->Ain.Call.target   = target;
    683    i->Ain.Call.regparms = regparms;
    684    i->Ain.Call.rloc     = rloc;
    685    vassert(regparms >= 0 && regparms <= 6);
    686    vassert(is_sane_RetLoc(rloc));
    687    return i;
    688 }
    689 
    690 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
    691                                  AMD64CondCode cond, Bool toFastEP ) {
    692    AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    693    i->tag                  = Ain_XDirect;
    694    i->Ain.XDirect.dstGA    = dstGA;
    695    i->Ain.XDirect.amRIP    = amRIP;
    696    i->Ain.XDirect.cond     = cond;
    697    i->Ain.XDirect.toFastEP = toFastEP;
    698    return i;
    699 }
    700 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
    701                                 AMD64CondCode cond ) {
    702    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    703    i->tag              = Ain_XIndir;
    704    i->Ain.XIndir.dstGA = dstGA;
    705    i->Ain.XIndir.amRIP = amRIP;
    706    i->Ain.XIndir.cond  = cond;
    707    return i;
    708 }
    709 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
    710                                    AMD64CondCode cond, IRJumpKind jk ) {
    711    AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    712    i->tag                 = Ain_XAssisted;
    713    i->Ain.XAssisted.dstGA = dstGA;
    714    i->Ain.XAssisted.amRIP = amRIP;
    715    i->Ain.XAssisted.cond  = cond;
    716    i->Ain.XAssisted.jk    = jk;
    717    return i;
    718 }
    719 
    720 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
    721    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    722    i->tag             = Ain_CMov64;
    723    i->Ain.CMov64.cond = cond;
    724    i->Ain.CMov64.src  = src;
    725    i->Ain.CMov64.dst  = dst;
    726    vassert(cond != Acc_ALWAYS);
    727    return i;
    728 }
    729 AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
    730                                AMD64AMode* addr, HReg dst ) {
    731    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    732    i->tag            = Ain_CLoad;
    733    i->Ain.CLoad.cond = cond;
    734    i->Ain.CLoad.szB  = szB;
    735    i->Ain.CLoad.addr = addr;
    736    i->Ain.CLoad.dst  = dst;
    737    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
    738    return i;
    739 }
    740 AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
    741                                 HReg src, AMD64AMode* addr ) {
    742    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    743    i->tag             = Ain_CStore;
    744    i->Ain.CStore.cond = cond;
    745    i->Ain.CStore.szB  = szB;
    746    i->Ain.CStore.src  = src;
    747    i->Ain.CStore.addr = addr;
    748    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
    749    return i;
    750 }
    751 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
    752    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    753    i->tag              = Ain_MovxLQ;
    754    i->Ain.MovxLQ.syned = syned;
    755    i->Ain.MovxLQ.src   = src;
    756    i->Ain.MovxLQ.dst   = dst;
    757    return i;
    758 }
    759 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
    760                                 AMD64AMode* src, HReg dst ) {
    761    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    762    i->tag                = Ain_LoadEX;
    763    i->Ain.LoadEX.szSmall = szSmall;
    764    i->Ain.LoadEX.syned   = syned;
    765    i->Ain.LoadEX.src     = src;
    766    i->Ain.LoadEX.dst     = dst;
    767    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
    768    return i;
    769 }
    770 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
    771    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    772    i->tag           = Ain_Store;
    773    i->Ain.Store.sz  = sz;
    774    i->Ain.Store.src = src;
    775    i->Ain.Store.dst = dst;
    776    vassert(sz == 1 || sz == 2 || sz == 4);
    777    return i;
    778 }
    779 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
    780    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    781    i->tag            = Ain_Set64;
    782    i->Ain.Set64.cond = cond;
    783    i->Ain.Set64.dst  = dst;
    784    return i;
    785 }
    786 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
    787    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    788    i->tag               = Ain_Bsfr64;
    789    i->Ain.Bsfr64.isFwds = isFwds;
    790    i->Ain.Bsfr64.src    = src;
    791    i->Ain.Bsfr64.dst    = dst;
    792    return i;
    793 }
    794 AMD64Instr* AMD64Instr_MFence ( void ) {
    795    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    796    i->tag        = Ain_MFence;
    797    return i;
    798 }
    799 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
    800    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    801    i->tag           = Ain_ACAS;
    802    i->Ain.ACAS.addr = addr;
    803    i->Ain.ACAS.sz   = sz;
    804    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
    805    return i;
    806 }
    807 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
    808    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    809    i->tag            = Ain_DACAS;
    810    i->Ain.DACAS.addr = addr;
    811    i->Ain.DACAS.sz   = sz;
    812    vassert(sz == 8 || sz == 4);
    813    return i;
    814 }
    815 
    816 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
    817 {
    818    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    819    i->tag               = Ain_A87Free;
    820    i->Ain.A87Free.nregs = nregs;
    821    vassert(nregs >= 1 && nregs <= 7);
    822    return i;
    823 }
    824 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
    825 {
    826    AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    827    i->tag                   = Ain_A87PushPop;
    828    i->Ain.A87PushPop.addr   = addr;
    829    i->Ain.A87PushPop.isPush = isPush;
    830    i->Ain.A87PushPop.szB    = szB;
    831    vassert(szB == 8 || szB == 4);
    832    return i;
    833 }
    834 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
    835 {
    836    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    837    i->tag            = Ain_A87FpOp;
    838    i->Ain.A87FpOp.op = op;
    839    return i;
    840 }
    841 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
    842 {
    843    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    844    i->tag              = Ain_A87LdCW;
    845    i->Ain.A87LdCW.addr = addr;
    846    return i;
    847 }
    848 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
    849 {
    850    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    851    i->tag              = Ain_A87StSW;
    852    i->Ain.A87StSW.addr = addr;
    853    return i;
    854 }
    855 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
    856    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    857    i->tag                = Ain_LdMXCSR;
    858    i->Ain.LdMXCSR.addr   = addr;
    859    return i;
    860 }
    861 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
    862    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    863    i->tag                = Ain_SseUComIS;
    864    i->Ain.SseUComIS.sz   = toUChar(sz);
    865    i->Ain.SseUComIS.srcL = srcL;
    866    i->Ain.SseUComIS.srcR = srcR;
    867    i->Ain.SseUComIS.dst  = dst;
    868    vassert(sz == 4 || sz == 8);
    869    return i;
    870 }
    871 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
    872    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    873    i->tag              = Ain_SseSI2SF;
    874    i->Ain.SseSI2SF.szS = toUChar(szS);
    875    i->Ain.SseSI2SF.szD = toUChar(szD);
    876    i->Ain.SseSI2SF.src = src;
    877    i->Ain.SseSI2SF.dst = dst;
    878    vassert(szS == 4 || szS == 8);
    879    vassert(szD == 4 || szD == 8);
    880    return i;
    881 }
    882 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
    883    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    884    i->tag              = Ain_SseSF2SI;
    885    i->Ain.SseSF2SI.szS = toUChar(szS);
    886    i->Ain.SseSF2SI.szD = toUChar(szD);
    887    i->Ain.SseSF2SI.src = src;
    888    i->Ain.SseSF2SI.dst = dst;
    889    vassert(szS == 4 || szS == 8);
    890    vassert(szD == 4 || szD == 8);
    891    return i;
    892 }
    893 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
    894 {
    895    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    896    i->tag                = Ain_SseSDSS;
    897    i->Ain.SseSDSS.from64 = from64;
    898    i->Ain.SseSDSS.src    = src;
    899    i->Ain.SseSDSS.dst    = dst;
    900    return i;
    901 }
    902 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
    903                                  HReg reg, AMD64AMode* addr ) {
    904    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    905    i->tag                = Ain_SseLdSt;
    906    i->Ain.SseLdSt.isLoad = isLoad;
    907    i->Ain.SseLdSt.sz     = toUChar(sz);
    908    i->Ain.SseLdSt.reg    = reg;
    909    i->Ain.SseLdSt.addr   = addr;
    910    vassert(sz == 4 || sz == 8 || sz == 16);
    911    return i;
    912 }
    913 AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
    914                                    HReg src, AMD64AMode* addr )
    915 {
    916    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    917    i->tag                = Ain_SseCStore;
    918    i->Ain.SseCStore.cond = cond;
    919    i->Ain.SseCStore.src  = src;
    920    i->Ain.SseCStore.addr = addr;
    921    vassert(cond != Acc_ALWAYS);
    922    return i;
    923 }
    924 AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
    925                                   AMD64AMode* addr, HReg dst )
    926 {
    927    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    928    i->tag               = Ain_SseCLoad;
    929    i->Ain.SseCLoad.cond = cond;
    930    i->Ain.SseCLoad.addr = addr;
    931    i->Ain.SseCLoad.dst  = dst;
    932    vassert(cond != Acc_ALWAYS);
    933    return i;
    934 }
    935 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
    936 {
    937    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    938    i->tag                = Ain_SseLdzLO;
    939    i->Ain.SseLdzLO.sz    = sz;
    940    i->Ain.SseLdzLO.reg   = reg;
    941    i->Ain.SseLdzLO.addr  = addr;
    942    vassert(sz == 4 || sz == 8);
    943    return i;
    944 }
    945 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
    946    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    947    i->tag              = Ain_Sse32Fx4;
    948    i->Ain.Sse32Fx4.op  = op;
    949    i->Ain.Sse32Fx4.src = src;
    950    i->Ain.Sse32Fx4.dst = dst;
    951    vassert(op != Asse_MOV);
    952    return i;
    953 }
    954 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    955    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    956    i->tag              = Ain_Sse32FLo;
    957    i->Ain.Sse32FLo.op  = op;
    958    i->Ain.Sse32FLo.src = src;
    959    i->Ain.Sse32FLo.dst = dst;
    960    vassert(op != Asse_MOV);
    961    return i;
    962 }
    963 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
    964    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    965    i->tag              = Ain_Sse64Fx2;
    966    i->Ain.Sse64Fx2.op  = op;
    967    i->Ain.Sse64Fx2.src = src;
    968    i->Ain.Sse64Fx2.dst = dst;
    969    vassert(op != Asse_MOV);
    970    return i;
    971 }
    972 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    973    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    974    i->tag              = Ain_Sse64FLo;
    975    i->Ain.Sse64FLo.op  = op;
    976    i->Ain.Sse64FLo.src = src;
    977    i->Ain.Sse64FLo.dst = dst;
    978    vassert(op != Asse_MOV);
    979    return i;
    980 }
    981 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
    982    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    983    i->tag             = Ain_SseReRg;
    984    i->Ain.SseReRg.op  = op;
    985    i->Ain.SseReRg.src = re;
    986    i->Ain.SseReRg.dst = rg;
    987    return i;
    988 }
    989 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
    990    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    991    i->tag              = Ain_SseCMov;
    992    i->Ain.SseCMov.cond = cond;
    993    i->Ain.SseCMov.src  = src;
    994    i->Ain.SseCMov.dst  = dst;
    995    vassert(cond != Acc_ALWAYS);
    996    return i;
    997 }
    998 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
    999    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1000    i->tag               = Ain_SseShuf;
   1001    i->Ain.SseShuf.order = order;
   1002    i->Ain.SseShuf.src   = src;
   1003    i->Ain.SseShuf.dst   = dst;
   1004    vassert(order >= 0 && order <= 0xFF);
   1005    return i;
   1006 }
   1007 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
   1008 //uu                                  HReg reg, AMD64AMode* addr ) {
   1009 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1010 //uu    i->tag                = Ain_AvxLdSt;
   1011 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
   1012 //uu    i->Ain.AvxLdSt.reg    = reg;
   1013 //uu    i->Ain.AvxLdSt.addr   = addr;
   1014 //uu    return i;
   1015 //uu }
   1016 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
   1017 //uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1018 //uu    i->tag             = Ain_AvxReRg;
   1019 //uu    i->Ain.AvxReRg.op  = op;
   1020 //uu    i->Ain.AvxReRg.src = re;
   1021 //uu    i->Ain.AvxReRg.dst = rg;
   1022 //uu    return i;
   1023 //uu }
   1024 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
   1025                                  AMD64AMode* amFailAddr ) {
   1026    AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1027    i->tag                    = Ain_EvCheck;
   1028    i->Ain.EvCheck.amCounter  = amCounter;
   1029    i->Ain.EvCheck.amFailAddr = amFailAddr;
   1030    return i;
   1031 }
   1032 AMD64Instr* AMD64Instr_ProfInc ( void ) {
   1033    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1034    i->tag        = Ain_ProfInc;
   1035    return i;
   1036 }
   1037 
   1038 void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
   1039 {
   1040    vassert(mode64 == True);
   1041    switch (i->tag) {
   1042       case Ain_Imm64:
   1043          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
   1044          ppHRegAMD64(i->Ain.Imm64.dst);
   1045          return;
   1046       case Ain_Alu64R:
   1047          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
   1048          ppAMD64RMI(i->Ain.Alu64R.src);
   1049          vex_printf(",");
   1050          ppHRegAMD64(i->Ain.Alu64R.dst);
   1051          return;
   1052       case Ain_Alu64M:
   1053          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
   1054          ppAMD64RI(i->Ain.Alu64M.src);
   1055          vex_printf(",");
   1056          ppAMD64AMode(i->Ain.Alu64M.dst);
   1057          return;
   1058       case Ain_Sh64:
   1059          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
   1060          if (i->Ain.Sh64.src == 0)
   1061             vex_printf("%%cl,");
   1062          else
   1063             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
   1064          ppHRegAMD64(i->Ain.Sh64.dst);
   1065          return;
   1066       case Ain_Test64:
   1067          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
   1068          ppHRegAMD64(i->Ain.Test64.dst);
   1069          return;
   1070       case Ain_Unary64:
   1071          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
   1072          ppHRegAMD64(i->Ain.Unary64.dst);
   1073          return;
   1074       case Ain_Lea64:
   1075          vex_printf("leaq ");
   1076          ppAMD64AMode(i->Ain.Lea64.am);
   1077          vex_printf(",");
   1078          ppHRegAMD64(i->Ain.Lea64.dst);
   1079          return;
   1080       case Ain_Alu32R:
   1081          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
   1082          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
   1083          vex_printf(",");
   1084          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
   1085          return;
   1086       case Ain_MulL:
   1087          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
   1088          ppAMD64RM(i->Ain.MulL.src);
   1089          return;
   1090       case Ain_Div:
   1091          vex_printf("%cdiv%s ",
   1092                     i->Ain.Div.syned ? 's' : 'u',
   1093                     showAMD64ScalarSz(i->Ain.Div.sz));
   1094          ppAMD64RM(i->Ain.Div.src);
   1095          return;
   1096       case Ain_Push:
   1097          vex_printf("pushq ");
   1098          ppAMD64RMI(i->Ain.Push.src);
   1099          return;
   1100       case Ain_Call:
   1101          vex_printf("call%s[%d,",
   1102                     i->Ain.Call.cond==Acc_ALWAYS
   1103                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
   1104                     i->Ain.Call.regparms );
   1105          ppRetLoc(i->Ain.Call.rloc);
   1106          vex_printf("] 0x%llx", i->Ain.Call.target);
   1107          break;
   1108 
   1109       case Ain_XDirect:
   1110          vex_printf("(xDirect) ");
   1111          vex_printf("if (%%rflags.%s) { ",
   1112                     showAMD64CondCode(i->Ain.XDirect.cond));
   1113          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
   1114          vex_printf("movq %%r11,");
   1115          ppAMD64AMode(i->Ain.XDirect.amRIP);
   1116          vex_printf("; ");
   1117          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
   1118                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
   1119          return;
   1120       case Ain_XIndir:
   1121          vex_printf("(xIndir) ");
   1122          vex_printf("if (%%rflags.%s) { ",
   1123                     showAMD64CondCode(i->Ain.XIndir.cond));
   1124          vex_printf("movq ");
   1125          ppHRegAMD64(i->Ain.XIndir.dstGA);
   1126          vex_printf(",");
   1127          ppAMD64AMode(i->Ain.XIndir.amRIP);
   1128          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
   1129          return;
   1130       case Ain_XAssisted:
   1131          vex_printf("(xAssisted) ");
   1132          vex_printf("if (%%rflags.%s) { ",
   1133                     showAMD64CondCode(i->Ain.XAssisted.cond));
   1134          vex_printf("movq ");
   1135          ppHRegAMD64(i->Ain.XAssisted.dstGA);
   1136          vex_printf(",");
   1137          ppAMD64AMode(i->Ain.XAssisted.amRIP);
   1138          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
   1139                     (Int)i->Ain.XAssisted.jk);
   1140          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
   1141          return;
   1142 
   1143       case Ain_CMov64:
   1144          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
   1145          ppHRegAMD64(i->Ain.CMov64.src);
   1146          vex_printf(",");
   1147          ppHRegAMD64(i->Ain.CMov64.dst);
   1148          return;
   1149       case Ain_CLoad:
   1150          vex_printf("if (%%rflags.%s) { ",
   1151                     showAMD64CondCode(i->Ain.CLoad.cond));
   1152          vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
   1153          ppAMD64AMode(i->Ain.CLoad.addr);
   1154          vex_printf(", ");
   1155          (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1156             (i->Ain.CLoad.dst);
   1157          vex_printf(" }");
   1158          return;
   1159       case Ain_CStore:
   1160          vex_printf("if (%%rflags.%s) { ",
   1161                     showAMD64CondCode(i->Ain.CStore.cond));
   1162          vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
   1163          (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1164             (i->Ain.CStore.src);
   1165          vex_printf(", ");
   1166          ppAMD64AMode(i->Ain.CStore.addr);
   1167          vex_printf(" }");
   1168          return;
   1169 
   1170       case Ain_MovxLQ:
   1171          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
   1172          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
   1173          vex_printf(",");
   1174          ppHRegAMD64(i->Ain.MovxLQ.dst);
   1175          return;
   1176       case Ain_LoadEX:
   1177          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
   1178             vex_printf("movl ");
   1179             ppAMD64AMode(i->Ain.LoadEX.src);
   1180             vex_printf(",");
   1181             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
   1182          } else {
   1183             vex_printf("mov%c%cq ",
   1184                        i->Ain.LoadEX.syned ? 's' : 'z',
   1185                        i->Ain.LoadEX.szSmall==1
   1186                           ? 'b'
   1187                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
   1188             ppAMD64AMode(i->Ain.LoadEX.src);
   1189             vex_printf(",");
   1190             ppHRegAMD64(i->Ain.LoadEX.dst);
   1191          }
   1192          return;
   1193       case Ain_Store:
   1194          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
   1195                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
   1196          ppHRegAMD64(i->Ain.Store.src);
   1197          vex_printf(",");
   1198          ppAMD64AMode(i->Ain.Store.dst);
   1199          return;
   1200       case Ain_Set64:
   1201          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
   1202          ppHRegAMD64(i->Ain.Set64.dst);
   1203          return;
   1204       case Ain_Bsfr64:
   1205          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
   1206          ppHRegAMD64(i->Ain.Bsfr64.src);
   1207          vex_printf(",");
   1208          ppHRegAMD64(i->Ain.Bsfr64.dst);
   1209          return;
   1210       case Ain_MFence:
   1211          vex_printf("mfence" );
   1212          return;
   1213       case Ain_ACAS:
   1214          vex_printf("lock cmpxchg%c ",
   1215                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
   1216                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
   1217          vex_printf("{%%rax->%%rbx},");
   1218          ppAMD64AMode(i->Ain.ACAS.addr);
   1219          return;
   1220       case Ain_DACAS:
   1221          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
   1222                     (Int)(2 * i->Ain.DACAS.sz));
   1223          ppAMD64AMode(i->Ain.DACAS.addr);
   1224          return;
   1225       case Ain_A87Free:
   1226          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
   1227          break;
   1228       case Ain_A87PushPop:
   1229          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
   1230                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
   1231          ppAMD64AMode(i->Ain.A87PushPop.addr);
   1232          break;
   1233       case Ain_A87FpOp:
   1234          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
   1235          break;
   1236       case Ain_A87LdCW:
   1237          vex_printf("fldcw ");
   1238          ppAMD64AMode(i->Ain.A87LdCW.addr);
   1239          break;
   1240       case Ain_A87StSW:
   1241          vex_printf("fstsw ");
   1242          ppAMD64AMode(i->Ain.A87StSW.addr);
   1243          break;
   1244       case Ain_LdMXCSR:
   1245          vex_printf("ldmxcsr ");
   1246          ppAMD64AMode(i->Ain.LdMXCSR.addr);
   1247          break;
   1248       case Ain_SseUComIS:
   1249          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
   1250          ppHRegAMD64(i->Ain.SseUComIS.srcL);
   1251          vex_printf(",");
   1252          ppHRegAMD64(i->Ain.SseUComIS.srcR);
   1253          vex_printf(" ; pushfq ; popq ");
   1254          ppHRegAMD64(i->Ain.SseUComIS.dst);
   1255          break;
   1256       case Ain_SseSI2SF:
   1257          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
   1258          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1259             (i->Ain.SseSI2SF.src);
   1260          vex_printf(",");
   1261          ppHRegAMD64(i->Ain.SseSI2SF.dst);
   1262          break;
   1263       case Ain_SseSF2SI:
   1264          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
   1265          ppHRegAMD64(i->Ain.SseSF2SI.src);
   1266          vex_printf(",");
   1267          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1268             (i->Ain.SseSF2SI.dst);
   1269          break;
   1270       case Ain_SseSDSS:
   1271          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
   1272          ppHRegAMD64(i->Ain.SseSDSS.src);
   1273          vex_printf(",");
   1274          ppHRegAMD64(i->Ain.SseSDSS.dst);
   1275          break;
   1276       case Ain_SseLdSt:
   1277          switch (i->Ain.SseLdSt.sz) {
   1278             case 4:  vex_printf("movss "); break;
   1279             case 8:  vex_printf("movsd "); break;
   1280             case 16: vex_printf("movups "); break;
   1281             default: vassert(0);
   1282          }
   1283          if (i->Ain.SseLdSt.isLoad) {
   1284             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1285             vex_printf(",");
   1286             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1287          } else {
   1288             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1289             vex_printf(",");
   1290             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1291          }
   1292          return;
   1293       case Ain_SseCStore:
   1294          vex_printf("if (%%rflags.%s) { ",
   1295                     showAMD64CondCode(i->Ain.SseCStore.cond));
   1296          vex_printf("movups ");
   1297          ppHRegAMD64(i->Ain.SseCStore.src);
   1298          vex_printf(", ");
   1299          ppAMD64AMode(i->Ain.SseCStore.addr);
   1300          vex_printf(" }");
   1301          return;
   1302       case Ain_SseCLoad:
   1303          vex_printf("if (%%rflags.%s) { ",
   1304                     showAMD64CondCode(i->Ain.SseCLoad.cond));
   1305          vex_printf("movups ");
   1306          ppAMD64AMode(i->Ain.SseCLoad.addr);
   1307          vex_printf(", ");
   1308          ppHRegAMD64(i->Ain.SseCLoad.dst);
   1309          vex_printf(" }");
   1310          return;
   1311       case Ain_SseLdzLO:
   1312          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
   1313          ppAMD64AMode(i->Ain.SseLdzLO.addr);
   1314          vex_printf(",");
   1315          ppHRegAMD64(i->Ain.SseLdzLO.reg);
   1316          return;
   1317       case Ain_Sse32Fx4:
   1318          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
   1319          ppHRegAMD64(i->Ain.Sse32Fx4.src);
   1320          vex_printf(",");
   1321          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
   1322          return;
   1323       case Ain_Sse32FLo:
   1324          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
   1325          ppHRegAMD64(i->Ain.Sse32FLo.src);
   1326          vex_printf(",");
   1327          ppHRegAMD64(i->Ain.Sse32FLo.dst);
   1328          return;
   1329       case Ain_Sse64Fx2:
   1330          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
   1331          ppHRegAMD64(i->Ain.Sse64Fx2.src);
   1332          vex_printf(",");
   1333          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
   1334          return;
   1335       case Ain_Sse64FLo:
   1336          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
   1337          ppHRegAMD64(i->Ain.Sse64FLo.src);
   1338          vex_printf(",");
   1339          ppHRegAMD64(i->Ain.Sse64FLo.dst);
   1340          return;
   1341       case Ain_SseReRg:
   1342          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1343          ppHRegAMD64(i->Ain.SseReRg.src);
   1344          vex_printf(",");
   1345          ppHRegAMD64(i->Ain.SseReRg.dst);
   1346          return;
   1347       case Ain_SseCMov:
   1348          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
   1349          ppHRegAMD64(i->Ain.SseCMov.src);
   1350          vex_printf(",");
   1351          ppHRegAMD64(i->Ain.SseCMov.dst);
   1352          return;
   1353       case Ain_SseShuf:
   1354          vex_printf("pshufd $0x%x,", (UInt)i->Ain.SseShuf.order);
   1355          ppHRegAMD64(i->Ain.SseShuf.src);
   1356          vex_printf(",");
   1357          ppHRegAMD64(i->Ain.SseShuf.dst);
   1358          return;
   1359       //uu case Ain_AvxLdSt:
   1360       //uu    vex_printf("vmovups ");
   1361       //uu    if (i->Ain.AvxLdSt.isLoad) {
   1362       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
   1363       //uu       vex_printf(",");
   1364       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
   1365       //uu    } else {
   1366       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
   1367       //uu       vex_printf(",");
   1368       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
   1369       //uu    }
   1370       //uu    return;
   1371       //uu case Ain_AvxReRg:
   1372       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1373       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
   1374       //uu    vex_printf(",");
   1375       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
   1376       //uu    return;
   1377       case Ain_EvCheck:
   1378          vex_printf("(evCheck) decl ");
   1379          ppAMD64AMode(i->Ain.EvCheck.amCounter);
   1380          vex_printf("; jns nofail; jmp *");
   1381          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
   1382          vex_printf("; nofail:");
   1383          return;
   1384       case Ain_ProfInc:
   1385          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
   1386          return;
   1387       default:
   1388          vpanic("ppAMD64Instr");
   1389    }
   1390 }
   1391 
   1392 /* --------- Helpers for register allocation. --------- */
   1393 
   1394 void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
   1395 {
   1396    Bool unary;
   1397    vassert(mode64 == True);
   1398    initHRegUsage(u);
   1399    switch (i->tag) {
   1400       case Ain_Imm64:
   1401          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
   1402          return;
   1403       case Ain_Alu64R:
   1404          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
   1405          if (i->Ain.Alu64R.op == Aalu_MOV) {
   1406             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
   1407             return;
   1408          }
   1409          if (i->Ain.Alu64R.op == Aalu_CMP) {
   1410             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
   1411             return;
   1412          }
   1413          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
   1414          return;
   1415       case Ain_Alu64M:
   1416          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
   1417          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
   1418          return;
   1419       case Ain_Sh64:
   1420          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
   1421          if (i->Ain.Sh64.src == 0)
   1422             addHRegUse(u, HRmRead, hregAMD64_RCX());
   1423          return;
   1424       case Ain_Test64:
   1425          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
   1426          return;
   1427       case Ain_Unary64:
   1428          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
   1429          return;
   1430       case Ain_Lea64:
   1431          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
   1432          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
   1433          return;
   1434       case Ain_Alu32R:
   1435          vassert(i->Ain.Alu32R.op != Aalu_MOV);
   1436          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
   1437          if (i->Ain.Alu32R.op == Aalu_CMP) {
   1438             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
   1439             return;
   1440          }
   1441          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
   1442          return;
   1443       case Ain_MulL:
   1444          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
   1445          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1446          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1447          return;
   1448       case Ain_Div:
   1449          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
   1450          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1451          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1452          return;
   1453       case Ain_Push:
   1454          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
   1455          addHRegUse(u, HRmModify, hregAMD64_RSP());
   1456          return;
   1457       case Ain_Call:
   1458          /* This is a bit subtle. */
   1459          /* First off, claim it trashes all the caller-saved regs
   1460             which fall within the register allocator's jurisdiction.
   1461             These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
   1462             and all the xmm registers.
   1463          */
   1464          addHRegUse(u, HRmWrite, hregAMD64_RAX());
   1465          addHRegUse(u, HRmWrite, hregAMD64_RCX());
   1466          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1467          addHRegUse(u, HRmWrite, hregAMD64_RSI());
   1468          addHRegUse(u, HRmWrite, hregAMD64_RDI());
   1469          addHRegUse(u, HRmWrite, hregAMD64_R8());
   1470          addHRegUse(u, HRmWrite, hregAMD64_R9());
   1471          addHRegUse(u, HRmWrite, hregAMD64_R10());
   1472          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1473          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
   1474          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
   1475          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
   1476          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
   1477          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
   1478          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
   1479          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
   1480          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
   1481          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
   1482          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
   1483          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
   1484          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
   1485 
   1486          /* Now we have to state any parameter-carrying registers
   1487             which might be read.  This depends on the regparmness. */
   1488          switch (i->Ain.Call.regparms) {
   1489             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
   1490             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
   1491             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
   1492             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
   1493             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
   1494             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
   1495             case 0: break;
   1496             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
   1497          }
   1498          /* Finally, there is the issue that the insn trashes a
   1499             register because the literal target address has to be
   1500             loaded into a register.  Fortunately, r11 is stated in the
   1501             ABI as a scratch register, and so seems a suitable victim.  */
   1502          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1503          /* Upshot of this is that the assembler really must use r11,
   1504             and no other, as a destination temporary. */
   1505          return;
   1506       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
   1507          conditionally exit the block.  Hence we only need to list (1)
   1508          the registers that they read, and (2) the registers that they
   1509          write in the case where the block is not exited.  (2) is
   1510          empty, hence only (1) is relevant here. */
   1511       case Ain_XDirect:
   1512          /* Don't bother to mention the write to %r11, since it is not
   1513             available to the allocator. */
   1514          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
   1515          return;
   1516       case Ain_XIndir:
   1517          /* Ditto re %r11 */
   1518          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
   1519          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
   1520          return;
   1521       case Ain_XAssisted:
   1522          /* Ditto re %r11 and %rbp (the baseblock ptr) */
   1523          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
   1524          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
   1525          return;
   1526       case Ain_CMov64:
   1527          addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
   1528          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
   1529          return;
   1530       case Ain_CLoad:
   1531          addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
   1532          addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
   1533          return;
   1534       case Ain_CStore:
   1535          addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
   1536          addHRegUse(u, HRmRead, i->Ain.CStore.src);
   1537          return;
   1538       case Ain_MovxLQ:
   1539          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
   1540          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
   1541          return;
   1542       case Ain_LoadEX:
   1543          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
   1544          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
   1545          return;
   1546       case Ain_Store:
   1547          addHRegUse(u, HRmRead, i->Ain.Store.src);
   1548          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
   1549          return;
   1550       case Ain_Set64:
   1551          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
   1552          return;
   1553       case Ain_Bsfr64:
   1554          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
   1555          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
   1556          return;
   1557       case Ain_MFence:
   1558          return;
   1559       case Ain_ACAS:
   1560          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
   1561          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1562          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1563          return;
   1564       case Ain_DACAS:
   1565          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
   1566          addHRegUse(u, HRmRead, hregAMD64_RCX());
   1567          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1568          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1569          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1570          return;
   1571       case Ain_A87Free:
   1572          return;
   1573       case Ain_A87PushPop:
   1574          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
   1575          return;
   1576       case Ain_A87FpOp:
   1577          return;
   1578       case Ain_A87LdCW:
   1579          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
   1580          return;
   1581       case Ain_A87StSW:
   1582          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
   1583          return;
   1584       case Ain_LdMXCSR:
   1585          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
   1586          return;
   1587       case Ain_SseUComIS:
   1588          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
   1589          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
   1590          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
   1591          return;
   1592       case Ain_SseSI2SF:
   1593          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
   1594          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
   1595          return;
   1596       case Ain_SseSF2SI:
   1597          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
   1598          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
   1599          return;
   1600       case Ain_SseSDSS:
   1601          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
   1602          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
   1603          return;
   1604       case Ain_SseLdSt:
   1605          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
   1606          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
   1607                        i->Ain.SseLdSt.reg);
   1608          return;
   1609       case Ain_SseCStore:
   1610          addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
   1611          addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
   1612          return;
   1613       case Ain_SseCLoad:
   1614          addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
   1615          addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
   1616          return;
   1617       case Ain_SseLdzLO:
   1618          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
   1619          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
   1620          return;
   1621       case Ain_Sse32Fx4:
   1622          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
   1623          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
   1624                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
   1625                          || i->Ain.Sse32Fx4.op == Asse_SQRTF );
   1626          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
   1627          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1628                        i->Ain.Sse32Fx4.dst);
   1629          return;
   1630       case Ain_Sse32FLo:
   1631          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
   1632          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
   1633                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
   1634                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
   1635          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
   1636          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1637                        i->Ain.Sse32FLo.dst);
   1638          return;
   1639       case Ain_Sse64Fx2:
   1640          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
   1641          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
   1642                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
   1643                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
   1644          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
   1645          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1646                        i->Ain.Sse64Fx2.dst);
   1647          return;
   1648       case Ain_Sse64FLo:
   1649          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
   1650          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
   1651                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
   1652                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
   1653          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
   1654          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1655                        i->Ain.Sse64FLo.dst);
   1656          return;
   1657       case Ain_SseReRg:
   1658          if ( (i->Ain.SseReRg.op == Asse_XOR
   1659                || i->Ain.SseReRg.op == Asse_CMPEQ32)
   1660               && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
   1661             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
   1662                r,r' as a write of a value to r, and independent of any
   1663                previous value in r */
   1664             /* (as opposed to a rite of passage :-) */
   1665             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
   1666          } else {
   1667             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
   1668             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
   1669                              ? HRmWrite : HRmModify,
   1670                           i->Ain.SseReRg.dst);
   1671          }
   1672          return;
   1673       case Ain_SseCMov:
   1674          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
   1675          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
   1676          return;
   1677       case Ain_SseShuf:
   1678          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
   1679          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
   1680          return;
   1681       //uu case Ain_AvxLdSt:
   1682       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
   1683       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
   1684       //uu               i->Ain.AvxLdSt.reg);
   1685       //uu return;
   1686       //uu case Ain_AvxReRg:
   1687       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
   1688       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
   1689       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
   1690       //uu       /* See comments on the case for Ain_SseReRg. */
   1691       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
   1692       //uu    } else {
   1693       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
   1694       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
   1695       //uu                        ? HRmWrite : HRmModify,
   1696       //uu                     i->Ain.AvxReRg.dst);
   1697       //uu    }
   1698       //uu    return;
   1699       case Ain_EvCheck:
   1700          /* We expect both amodes only to mention %rbp, so this is in
   1701             fact pointless, since %rbp isn't allocatable, but anyway.. */
   1702          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
   1703          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
   1704          return;
   1705       case Ain_ProfInc:
   1706          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1707          return;
   1708       default:
   1709          ppAMD64Instr(i, mode64);
   1710          vpanic("getRegUsage_AMD64Instr");
   1711    }
   1712 }
   1713 
   1714 /* local helper */
   1715 static inline void mapReg(HRegRemap* m, HReg* r)
   1716 {
   1717    *r = lookupHRegRemap(m, *r);
   1718 }
   1719 
   1720 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
   1721 {
   1722    vassert(mode64 == True);
   1723    switch (i->tag) {
   1724       case Ain_Imm64:
   1725          mapReg(m, &i->Ain.Imm64.dst);
   1726          return;
   1727       case Ain_Alu64R:
   1728          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
   1729          mapReg(m, &i->Ain.Alu64R.dst);
   1730          return;
   1731       case Ain_Alu64M:
   1732          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
   1733          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
   1734          return;
   1735       case Ain_Sh64:
   1736          mapReg(m, &i->Ain.Sh64.dst);
   1737          return;
   1738       case Ain_Test64:
   1739          mapReg(m, &i->Ain.Test64.dst);
   1740          return;
   1741       case Ain_Unary64:
   1742          mapReg(m, &i->Ain.Unary64.dst);
   1743          return;
   1744       case Ain_Lea64:
   1745          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
   1746          mapReg(m, &i->Ain.Lea64.dst);
   1747          return;
   1748       case Ain_Alu32R:
   1749          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
   1750          mapReg(m, &i->Ain.Alu32R.dst);
   1751          return;
   1752       case Ain_MulL:
   1753          mapRegs_AMD64RM(m, i->Ain.MulL.src);
   1754          return;
   1755       case Ain_Div:
   1756          mapRegs_AMD64RM(m, i->Ain.Div.src);
   1757          return;
   1758       case Ain_Push:
   1759          mapRegs_AMD64RMI(m, i->Ain.Push.src);
   1760          return;
   1761       case Ain_Call:
   1762          return;
   1763       case Ain_XDirect:
   1764          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
   1765          return;
   1766       case Ain_XIndir:
   1767          mapReg(m, &i->Ain.XIndir.dstGA);
   1768          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
   1769          return;
   1770       case Ain_XAssisted:
   1771          mapReg(m, &i->Ain.XAssisted.dstGA);
   1772          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
   1773          return;
   1774       case Ain_CMov64:
   1775          mapReg(m, &i->Ain.CMov64.src);
   1776          mapReg(m, &i->Ain.CMov64.dst);
   1777          return;
   1778       case Ain_CLoad:
   1779          mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
   1780          mapReg(m, &i->Ain.CLoad.dst);
   1781          return;
   1782       case Ain_CStore:
   1783          mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
   1784          mapReg(m, &i->Ain.CStore.src);
   1785          return;
   1786       case Ain_MovxLQ:
   1787          mapReg(m, &i->Ain.MovxLQ.src);
   1788          mapReg(m, &i->Ain.MovxLQ.dst);
   1789          return;
   1790       case Ain_LoadEX:
   1791          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
   1792          mapReg(m, &i->Ain.LoadEX.dst);
   1793          return;
   1794       case Ain_Store:
   1795          mapReg(m, &i->Ain.Store.src);
   1796          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
   1797          return;
   1798       case Ain_Set64:
   1799          mapReg(m, &i->Ain.Set64.dst);
   1800          return;
   1801       case Ain_Bsfr64:
   1802          mapReg(m, &i->Ain.Bsfr64.src);
   1803          mapReg(m, &i->Ain.Bsfr64.dst);
   1804          return;
   1805       case Ain_MFence:
   1806          return;
   1807       case Ain_ACAS:
   1808          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
   1809          return;
   1810       case Ain_DACAS:
   1811          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
   1812          return;
   1813       case Ain_A87Free:
   1814          return;
   1815       case Ain_A87PushPop:
   1816          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
   1817          return;
   1818       case Ain_A87FpOp:
   1819          return;
   1820       case Ain_A87LdCW:
   1821          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
   1822          return;
   1823       case Ain_A87StSW:
   1824          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
   1825          return;
   1826       case Ain_LdMXCSR:
   1827          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
   1828          return;
   1829       case Ain_SseUComIS:
   1830          mapReg(m, &i->Ain.SseUComIS.srcL);
   1831          mapReg(m, &i->Ain.SseUComIS.srcR);
   1832          mapReg(m, &i->Ain.SseUComIS.dst);
   1833          return;
   1834       case Ain_SseSI2SF:
   1835          mapReg(m, &i->Ain.SseSI2SF.src);
   1836          mapReg(m, &i->Ain.SseSI2SF.dst);
   1837          return;
   1838       case Ain_SseSF2SI:
   1839          mapReg(m, &i->Ain.SseSF2SI.src);
   1840          mapReg(m, &i->Ain.SseSF2SI.dst);
   1841          return;
   1842       case Ain_SseSDSS:
   1843          mapReg(m, &i->Ain.SseSDSS.src);
   1844          mapReg(m, &i->Ain.SseSDSS.dst);
   1845          return;
   1846       case Ain_SseLdSt:
   1847          mapReg(m, &i->Ain.SseLdSt.reg);
   1848          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
   1849          break;
   1850       case Ain_SseCStore:
   1851          mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
   1852          mapReg(m, &i->Ain.SseCStore.src);
   1853          return;
   1854       case Ain_SseCLoad:
   1855          mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
   1856          mapReg(m, &i->Ain.SseCLoad.dst);
   1857          return;
   1858       case Ain_SseLdzLO:
   1859          mapReg(m, &i->Ain.SseLdzLO.reg);
   1860          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
   1861          break;
   1862       case Ain_Sse32Fx4:
   1863          mapReg(m, &i->Ain.Sse32Fx4.src);
   1864          mapReg(m, &i->Ain.Sse32Fx4.dst);
   1865          return;
   1866       case Ain_Sse32FLo:
   1867          mapReg(m, &i->Ain.Sse32FLo.src);
   1868          mapReg(m, &i->Ain.Sse32FLo.dst);
   1869          return;
   1870       case Ain_Sse64Fx2:
   1871          mapReg(m, &i->Ain.Sse64Fx2.src);
   1872          mapReg(m, &i->Ain.Sse64Fx2.dst);
   1873          return;
   1874       case Ain_Sse64FLo:
   1875          mapReg(m, &i->Ain.Sse64FLo.src);
   1876          mapReg(m, &i->Ain.Sse64FLo.dst);
   1877          return;
   1878       case Ain_SseReRg:
   1879          mapReg(m, &i->Ain.SseReRg.src);
   1880          mapReg(m, &i->Ain.SseReRg.dst);
   1881          return;
   1882       case Ain_SseCMov:
   1883          mapReg(m, &i->Ain.SseCMov.src);
   1884          mapReg(m, &i->Ain.SseCMov.dst);
   1885          return;
   1886       case Ain_SseShuf:
   1887          mapReg(m, &i->Ain.SseShuf.src);
   1888          mapReg(m, &i->Ain.SseShuf.dst);
   1889          return;
   1890       //uu case Ain_AvxLdSt:
   1891       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
   1892       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
   1893       //uu    break;
   1894       //uu case Ain_AvxReRg:
   1895       //uu    mapReg(m, &i->Ain.AvxReRg.src);
   1896       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
   1897       //uu    return;
   1898       case Ain_EvCheck:
   1899          /* We expect both amodes only to mention %rbp, so this is in
   1900             fact pointless, since %rbp isn't allocatable, but anyway.. */
   1901          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
   1902          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
   1903          return;
   1904       case Ain_ProfInc:
   1905          /* hardwires r11 -- nothing to modify. */
   1906          return;
   1907       default:
   1908          ppAMD64Instr(i, mode64);
   1909          vpanic("mapRegs_AMD64Instr");
   1910    }
   1911 }
   1912 
   1913 /* Figure out if i represents a reg-reg move, and if so assign the
   1914    source and destination to *src and *dst.  If in doubt say No.  Used
   1915    by the register allocator to do move coalescing.
   1916 */
   1917 Bool isMove_AMD64Instr ( const AMD64Instr* i, HReg* src, HReg* dst )
   1918 {
   1919    switch (i->tag) {
   1920       case Ain_Alu64R:
   1921          /* Moves between integer regs */
   1922          if (i->Ain.Alu64R.op != Aalu_MOV)
   1923             return False;
   1924          if (i->Ain.Alu64R.src->tag != Armi_Reg)
   1925             return False;
   1926          *src = i->Ain.Alu64R.src->Armi.Reg.reg;
   1927          *dst = i->Ain.Alu64R.dst;
   1928          return True;
   1929       case Ain_SseReRg:
   1930          /* Moves between SSE regs */
   1931          if (i->Ain.SseReRg.op != Asse_MOV)
   1932             return False;
   1933          *src = i->Ain.SseReRg.src;
   1934          *dst = i->Ain.SseReRg.dst;
   1935          return True;
   1936       //uu case Ain_AvxReRg:
   1937       //uu    /* Moves between AVX regs */
   1938       //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
   1939       //uu       return False;
   1940       //uu    *src = i->Ain.AvxReRg.src;
   1941       //uu    *dst = i->Ain.AvxReRg.dst;
   1942       //uu    return True;
   1943       default:
   1944          return False;
   1945    }
   1946    /*NOTREACHED*/
   1947 }
   1948 
   1949 
   1950 /* Generate amd64 spill/reload instructions under the direction of the
   1951    register allocator.  Note it's critical these don't write the
   1952    condition codes. */
   1953 
   1954 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1955                       HReg rreg, Int offsetB, Bool mode64 )
   1956 {
   1957    AMD64AMode* am;
   1958    vassert(offsetB >= 0);
   1959    vassert(!hregIsVirtual(rreg));
   1960    vassert(mode64 == True);
   1961    *i1 = *i2 = NULL;
   1962    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1963    switch (hregClass(rreg)) {
   1964       case HRcInt64:
   1965          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
   1966          return;
   1967       case HRcVec128:
   1968          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
   1969          return;
   1970       default:
   1971          ppHRegClass(hregClass(rreg));
   1972          vpanic("genSpill_AMD64: unimplemented regclass");
   1973    }
   1974 }
   1975 
   1976 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1977                        HReg rreg, Int offsetB, Bool mode64 )
   1978 {
   1979    AMD64AMode* am;
   1980    vassert(offsetB >= 0);
   1981    vassert(!hregIsVirtual(rreg));
   1982    vassert(mode64 == True);
   1983    *i1 = *i2 = NULL;
   1984    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1985    switch (hregClass(rreg)) {
   1986       case HRcInt64:
   1987          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
   1988          return;
   1989       case HRcVec128:
   1990          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
   1991          return;
   1992       default:
   1993          ppHRegClass(hregClass(rreg));
   1994          vpanic("genReload_AMD64: unimplemented regclass");
   1995    }
   1996 }
   1997 
   1998 AMD64Instr* directReload_AMD64( AMD64Instr* i, HReg vreg, Short spill_off )
   1999 {
   2000    vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
   2001 
   2002    /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
   2003       Convert to: src=RMI_Mem, dst=Reg
   2004    */
   2005    if (i->tag == Ain_Alu64R
   2006        && (i->Ain.Alu64R.op == Aalu_MOV || i->Ain.Alu64R.op == Aalu_OR
   2007            || i->Ain.Alu64R.op == Aalu_XOR)
   2008        && i->Ain.Alu64R.src->tag == Armi_Reg
   2009        && sameHReg(i->Ain.Alu64R.src->Armi.Reg.reg, vreg)) {
   2010       vassert(! sameHReg(i->Ain.Alu64R.dst, vreg));
   2011       return AMD64Instr_Alu64R(
   2012                 i->Ain.Alu64R.op,
   2013                 AMD64RMI_Mem( AMD64AMode_IR( spill_off, hregAMD64_RBP())),
   2014                 i->Ain.Alu64R.dst
   2015              );
   2016    }
   2017 
   2018    /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
   2019       Convert to: src=RI_Imm, dst=Mem
   2020    */
   2021    if (i->tag == Ain_Alu64R
   2022        && (i->Ain.Alu64R.op == Aalu_CMP)
   2023        && i->Ain.Alu64R.src->tag == Armi_Imm
   2024        && sameHReg(i->Ain.Alu64R.dst, vreg)) {
   2025       return AMD64Instr_Alu64M(
   2026                 i->Ain.Alu64R.op,
   2027                 AMD64RI_Imm( i->Ain.Alu64R.src->Armi.Imm.imm32 ),
   2028                 AMD64AMode_IR( spill_off, hregAMD64_RBP())
   2029              );
   2030    }
   2031 
   2032    return NULL;
   2033 }
   2034 
   2035 
   2036 /* --------- The amd64 assembler (bleh.) --------- */
   2037 
   2038 /* Produce the low three bits of an integer register number. */
   2039 inline static UInt iregEnc210 ( HReg r )
   2040 {
   2041    UInt n;
   2042    vassert(hregClass(r) == HRcInt64);
   2043    vassert(!hregIsVirtual(r));
   2044    n = hregEncoding(r);
   2045    vassert(n <= 15);
   2046    return n & 7;
   2047 }
   2048 
   2049 /* Produce bit 3 of an integer register number. */
   2050 inline static UInt iregEnc3 ( HReg r )
   2051 {
   2052    UInt n;
   2053    vassert(hregClass(r) == HRcInt64);
   2054    vassert(!hregIsVirtual(r));
   2055    n = hregEncoding(r);
   2056    vassert(n <= 15);
   2057    return (n >> 3) & 1;
   2058 }
   2059 
   2060 /* Produce a complete 4-bit integer register number. */
   2061 inline static UInt iregEnc3210 ( HReg r )
   2062 {
   2063    UInt n;
   2064    vassert(hregClass(r) == HRcInt64);
   2065    vassert(!hregIsVirtual(r));
   2066    n = hregEncoding(r);
   2067    vassert(n <= 15);
   2068    return n;
   2069 }
   2070 
   2071 /* Produce a complete 4-bit integer register number. */
   2072 inline static UInt vregEnc3210 ( HReg r )
   2073 {
   2074    UInt n;
   2075    vassert(hregClass(r) == HRcVec128);
   2076    vassert(!hregIsVirtual(r));
   2077    n = hregEncoding(r);
   2078    vassert(n <= 15);
   2079    return n;
   2080 }
   2081 
   2082 inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
   2083 {
   2084    vassert(mod < 4);
   2085    vassert((reg|regmem) < 8);
   2086    return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
   2087 }
   2088 
   2089 inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
   2090 {
   2091    vassert(shift < 4);
   2092    vassert((regindex|regbase) < 8);
   2093    return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
   2094 }
   2095 
   2096 static UChar* emit32 ( UChar* p, UInt w32 )
   2097 {
   2098    *p++ = toUChar((w32)       & 0x000000FF);
   2099    *p++ = toUChar((w32 >>  8) & 0x000000FF);
   2100    *p++ = toUChar((w32 >> 16) & 0x000000FF);
   2101    *p++ = toUChar((w32 >> 24) & 0x000000FF);
   2102    return p;
   2103 }
   2104 
   2105 static UChar* emit64 ( UChar* p, ULong w64 )
   2106 {
   2107    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
   2108    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
   2109    return p;
   2110 }
   2111 
   2112 /* Does a sign-extend of the lowest 8 bits give
   2113    the original number? */
   2114 static Bool fits8bits ( UInt w32 )
   2115 {
   2116    Int i32 = (Int)w32;
   2117    return toBool(i32 == ((Int)(w32 << 24) >> 24));
   2118 }
   2119 /* Can the lower 32 bits be signedly widened to produce the whole
   2120    64-bit value?  In other words, are the top 33 bits either all 0 or
   2121    all 1 ? */
   2122 static Bool fitsIn32Bits ( ULong x )
   2123 {
   2124    Long y1;
   2125    y1 = x << 32;
   2126    y1 >>=/*s*/ 32;
   2127    return toBool(x == y1);
   2128 }
   2129 
   2130 
   2131 /* Forming mod-reg-rm bytes and scale-index-base bytes.
   2132 
   2133      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
   2134                        =  00 greg ereg
   2135 
   2136      greg,  d8(ereg)   |  ereg is neither of: RSP R12
   2137                        =  01 greg ereg, d8
   2138 
   2139      greg,  d32(ereg)  |  ereg is neither of: RSP R12
   2140                        =  10 greg ereg, d32
   2141 
   2142      greg,  d8(ereg)   |  ereg is either: RSP R12
   2143                        =  01 greg 100, 0x24, d8
   2144                        (lowest bit of rex distinguishes R12/RSP)
   2145 
   2146      greg,  d32(ereg)  |  ereg is either: RSP R12
   2147                        =  10 greg 100, 0x24, d32
   2148                        (lowest bit of rex distinguishes R12/RSP)
   2149 
   2150      -----------------------------------------------
   2151 
   2152      greg,  d8(base,index,scale)
   2153                |  index != RSP
   2154                =  01 greg 100, scale index base, d8
   2155 
   2156      greg,  d32(base,index,scale)
   2157                |  index != RSP
   2158                =  10 greg 100, scale index base, d32
   2159 */
   2160 static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
   2161 {
   2162    UInt gregEnc210 = gregEnc3210 & 7;
   2163    if (am->tag == Aam_IR) {
   2164       if (am->Aam.IR.imm == 0
   2165           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2166           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
   2167           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2168           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
   2169          ) {
   2170          *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
   2171          return p;
   2172       }
   2173       if (fits8bits(am->Aam.IR.imm)
   2174           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2175           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2176          ) {
   2177          *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
   2178          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2179          return p;
   2180       }
   2181       if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2182           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2183          ) {
   2184          *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
   2185          p = emit32(p, am->Aam.IR.imm);
   2186          return p;
   2187       }
   2188       if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2189            || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
   2190           && fits8bits(am->Aam.IR.imm)) {
   2191  	 *p++ = mkModRegRM(1, gregEnc210, 4);
   2192          *p++ = 0x24;
   2193          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2194          return p;
   2195       }
   2196       if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2197 	      || wait for test case for RSP case */
   2198           sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
   2199  	 *p++ = mkModRegRM(2, gregEnc210, 4);
   2200          *p++ = 0x24;
   2201          p = emit32(p, am->Aam.IR.imm);
   2202          return p;
   2203       }
   2204       ppAMD64AMode(am);
   2205       vpanic("doAMode_M: can't emit amode IR");
   2206       /*NOTREACHED*/
   2207    }
   2208    if (am->tag == Aam_IRRS) {
   2209       if (fits8bits(am->Aam.IRRS.imm)
   2210           && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
   2211          *p++ = mkModRegRM(1, gregEnc210, 4);
   2212          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
   2213                                           iregEnc210(am->Aam.IRRS.base));
   2214          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
   2215          return p;
   2216       }
   2217       if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
   2218          *p++ = mkModRegRM(2, gregEnc210, 4);
   2219          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
   2220                                           iregEnc210(am->Aam.IRRS.base));
   2221          p = emit32(p, am->Aam.IRRS.imm);
   2222          return p;
   2223       }
   2224       ppAMD64AMode(am);
   2225       vpanic("doAMode_M: can't emit amode IRRS");
   2226       /*NOTREACHED*/
   2227    }
   2228    vpanic("doAMode_M: unknown amode");
   2229    /*NOTREACHED*/
   2230 }
   2231 
   2232 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
   2233 {
   2234    return doAMode_M__wrk(p, iregEnc3210(greg), am);
   2235 }
   2236 
   2237 static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
   2238 {
   2239    vassert(gregEnc3210 < 16);
   2240    return doAMode_M__wrk(p, gregEnc3210, am);
   2241 }
   2242 
   2243 
   2244 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
   2245 inline
   2246 static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
   2247 {
   2248    *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
   2249    return p;
   2250 }
   2251 
   2252 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
   2253 {
   2254    return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
   2255 }
   2256 
   2257 static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
   2258 {
   2259    vassert(gregEnc3210 < 16);
   2260    return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
   2261 }
   2262 
   2263 static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
   2264 {
   2265    vassert(eregEnc3210 < 16);
   2266    return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
   2267 }
   2268 
   2269 static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
   2270 {
   2271    vassert( (gregEnc3210|eregEnc3210) < 16);
   2272    return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
   2273 }
   2274 
   2275 
   2276 /* Clear the W bit on a REX byte, thereby changing the operand size
   2277    back to whatever that instruction's default operand size is. */
   2278 static inline UChar clearWBit ( UChar rex )
   2279 {
   2280    return rex & ~(1<<3);
   2281 }
   2282 
   2283 
   2284 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
   2285 inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
   2286 {
   2287    if (am->tag == Aam_IR) {
   2288       UChar W = 1;  /* we want 64-bit mode */
   2289       UChar R = (gregEnc3210 >> 3) & 1;
   2290       UChar X = 0; /* not relevant */
   2291       UChar B = iregEnc3(am->Aam.IR.reg);
   2292       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
   2293    }
   2294    if (am->tag == Aam_IRRS) {
   2295       UChar W = 1;  /* we want 64-bit mode */
   2296       UChar R = (gregEnc3210 >> 3) & 1;
   2297       UChar X = iregEnc3(am->Aam.IRRS.index);
   2298       UChar B = iregEnc3(am->Aam.IRRS.base);
   2299       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
   2300    }
   2301    vassert(0);
   2302    return 0; /*NOTREACHED*/
   2303 }
   2304 
   2305 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
   2306 {
   2307    return rexAMode_M__wrk(iregEnc3210(greg), am);
   2308 }
   2309 
   2310 static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
   2311 {
   2312    vassert(gregEnc3210 < 16);
   2313    return rexAMode_M__wrk(gregEnc3210, am);
   2314 }
   2315 
   2316 
   2317 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
   2318 inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
   2319 {
   2320    UChar W = 1;  /* we want 64-bit mode */
   2321    UChar R = (gregEnc3210 >> 3) & 1;
   2322    UChar X = 0; /* not relevant */
   2323    UChar B = (eregEnc3210 >> 3) & 1;
   2324    return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
   2325 }
   2326 
   2327 static UChar rexAMode_R ( HReg greg, HReg ereg )
   2328 {
   2329    return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
   2330 }
   2331 
   2332 static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
   2333 {
   2334    vassert(gregEnc3210 < 16);
   2335    return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
   2336 }
   2337 
   2338 static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
   2339 {
   2340    vassert(eregEnc3210 < 16);
   2341    return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
   2342 }
   2343 
   2344 static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
   2345 {
   2346    vassert((gregEnc3210|eregEnc3210) < 16);
   2347    return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
   2348 }
   2349 
   2350 
   2351 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
   2352 //uu    verified correct (I reckon).  Certainly it has been known to
   2353 //uu    produce correct VEX prefixes during testing. */
   2354 //uu
   2355 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
   2356 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
   2357 //uu    in verbatim.  There's no range checking on the bits. */
   2358 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
   2359 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
   2360 //uu                             UInt L, UInt pp )
   2361 //uu {
   2362 //uu    UChar byte0 = 0;
   2363 //uu    UChar byte1 = 0;
   2364 //uu    UChar byte2 = 0;
   2365 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
   2366 //uu       /* 2 byte encoding is possible. */
   2367 //uu       byte0 = 0xC5;
   2368 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
   2369 //uu               | (L << 2) | pp;
   2370 //uu    } else {
   2371 //uu       /* 3 byte encoding is needed. */
   2372 //uu       byte0 = 0xC4;
   2373 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
   2374 //uu               | ((rexB ^ 1) << 5) | mmmmm;
   2375 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
   2376 //uu    }
   2377 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
   2378 //uu }
   2379 //uu
   2380 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
   2381 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
   2382 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
   2383 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
   2384 //uu    vvvv=1111 (unused 3rd reg). */
   2385 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
   2386 //uu {
   2387 //uu    UChar L       = 1; /* size = 256 */
   2388 //uu    UChar pp      = 0; /* no SIMD prefix */
   2389 //uu    UChar mmmmm   = 1; /* 0F */
   2390 //uu    UChar notVvvv = 0; /* unused */
   2391 //uu    UChar rexW    = 0;
   2392 //uu    UChar rexR    = 0;
   2393 //uu    UChar rexX    = 0;
   2394 //uu    UChar rexB    = 0;
   2395 //uu    /* Same logic as in rexAMode_M. */
   2396 //uu    if (am->tag == Aam_IR) {
   2397 //uu       rexR = iregEnc3(greg);
   2398 //uu       rexX = 0; /* not relevant */
   2399 //uu       rexB = iregEnc3(am->Aam.IR.reg);
   2400 //uu    }
   2401 //uu    else if (am->tag == Aam_IRRS) {
   2402 //uu       rexR = iregEnc3(greg);
   2403 //uu       rexX = iregEnc3(am->Aam.IRRS.index);
   2404 //uu       rexB = iregEnc3(am->Aam.IRRS.base);
   2405 //uu    } else {
   2406 //uu       vassert(0);
   2407 //uu    }
   2408 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
   2409 //uu }
   2410 //uu
   2411 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
   2412 //uu {
   2413 //uu    switch (vex & 0xFF) {
   2414 //uu       case 0xC5:
   2415 //uu          *p++ = 0xC5;
   2416 //uu          *p++ = (vex >> 8) & 0xFF;
   2417 //uu          vassert(0 == (vex >> 16));
   2418 //uu          break;
   2419 //uu       case 0xC4:
   2420 //uu          *p++ = 0xC4;
   2421 //uu          *p++ = (vex >> 8) & 0xFF;
   2422 //uu          *p++ = (vex >> 16) & 0xFF;
   2423 //uu          vassert(0 == (vex >> 24));
   2424 //uu          break;
   2425 //uu       default:
   2426 //uu          vassert(0);
   2427 //uu    }
   2428 //uu    return p;
   2429 //uu }
   2430 
   2431 
   2432 /* Emit ffree %st(N) */
   2433 static UChar* do_ffree_st ( UChar* p, Int n )
   2434 {
   2435    vassert(n >= 0 && n <= 7);
   2436    *p++ = 0xDD;
   2437    *p++ = toUChar(0xC0 + n);
   2438    return p;
   2439 }
   2440 
   2441 /* Emit an instruction into buf and return the number of bytes used.
   2442    Note that buf is not the insn's final place, and therefore it is
   2443    imperative to emit position-independent code.  If the emitted
   2444    instruction was a profiler inc, set *is_profInc to True, else
   2445    leave it unchanged. */
   2446 
   2447 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
   2448                       UChar* buf, Int nbuf, const AMD64Instr* i,
   2449                       Bool mode64, VexEndness endness_host,
   2450                       const void* disp_cp_chain_me_to_slowEP,
   2451                       const void* disp_cp_chain_me_to_fastEP,
   2452                       const void* disp_cp_xindir,
   2453                       const void* disp_cp_xassisted )
   2454 {
   2455    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
   2456    UInt   xtra;
   2457    UInt   reg;
   2458    UChar  rex;
   2459    UChar* p = &buf[0];
   2460    UChar* ptmp;
   2461    Int    j;
   2462    vassert(nbuf >= 64);
   2463    vassert(mode64 == True);
   2464 
   2465    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
   2466 
   2467    switch (i->tag) {
   2468 
   2469    case Ain_Imm64:
   2470       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
   2471          /* Use the short form (load into 32 bit reg, + default
   2472             widening rule) for constants under 1 million.  We could
   2473             use this form for the range 0 to 0x7FFFFFFF inclusive, but
   2474             limit it to a smaller range for verifiability purposes. */
   2475          if (1 & iregEnc3(i->Ain.Imm64.dst))
   2476             *p++ = 0x41;
   2477          *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
   2478          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
   2479       } else {
   2480          *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
   2481          *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
   2482          p = emit64(p, i->Ain.Imm64.imm64);
   2483       }
   2484       goto done;
   2485 
   2486    case Ain_Alu64R:
   2487       /* Deal specially with MOV */
   2488       if (i->Ain.Alu64R.op == Aalu_MOV) {
   2489          switch (i->Ain.Alu64R.src->tag) {
   2490             case Armi_Imm:
   2491                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
   2492                   /* Actually we could use this form for constants in
   2493                      the range 0 through 0x7FFFFFFF inclusive, but
   2494                      limit it to a small range for verifiability
   2495                      purposes. */
   2496                   /* Generate "movl $imm32, 32-bit-register" and let
   2497                      the default zero-extend rule cause the upper half
   2498                      of the dst to be zeroed out too.  This saves 1
   2499                      and sometimes 2 bytes compared to the more
   2500                      obvious encoding in the 'else' branch. */
   2501                   if (1 & iregEnc3(i->Ain.Alu64R.dst))
   2502                      *p++ = 0x41;
   2503                   *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
   2504                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2505                } else {
   2506                   *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
   2507                   *p++ = 0xC7;
   2508                   *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
   2509                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2510                }
   2511                goto done;
   2512             case Armi_Reg:
   2513                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2514                                   i->Ain.Alu64R.dst );
   2515                *p++ = 0x89;
   2516                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2517                                 i->Ain.Alu64R.dst);
   2518                goto done;
   2519             case Armi_Mem:
   2520                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2521                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2522                *p++ = 0x8B;
   2523                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2524                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2525                goto done;
   2526             default:
   2527                goto bad;
   2528          }
   2529       }
   2530       /* MUL */
   2531       if (i->Ain.Alu64R.op == Aalu_MUL) {
   2532          switch (i->Ain.Alu64R.src->tag) {
   2533             case Armi_Reg:
   2534                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
   2535                                   i->Ain.Alu64R.src->Armi.Reg.reg);
   2536                *p++ = 0x0F;
   2537                *p++ = 0xAF;
   2538                p = doAMode_R(p, i->Ain.Alu64R.dst,
   2539                                 i->Ain.Alu64R.src->Armi.Reg.reg);
   2540                goto done;
   2541             case Armi_Mem:
   2542                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2543                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2544                *p++ = 0x0F;
   2545                *p++ = 0xAF;
   2546                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2547                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2548                goto done;
   2549             case Armi_Imm:
   2550                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2551                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2552                   *p++ = 0x6B;
   2553                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2554                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2555                } else {
   2556                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2557                   *p++ = 0x69;
   2558                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2559                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2560                }
   2561                goto done;
   2562             default:
   2563                goto bad;
   2564          }
   2565       }
   2566       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
   2567       opc = opc_rr = subopc_imm = opc_imma = 0;
   2568       switch (i->Ain.Alu64R.op) {
   2569          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
   2570                         subopc_imm = 2; opc_imma = 0x15; break;
   2571          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2572                         subopc_imm = 0; opc_imma = 0x05; break;
   2573          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2574                         subopc_imm = 5; opc_imma = 0x2D; break;
   2575          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
   2576                         subopc_imm = 3; opc_imma = 0x1D; break;
   2577          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2578                         subopc_imm = 4; opc_imma = 0x25; break;
   2579          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2580                         subopc_imm = 6; opc_imma = 0x35; break;
   2581          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2582                         subopc_imm = 1; opc_imma = 0x0D; break;
   2583          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2584                         subopc_imm = 7; opc_imma = 0x3D; break;
   2585          default: goto bad;
   2586       }
   2587       switch (i->Ain.Alu64R.src->tag) {
   2588          case Armi_Imm:
   2589             if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
   2590                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2591                goto bad; /* FIXME: awaiting test case */
   2592                *p++ = toUChar(opc_imma);
   2593                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2594             } else
   2595             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2596                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
   2597                *p++ = 0x83;
   2598                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
   2599                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2600             } else {
   2601                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
   2602                *p++ = 0x81;
   2603                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
   2604                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2605             }
   2606             goto done;
   2607          case Armi_Reg:
   2608             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2609                                i->Ain.Alu64R.dst);
   2610             *p++ = toUChar(opc_rr);
   2611             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2612                              i->Ain.Alu64R.dst);
   2613             goto done;
   2614          case Armi_Mem:
   2615             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
   2616                                i->Ain.Alu64R.src->Armi.Mem.am);
   2617             *p++ = toUChar(opc);
   2618             p = doAMode_M(p, i->Ain.Alu64R.dst,
   2619                              i->Ain.Alu64R.src->Armi.Mem.am);
   2620             goto done;
   2621          default:
   2622             goto bad;
   2623       }
   2624       break;
   2625 
   2626    case Ain_Alu64M:
   2627       /* Deal specially with MOV */
   2628       if (i->Ain.Alu64M.op == Aalu_MOV) {
   2629          switch (i->Ain.Alu64M.src->tag) {
   2630             case Ari_Reg:
   2631                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
   2632                                  i->Ain.Alu64M.dst);
   2633                *p++ = 0x89;
   2634                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
   2635                                 i->Ain.Alu64M.dst);
   2636                goto done;
   2637             case Ari_Imm:
   2638                *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
   2639                *p++ = 0xC7;
   2640                p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
   2641                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
   2642                goto done;
   2643             default:
   2644                goto bad;
   2645          }
   2646       }
   2647       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
   2648          allowed here. (This is derived from the x86 version of same). */
   2649       opc = subopc_imm = opc_imma = 0;
   2650       switch (i->Ain.Alu64M.op) {
   2651          case Aalu_CMP: opc = 0x39; subopc_imm = 7; break;
   2652          default: goto bad;
   2653       }
   2654       switch (i->Ain.Alu64M.src->tag) {
   2655          /*
   2656          case Xri_Reg:
   2657             *p++ = toUChar(opc);
   2658             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
   2659                              i->Xin.Alu32M.dst);
   2660             goto done;
   2661          */
   2662          case Ari_Imm:
   2663             if (fits8bits(i->Ain.Alu64M.src->Ari.Imm.imm32)) {
   2664                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
   2665                *p++ = 0x83;
   2666                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
   2667                *p++ = toUChar(0xFF & i->Ain.Alu64M.src->Ari.Imm.imm32);
   2668                goto done;
   2669             } else {
   2670                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
   2671                *p++ = 0x81;
   2672                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
   2673                p    = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
   2674                goto done;
   2675             }
   2676          default:
   2677             goto bad;
   2678       }
   2679 
   2680       break;
   2681 
   2682    case Ain_Sh64:
   2683       opc_cl = opc_imm = subopc = 0;
   2684       switch (i->Ain.Sh64.op) {
   2685          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
   2686          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
   2687          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
   2688          default: goto bad;
   2689       }
   2690       if (i->Ain.Sh64.src == 0) {
   2691          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
   2692          *p++ = toUChar(opc_cl);
   2693          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
   2694          goto done;
   2695       } else {
   2696          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
   2697          *p++ = toUChar(opc_imm);
   2698          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
   2699          *p++ = (UChar)(i->Ain.Sh64.src);
   2700          goto done;
   2701       }
   2702       break;
   2703 
   2704    case Ain_Test64:
   2705       /* testq sign-extend($imm32), %reg */
   2706       *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
   2707       *p++ = 0xF7;
   2708       p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
   2709       p = emit32(p, i->Ain.Test64.imm32);
   2710       goto done;
   2711 
   2712    case Ain_Unary64:
   2713       if (i->Ain.Unary64.op == Aun_NOT) {
   2714          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
   2715          *p++ = 0xF7;
   2716          p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
   2717          goto done;
   2718       }
   2719       if (i->Ain.Unary64.op == Aun_NEG) {
   2720          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
   2721          *p++ = 0xF7;
   2722          p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
   2723          goto done;
   2724       }
   2725       break;
   2726 
   2727    case Ain_Lea64:
   2728       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2729       *p++ = 0x8D;
   2730       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2731       goto done;
   2732 
   2733    case Ain_Alu32R:
   2734       /* ADD/SUB/AND/OR/XOR/CMP */
   2735       opc = opc_rr = subopc_imm = opc_imma = 0;
   2736       switch (i->Ain.Alu32R.op) {
   2737          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2738                         subopc_imm = 0; opc_imma = 0x05; break;
   2739          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2740                         subopc_imm = 5; opc_imma = 0x2D; break;
   2741          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2742                         subopc_imm = 4; opc_imma = 0x25; break;
   2743          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2744                         subopc_imm = 6; opc_imma = 0x35; break;
   2745          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2746                         subopc_imm = 1; opc_imma = 0x0D; break;
   2747          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2748                         subopc_imm = 7; opc_imma = 0x3D; break;
   2749          default: goto bad;
   2750       }
   2751       switch (i->Ain.Alu32R.src->tag) {
   2752          case Armi_Imm:
   2753             if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
   2754                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
   2755                goto bad; /* FIXME: awaiting test case */
   2756                *p++ = toUChar(opc_imma);
   2757                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
   2758             } else
   2759             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
   2760                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
   2761                if (rex != 0x40) *p++ = rex;
   2762                *p++ = 0x83;
   2763                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
   2764                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
   2765             } else {
   2766                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
   2767                if (rex != 0x40) *p++ = rex;
   2768                *p++ = 0x81;
   2769                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
   2770                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
   2771             }
   2772             goto done;
   2773          case Armi_Reg:
   2774             rex  = clearWBit(
   2775                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
   2776                                i->Ain.Alu32R.dst) );
   2777             if (rex != 0x40) *p++ = rex;
   2778             *p++ = toUChar(opc_rr);
   2779             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
   2780                              i->Ain.Alu32R.dst);
   2781             goto done;
   2782          case Armi_Mem:
   2783             rex  = clearWBit(
   2784                    rexAMode_M( i->Ain.Alu32R.dst,
   2785                                i->Ain.Alu32R.src->Armi.Mem.am) );
   2786             if (rex != 0x40) *p++ = rex;
   2787             *p++ = toUChar(opc);
   2788             p = doAMode_M(p, i->Ain.Alu32R.dst,
   2789                              i->Ain.Alu32R.src->Armi.Mem.am);
   2790             goto done;
   2791          default:
   2792             goto bad;
   2793       }
   2794       break;
   2795 
   2796    case Ain_MulL:
   2797       subopc = i->Ain.MulL.syned ? 5 : 4;
   2798       switch (i->Ain.MulL.src->tag)  {
   2799          case Arm_Mem:
   2800             *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
   2801             *p++ = 0xF7;
   2802             p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
   2803             goto done;
   2804          case Arm_Reg:
   2805             *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
   2806             *p++ = 0xF7;
   2807             p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
   2808             goto done;
   2809          default:
   2810             goto bad;
   2811       }
   2812       break;
   2813 
   2814    case Ain_Div:
   2815       subopc = i->Ain.Div.syned ? 7 : 6;
   2816       if (i->Ain.Div.sz == 4) {
   2817          switch (i->Ain.Div.src->tag)  {
   2818             case Arm_Mem:
   2819                goto bad;
   2820                /*FIXME*/
   2821                *p++ = 0xF7;
   2822                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
   2823                goto done;
   2824             case Arm_Reg:
   2825                *p++ = clearWBit(
   2826                       rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
   2827                *p++ = 0xF7;
   2828                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
   2829                goto done;
   2830             default:
   2831                goto bad;
   2832          }
   2833       }
   2834       if (i->Ain.Div.sz == 8) {
   2835          switch (i->Ain.Div.src->tag)  {
   2836             case Arm_Mem:
   2837                *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
   2838                *p++ = 0xF7;
   2839                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
   2840                goto done;
   2841             case Arm_Reg:
   2842                *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
   2843                *p++ = 0xF7;
   2844                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
   2845                goto done;
   2846             default:
   2847                goto bad;
   2848          }
   2849       }
   2850       break;
   2851 
   2852    case Ain_Push:
   2853       switch (i->Ain.Push.src->tag) {
   2854          case Armi_Mem:
   2855             *p++ = clearWBit(
   2856                    rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
   2857             *p++ = 0xFF;
   2858             p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
   2859             goto done;
   2860          case Armi_Imm:
   2861             *p++ = 0x68;
   2862             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
   2863             goto done;
   2864          case Armi_Reg:
   2865             *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
   2866             *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
   2867             goto done;
   2868         default:
   2869             goto bad;
   2870       }
   2871 
   2872    case Ain_Call: {
   2873       /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
   2874          above, %r11 is used as an address temporary. */
   2875       /* If we don't need to do any fixup actions in the case that the
   2876          call doesn't happen, just do the simple thing and emit
   2877          straight-line code.  This is usually the case. */
   2878       if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
   2879           || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
   2880          /* jump over the following two insns if the condition does
   2881             not hold */
   2882          Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
   2883          if (i->Ain.Call.cond != Acc_ALWAYS) {
   2884             *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
   2885             *p++ = shortImm ? 10 : 13;
   2886             /* 10 or 13 bytes in the next two insns */
   2887          }
   2888          if (shortImm) {
   2889             /* 7 bytes: movl sign-extend(imm32), %r11 */
   2890             *p++ = 0x49;
   2891             *p++ = 0xC7;
   2892             *p++ = 0xC3;
   2893             p = emit32(p, (UInt)i->Ain.Call.target);
   2894          } else {
   2895             /* 10 bytes: movabsq $target, %r11 */
   2896             *p++ = 0x49;
   2897             *p++ = 0xBB;
   2898             p = emit64(p, i->Ain.Call.target);
   2899          }
   2900          /* 3 bytes: call *%r11 */
   2901          *p++ = 0x41;
   2902          *p++ = 0xFF;
   2903          *p++ = 0xD3;
   2904       } else {
   2905          Int delta;
   2906          /* Complex case.  We have to generate an if-then-else diamond. */
   2907          // before:
   2908          //   j{!cond} else:
   2909          //   movabsq $target, %r11
   2910          //   call* %r11
   2911          // preElse:
   2912          //   jmp after:
   2913          // else:
   2914          //   movabsq $0x5555555555555555, %rax  // possibly
   2915          //   movq %rax, %rdx                    // possibly
   2916          // after:
   2917 
   2918          // before:
   2919          UChar* pBefore = p;
   2920 
   2921          //   j{!cond} else:
   2922          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
   2923          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2924 
   2925          //   movabsq $target, %r11
   2926          *p++ = 0x49;
   2927          *p++ = 0xBB;
   2928          p = emit64(p, i->Ain.Call.target);
   2929 
   2930          //   call* %r11
   2931          *p++ = 0x41;
   2932          *p++ = 0xFF;
   2933          *p++ = 0xD3;
   2934 
   2935          // preElse:
   2936          UChar* pPreElse = p;
   2937 
   2938          //   jmp after:
   2939          *p++ = 0xEB;
   2940          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2941 
   2942          // else:
   2943          UChar* pElse = p;
   2944 
   2945          /* Do the 'else' actions */
   2946          switch (i->Ain.Call.rloc.pri) {
   2947             case RLPri_Int:
   2948                // movabsq $0x5555555555555555, %rax
   2949                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
   2950                break;
   2951             case RLPri_2Int:
   2952                goto bad; //ATC
   2953                // movabsq $0x5555555555555555, %rax
   2954                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
   2955                // movq %rax, %rdx
   2956                *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
   2957                break;
   2958             case RLPri_V128SpRel:
   2959                if (i->Ain.Call.rloc.spOff == 0) {
   2960                   // We could accept any |spOff| here, but that's more
   2961                   // hassle and the only value we're ever going to get
   2962                   // is zero (I believe.)  Hence take the easy path :)
   2963                   // We need a scag register -- r11 can be it.
   2964                   // movabsq $0x5555555555555555, %r11
   2965                   *p++ = 0x49; *p++ = 0xBB;
   2966                   p = emit64(p, 0x5555555555555555ULL);
   2967                   // movq %r11, 0(%rsp)
   2968                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
   2969                   // movq %r11, 8(%rsp)
   2970                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
   2971                   *p++ = 0x08;
   2972                   break;
   2973                }
   2974                goto bad; //ATC for all other spOff values
   2975             case RLPri_V256SpRel:
   2976                goto bad; //ATC
   2977             case RLPri_None: case RLPri_INVALID: default:
   2978                vassert(0); // should never get here
   2979          }
   2980 
   2981          // after:
   2982          UChar* pAfter = p;
   2983 
   2984          // Fix up the branch offsets.  The +2s in the offset
   2985          // calculations are there because x86 requires conditional
   2986          // branches to have their offset stated relative to the
   2987          // instruction immediately following the branch insn.  And in
   2988          // both cases the branch insns are 2 bytes long.
   2989 
   2990          // First, the "j{!cond} else:" at pBefore.
   2991          delta = (Int)(Long)(pElse - (pBefore + 2));
   2992          vassert(delta >= 0 && delta < 100/*arbitrary*/);
   2993          *(pBefore+1) = (UChar)delta;
   2994 
   2995          // And secondly, the "jmp after:" at pPreElse.
   2996          delta = (Int)(Long)(pAfter - (pPreElse + 2));
   2997          vassert(delta >= 0 && delta < 100/*arbitrary*/);
   2998          *(pPreElse+1) = (UChar)delta;
   2999       }
   3000       goto done;
   3001    }
   3002 
   3003    case Ain_XDirect: {
   3004       /* NB: what goes on here has to be very closely coordinated with the
   3005          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
   3006       /* We're generating chain-me requests here, so we need to be
   3007          sure this is actually allowed -- no-redir translations can't
   3008          use chain-me's.  Hence: */
   3009       vassert(disp_cp_chain_me_to_slowEP != NULL);
   3010       vassert(disp_cp_chain_me_to_fastEP != NULL);
   3011 
   3012       HReg r11 = hregAMD64_R11();
   3013 
   3014       /* Use ptmp for backpatching conditional jumps. */
   3015       ptmp = NULL;
   3016 
   3017       /* First off, if this is conditional, create a conditional
   3018          jump over the rest of it. */
   3019       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
   3020          /* jmp fwds if !condition */
   3021          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
   3022          ptmp = p; /* fill in this bit later */
   3023          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3024       }
   3025 
   3026       /* Update the guest RIP. */
   3027       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
   3028          /* use a shorter encoding */
   3029          /* movl sign-extend(dstGA), %r11 */
   3030          *p++ = 0x49;
   3031          *p++ = 0xC7;
   3032          *p++ = 0xC3;
   3033          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
   3034       } else {
   3035          /* movabsq $dstGA, %r11 */
   3036          *p++ = 0x49;
   3037          *p++ = 0xBB;
   3038          p = emit64(p, i->Ain.XDirect.dstGA);
   3039       }
   3040 
   3041       /* movq %r11, amRIP */
   3042       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
   3043       *p++ = 0x89;
   3044       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
   3045 
   3046       /* --- FIRST PATCHABLE BYTE follows --- */
   3047       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
   3048          to) backs up the return address, so as to find the address of
   3049          the first patchable byte.  So: don't change the length of the
   3050          two instructions below. */
   3051       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
   3052       *p++ = 0x49;
   3053       *p++ = 0xBB;
   3054       const void* disp_cp_chain_me
   3055                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
   3056                                          : disp_cp_chain_me_to_slowEP;
   3057       p = emit64(p, (Addr)disp_cp_chain_me);
   3058       /* call *%r11 */
   3059       *p++ = 0x41;
   3060       *p++ = 0xFF;
   3061       *p++ = 0xD3;
   3062       /* --- END of PATCHABLE BYTES --- */
   3063 
   3064       /* Fix up the conditional jump, if there was one. */
   3065       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
   3066          Int delta = p - ptmp;
   3067          vassert(delta > 0 && delta < 40);
   3068          *ptmp = toUChar(delta-1);
   3069       }
   3070       goto done;
   3071    }
   3072 
   3073    case Ain_XIndir: {
   3074       /* We're generating transfers that could lead indirectly to a
   3075          chain-me, so we need to be sure this is actually allowed --
   3076          no-redir translations are not allowed to reach normal
   3077          translations without going through the scheduler.  That means
   3078          no XDirects or XIndirs out from no-redir translations.
   3079          Hence: */
   3080       vassert(disp_cp_xindir != NULL);
   3081 
   3082       /* Use ptmp for backpatching conditional jumps. */
   3083       ptmp = NULL;
   3084 
   3085       /* First off, if this is conditional, create a conditional
   3086          jump over the rest of it. */
   3087       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
   3088          /* jmp fwds if !condition */
   3089          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
   3090          ptmp = p; /* fill in this bit later */
   3091          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3092       }
   3093 
   3094       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
   3095       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
   3096       *p++ = 0x89;
   3097       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
   3098 
   3099       /* get $disp_cp_xindir into %r11 */
   3100       if (fitsIn32Bits((Addr)disp_cp_xindir)) {
   3101          /* use a shorter encoding */
   3102          /* movl sign-extend(disp_cp_xindir), %r11 */
   3103          *p++ = 0x49;
   3104          *p++ = 0xC7;
   3105          *p++ = 0xC3;
   3106          p = emit32(p, (UInt)(Addr)disp_cp_xindir);
   3107       } else {
   3108          /* movabsq $disp_cp_xindir, %r11 */
   3109          *p++ = 0x49;
   3110          *p++ = 0xBB;
   3111          p = emit64(p, (Addr)disp_cp_xindir);
   3112       }
   3113 
   3114       /* jmp *%r11 */
   3115       *p++ = 0x41;
   3116       *p++ = 0xFF;
   3117       *p++ = 0xE3;
   3118 
   3119       /* Fix up the conditional jump, if there was one. */
   3120       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
   3121          Int delta = p - ptmp;
   3122          vassert(delta > 0 && delta < 40);
   3123          *ptmp = toUChar(delta-1);
   3124       }
   3125       goto done;
   3126    }
   3127 
   3128    case Ain_XAssisted: {
   3129       /* Use ptmp for backpatching conditional jumps. */
   3130       ptmp = NULL;
   3131 
   3132       /* First off, if this is conditional, create a conditional
   3133          jump over the rest of it. */
   3134       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
   3135          /* jmp fwds if !condition */
   3136          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
   3137          ptmp = p; /* fill in this bit later */
   3138          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3139       }
   3140 
   3141       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
   3142       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
   3143       *p++ = 0x89;
   3144       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
   3145       /* movl $magic_number, %ebp.  Since these numbers are all small positive
   3146          integers, we can get away with "movl $N, %ebp" rather than
   3147          the longer "movq $N, %rbp". */
   3148       UInt trcval = 0;
   3149       switch (i->Ain.XAssisted.jk) {
   3150          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
   3151          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
   3152          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
   3153          case Ijk_Sys_int210:  trcval = VEX_TRC_JMP_SYS_INT210;  break;
   3154          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
   3155          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
   3156          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
   3157          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
   3158          case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
   3159          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
   3160          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
   3161          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
   3162          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
   3163          /* We don't expect to see the following being assisted. */
   3164          case Ijk_Ret:
   3165          case Ijk_Call:
   3166          /* fallthrough */
   3167          default:
   3168             ppIRJumpKind(i->Ain.XAssisted.jk);
   3169             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
   3170       }
   3171       vassert(trcval != 0);
   3172       *p++ = 0xBD;
   3173       p = emit32(p, trcval);
   3174       /* movabsq $disp_assisted, %r11 */
   3175       *p++ = 0x49;
   3176       *p++ = 0xBB;
   3177       p = emit64(p, (Addr)disp_cp_xassisted);
   3178       /* jmp *%r11 */
   3179       *p++ = 0x41;
   3180       *p++ = 0xFF;
   3181       *p++ = 0xE3;
   3182 
   3183       /* Fix up the conditional jump, if there was one. */
   3184       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
   3185          Int delta = p - ptmp;
   3186          vassert(delta > 0 && delta < 40);
   3187          *ptmp = toUChar(delta-1);
   3188       }
   3189       goto done;
   3190    }
   3191 
   3192    case Ain_CMov64:
   3193       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
   3194       *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
   3195       *p++ = 0x0F;
   3196       *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
   3197       p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
   3198       goto done;
   3199 
   3200    case Ain_CLoad: {
   3201       vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
   3202 
   3203       /* Only 32- or 64-bit variants are allowed. */
   3204       vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
   3205 
   3206       /* Use ptmp for backpatching conditional jumps. */
   3207       ptmp = NULL;
   3208 
   3209       /* jmp fwds if !condition */
   3210       *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
   3211       ptmp = p; /* fill in this bit later */
   3212       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3213 
   3214       /* Now the load.  Either a normal 64 bit load or a normal 32 bit
   3215          load, which, by the default zero-extension rule, zeroes out
   3216          the upper half of the destination, as required. */
   3217       rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
   3218       *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
   3219       *p++ = 0x8B;
   3220       p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
   3221 
   3222       /* Fix up the conditional branch */
   3223       Int delta = p - ptmp;
   3224       vassert(delta > 0 && delta < 40);
   3225       *ptmp = toUChar(delta-1);
   3226       goto done;
   3227    }
   3228 
   3229    case Ain_CStore: {
   3230       /* AFAICS this is identical to Ain_CLoad except that the opcode
   3231          is 0x89 instead of 0x8B. */
   3232       vassert(i->Ain.CStore.cond != Acc_ALWAYS);
   3233 
   3234       /* Only 32- or 64-bit variants are allowed. */
   3235       vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
   3236 
   3237       /* Use ptmp for backpatching conditional jumps. */
   3238       ptmp = NULL;
   3239 
   3240       /* jmp fwds if !condition */
   3241       *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
   3242       ptmp = p; /* fill in this bit later */
   3243       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3244 
   3245       /* Now the store. */
   3246       rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
   3247       *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
   3248       *p++ = 0x89;
   3249       p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
   3250 
   3251       /* Fix up the conditional branch */
   3252       Int delta = p - ptmp;
   3253       vassert(delta > 0 && delta < 40);
   3254       *ptmp = toUChar(delta-1);
   3255       goto done;
   3256    }
   3257 
   3258    case Ain_MovxLQ:
   3259       /* No, _don't_ ask me why the sense of the args has to be
   3260          different in the S vs Z case.  I don't know. */
   3261       if (i->Ain.MovxLQ.syned) {
   3262          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
   3263          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   3264          *p++ = 0x63;
   3265          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   3266       } else {
   3267          /* Produce a 32-bit reg-reg move, since the implicit
   3268             zero-extend does what we want. */
   3269          *p++ = clearWBit (
   3270                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
   3271          *p++ = 0x89;
   3272          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
   3273       }
   3274       goto done;
   3275 
   3276    case Ain_LoadEX:
   3277       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
   3278          /* movzbq */
   3279          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3280          *p++ = 0x0F;
   3281          *p++ = 0xB6;
   3282          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3283          goto done;
   3284       }
   3285       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
   3286          /* movzwq */
   3287          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3288          *p++ = 0x0F;
   3289          *p++ = 0xB7;
   3290          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3291          goto done;
   3292       }
   3293       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
   3294          /* movzlq */
   3295          /* This isn't really an existing AMD64 instruction per se.
   3296             Rather, we have to do a 32-bit load.  Because a 32-bit
   3297             write implicitly clears the upper 32 bits of the target
   3298             register, we get what we want. */
   3299          *p++ = clearWBit(
   3300                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
   3301          *p++ = 0x8B;
   3302          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3303          goto done;
   3304       }
   3305       break;
   3306 
   3307    case Ain_Set64:
   3308       /* Make the destination register be 1 or 0, depending on whether
   3309          the relevant condition holds.  Complication: the top 56 bits
   3310          of the destination should be forced to zero, but doing 'xorq
   3311          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
   3312          start off my moving $0 into the dest. */
   3313       reg = iregEnc3210(i->Ain.Set64.dst);
   3314       vassert(reg < 16);
   3315 
   3316       /* movq $0, %dst */
   3317       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
   3318       *p++ = 0xC7;
   3319       *p++ = toUChar(0xC0 + (reg & 7));
   3320       p = emit32(p, 0);
   3321 
   3322       /* setb lo8(%dst) */
   3323       /* note, 8-bit register rex trickyness.  Be careful here. */
   3324       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
   3325       *p++ = 0x0F;
   3326       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
   3327       *p++ = toUChar(0xC0 + (reg & 7));
   3328       goto done;
   3329 
   3330    case Ain_Bsfr64:
   3331       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   3332       *p++ = 0x0F;
   3333       if (i->Ain.Bsfr64.isFwds) {
   3334          *p++ = 0xBC;
   3335       } else {
   3336          *p++ = 0xBD;
   3337       }
   3338       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   3339       goto done;
   3340 
   3341    case Ain_MFence:
   3342       /* mfence */
   3343       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
   3344       goto done;
   3345 
   3346    case Ain_ACAS:
   3347       /* lock */
   3348       *p++ = 0xF0;
   3349       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
   3350       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
   3351          in %rbx.  The new-value register is hardwired to be %rbx
   3352          since dealing with byte integer registers is too much hassle,
   3353          so we force the register operand to %rbx (could equally be
   3354          %rcx or %rdx). */
   3355       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
   3356       if (i->Ain.ACAS.sz != 8)
   3357          rex = clearWBit(rex);
   3358 
   3359       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
   3360       *p++ = 0x0F;
   3361       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
   3362       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
   3363       goto done;
   3364 
   3365    case Ain_DACAS:
   3366       /* lock */
   3367       *p++ = 0xF0;
   3368       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
   3369          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
   3370          aren't encoded in the insn. */
   3371       rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
   3372       if (i->Ain.ACAS.sz != 8)
   3373          rex = clearWBit(rex);
   3374       *p++ = rex;
   3375       *p++ = 0x0F;
   3376       *p++ = 0xC7;
   3377       p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
   3378       goto done;
   3379 
   3380    case Ain_A87Free:
   3381       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
   3382       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
   3383          p = do_ffree_st(p, 7-j);
   3384       }
   3385       goto done;
   3386 
   3387    case Ain_A87PushPop:
   3388       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
   3389       if (i->Ain.A87PushPop.isPush) {
   3390          /* Load from memory into %st(0): flds/fldl amode */
   3391          *p++ = clearWBit(
   3392                    rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
   3393          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   3394 	 p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
   3395       } else {
   3396          /* Dump %st(0) to memory: fstps/fstpl amode */
   3397          *p++ = clearWBit(
   3398                    rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
   3399          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   3400          p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
   3401          goto done;
   3402       }
   3403       goto done;
   3404 
   3405    case Ain_A87FpOp:
   3406       switch (i->Ain.A87FpOp.op) {
   3407          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
   3408          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
   3409          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
   3410          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
   3411          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
   3412          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
   3413          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
   3414          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
   3415          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
   3416          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
   3417          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
   3418          case Afp_TAN:
   3419             /* fptan pushes 1.0 on the FP stack, except when the
   3420                argument is out of range.  Hence we have to do the
   3421                instruction, then inspect C2 to see if there is an out
   3422                of range condition.  If there is, we skip the fincstp
   3423                that is used by the in-range case to get rid of this
   3424                extra 1.0 value. */
   3425             *p++ = 0xD9; *p++ = 0xF2; // fptan
   3426             *p++ = 0x50;              // pushq %rax
   3427             *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
   3428             *p++ = 0x66; *p++ = 0xA9;
   3429             *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
   3430             *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
   3431             *p++ = 0xD9; *p++ = 0xF7; // fincstp
   3432             *p++ = 0x58;              // after_fincstp: popq %rax
   3433             break;
   3434          default:
   3435             goto bad;
   3436       }
   3437       goto done;
   3438 
   3439    case Ain_A87LdCW:
   3440       *p++ = clearWBit(
   3441                 rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
   3442       *p++ = 0xD9;
   3443       p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
   3444       goto done;
   3445 
   3446    case Ain_A87StSW:
   3447       *p++ = clearWBit(
   3448                 rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
   3449       *p++ = 0xDD;
   3450       p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
   3451       goto done;
   3452 
   3453    case Ain_Store:
   3454       if (i->Ain.Store.sz == 2) {
   3455          /* This just goes to show the crazyness of the instruction
   3456             set encoding.  We have to insert two prefix bytes, but be
   3457             careful to avoid a conflict in what the size should be, by
   3458             ensuring that REX.W = 0. */
   3459          *p++ = 0x66; /* override to 16-bits */
   3460 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3461          *p++ = 0x89;
   3462          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3463          goto done;
   3464       }
   3465       if (i->Ain.Store.sz == 4) {
   3466 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3467          *p++ = 0x89;
   3468          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3469          goto done;
   3470       }
   3471       if (i->Ain.Store.sz == 1) {
   3472          /* This is one place where it would be wrong to skip emitting
   3473             a rex byte of 0x40, since the mere presence of rex changes
   3474             the meaning of the byte register access.  Be careful. */
   3475 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3476          *p++ = 0x88;
   3477          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3478          goto done;
   3479       }
   3480       break;
   3481 
   3482    case Ain_LdMXCSR:
   3483       *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
   3484       *p++ = 0x0F;
   3485       *p++ = 0xAE;
   3486       p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
   3487       goto done;
   3488 
   3489    case Ain_SseUComIS:
   3490       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
   3491       /* ucomi[sd] %srcL, %srcR */
   3492       if (i->Ain.SseUComIS.sz == 8) {
   3493          *p++ = 0x66;
   3494       } else {
   3495          goto bad;
   3496          vassert(i->Ain.SseUComIS.sz == 4);
   3497       }
   3498       *p++ = clearWBit (
   3499              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
   3500                                  vregEnc3210(i->Ain.SseUComIS.srcR) ));
   3501       *p++ = 0x0F;
   3502       *p++ = 0x2E;
   3503       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
   3504                                vregEnc3210(i->Ain.SseUComIS.srcR) );
   3505       /* pushfq */
   3506       *p++ = 0x9C;
   3507       /* popq %dst */
   3508       *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
   3509       *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
   3510       goto done;
   3511 
   3512    case Ain_SseSI2SF:
   3513       /* cvssi2s[sd] %src, %dst */
   3514       rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
   3515                                 i->Ain.SseSI2SF.src );
   3516       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
   3517       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
   3518       *p++ = 0x0F;
   3519       *p++ = 0x2A;
   3520       p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
   3521                                 i->Ain.SseSI2SF.src );
   3522       goto done;
   3523 
   3524    case Ain_SseSF2SI:
   3525       /* cvss[sd]2si %src, %dst */
   3526       rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
   3527                                 vregEnc3210(i->Ain.SseSF2SI.src) );
   3528       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
   3529       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
   3530       *p++ = 0x0F;
   3531       *p++ = 0x2D;
   3532       p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
   3533                                 vregEnc3210(i->Ain.SseSF2SI.src) );
   3534       goto done;
   3535 
   3536    case Ain_SseSDSS:
   3537       /* cvtsd2ss/cvtss2sd %src, %dst */
   3538       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
   3539       *p++ = clearWBit(
   3540               rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
   3541                                   vregEnc3210(i->Ain.SseSDSS.src) ));
   3542       *p++ = 0x0F;
   3543       *p++ = 0x5A;
   3544       p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
   3545                                 vregEnc3210(i->Ain.SseSDSS.src) );
   3546       goto done;
   3547 
   3548    case Ain_SseLdSt:
   3549       if (i->Ain.SseLdSt.sz == 8) {
   3550          *p++ = 0xF2;
   3551       } else
   3552       if (i->Ain.SseLdSt.sz == 4) {
   3553          *p++ = 0xF3;
   3554       } else
   3555       if (i->Ain.SseLdSt.sz != 16) {
   3556          vassert(0);
   3557       }
   3558       *p++ = clearWBit(
   3559              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
   3560                             i->Ain.SseLdSt.addr));
   3561       *p++ = 0x0F;
   3562       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
   3563       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
   3564                            i->Ain.SseLdSt.addr);
   3565       goto done;
   3566 
   3567    case Ain_SseCStore: {
   3568       vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
   3569 
   3570       /* Use ptmp for backpatching conditional jumps. */
   3571       ptmp = NULL;
   3572 
   3573       /* jmp fwds if !condition */
   3574       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
   3575       ptmp = p; /* fill in this bit later */
   3576       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3577 
   3578       /* Now the store. */
   3579       *p++ = clearWBit(
   3580              rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
   3581                             i->Ain.SseCStore.addr));
   3582       *p++ = 0x0F;
   3583       *p++ = toUChar(0x11);
   3584       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
   3585                            i->Ain.SseCStore.addr);
   3586 
   3587       /* Fix up the conditional branch */
   3588       Int delta = p - ptmp;
   3589       vassert(delta > 0 && delta < 40);
   3590       *ptmp = toUChar(delta-1);
   3591       goto done;
   3592    }
   3593 
   3594    case Ain_SseCLoad: {
   3595       vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
   3596 
   3597       /* Use ptmp for backpatching conditional jumps. */
   3598       ptmp = NULL;
   3599 
   3600       /* jmp fwds if !condition */
   3601       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
   3602       ptmp = p; /* fill in this bit later */
   3603       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3604 
   3605       /* Now the load. */
   3606       *p++ = clearWBit(
   3607              rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
   3608                             i->Ain.SseCLoad.addr));
   3609       *p++ = 0x0F;
   3610       *p++ = toUChar(0x10);
   3611       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
   3612                            i->Ain.SseCLoad.addr);
   3613 
   3614       /* Fix up the conditional branch */
   3615       Int delta = p - ptmp;
   3616       vassert(delta > 0 && delta < 40);
   3617       *ptmp = toUChar(delta-1);
   3618       goto done;
   3619    }
   3620 
   3621    case Ain_SseLdzLO:
   3622       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
   3623       /* movs[sd] amode, %xmm-dst */
   3624       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
   3625       *p++ = clearWBit(
   3626              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
   3627                             i->Ain.SseLdzLO.addr));
   3628       *p++ = 0x0F;
   3629       *p++ = 0x10;
   3630       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
   3631                            i->Ain.SseLdzLO.addr);
   3632       goto done;
   3633 
   3634    case Ain_Sse32Fx4:
   3635       xtra = 0;
   3636       *p++ = clearWBit(
   3637              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst),
   3638                                  vregEnc3210(i->Ain.Sse32Fx4.src) ));
   3639       *p++ = 0x0F;
   3640       switch (i->Ain.Sse32Fx4.op) {
   3641          case Asse_ADDF:   *p++ = 0x58; break;
   3642          case Asse_DIVF:   *p++ = 0x5E; break;
   3643          case Asse_MAXF:   *p++ = 0x5F; break;
   3644          case Asse_MINF:   *p++ = 0x5D; break;
   3645          case Asse_MULF:   *p++ = 0x59; break;
   3646          case Asse_RCPF:   *p++ = 0x53; break;
   3647          case Asse_RSQRTF: *p++ = 0x52; break;
   3648          case Asse_SQRTF:  *p++ = 0x51; break;
   3649          case Asse_SUBF:   *p++ = 0x5C; break;
   3650          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3651          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3652          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3653          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3654          default: goto bad;
   3655       }
   3656       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32Fx4.dst),
   3657                                vregEnc3210(i->Ain.Sse32Fx4.src) );
   3658       if (xtra & 0x100)
   3659          *p++ = toUChar(xtra & 0xFF);
   3660       goto done;
   3661 
   3662    case Ain_Sse64Fx2:
   3663       xtra = 0;
   3664       *p++ = 0x66;
   3665       *p++ = clearWBit(
   3666              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
   3667                                  vregEnc3210(i->Ain.Sse64Fx2.src) ));
   3668       *p++ = 0x0F;
   3669       switch (i->Ain.Sse64Fx2.op) {
   3670          case Asse_ADDF:   *p++ = 0x58; break;
   3671          case Asse_DIVF:   *p++ = 0x5E; break;
   3672          case Asse_MAXF:   *p++ = 0x5F; break;
   3673          case Asse_MINF:   *p++ = 0x5D; break;
   3674          case Asse_MULF:   *p++ = 0x59; break;
   3675          case Asse_SQRTF:  *p++ = 0x51; break;
   3676          case Asse_SUBF:   *p++ = 0x5C; break;
   3677          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3678          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3679          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3680          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3681          default: goto bad;
   3682       }
   3683       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
   3684                                vregEnc3210(i->Ain.Sse64Fx2.src) );
   3685       if (xtra & 0x100)
   3686          *p++ = toUChar(xtra & 0xFF);
   3687       goto done;
   3688 
   3689    case Ain_Sse32FLo:
   3690       xtra = 0;
   3691       *p++ = 0xF3;
   3692       *p++ = clearWBit(
   3693              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
   3694                                  vregEnc3210(i->Ain.Sse32FLo.src) ));
   3695       *p++ = 0x0F;
   3696       switch (i->Ain.Sse32FLo.op) {
   3697          case Asse_ADDF:   *p++ = 0x58; break;
   3698          case Asse_DIVF:   *p++ = 0x5E; break;
   3699          case Asse_MAXF:   *p++ = 0x5F; break;
   3700          case Asse_MINF:   *p++ = 0x5D; break;
   3701          case Asse_MULF:   *p++ = 0x59; break;
   3702          case Asse_RCPF:   *p++ = 0x53; break;
   3703          case Asse_RSQRTF: *p++ = 0x52; break;
   3704          case Asse_SQRTF:  *p++ = 0x51; break;
   3705          case Asse_SUBF:   *p++ = 0x5C; break;
   3706          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3707          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3708          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3709          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3710          default: goto bad;
   3711       }
   3712       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
   3713                                vregEnc3210(i->Ain.Sse32FLo.src) );
   3714       if (xtra & 0x100)
   3715          *p++ = toUChar(xtra & 0xFF);
   3716       goto done;
   3717 
   3718    case Ain_Sse64FLo:
   3719       xtra = 0;
   3720       *p++ = 0xF2;
   3721       *p++ = clearWBit(
   3722              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
   3723                                  vregEnc3210(i->Ain.Sse64FLo.src) ));
   3724       *p++ = 0x0F;
   3725       switch (i->Ain.Sse64FLo.op) {
   3726          case Asse_ADDF:   *p++ = 0x58; break;
   3727          case Asse_DIVF:   *p++ = 0x5E; break;
   3728          case Asse_MAXF:   *p++ = 0x5F; break;
   3729          case Asse_MINF:   *p++ = 0x5D; break;
   3730          case Asse_MULF:   *p++ = 0x59; break;
   3731          case Asse_SQRTF:  *p++ = 0x51; break;
   3732          case Asse_SUBF:   *p++ = 0x5C; break;
   3733          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3734          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3735          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3736          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3737          default: goto bad;
   3738       }
   3739       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
   3740                                vregEnc3210(i->Ain.Sse64FLo.src) );
   3741       if (xtra & 0x100)
   3742          *p++ = toUChar(xtra & 0xFF);
   3743       goto done;
   3744 
   3745    case Ain_SseReRg:
   3746 #     define XX(_n) *p++ = (_n)
   3747 
   3748       rex = clearWBit(
   3749             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
   3750                                 vregEnc3210(i->Ain.SseReRg.src) ));
   3751 
   3752       switch (i->Ain.SseReRg.op) {
   3753          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
   3754          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
   3755          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
   3756          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
   3757          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
   3758          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
   3759          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
   3760          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
   3761          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
   3762          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
   3763          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
   3764          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
   3765          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
   3766          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
   3767          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
   3768          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
   3769          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
   3770          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
   3771          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
   3772          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
   3773          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
   3774          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
   3775          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
   3776          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
   3777          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
   3778          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
   3779          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
   3780          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
   3781          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
   3782          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
   3783          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
   3784          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
   3785          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
   3786          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
   3787          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
   3788          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
   3789          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
   3790          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
   3791          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
   3792          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
   3793          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
   3794          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
   3795          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
   3796          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
   3797          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
   3798          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
   3799          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
   3800          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
   3801          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
   3802          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
   3803          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
   3804          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
   3805          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
   3806          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
   3807          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
   3808          default: goto bad;
   3809       }
   3810       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
   3811                                vregEnc3210(i->Ain.SseReRg.src) );
   3812 #     undef XX
   3813       goto done;
   3814 
   3815    case Ain_SseCMov:
   3816       /* jmp fwds if !condition */
   3817       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
   3818       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   3819       ptmp = p;
   3820 
   3821       /* movaps %src, %dst */
   3822       *p++ = clearWBit(
   3823              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
   3824                                  vregEnc3210(i->Ain.SseCMov.src) ));
   3825       *p++ = 0x0F;
   3826       *p++ = 0x28;
   3827       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
   3828                                vregEnc3210(i->Ain.SseCMov.src) );
   3829 
   3830       /* Fill in the jump offset. */
   3831       *(ptmp-1) = toUChar(p - ptmp);
   3832       goto done;
   3833 
   3834    case Ain_SseShuf:
   3835       *p++ = 0x66;
   3836       *p++ = clearWBit(
   3837              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
   3838                                  vregEnc3210(i->Ain.SseShuf.src) ));
   3839       *p++ = 0x0F;
   3840       *p++ = 0x70;
   3841       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
   3842                                vregEnc3210(i->Ain.SseShuf.src) );
   3843       *p++ = (UChar)(i->Ain.SseShuf.order);
   3844       goto done;
   3845 
   3846    //uu case Ain_AvxLdSt: {
   3847    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
   3848    //uu                           i->Ain.AvxLdSt.addr );
   3849    //uu    p = emitVexPrefix(p, vex);
   3850    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
   3851    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
   3852    //uu      goto done;
   3853    //uu }
   3854 
   3855    case Ain_EvCheck: {
   3856       /* We generate:
   3857             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
   3858             (2 bytes)  jns  nofail     expected taken
   3859             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
   3860             nofail:
   3861       */
   3862       /* This is heavily asserted re instruction lengths.  It needs to
   3863          be.  If we get given unexpected forms of .amCounter or
   3864          .amFailAddr -- basically, anything that's not of the form
   3865          uimm7(%rbp) -- they are likely to fail. */
   3866       /* Note also that after the decl we must be very careful not to
   3867          read the carry flag, else we get a partial flags stall.
   3868          js/jns avoids that, though. */
   3869       UChar* p0 = p;
   3870       /* ---  decl 8(%rbp) --- */
   3871       /* Need to compute the REX byte for the decl in order to prove
   3872          that we don't need it, since this is a 32-bit inc and all
   3873          registers involved in the amode are < r8.  "1" because
   3874          there's no register in this encoding; instead the register
   3875          field is used as a sub opcode.  The encoding for "decl r/m32"
   3876          is FF /1, hence the "1". */
   3877       rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
   3878       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
   3879       *p++ = 0xFF;
   3880       p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
   3881       vassert(p - p0 == 3);
   3882       /* --- jns nofail --- */
   3883       *p++ = 0x79;
   3884       *p++ = 0x03; /* need to check this 0x03 after the next insn */
   3885       vassert(p - p0 == 5);
   3886       /* --- jmp* 0(%rbp) --- */
   3887       /* Once again, verify we don't need REX.  The encoding is FF /4.
   3888          We don't need REX.W since by default FF /4 in 64-bit mode
   3889          implies a 64 bit load. */
   3890       rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
   3891       if (rex != 0x40) goto bad;
   3892       *p++ = 0xFF;
   3893       p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
   3894       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
   3895       /* And crosscheck .. */
   3896       vassert(evCheckSzB_AMD64() == 8);
   3897       goto done;
   3898    }
   3899 
   3900    case Ain_ProfInc: {
   3901       /* We generate   movabsq $0, %r11
   3902                        incq (%r11)
   3903          in the expectation that a later call to LibVEX_patchProfCtr
   3904          will be used to fill in the immediate field once the right
   3905          value is known.
   3906          49 BB 00 00 00 00 00 00 00 00
   3907          49 FF 03
   3908       */
   3909       *p++ = 0x49; *p++ = 0xBB;
   3910       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3911       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3912       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
   3913       /* Tell the caller .. */
   3914       vassert(!(*is_profInc));
   3915       *is_profInc = True;
   3916       goto done;
   3917    }
   3918 
   3919    default:
   3920       goto bad;
   3921    }
   3922 
   3923   bad:
   3924    ppAMD64Instr(i, mode64);
   3925    vpanic("emit_AMD64Instr");
   3926    /*NOTREACHED*/
   3927 
   3928   done:
   3929    vassert(p - &buf[0] <= 64);
   3930    return p - &buf[0];
   3931 }
   3932 
   3933 
   3934 /* How big is an event check?  See case for Ain_EvCheck in
   3935    emit_AMD64Instr just above.  That crosschecks what this returns, so
   3936    we can tell if we're inconsistent. */
   3937 Int evCheckSzB_AMD64 (void)
   3938 {
   3939    return 8;
   3940 }
   3941 
   3942 
   3943 /* NB: what goes on here has to be very closely coordinated with the
   3944    emitInstr case for XDirect, above. */
   3945 VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
   3946                                    void* place_to_chain,
   3947                                    const void* disp_cp_chain_me_EXPECTED,
   3948                                    const void* place_to_jump_to )
   3949 {
   3950    vassert(endness_host == VexEndnessLE);
   3951 
   3952    /* What we're expecting to see is:
   3953         movabsq $disp_cp_chain_me_EXPECTED, %r11
   3954         call *%r11
   3955       viz
   3956         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
   3957         41 FF D3
   3958    */
   3959    UChar* p = (UChar*)place_to_chain;
   3960    vassert(p[0] == 0x49);
   3961    vassert(p[1] == 0xBB);
   3962    vassert(read_misaligned_ULong_LE(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
   3963    vassert(p[10] == 0x41);
   3964    vassert(p[11] == 0xFF);
   3965    vassert(p[12] == 0xD3);
   3966    /* And what we want to change it to is either:
   3967         (general case):
   3968           movabsq $place_to_jump_to, %r11
   3969           jmpq *%r11
   3970         viz
   3971           49 BB <8 bytes value == place_to_jump_to>
   3972           41 FF E3
   3973         So it's the same length (convenient, huh) and we don't
   3974         need to change all the bits.
   3975       ---OR---
   3976         in the case where the displacement falls within 32 bits
   3977           jmpq disp32   where disp32 is relative to the next insn
   3978           ud2; ud2; ud2; ud2
   3979         viz
   3980           E9 <4 bytes == disp32>
   3981           0F 0B 0F 0B 0F 0B 0F 0B
   3982 
   3983       In both cases the replacement has the same length as the original.
   3984       To remain sane & verifiable,
   3985       (1) limit the displacement for the short form to
   3986           (say) +/- one billion, so as to avoid wraparound
   3987           off-by-ones
   3988       (2) even if the short form is applicable, once every (say)
   3989           1024 times use the long form anyway, so as to maintain
   3990           verifiability
   3991    */
   3992    /* This is the delta we need to put into a JMP d32 insn.  It's
   3993       relative to the start of the next insn, hence the -5.  */
   3994    Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
   3995    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
   3996 
   3997    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
   3998    if (shortOK) {
   3999       shortCTR++; // thread safety bleh
   4000       if (0 == (shortCTR & 0x3FF)) {
   4001          shortOK = False;
   4002          if (0)
   4003             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
   4004                        "using long jmp\n", shortCTR);
   4005       }
   4006    }
   4007 
   4008    /* And make the modifications. */
   4009    if (shortOK) {
   4010       p[0]  = 0xE9;
   4011       write_misaligned_UInt_LE(&p[1], (UInt)(Int)delta);
   4012       p[5]  = 0x0F; p[6]  = 0x0B;
   4013       p[7]  = 0x0F; p[8]  = 0x0B;
   4014       p[9]  = 0x0F; p[10] = 0x0B;
   4015       p[11] = 0x0F; p[12] = 0x0B;
   4016       /* sanity check on the delta -- top 32 are all 0 or all 1 */
   4017       delta >>= 32;
   4018       vassert(delta == 0LL || delta == -1LL);
   4019    } else {
   4020       /* Minimal modifications from the starting sequence. */
   4021       write_misaligned_ULong_LE(&p[2], (ULong)(Addr)place_to_jump_to);
   4022       p[12] = 0xE3;
   4023    }
   4024    VexInvalRange vir = { (HWord)place_to_chain, 13 };
   4025    return vir;
   4026 }
   4027 
   4028 
   4029 /* NB: what goes on here has to be very closely coordinated with the
   4030    emitInstr case for XDirect, above. */
   4031 VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
   4032                                      void* place_to_unchain,
   4033                                      const void* place_to_jump_to_EXPECTED,
   4034                                      const void* disp_cp_chain_me )
   4035 {
   4036    vassert(endness_host == VexEndnessLE);
   4037 
   4038    /* What we're expecting to see is either:
   4039         (general case)
   4040           movabsq $place_to_jump_to_EXPECTED, %r11
   4041           jmpq *%r11
   4042         viz
   4043           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
   4044           41 FF E3
   4045       ---OR---
   4046         in the case where the displacement falls within 32 bits
   4047           jmpq d32
   4048           ud2; ud2; ud2; ud2
   4049         viz
   4050           E9 <4 bytes == disp32>
   4051           0F 0B 0F 0B 0F 0B 0F 0B
   4052    */
   4053    UChar* p     = (UChar*)place_to_unchain;
   4054    Bool   valid = False;
   4055    if (p[0] == 0x49 && p[1] == 0xBB
   4056        && read_misaligned_ULong_LE(&p[2])
   4057           == (ULong)(Addr)place_to_jump_to_EXPECTED
   4058        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
   4059       /* it's the long form */
   4060       valid = True;
   4061    }
   4062    else
   4063    if (p[0] == 0xE9
   4064        && p[5]  == 0x0F && p[6]  == 0x0B
   4065        && p[7]  == 0x0F && p[8]  == 0x0B
   4066        && p[9]  == 0x0F && p[10] == 0x0B
   4067        && p[11] == 0x0F && p[12] == 0x0B) {
   4068       /* It's the short form.  Check the offset is right. */
   4069       Int  s32 = (Int)read_misaligned_UInt_LE(&p[1]);
   4070       Long s64 = (Long)s32;
   4071       if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
   4072          valid = True;
   4073          if (0)
   4074             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
   4075       }
   4076    }
   4077    vassert(valid);
   4078    /* And what we want to change it to is:
   4079         movabsq $disp_cp_chain_me, %r11
   4080         call *%r11
   4081       viz
   4082         49 BB <8 bytes value == disp_cp_chain_me>
   4083         41 FF D3
   4084       So it's the same length (convenient, huh).
   4085    */
   4086    p[0] = 0x49;
   4087    p[1] = 0xBB;
   4088    write_misaligned_ULong_LE(&p[2], (ULong)(Addr)disp_cp_chain_me);
   4089    p[10] = 0x41;
   4090    p[11] = 0xFF;
   4091    p[12] = 0xD3;
   4092    VexInvalRange vir = { (HWord)place_to_unchain, 13 };
   4093    return vir;
   4094 }
   4095 
   4096 
   4097 /* Patch the counter address into a profile inc point, as previously
   4098    created by the Ain_ProfInc case for emit_AMD64Instr. */
   4099 VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
   4100                                    void*  place_to_patch,
   4101                                    const ULong* location_of_counter )
   4102 {
   4103    vassert(endness_host == VexEndnessLE);
   4104    vassert(sizeof(ULong*) == 8);
   4105    UChar* p = (UChar*)place_to_patch;
   4106    vassert(p[0] == 0x49);
   4107    vassert(p[1] == 0xBB);
   4108    vassert(p[2] == 0x00);
   4109    vassert(p[3] == 0x00);
   4110    vassert(p[4] == 0x00);
   4111    vassert(p[5] == 0x00);
   4112    vassert(p[6] == 0x00);
   4113    vassert(p[7] == 0x00);
   4114    vassert(p[8] == 0x00);
   4115    vassert(p[9] == 0x00);
   4116    vassert(p[10] == 0x49);
   4117    vassert(p[11] == 0xFF);
   4118    vassert(p[12] == 0x03);
   4119    ULong imm64 = (ULong)(Addr)location_of_counter;
   4120    p[2] = imm64 & 0xFF; imm64 >>= 8;
   4121    p[3] = imm64 & 0xFF; imm64 >>= 8;
   4122    p[4] = imm64 & 0xFF; imm64 >>= 8;
   4123    p[5] = imm64 & 0xFF; imm64 >>= 8;
   4124    p[6] = imm64 & 0xFF; imm64 >>= 8;
   4125    p[7] = imm64 & 0xFF; imm64 >>= 8;
   4126    p[8] = imm64 & 0xFF; imm64 >>= 8;
   4127    p[9] = imm64 & 0xFF; imm64 >>= 8;
   4128    VexInvalRange vir = { (HWord)place_to_patch, 13 };
   4129    return vir;
   4130 }
   4131 
   4132 
   4133 /*---------------------------------------------------------------*/
   4134 /*--- end                                   host_amd64_defs.c ---*/
   4135 /*---------------------------------------------------------------*/
   4136