Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_defs.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2015 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex.h"
     38 #include "libvex_trc_values.h"
     39 
     40 #include "main_util.h"
     41 #include "host_generic_regs.h"
     42 #include "host_amd64_defs.h"
     43 
     44 
     45 /* --------- Registers. --------- */
     46 
     47 const RRegUniverse* getRRegUniverse_AMD64 ( void )
     48 {
     49    /* The real-register universe is a big constant, so we just want to
     50       initialise it once. */
     51    static RRegUniverse rRegUniverse_AMD64;
     52    static Bool         rRegUniverse_AMD64_initted = False;
     53 
     54    /* Handy shorthand, nothing more */
     55    RRegUniverse* ru = &rRegUniverse_AMD64;
     56 
     57    /* This isn't thread-safe.  Sigh. */
     58    if (LIKELY(rRegUniverse_AMD64_initted))
     59       return ru;
     60 
     61    RRegUniverse__init(ru);
     62 
     63    /* Add the registers.  The initial segment of this array must be
     64       those available for allocation by reg-alloc, and those that
     65       follow are not available for allocation. */
     66    ru->regs[ru->size++] = hregAMD64_RSI();
     67    ru->regs[ru->size++] = hregAMD64_RDI();
     68    ru->regs[ru->size++] = hregAMD64_R8();
     69    ru->regs[ru->size++] = hregAMD64_R9();
     70    ru->regs[ru->size++] = hregAMD64_R12();
     71    ru->regs[ru->size++] = hregAMD64_R13();
     72    ru->regs[ru->size++] = hregAMD64_R14();
     73    ru->regs[ru->size++] = hregAMD64_R15();
     74    ru->regs[ru->size++] = hregAMD64_RBX();
     75    ru->regs[ru->size++] = hregAMD64_XMM3();
     76    ru->regs[ru->size++] = hregAMD64_XMM4();
     77    ru->regs[ru->size++] = hregAMD64_XMM5();
     78    ru->regs[ru->size++] = hregAMD64_XMM6();
     79    ru->regs[ru->size++] = hregAMD64_XMM7();
     80    ru->regs[ru->size++] = hregAMD64_XMM8();
     81    ru->regs[ru->size++] = hregAMD64_XMM9();
     82    ru->regs[ru->size++] = hregAMD64_XMM10();
     83    ru->regs[ru->size++] = hregAMD64_XMM11();
     84    ru->regs[ru->size++] = hregAMD64_XMM12();
     85    ru->regs[ru->size++] = hregAMD64_R10();
     86    ru->allocable = ru->size;
     87    /* And other regs, not available to the allocator. */
     88    ru->regs[ru->size++] = hregAMD64_RAX();
     89    ru->regs[ru->size++] = hregAMD64_RCX();
     90    ru->regs[ru->size++] = hregAMD64_RDX();
     91    ru->regs[ru->size++] = hregAMD64_RSP();
     92    ru->regs[ru->size++] = hregAMD64_RBP();
     93    ru->regs[ru->size++] = hregAMD64_R11();
     94    ru->regs[ru->size++] = hregAMD64_XMM0();
     95    ru->regs[ru->size++] = hregAMD64_XMM1();
     96 
     97    rRegUniverse_AMD64_initted = True;
     98 
     99    RRegUniverse__check_is_sane(ru);
    100    return ru;
    101 }
    102 
    103 
    104 void ppHRegAMD64 ( HReg reg )
    105 {
    106    Int r;
    107    static const HChar* ireg64_names[16]
    108      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    109          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    110    /* Be generic for all virtual regs. */
    111    if (hregIsVirtual(reg)) {
    112       ppHReg(reg);
    113       return;
    114    }
    115    /* But specific for real regs. */
    116    switch (hregClass(reg)) {
    117       case HRcInt64:
    118          r = hregEncoding(reg);
    119          vassert(r >= 0 && r < 16);
    120          vex_printf("%s", ireg64_names[r]);
    121          return;
    122       case HRcVec128:
    123          r = hregEncoding(reg);
    124          vassert(r >= 0 && r < 16);
    125          vex_printf("%%xmm%d", r);
    126          return;
    127       default:
    128          vpanic("ppHRegAMD64");
    129    }
    130 }
    131 
    132 static void ppHRegAMD64_lo32 ( HReg reg )
    133 {
    134    Int r;
    135    static const HChar* ireg32_names[16]
    136      = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
    137          "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
    138    /* Be generic for all virtual regs. */
    139    if (hregIsVirtual(reg)) {
    140       ppHReg(reg);
    141       vex_printf("d");
    142       return;
    143    }
    144    /* But specific for real regs. */
    145    switch (hregClass(reg)) {
    146       case HRcInt64:
    147          r = hregEncoding(reg);
    148          vassert(r >= 0 && r < 16);
    149          vex_printf("%s", ireg32_names[r]);
    150          return;
    151       default:
    152          vpanic("ppHRegAMD64_lo32: invalid regclass");
    153    }
    154 }
    155 
    156 
    157 /* --------- Condition codes, Intel encoding. --------- */
    158 
    159 const HChar* showAMD64CondCode ( AMD64CondCode cond )
    160 {
    161    switch (cond) {
    162       case Acc_O:      return "o";
    163       case Acc_NO:     return "no";
    164       case Acc_B:      return "b";
    165       case Acc_NB:     return "nb";
    166       case Acc_Z:      return "z";
    167       case Acc_NZ:     return "nz";
    168       case Acc_BE:     return "be";
    169       case Acc_NBE:    return "nbe";
    170       case Acc_S:      return "s";
    171       case Acc_NS:     return "ns";
    172       case Acc_P:      return "p";
    173       case Acc_NP:     return "np";
    174       case Acc_L:      return "l";
    175       case Acc_NL:     return "nl";
    176       case Acc_LE:     return "le";
    177       case Acc_NLE:    return "nle";
    178       case Acc_ALWAYS: return "ALWAYS";
    179       default: vpanic("ppAMD64CondCode");
    180    }
    181 }
    182 
    183 
    184 /* --------- AMD64AMode: memory address expressions. --------- */
    185 
    186 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
    187    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
    188    am->tag        = Aam_IR;
    189    am->Aam.IR.imm = imm32;
    190    am->Aam.IR.reg = reg;
    191    return am;
    192 }
    193 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
    194    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
    195    am->tag = Aam_IRRS;
    196    am->Aam.IRRS.imm   = imm32;
    197    am->Aam.IRRS.base  = base;
    198    am->Aam.IRRS.index = indEx;
    199    am->Aam.IRRS.shift = shift;
    200    vassert(shift >= 0 && shift <= 3);
    201    return am;
    202 }
    203 
    204 void ppAMD64AMode ( AMD64AMode* am ) {
    205    switch (am->tag) {
    206       case Aam_IR:
    207          if (am->Aam.IR.imm == 0)
    208             vex_printf("(");
    209          else
    210             vex_printf("0x%x(", am->Aam.IR.imm);
    211          ppHRegAMD64(am->Aam.IR.reg);
    212          vex_printf(")");
    213          return;
    214       case Aam_IRRS:
    215          vex_printf("0x%x(", am->Aam.IRRS.imm);
    216          ppHRegAMD64(am->Aam.IRRS.base);
    217          vex_printf(",");
    218          ppHRegAMD64(am->Aam.IRRS.index);
    219          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
    220          return;
    221       default:
    222          vpanic("ppAMD64AMode");
    223    }
    224 }
    225 
    226 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
    227    switch (am->tag) {
    228       case Aam_IR:
    229          addHRegUse(u, HRmRead, am->Aam.IR.reg);
    230          return;
    231       case Aam_IRRS:
    232          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
    233          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
    234          return;
    235       default:
    236          vpanic("addRegUsage_AMD64AMode");
    237    }
    238 }
    239 
    240 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
    241    switch (am->tag) {
    242       case Aam_IR:
    243          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
    244          return;
    245       case Aam_IRRS:
    246          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
    247          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
    248          return;
    249       default:
    250          vpanic("mapRegs_AMD64AMode");
    251    }
    252 }
    253 
    254 /* --------- Operand, which can be reg, immediate or memory. --------- */
    255 
    256 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
    257    AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
    258    op->tag            = Armi_Imm;
    259    op->Armi.Imm.imm32 = imm32;
    260    return op;
    261 }
    262 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
    263    AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
    264    op->tag          = Armi_Reg;
    265    op->Armi.Reg.reg = reg;
    266    return op;
    267 }
    268 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
    269    AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
    270    op->tag         = Armi_Mem;
    271    op->Armi.Mem.am = am;
    272    return op;
    273 }
    274 
    275 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
    276    switch (op->tag) {
    277       case Armi_Imm:
    278          vex_printf("$0x%x", op->Armi.Imm.imm32);
    279          return;
    280       case Armi_Reg:
    281          if (lo32)
    282             ppHRegAMD64_lo32(op->Armi.Reg.reg);
    283          else
    284             ppHRegAMD64(op->Armi.Reg.reg);
    285          return;
    286       case Armi_Mem:
    287          ppAMD64AMode(op->Armi.Mem.am);
    288          return;
    289      default:
    290          vpanic("ppAMD64RMI");
    291    }
    292 }
    293 void ppAMD64RMI ( AMD64RMI* op ) {
    294    ppAMD64RMI_wrk(op, False/*!lo32*/);
    295 }
    296 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
    297    ppAMD64RMI_wrk(op, True/*lo32*/);
    298 }
    299 
    300 /* An AMD64RMI can only be used in a "read" context (what would it mean
    301    to write or modify a literal?) and so we enumerate its registers
    302    accordingly. */
    303 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
    304    switch (op->tag) {
    305       case Armi_Imm:
    306          return;
    307       case Armi_Reg:
    308          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
    309          return;
    310       case Armi_Mem:
    311          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
    312          return;
    313       default:
    314          vpanic("addRegUsage_AMD64RMI");
    315    }
    316 }
    317 
    318 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
    319    switch (op->tag) {
    320       case Armi_Imm:
    321          return;
    322       case Armi_Reg:
    323          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
    324          return;
    325       case Armi_Mem:
    326          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
    327          return;
    328       default:
    329          vpanic("mapRegs_AMD64RMI");
    330    }
    331 }
    332 
    333 
    334 /* --------- Operand, which can be reg or immediate only. --------- */
    335 
    336 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
    337    AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
    338    op->tag           = Ari_Imm;
    339    op->Ari.Imm.imm32 = imm32;
    340    return op;
    341 }
    342 AMD64RI* AMD64RI_Reg ( HReg reg ) {
    343    AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
    344    op->tag         = Ari_Reg;
    345    op->Ari.Reg.reg = reg;
    346    return op;
    347 }
    348 
    349 void ppAMD64RI ( AMD64RI* op ) {
    350    switch (op->tag) {
    351       case Ari_Imm:
    352          vex_printf("$0x%x", op->Ari.Imm.imm32);
    353          return;
    354       case Ari_Reg:
    355          ppHRegAMD64(op->Ari.Reg.reg);
    356          return;
    357      default:
    358          vpanic("ppAMD64RI");
    359    }
    360 }
    361 
    362 /* An AMD64RI can only be used in a "read" context (what would it mean
    363    to write or modify a literal?) and so we enumerate its registers
    364    accordingly. */
    365 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
    366    switch (op->tag) {
    367       case Ari_Imm:
    368          return;
    369       case Ari_Reg:
    370          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
    371          return;
    372       default:
    373          vpanic("addRegUsage_AMD64RI");
    374    }
    375 }
    376 
    377 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
    378    switch (op->tag) {
    379       case Ari_Imm:
    380          return;
    381       case Ari_Reg:
    382          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
    383          return;
    384       default:
    385          vpanic("mapRegs_AMD64RI");
    386    }
    387 }
    388 
    389 
    390 /* --------- Operand, which can be reg or memory only. --------- */
    391 
    392 AMD64RM* AMD64RM_Reg ( HReg reg ) {
    393    AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
    394    op->tag         = Arm_Reg;
    395    op->Arm.Reg.reg = reg;
    396    return op;
    397 }
    398 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
    399    AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
    400    op->tag        = Arm_Mem;
    401    op->Arm.Mem.am = am;
    402    return op;
    403 }
    404 
    405 void ppAMD64RM ( AMD64RM* op ) {
    406    switch (op->tag) {
    407       case Arm_Mem:
    408          ppAMD64AMode(op->Arm.Mem.am);
    409          return;
    410       case Arm_Reg:
    411          ppHRegAMD64(op->Arm.Reg.reg);
    412          return;
    413      default:
    414          vpanic("ppAMD64RM");
    415    }
    416 }
    417 
    418 /* Because an AMD64RM can be both a source or destination operand, we
    419    have to supply a mode -- pertaining to the operand as a whole --
    420    indicating how it's being used. */
    421 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
    422    switch (op->tag) {
    423       case Arm_Mem:
    424          /* Memory is read, written or modified.  So we just want to
    425             know the regs read by the amode. */
    426          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
    427          return;
    428       case Arm_Reg:
    429          /* reg is read, written or modified.  Add it in the
    430             appropriate way. */
    431          addHRegUse(u, mode, op->Arm.Reg.reg);
    432          return;
    433      default:
    434          vpanic("addRegUsage_AMD64RM");
    435    }
    436 }
    437 
    438 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
    439 {
    440    switch (op->tag) {
    441       case Arm_Mem:
    442          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
    443          return;
    444       case Arm_Reg:
    445          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
    446          return;
    447      default:
    448          vpanic("mapRegs_AMD64RM");
    449    }
    450 }
    451 
    452 
    453 /* --------- Instructions. --------- */
    454 
    455 static const HChar* showAMD64ScalarSz ( Int sz ) {
    456    switch (sz) {
    457       case 2: return "w";
    458       case 4: return "l";
    459       case 8: return "q";
    460       default: vpanic("showAMD64ScalarSz");
    461    }
    462 }
    463 
    464 const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
    465    switch (op) {
    466       case Aun_NOT: return "not";
    467       case Aun_NEG: return "neg";
    468       default: vpanic("showAMD64UnaryOp");
    469    }
    470 }
    471 
    472 const HChar* showAMD64AluOp ( AMD64AluOp op ) {
    473    switch (op) {
    474       case Aalu_MOV:  return "mov";
    475       case Aalu_CMP:  return "cmp";
    476       case Aalu_ADD:  return "add";
    477       case Aalu_SUB:  return "sub";
    478       case Aalu_ADC:  return "adc";
    479       case Aalu_SBB:  return "sbb";
    480       case Aalu_AND:  return "and";
    481       case Aalu_OR:   return "or";
    482       case Aalu_XOR:  return "xor";
    483       case Aalu_MUL:  return "imul";
    484       default: vpanic("showAMD64AluOp");
    485    }
    486 }
    487 
    488 const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
    489    switch (op) {
    490       case Ash_SHL: return "shl";
    491       case Ash_SHR: return "shr";
    492       case Ash_SAR: return "sar";
    493       default: vpanic("showAMD64ShiftOp");
    494    }
    495 }
    496 
    497 const HChar* showA87FpOp ( A87FpOp op ) {
    498    switch (op) {
    499       case Afp_SCALE:  return "scale";
    500       case Afp_ATAN:   return "atan";
    501       case Afp_YL2X:   return "yl2x";
    502       case Afp_YL2XP1: return "yl2xp1";
    503       case Afp_PREM:   return "prem";
    504       case Afp_PREM1:  return "prem1";
    505       case Afp_SQRT:   return "sqrt";
    506       case Afp_SIN:    return "sin";
    507       case Afp_COS:    return "cos";
    508       case Afp_TAN:    return "tan";
    509       case Afp_ROUND:  return "round";
    510       case Afp_2XM1:   return "2xm1";
    511       default: vpanic("showA87FpOp");
    512    }
    513 }
    514 
    515 const HChar* showAMD64SseOp ( AMD64SseOp op ) {
    516    switch (op) {
    517       case Asse_MOV:      return "movups";
    518       case Asse_ADDF:     return "add";
    519       case Asse_SUBF:     return "sub";
    520       case Asse_MULF:     return "mul";
    521       case Asse_DIVF:     return "div";
    522       case Asse_MAXF:     return "max";
    523       case Asse_MINF:     return "min";
    524       case Asse_CMPEQF:   return "cmpFeq";
    525       case Asse_CMPLTF:   return "cmpFlt";
    526       case Asse_CMPLEF:   return "cmpFle";
    527       case Asse_CMPUNF:   return "cmpFun";
    528       case Asse_RCPF:     return "rcp";
    529       case Asse_RSQRTF:   return "rsqrt";
    530       case Asse_SQRTF:    return "sqrt";
    531       case Asse_AND:      return "and";
    532       case Asse_OR:       return "or";
    533       case Asse_XOR:      return "xor";
    534       case Asse_ANDN:     return "andn";
    535       case Asse_ADD8:     return "paddb";
    536       case Asse_ADD16:    return "paddw";
    537       case Asse_ADD32:    return "paddd";
    538       case Asse_ADD64:    return "paddq";
    539       case Asse_QADD8U:   return "paddusb";
    540       case Asse_QADD16U:  return "paddusw";
    541       case Asse_QADD8S:   return "paddsb";
    542       case Asse_QADD16S:  return "paddsw";
    543       case Asse_SUB8:     return "psubb";
    544       case Asse_SUB16:    return "psubw";
    545       case Asse_SUB32:    return "psubd";
    546       case Asse_SUB64:    return "psubq";
    547       case Asse_QSUB8U:   return "psubusb";
    548       case Asse_QSUB16U:  return "psubusw";
    549       case Asse_QSUB8S:   return "psubsb";
    550       case Asse_QSUB16S:  return "psubsw";
    551       case Asse_MUL16:    return "pmullw";
    552       case Asse_MULHI16U: return "pmulhuw";
    553       case Asse_MULHI16S: return "pmulhw";
    554       case Asse_AVG8U:    return "pavgb";
    555       case Asse_AVG16U:   return "pavgw";
    556       case Asse_MAX16S:   return "pmaxw";
    557       case Asse_MAX8U:    return "pmaxub";
    558       case Asse_MIN16S:   return "pminw";
    559       case Asse_MIN8U:    return "pminub";
    560       case Asse_CMPEQ8:   return "pcmpeqb";
    561       case Asse_CMPEQ16:  return "pcmpeqw";
    562       case Asse_CMPEQ32:  return "pcmpeqd";
    563       case Asse_CMPGT8S:  return "pcmpgtb";
    564       case Asse_CMPGT16S: return "pcmpgtw";
    565       case Asse_CMPGT32S: return "pcmpgtd";
    566       case Asse_SHL16:    return "psllw";
    567       case Asse_SHL32:    return "pslld";
    568       case Asse_SHL64:    return "psllq";
    569       case Asse_SHR16:    return "psrlw";
    570       case Asse_SHR32:    return "psrld";
    571       case Asse_SHR64:    return "psrlq";
    572       case Asse_SAR16:    return "psraw";
    573       case Asse_SAR32:    return "psrad";
    574       case Asse_PACKSSD:  return "packssdw";
    575       case Asse_PACKSSW:  return "packsswb";
    576       case Asse_PACKUSW:  return "packuswb";
    577       case Asse_UNPCKHB:  return "punpckhb";
    578       case Asse_UNPCKHW:  return "punpckhw";
    579       case Asse_UNPCKHD:  return "punpckhd";
    580       case Asse_UNPCKHQ:  return "punpckhq";
    581       case Asse_UNPCKLB:  return "punpcklb";
    582       case Asse_UNPCKLW:  return "punpcklw";
    583       case Asse_UNPCKLD:  return "punpckld";
    584       case Asse_UNPCKLQ:  return "punpcklq";
    585       default: vpanic("showAMD64SseOp");
    586    }
    587 }
    588 
    589 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
    590    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    591    i->tag             = Ain_Imm64;
    592    i->Ain.Imm64.imm64 = imm64;
    593    i->Ain.Imm64.dst   = dst;
    594    return i;
    595 }
    596 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    597    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    598    i->tag            = Ain_Alu64R;
    599    i->Ain.Alu64R.op  = op;
    600    i->Ain.Alu64R.src = src;
    601    i->Ain.Alu64R.dst = dst;
    602    return i;
    603 }
    604 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
    605    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    606    i->tag            = Ain_Alu64M;
    607    i->Ain.Alu64M.op  = op;
    608    i->Ain.Alu64M.src = src;
    609    i->Ain.Alu64M.dst = dst;
    610    vassert(op != Aalu_MUL);
    611    return i;
    612 }
    613 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
    614    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    615    i->tag          = Ain_Sh64;
    616    i->Ain.Sh64.op  = op;
    617    i->Ain.Sh64.src = src;
    618    i->Ain.Sh64.dst = dst;
    619    return i;
    620 }
    621 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
    622    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    623    i->tag              = Ain_Test64;
    624    i->Ain.Test64.imm32 = imm32;
    625    i->Ain.Test64.dst   = dst;
    626    return i;
    627 }
    628 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
    629    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    630    i->tag             = Ain_Unary64;
    631    i->Ain.Unary64.op  = op;
    632    i->Ain.Unary64.dst = dst;
    633    return i;
    634 }
    635 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
    636    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    637    i->tag             = Ain_Lea64;
    638    i->Ain.Lea64.am    = am;
    639    i->Ain.Lea64.dst   = dst;
    640    return i;
    641 }
    642 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
    643    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    644    i->tag            = Ain_Alu32R;
    645    i->Ain.Alu32R.op  = op;
    646    i->Ain.Alu32R.src = src;
    647    i->Ain.Alu32R.dst = dst;
    648    switch (op) {
    649       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
    650       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
    651       default: vassert(0);
    652    }
    653    return i;
    654 }
    655 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
    656    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    657    i->tag            = Ain_MulL;
    658    i->Ain.MulL.syned = syned;
    659    i->Ain.MulL.src   = src;
    660    return i;
    661 }
    662 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
    663    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    664    i->tag            = Ain_Div;
    665    i->Ain.Div.syned  = syned;
    666    i->Ain.Div.sz     = sz;
    667    i->Ain.Div.src    = src;
    668    vassert(sz == 4 || sz == 8);
    669    return i;
    670 }
    671 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
    672    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    673    i->tag          = Ain_Push;
    674    i->Ain.Push.src = src;
    675    return i;
    676 }
    677 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
    678                               RetLoc rloc ) {
    679    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    680    i->tag               = Ain_Call;
    681    i->Ain.Call.cond     = cond;
    682    i->Ain.Call.target   = target;
    683    i->Ain.Call.regparms = regparms;
    684    i->Ain.Call.rloc     = rloc;
    685    vassert(regparms >= 0 && regparms <= 6);
    686    vassert(is_sane_RetLoc(rloc));
    687    return i;
    688 }
    689 
    690 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
    691                                  AMD64CondCode cond, Bool toFastEP ) {
    692    AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    693    i->tag                  = Ain_XDirect;
    694    i->Ain.XDirect.dstGA    = dstGA;
    695    i->Ain.XDirect.amRIP    = amRIP;
    696    i->Ain.XDirect.cond     = cond;
    697    i->Ain.XDirect.toFastEP = toFastEP;
    698    return i;
    699 }
    700 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
    701                                 AMD64CondCode cond ) {
    702    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    703    i->tag              = Ain_XIndir;
    704    i->Ain.XIndir.dstGA = dstGA;
    705    i->Ain.XIndir.amRIP = amRIP;
    706    i->Ain.XIndir.cond  = cond;
    707    return i;
    708 }
    709 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
    710                                    AMD64CondCode cond, IRJumpKind jk ) {
    711    AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    712    i->tag                 = Ain_XAssisted;
    713    i->Ain.XAssisted.dstGA = dstGA;
    714    i->Ain.XAssisted.amRIP = amRIP;
    715    i->Ain.XAssisted.cond  = cond;
    716    i->Ain.XAssisted.jk    = jk;
    717    return i;
    718 }
    719 
    720 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
    721    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    722    i->tag             = Ain_CMov64;
    723    i->Ain.CMov64.cond = cond;
    724    i->Ain.CMov64.src  = src;
    725    i->Ain.CMov64.dst  = dst;
    726    vassert(cond != Acc_ALWAYS);
    727    return i;
    728 }
    729 AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
    730                                AMD64AMode* addr, HReg dst ) {
    731    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    732    i->tag            = Ain_CLoad;
    733    i->Ain.CLoad.cond = cond;
    734    i->Ain.CLoad.szB  = szB;
    735    i->Ain.CLoad.addr = addr;
    736    i->Ain.CLoad.dst  = dst;
    737    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
    738    return i;
    739 }
    740 AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
    741                                 HReg src, AMD64AMode* addr ) {
    742    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    743    i->tag             = Ain_CStore;
    744    i->Ain.CStore.cond = cond;
    745    i->Ain.CStore.szB  = szB;
    746    i->Ain.CStore.src  = src;
    747    i->Ain.CStore.addr = addr;
    748    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
    749    return i;
    750 }
    751 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
    752    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    753    i->tag              = Ain_MovxLQ;
    754    i->Ain.MovxLQ.syned = syned;
    755    i->Ain.MovxLQ.src   = src;
    756    i->Ain.MovxLQ.dst   = dst;
    757    return i;
    758 }
    759 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
    760                                 AMD64AMode* src, HReg dst ) {
    761    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    762    i->tag                = Ain_LoadEX;
    763    i->Ain.LoadEX.szSmall = szSmall;
    764    i->Ain.LoadEX.syned   = syned;
    765    i->Ain.LoadEX.src     = src;
    766    i->Ain.LoadEX.dst     = dst;
    767    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
    768    return i;
    769 }
    770 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
    771    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    772    i->tag           = Ain_Store;
    773    i->Ain.Store.sz  = sz;
    774    i->Ain.Store.src = src;
    775    i->Ain.Store.dst = dst;
    776    vassert(sz == 1 || sz == 2 || sz == 4);
    777    return i;
    778 }
    779 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
    780    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    781    i->tag            = Ain_Set64;
    782    i->Ain.Set64.cond = cond;
    783    i->Ain.Set64.dst  = dst;
    784    return i;
    785 }
    786 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
    787    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    788    i->tag               = Ain_Bsfr64;
    789    i->Ain.Bsfr64.isFwds = isFwds;
    790    i->Ain.Bsfr64.src    = src;
    791    i->Ain.Bsfr64.dst    = dst;
    792    return i;
    793 }
    794 AMD64Instr* AMD64Instr_MFence ( void ) {
    795    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    796    i->tag        = Ain_MFence;
    797    return i;
    798 }
    799 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
    800    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    801    i->tag           = Ain_ACAS;
    802    i->Ain.ACAS.addr = addr;
    803    i->Ain.ACAS.sz   = sz;
    804    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
    805    return i;
    806 }
    807 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
    808    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    809    i->tag            = Ain_DACAS;
    810    i->Ain.DACAS.addr = addr;
    811    i->Ain.DACAS.sz   = sz;
    812    vassert(sz == 8 || sz == 4);
    813    return i;
    814 }
    815 
    816 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
    817 {
    818    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    819    i->tag               = Ain_A87Free;
    820    i->Ain.A87Free.nregs = nregs;
    821    vassert(nregs >= 1 && nregs <= 7);
    822    return i;
    823 }
    824 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
    825 {
    826    AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    827    i->tag                   = Ain_A87PushPop;
    828    i->Ain.A87PushPop.addr   = addr;
    829    i->Ain.A87PushPop.isPush = isPush;
    830    i->Ain.A87PushPop.szB    = szB;
    831    vassert(szB == 8 || szB == 4);
    832    return i;
    833 }
    834 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
    835 {
    836    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    837    i->tag            = Ain_A87FpOp;
    838    i->Ain.A87FpOp.op = op;
    839    return i;
    840 }
    841 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
    842 {
    843    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    844    i->tag              = Ain_A87LdCW;
    845    i->Ain.A87LdCW.addr = addr;
    846    return i;
    847 }
    848 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
    849 {
    850    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    851    i->tag              = Ain_A87StSW;
    852    i->Ain.A87StSW.addr = addr;
    853    return i;
    854 }
    855 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
    856    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    857    i->tag                = Ain_LdMXCSR;
    858    i->Ain.LdMXCSR.addr   = addr;
    859    return i;
    860 }
    861 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
    862    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    863    i->tag                = Ain_SseUComIS;
    864    i->Ain.SseUComIS.sz   = toUChar(sz);
    865    i->Ain.SseUComIS.srcL = srcL;
    866    i->Ain.SseUComIS.srcR = srcR;
    867    i->Ain.SseUComIS.dst  = dst;
    868    vassert(sz == 4 || sz == 8);
    869    return i;
    870 }
    871 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
    872    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    873    i->tag              = Ain_SseSI2SF;
    874    i->Ain.SseSI2SF.szS = toUChar(szS);
    875    i->Ain.SseSI2SF.szD = toUChar(szD);
    876    i->Ain.SseSI2SF.src = src;
    877    i->Ain.SseSI2SF.dst = dst;
    878    vassert(szS == 4 || szS == 8);
    879    vassert(szD == 4 || szD == 8);
    880    return i;
    881 }
    882 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
    883    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    884    i->tag              = Ain_SseSF2SI;
    885    i->Ain.SseSF2SI.szS = toUChar(szS);
    886    i->Ain.SseSF2SI.szD = toUChar(szD);
    887    i->Ain.SseSF2SI.src = src;
    888    i->Ain.SseSF2SI.dst = dst;
    889    vassert(szS == 4 || szS == 8);
    890    vassert(szD == 4 || szD == 8);
    891    return i;
    892 }
    893 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
    894 {
    895    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    896    i->tag                = Ain_SseSDSS;
    897    i->Ain.SseSDSS.from64 = from64;
    898    i->Ain.SseSDSS.src    = src;
    899    i->Ain.SseSDSS.dst    = dst;
    900    return i;
    901 }
    902 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
    903                                  HReg reg, AMD64AMode* addr ) {
    904    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    905    i->tag                = Ain_SseLdSt;
    906    i->Ain.SseLdSt.isLoad = isLoad;
    907    i->Ain.SseLdSt.sz     = toUChar(sz);
    908    i->Ain.SseLdSt.reg    = reg;
    909    i->Ain.SseLdSt.addr   = addr;
    910    vassert(sz == 4 || sz == 8 || sz == 16);
    911    return i;
    912 }
    913 AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
    914                                    HReg src, AMD64AMode* addr )
    915 {
    916    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    917    i->tag                = Ain_SseCStore;
    918    i->Ain.SseCStore.cond = cond;
    919    i->Ain.SseCStore.src  = src;
    920    i->Ain.SseCStore.addr = addr;
    921    vassert(cond != Acc_ALWAYS);
    922    return i;
    923 }
    924 AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
    925                                   AMD64AMode* addr, HReg dst )
    926 {
    927    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    928    i->tag               = Ain_SseCLoad;
    929    i->Ain.SseCLoad.cond = cond;
    930    i->Ain.SseCLoad.addr = addr;
    931    i->Ain.SseCLoad.dst  = dst;
    932    vassert(cond != Acc_ALWAYS);
    933    return i;
    934 }
    935 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
    936 {
    937    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    938    i->tag                = Ain_SseLdzLO;
    939    i->Ain.SseLdzLO.sz    = sz;
    940    i->Ain.SseLdzLO.reg   = reg;
    941    i->Ain.SseLdzLO.addr  = addr;
    942    vassert(sz == 4 || sz == 8);
    943    return i;
    944 }
    945 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
    946    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    947    i->tag              = Ain_Sse32Fx4;
    948    i->Ain.Sse32Fx4.op  = op;
    949    i->Ain.Sse32Fx4.src = src;
    950    i->Ain.Sse32Fx4.dst = dst;
    951    vassert(op != Asse_MOV);
    952    return i;
    953 }
    954 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    955    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    956    i->tag              = Ain_Sse32FLo;
    957    i->Ain.Sse32FLo.op  = op;
    958    i->Ain.Sse32FLo.src = src;
    959    i->Ain.Sse32FLo.dst = dst;
    960    vassert(op != Asse_MOV);
    961    return i;
    962 }
    963 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
    964    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    965    i->tag              = Ain_Sse64Fx2;
    966    i->Ain.Sse64Fx2.op  = op;
    967    i->Ain.Sse64Fx2.src = src;
    968    i->Ain.Sse64Fx2.dst = dst;
    969    vassert(op != Asse_MOV);
    970    return i;
    971 }
    972 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
    973    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    974    i->tag              = Ain_Sse64FLo;
    975    i->Ain.Sse64FLo.op  = op;
    976    i->Ain.Sse64FLo.src = src;
    977    i->Ain.Sse64FLo.dst = dst;
    978    vassert(op != Asse_MOV);
    979    return i;
    980 }
    981 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
    982    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    983    i->tag             = Ain_SseReRg;
    984    i->Ain.SseReRg.op  = op;
    985    i->Ain.SseReRg.src = re;
    986    i->Ain.SseReRg.dst = rg;
    987    return i;
    988 }
    989 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
    990    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
    991    i->tag              = Ain_SseCMov;
    992    i->Ain.SseCMov.cond = cond;
    993    i->Ain.SseCMov.src  = src;
    994    i->Ain.SseCMov.dst  = dst;
    995    vassert(cond != Acc_ALWAYS);
    996    return i;
    997 }
    998 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
    999    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1000    i->tag               = Ain_SseShuf;
   1001    i->Ain.SseShuf.order = order;
   1002    i->Ain.SseShuf.src   = src;
   1003    i->Ain.SseShuf.dst   = dst;
   1004    vassert(order >= 0 && order <= 0xFF);
   1005    return i;
   1006 }
   1007 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
   1008 //uu                                  HReg reg, AMD64AMode* addr ) {
   1009 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1010 //uu    i->tag                = Ain_AvxLdSt;
   1011 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
   1012 //uu    i->Ain.AvxLdSt.reg    = reg;
   1013 //uu    i->Ain.AvxLdSt.addr   = addr;
   1014 //uu    return i;
   1015 //uu }
   1016 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
   1017 //uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1018 //uu    i->tag             = Ain_AvxReRg;
   1019 //uu    i->Ain.AvxReRg.op  = op;
   1020 //uu    i->Ain.AvxReRg.src = re;
   1021 //uu    i->Ain.AvxReRg.dst = rg;
   1022 //uu    return i;
   1023 //uu }
   1024 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
   1025                                  AMD64AMode* amFailAddr ) {
   1026    AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1027    i->tag                    = Ain_EvCheck;
   1028    i->Ain.EvCheck.amCounter  = amCounter;
   1029    i->Ain.EvCheck.amFailAddr = amFailAddr;
   1030    return i;
   1031 }
   1032 AMD64Instr* AMD64Instr_ProfInc ( void ) {
   1033    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
   1034    i->tag        = Ain_ProfInc;
   1035    return i;
   1036 }
   1037 
   1038 void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
   1039 {
   1040    vassert(mode64 == True);
   1041    switch (i->tag) {
   1042       case Ain_Imm64:
   1043          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
   1044          ppHRegAMD64(i->Ain.Imm64.dst);
   1045          return;
   1046       case Ain_Alu64R:
   1047          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
   1048          ppAMD64RMI(i->Ain.Alu64R.src);
   1049          vex_printf(",");
   1050          ppHRegAMD64(i->Ain.Alu64R.dst);
   1051          return;
   1052       case Ain_Alu64M:
   1053          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
   1054          ppAMD64RI(i->Ain.Alu64M.src);
   1055          vex_printf(",");
   1056          ppAMD64AMode(i->Ain.Alu64M.dst);
   1057          return;
   1058       case Ain_Sh64:
   1059          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
   1060          if (i->Ain.Sh64.src == 0)
   1061             vex_printf("%%cl,");
   1062          else
   1063             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
   1064          ppHRegAMD64(i->Ain.Sh64.dst);
   1065          return;
   1066       case Ain_Test64:
   1067          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
   1068          ppHRegAMD64(i->Ain.Test64.dst);
   1069          return;
   1070       case Ain_Unary64:
   1071          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
   1072          ppHRegAMD64(i->Ain.Unary64.dst);
   1073          return;
   1074       case Ain_Lea64:
   1075          vex_printf("leaq ");
   1076          ppAMD64AMode(i->Ain.Lea64.am);
   1077          vex_printf(",");
   1078          ppHRegAMD64(i->Ain.Lea64.dst);
   1079          return;
   1080       case Ain_Alu32R:
   1081          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
   1082          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
   1083          vex_printf(",");
   1084          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
   1085          return;
   1086       case Ain_MulL:
   1087          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
   1088          ppAMD64RM(i->Ain.MulL.src);
   1089          return;
   1090       case Ain_Div:
   1091          vex_printf("%cdiv%s ",
   1092                     i->Ain.Div.syned ? 's' : 'u',
   1093                     showAMD64ScalarSz(i->Ain.Div.sz));
   1094          ppAMD64RM(i->Ain.Div.src);
   1095          return;
   1096       case Ain_Push:
   1097          vex_printf("pushq ");
   1098          ppAMD64RMI(i->Ain.Push.src);
   1099          return;
   1100       case Ain_Call:
   1101          vex_printf("call%s[%d,",
   1102                     i->Ain.Call.cond==Acc_ALWAYS
   1103                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
   1104                     i->Ain.Call.regparms );
   1105          ppRetLoc(i->Ain.Call.rloc);
   1106          vex_printf("] 0x%llx", i->Ain.Call.target);
   1107          break;
   1108 
   1109       case Ain_XDirect:
   1110          vex_printf("(xDirect) ");
   1111          vex_printf("if (%%rflags.%s) { ",
   1112                     showAMD64CondCode(i->Ain.XDirect.cond));
   1113          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
   1114          vex_printf("movq %%r11,");
   1115          ppAMD64AMode(i->Ain.XDirect.amRIP);
   1116          vex_printf("; ");
   1117          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
   1118                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
   1119          return;
   1120       case Ain_XIndir:
   1121          vex_printf("(xIndir) ");
   1122          vex_printf("if (%%rflags.%s) { ",
   1123                     showAMD64CondCode(i->Ain.XIndir.cond));
   1124          vex_printf("movq ");
   1125          ppHRegAMD64(i->Ain.XIndir.dstGA);
   1126          vex_printf(",");
   1127          ppAMD64AMode(i->Ain.XIndir.amRIP);
   1128          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
   1129          return;
   1130       case Ain_XAssisted:
   1131          vex_printf("(xAssisted) ");
   1132          vex_printf("if (%%rflags.%s) { ",
   1133                     showAMD64CondCode(i->Ain.XAssisted.cond));
   1134          vex_printf("movq ");
   1135          ppHRegAMD64(i->Ain.XAssisted.dstGA);
   1136          vex_printf(",");
   1137          ppAMD64AMode(i->Ain.XAssisted.amRIP);
   1138          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
   1139                     (Int)i->Ain.XAssisted.jk);
   1140          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
   1141          return;
   1142 
   1143       case Ain_CMov64:
   1144          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
   1145          ppHRegAMD64(i->Ain.CMov64.src);
   1146          vex_printf(",");
   1147          ppHRegAMD64(i->Ain.CMov64.dst);
   1148          return;
   1149       case Ain_CLoad:
   1150          vex_printf("if (%%rflags.%s) { ",
   1151                     showAMD64CondCode(i->Ain.CLoad.cond));
   1152          vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
   1153          ppAMD64AMode(i->Ain.CLoad.addr);
   1154          vex_printf(", ");
   1155          (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1156             (i->Ain.CLoad.dst);
   1157          vex_printf(" }");
   1158          return;
   1159       case Ain_CStore:
   1160          vex_printf("if (%%rflags.%s) { ",
   1161                     showAMD64CondCode(i->Ain.CStore.cond));
   1162          vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
   1163          (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1164             (i->Ain.CStore.src);
   1165          vex_printf(", ");
   1166          ppAMD64AMode(i->Ain.CStore.addr);
   1167          vex_printf(" }");
   1168          return;
   1169 
   1170       case Ain_MovxLQ:
   1171          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
   1172          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
   1173          vex_printf(",");
   1174          ppHRegAMD64(i->Ain.MovxLQ.dst);
   1175          return;
   1176       case Ain_LoadEX:
   1177          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
   1178             vex_printf("movl ");
   1179             ppAMD64AMode(i->Ain.LoadEX.src);
   1180             vex_printf(",");
   1181             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
   1182          } else {
   1183             vex_printf("mov%c%cq ",
   1184                        i->Ain.LoadEX.syned ? 's' : 'z',
   1185                        i->Ain.LoadEX.szSmall==1
   1186                           ? 'b'
   1187                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
   1188             ppAMD64AMode(i->Ain.LoadEX.src);
   1189             vex_printf(",");
   1190             ppHRegAMD64(i->Ain.LoadEX.dst);
   1191          }
   1192          return;
   1193       case Ain_Store:
   1194          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
   1195                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
   1196          ppHRegAMD64(i->Ain.Store.src);
   1197          vex_printf(",");
   1198          ppAMD64AMode(i->Ain.Store.dst);
   1199          return;
   1200       case Ain_Set64:
   1201          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
   1202          ppHRegAMD64(i->Ain.Set64.dst);
   1203          return;
   1204       case Ain_Bsfr64:
   1205          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
   1206          ppHRegAMD64(i->Ain.Bsfr64.src);
   1207          vex_printf(",");
   1208          ppHRegAMD64(i->Ain.Bsfr64.dst);
   1209          return;
   1210       case Ain_MFence:
   1211          vex_printf("mfence" );
   1212          return;
   1213       case Ain_ACAS:
   1214          vex_printf("lock cmpxchg%c ",
   1215                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
   1216                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
   1217          vex_printf("{%%rax->%%rbx},");
   1218          ppAMD64AMode(i->Ain.ACAS.addr);
   1219          return;
   1220       case Ain_DACAS:
   1221          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
   1222                     (Int)(2 * i->Ain.DACAS.sz));
   1223          ppAMD64AMode(i->Ain.DACAS.addr);
   1224          return;
   1225       case Ain_A87Free:
   1226          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
   1227          break;
   1228       case Ain_A87PushPop:
   1229          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
   1230                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
   1231          ppAMD64AMode(i->Ain.A87PushPop.addr);
   1232          break;
   1233       case Ain_A87FpOp:
   1234          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
   1235          break;
   1236       case Ain_A87LdCW:
   1237          vex_printf("fldcw ");
   1238          ppAMD64AMode(i->Ain.A87LdCW.addr);
   1239          break;
   1240       case Ain_A87StSW:
   1241          vex_printf("fstsw ");
   1242          ppAMD64AMode(i->Ain.A87StSW.addr);
   1243          break;
   1244       case Ain_LdMXCSR:
   1245          vex_printf("ldmxcsr ");
   1246          ppAMD64AMode(i->Ain.LdMXCSR.addr);
   1247          break;
   1248       case Ain_SseUComIS:
   1249          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
   1250          ppHRegAMD64(i->Ain.SseUComIS.srcL);
   1251          vex_printf(",");
   1252          ppHRegAMD64(i->Ain.SseUComIS.srcR);
   1253          vex_printf(" ; pushfq ; popq ");
   1254          ppHRegAMD64(i->Ain.SseUComIS.dst);
   1255          break;
   1256       case Ain_SseSI2SF:
   1257          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
   1258          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1259             (i->Ain.SseSI2SF.src);
   1260          vex_printf(",");
   1261          ppHRegAMD64(i->Ain.SseSI2SF.dst);
   1262          break;
   1263       case Ain_SseSF2SI:
   1264          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
   1265          ppHRegAMD64(i->Ain.SseSF2SI.src);
   1266          vex_printf(",");
   1267          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
   1268             (i->Ain.SseSF2SI.dst);
   1269          break;
   1270       case Ain_SseSDSS:
   1271          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
   1272          ppHRegAMD64(i->Ain.SseSDSS.src);
   1273          vex_printf(",");
   1274          ppHRegAMD64(i->Ain.SseSDSS.dst);
   1275          break;
   1276       case Ain_SseLdSt:
   1277          switch (i->Ain.SseLdSt.sz) {
   1278             case 4:  vex_printf("movss "); break;
   1279             case 8:  vex_printf("movsd "); break;
   1280             case 16: vex_printf("movups "); break;
   1281             default: vassert(0);
   1282          }
   1283          if (i->Ain.SseLdSt.isLoad) {
   1284             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1285             vex_printf(",");
   1286             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1287          } else {
   1288             ppHRegAMD64(i->Ain.SseLdSt.reg);
   1289             vex_printf(",");
   1290             ppAMD64AMode(i->Ain.SseLdSt.addr);
   1291          }
   1292          return;
   1293       case Ain_SseCStore:
   1294          vex_printf("if (%%rflags.%s) { ",
   1295                     showAMD64CondCode(i->Ain.SseCStore.cond));
   1296          vex_printf("movups ");
   1297          ppHRegAMD64(i->Ain.SseCStore.src);
   1298          vex_printf(", ");
   1299          ppAMD64AMode(i->Ain.SseCStore.addr);
   1300          vex_printf(" }");
   1301          return;
   1302       case Ain_SseCLoad:
   1303          vex_printf("if (%%rflags.%s) { ",
   1304                     showAMD64CondCode(i->Ain.SseCLoad.cond));
   1305          vex_printf("movups ");
   1306          ppAMD64AMode(i->Ain.SseCLoad.addr);
   1307          vex_printf(", ");
   1308          ppHRegAMD64(i->Ain.SseCLoad.dst);
   1309          vex_printf(" }");
   1310          return;
   1311       case Ain_SseLdzLO:
   1312          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
   1313          ppAMD64AMode(i->Ain.SseLdzLO.addr);
   1314          vex_printf(",");
   1315          ppHRegAMD64(i->Ain.SseLdzLO.reg);
   1316          return;
   1317       case Ain_Sse32Fx4:
   1318          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
   1319          ppHRegAMD64(i->Ain.Sse32Fx4.src);
   1320          vex_printf(",");
   1321          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
   1322          return;
   1323       case Ain_Sse32FLo:
   1324          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
   1325          ppHRegAMD64(i->Ain.Sse32FLo.src);
   1326          vex_printf(",");
   1327          ppHRegAMD64(i->Ain.Sse32FLo.dst);
   1328          return;
   1329       case Ain_Sse64Fx2:
   1330          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
   1331          ppHRegAMD64(i->Ain.Sse64Fx2.src);
   1332          vex_printf(",");
   1333          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
   1334          return;
   1335       case Ain_Sse64FLo:
   1336          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
   1337          ppHRegAMD64(i->Ain.Sse64FLo.src);
   1338          vex_printf(",");
   1339          ppHRegAMD64(i->Ain.Sse64FLo.dst);
   1340          return;
   1341       case Ain_SseReRg:
   1342          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1343          ppHRegAMD64(i->Ain.SseReRg.src);
   1344          vex_printf(",");
   1345          ppHRegAMD64(i->Ain.SseReRg.dst);
   1346          return;
   1347       case Ain_SseCMov:
   1348          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
   1349          ppHRegAMD64(i->Ain.SseCMov.src);
   1350          vex_printf(",");
   1351          ppHRegAMD64(i->Ain.SseCMov.dst);
   1352          return;
   1353       case Ain_SseShuf:
   1354          vex_printf("pshufd $0x%x,", (UInt)i->Ain.SseShuf.order);
   1355          ppHRegAMD64(i->Ain.SseShuf.src);
   1356          vex_printf(",");
   1357          ppHRegAMD64(i->Ain.SseShuf.dst);
   1358          return;
   1359       //uu case Ain_AvxLdSt:
   1360       //uu    vex_printf("vmovups ");
   1361       //uu    if (i->Ain.AvxLdSt.isLoad) {
   1362       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
   1363       //uu       vex_printf(",");
   1364       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
   1365       //uu    } else {
   1366       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
   1367       //uu       vex_printf(",");
   1368       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
   1369       //uu    }
   1370       //uu    return;
   1371       //uu case Ain_AvxReRg:
   1372       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
   1373       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
   1374       //uu    vex_printf(",");
   1375       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
   1376       //uu    return;
   1377       case Ain_EvCheck:
   1378          vex_printf("(evCheck) decl ");
   1379          ppAMD64AMode(i->Ain.EvCheck.amCounter);
   1380          vex_printf("; jns nofail; jmp *");
   1381          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
   1382          vex_printf("; nofail:");
   1383          return;
   1384       case Ain_ProfInc:
   1385          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
   1386          return;
   1387       default:
   1388          vpanic("ppAMD64Instr");
   1389    }
   1390 }
   1391 
   1392 /* --------- Helpers for register allocation. --------- */
   1393 
   1394 void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
   1395 {
   1396    Bool unary;
   1397    vassert(mode64 == True);
   1398    initHRegUsage(u);
   1399    switch (i->tag) {
   1400       case Ain_Imm64:
   1401          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
   1402          return;
   1403       case Ain_Alu64R:
   1404          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
   1405          if (i->Ain.Alu64R.op == Aalu_MOV) {
   1406             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
   1407             return;
   1408          }
   1409          if (i->Ain.Alu64R.op == Aalu_CMP) {
   1410             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
   1411             return;
   1412          }
   1413          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
   1414          return;
   1415       case Ain_Alu64M:
   1416          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
   1417          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
   1418          return;
   1419       case Ain_Sh64:
   1420          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
   1421          if (i->Ain.Sh64.src == 0)
   1422             addHRegUse(u, HRmRead, hregAMD64_RCX());
   1423          return;
   1424       case Ain_Test64:
   1425          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
   1426          return;
   1427       case Ain_Unary64:
   1428          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
   1429          return;
   1430       case Ain_Lea64:
   1431          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
   1432          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
   1433          return;
   1434       case Ain_Alu32R:
   1435          vassert(i->Ain.Alu32R.op != Aalu_MOV);
   1436          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
   1437          if (i->Ain.Alu32R.op == Aalu_CMP) {
   1438             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
   1439             return;
   1440          }
   1441          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
   1442          return;
   1443       case Ain_MulL:
   1444          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
   1445          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1446          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1447          return;
   1448       case Ain_Div:
   1449          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
   1450          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1451          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1452          return;
   1453       case Ain_Push:
   1454          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
   1455          addHRegUse(u, HRmModify, hregAMD64_RSP());
   1456          return;
   1457       case Ain_Call:
   1458          /* This is a bit subtle. */
   1459          /* First off, claim it trashes all the caller-saved regs
   1460             which fall within the register allocator's jurisdiction.
   1461             These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11
   1462             and all the xmm registers.
   1463          */
   1464          addHRegUse(u, HRmWrite, hregAMD64_RAX());
   1465          addHRegUse(u, HRmWrite, hregAMD64_RCX());
   1466          addHRegUse(u, HRmWrite, hregAMD64_RDX());
   1467          addHRegUse(u, HRmWrite, hregAMD64_RSI());
   1468          addHRegUse(u, HRmWrite, hregAMD64_RDI());
   1469          addHRegUse(u, HRmWrite, hregAMD64_R8());
   1470          addHRegUse(u, HRmWrite, hregAMD64_R9());
   1471          addHRegUse(u, HRmWrite, hregAMD64_R10());
   1472          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1473          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
   1474          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
   1475          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
   1476          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
   1477          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
   1478          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
   1479          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
   1480          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
   1481          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
   1482          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
   1483          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
   1484          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
   1485 
   1486          /* Now we have to state any parameter-carrying registers
   1487             which might be read.  This depends on the regparmness. */
   1488          switch (i->Ain.Call.regparms) {
   1489             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
   1490             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
   1491             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
   1492             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
   1493             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
   1494             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
   1495             case 0: break;
   1496             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
   1497          }
   1498          /* Finally, there is the issue that the insn trashes a
   1499             register because the literal target address has to be
   1500             loaded into a register.  Fortunately, r11 is stated in the
   1501             ABI as a scratch register, and so seems a suitable victim.  */
   1502          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1503          /* Upshot of this is that the assembler really must use r11,
   1504             and no other, as a destination temporary. */
   1505          return;
   1506       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
   1507          conditionally exit the block.  Hence we only need to list (1)
   1508          the registers that they read, and (2) the registers that they
   1509          write in the case where the block is not exited.  (2) is
   1510          empty, hence only (1) is relevant here. */
   1511       case Ain_XDirect:
   1512          /* Don't bother to mention the write to %r11, since it is not
   1513             available to the allocator. */
   1514          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
   1515          return;
   1516       case Ain_XIndir:
   1517          /* Ditto re %r11 */
   1518          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
   1519          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
   1520          return;
   1521       case Ain_XAssisted:
   1522          /* Ditto re %r11 and %rbp (the baseblock ptr) */
   1523          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
   1524          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
   1525          return;
   1526       case Ain_CMov64:
   1527          addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
   1528          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
   1529          return;
   1530       case Ain_CLoad:
   1531          addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
   1532          addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
   1533          return;
   1534       case Ain_CStore:
   1535          addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
   1536          addHRegUse(u, HRmRead, i->Ain.CStore.src);
   1537          return;
   1538       case Ain_MovxLQ:
   1539          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
   1540          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
   1541          return;
   1542       case Ain_LoadEX:
   1543          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
   1544          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
   1545          return;
   1546       case Ain_Store:
   1547          addHRegUse(u, HRmRead, i->Ain.Store.src);
   1548          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
   1549          return;
   1550       case Ain_Set64:
   1551          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
   1552          return;
   1553       case Ain_Bsfr64:
   1554          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
   1555          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
   1556          return;
   1557       case Ain_MFence:
   1558          return;
   1559       case Ain_ACAS:
   1560          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
   1561          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1562          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1563          return;
   1564       case Ain_DACAS:
   1565          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
   1566          addHRegUse(u, HRmRead, hregAMD64_RCX());
   1567          addHRegUse(u, HRmRead, hregAMD64_RBX());
   1568          addHRegUse(u, HRmModify, hregAMD64_RDX());
   1569          addHRegUse(u, HRmModify, hregAMD64_RAX());
   1570          return;
   1571       case Ain_A87Free:
   1572          return;
   1573       case Ain_A87PushPop:
   1574          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
   1575          return;
   1576       case Ain_A87FpOp:
   1577          return;
   1578       case Ain_A87LdCW:
   1579          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
   1580          return;
   1581       case Ain_A87StSW:
   1582          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
   1583          return;
   1584       case Ain_LdMXCSR:
   1585          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
   1586          return;
   1587       case Ain_SseUComIS:
   1588          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
   1589          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
   1590          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
   1591          return;
   1592       case Ain_SseSI2SF:
   1593          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
   1594          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
   1595          return;
   1596       case Ain_SseSF2SI:
   1597          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
   1598          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
   1599          return;
   1600       case Ain_SseSDSS:
   1601          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
   1602          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
   1603          return;
   1604       case Ain_SseLdSt:
   1605          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
   1606          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
   1607                        i->Ain.SseLdSt.reg);
   1608          return;
   1609       case Ain_SseCStore:
   1610          addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
   1611          addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
   1612          return;
   1613       case Ain_SseCLoad:
   1614          addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
   1615          addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
   1616          return;
   1617       case Ain_SseLdzLO:
   1618          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
   1619          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
   1620          return;
   1621       case Ain_Sse32Fx4:
   1622          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
   1623          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
   1624                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
   1625                          || i->Ain.Sse32Fx4.op == Asse_SQRTF );
   1626          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
   1627          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1628                        i->Ain.Sse32Fx4.dst);
   1629          return;
   1630       case Ain_Sse32FLo:
   1631          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
   1632          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
   1633                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
   1634                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
   1635          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
   1636          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1637                        i->Ain.Sse32FLo.dst);
   1638          return;
   1639       case Ain_Sse64Fx2:
   1640          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
   1641          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
   1642                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
   1643                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
   1644          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
   1645          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1646                        i->Ain.Sse64Fx2.dst);
   1647          return;
   1648       case Ain_Sse64FLo:
   1649          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
   1650          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
   1651                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
   1652                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
   1653          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
   1654          addHRegUse(u, unary ? HRmWrite : HRmModify,
   1655                        i->Ain.Sse64FLo.dst);
   1656          return;
   1657       case Ain_SseReRg:
   1658          if ( (i->Ain.SseReRg.op == Asse_XOR
   1659                || i->Ain.SseReRg.op == Asse_CMPEQ32)
   1660               && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
   1661             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
   1662                r,r' as a write of a value to r, and independent of any
   1663                previous value in r */
   1664             /* (as opposed to a rite of passage :-) */
   1665             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
   1666          } else {
   1667             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
   1668             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
   1669                              ? HRmWrite : HRmModify,
   1670                           i->Ain.SseReRg.dst);
   1671          }
   1672          return;
   1673       case Ain_SseCMov:
   1674          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
   1675          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
   1676          return;
   1677       case Ain_SseShuf:
   1678          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
   1679          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
   1680          return;
   1681       //uu case Ain_AvxLdSt:
   1682       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
   1683       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
   1684       //uu               i->Ain.AvxLdSt.reg);
   1685       //uu return;
   1686       //uu case Ain_AvxReRg:
   1687       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
   1688       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
   1689       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
   1690       //uu       /* See comments on the case for Ain_SseReRg. */
   1691       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
   1692       //uu    } else {
   1693       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
   1694       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
   1695       //uu                        ? HRmWrite : HRmModify,
   1696       //uu                     i->Ain.AvxReRg.dst);
   1697       //uu    }
   1698       //uu    return;
   1699       case Ain_EvCheck:
   1700          /* We expect both amodes only to mention %rbp, so this is in
   1701             fact pointless, since %rbp isn't allocatable, but anyway.. */
   1702          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
   1703          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
   1704          return;
   1705       case Ain_ProfInc:
   1706          addHRegUse(u, HRmWrite, hregAMD64_R11());
   1707          return;
   1708       default:
   1709          ppAMD64Instr(i, mode64);
   1710          vpanic("getRegUsage_AMD64Instr");
   1711    }
   1712 }
   1713 
   1714 /* local helper */
   1715 static inline void mapReg(HRegRemap* m, HReg* r)
   1716 {
   1717    *r = lookupHRegRemap(m, *r);
   1718 }
   1719 
   1720 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
   1721 {
   1722    vassert(mode64 == True);
   1723    switch (i->tag) {
   1724       case Ain_Imm64:
   1725          mapReg(m, &i->Ain.Imm64.dst);
   1726          return;
   1727       case Ain_Alu64R:
   1728          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
   1729          mapReg(m, &i->Ain.Alu64R.dst);
   1730          return;
   1731       case Ain_Alu64M:
   1732          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
   1733          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
   1734          return;
   1735       case Ain_Sh64:
   1736          mapReg(m, &i->Ain.Sh64.dst);
   1737          return;
   1738       case Ain_Test64:
   1739          mapReg(m, &i->Ain.Test64.dst);
   1740          return;
   1741       case Ain_Unary64:
   1742          mapReg(m, &i->Ain.Unary64.dst);
   1743          return;
   1744       case Ain_Lea64:
   1745          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
   1746          mapReg(m, &i->Ain.Lea64.dst);
   1747          return;
   1748       case Ain_Alu32R:
   1749          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
   1750          mapReg(m, &i->Ain.Alu32R.dst);
   1751          return;
   1752       case Ain_MulL:
   1753          mapRegs_AMD64RM(m, i->Ain.MulL.src);
   1754          return;
   1755       case Ain_Div:
   1756          mapRegs_AMD64RM(m, i->Ain.Div.src);
   1757          return;
   1758       case Ain_Push:
   1759          mapRegs_AMD64RMI(m, i->Ain.Push.src);
   1760          return;
   1761       case Ain_Call:
   1762          return;
   1763       case Ain_XDirect:
   1764          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
   1765          return;
   1766       case Ain_XIndir:
   1767          mapReg(m, &i->Ain.XIndir.dstGA);
   1768          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
   1769          return;
   1770       case Ain_XAssisted:
   1771          mapReg(m, &i->Ain.XAssisted.dstGA);
   1772          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
   1773          return;
   1774       case Ain_CMov64:
   1775          mapReg(m, &i->Ain.CMov64.src);
   1776          mapReg(m, &i->Ain.CMov64.dst);
   1777          return;
   1778       case Ain_CLoad:
   1779          mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
   1780          mapReg(m, &i->Ain.CLoad.dst);
   1781          return;
   1782       case Ain_CStore:
   1783          mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
   1784          mapReg(m, &i->Ain.CStore.src);
   1785          return;
   1786       case Ain_MovxLQ:
   1787          mapReg(m, &i->Ain.MovxLQ.src);
   1788          mapReg(m, &i->Ain.MovxLQ.dst);
   1789          return;
   1790       case Ain_LoadEX:
   1791          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
   1792          mapReg(m, &i->Ain.LoadEX.dst);
   1793          return;
   1794       case Ain_Store:
   1795          mapReg(m, &i->Ain.Store.src);
   1796          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
   1797          return;
   1798       case Ain_Set64:
   1799          mapReg(m, &i->Ain.Set64.dst);
   1800          return;
   1801       case Ain_Bsfr64:
   1802          mapReg(m, &i->Ain.Bsfr64.src);
   1803          mapReg(m, &i->Ain.Bsfr64.dst);
   1804          return;
   1805       case Ain_MFence:
   1806          return;
   1807       case Ain_ACAS:
   1808          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
   1809          return;
   1810       case Ain_DACAS:
   1811          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
   1812          return;
   1813       case Ain_A87Free:
   1814          return;
   1815       case Ain_A87PushPop:
   1816          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
   1817          return;
   1818       case Ain_A87FpOp:
   1819          return;
   1820       case Ain_A87LdCW:
   1821          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
   1822          return;
   1823       case Ain_A87StSW:
   1824          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
   1825          return;
   1826       case Ain_LdMXCSR:
   1827          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
   1828          return;
   1829       case Ain_SseUComIS:
   1830          mapReg(m, &i->Ain.SseUComIS.srcL);
   1831          mapReg(m, &i->Ain.SseUComIS.srcR);
   1832          mapReg(m, &i->Ain.SseUComIS.dst);
   1833          return;
   1834       case Ain_SseSI2SF:
   1835          mapReg(m, &i->Ain.SseSI2SF.src);
   1836          mapReg(m, &i->Ain.SseSI2SF.dst);
   1837          return;
   1838       case Ain_SseSF2SI:
   1839          mapReg(m, &i->Ain.SseSF2SI.src);
   1840          mapReg(m, &i->Ain.SseSF2SI.dst);
   1841          return;
   1842       case Ain_SseSDSS:
   1843          mapReg(m, &i->Ain.SseSDSS.src);
   1844          mapReg(m, &i->Ain.SseSDSS.dst);
   1845          return;
   1846       case Ain_SseLdSt:
   1847          mapReg(m, &i->Ain.SseLdSt.reg);
   1848          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
   1849          break;
   1850       case Ain_SseCStore:
   1851          mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
   1852          mapReg(m, &i->Ain.SseCStore.src);
   1853          return;
   1854       case Ain_SseCLoad:
   1855          mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
   1856          mapReg(m, &i->Ain.SseCLoad.dst);
   1857          return;
   1858       case Ain_SseLdzLO:
   1859          mapReg(m, &i->Ain.SseLdzLO.reg);
   1860          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
   1861          break;
   1862       case Ain_Sse32Fx4:
   1863          mapReg(m, &i->Ain.Sse32Fx4.src);
   1864          mapReg(m, &i->Ain.Sse32Fx4.dst);
   1865          return;
   1866       case Ain_Sse32FLo:
   1867          mapReg(m, &i->Ain.Sse32FLo.src);
   1868          mapReg(m, &i->Ain.Sse32FLo.dst);
   1869          return;
   1870       case Ain_Sse64Fx2:
   1871          mapReg(m, &i->Ain.Sse64Fx2.src);
   1872          mapReg(m, &i->Ain.Sse64Fx2.dst);
   1873          return;
   1874       case Ain_Sse64FLo:
   1875          mapReg(m, &i->Ain.Sse64FLo.src);
   1876          mapReg(m, &i->Ain.Sse64FLo.dst);
   1877          return;
   1878       case Ain_SseReRg:
   1879          mapReg(m, &i->Ain.SseReRg.src);
   1880          mapReg(m, &i->Ain.SseReRg.dst);
   1881          return;
   1882       case Ain_SseCMov:
   1883          mapReg(m, &i->Ain.SseCMov.src);
   1884          mapReg(m, &i->Ain.SseCMov.dst);
   1885          return;
   1886       case Ain_SseShuf:
   1887          mapReg(m, &i->Ain.SseShuf.src);
   1888          mapReg(m, &i->Ain.SseShuf.dst);
   1889          return;
   1890       //uu case Ain_AvxLdSt:
   1891       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
   1892       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
   1893       //uu    break;
   1894       //uu case Ain_AvxReRg:
   1895       //uu    mapReg(m, &i->Ain.AvxReRg.src);
   1896       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
   1897       //uu    return;
   1898       case Ain_EvCheck:
   1899          /* We expect both amodes only to mention %rbp, so this is in
   1900             fact pointless, since %rbp isn't allocatable, but anyway.. */
   1901          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
   1902          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
   1903          return;
   1904       case Ain_ProfInc:
   1905          /* hardwires r11 -- nothing to modify. */
   1906          return;
   1907       default:
   1908          ppAMD64Instr(i, mode64);
   1909          vpanic("mapRegs_AMD64Instr");
   1910    }
   1911 }
   1912 
   1913 /* Figure out if i represents a reg-reg move, and if so assign the
   1914    source and destination to *src and *dst.  If in doubt say No.  Used
   1915    by the register allocator to do move coalescing.
   1916 */
   1917 Bool isMove_AMD64Instr ( const AMD64Instr* i, HReg* src, HReg* dst )
   1918 {
   1919    switch (i->tag) {
   1920       case Ain_Alu64R:
   1921          /* Moves between integer regs */
   1922          if (i->Ain.Alu64R.op != Aalu_MOV)
   1923             return False;
   1924          if (i->Ain.Alu64R.src->tag != Armi_Reg)
   1925             return False;
   1926          *src = i->Ain.Alu64R.src->Armi.Reg.reg;
   1927          *dst = i->Ain.Alu64R.dst;
   1928          return True;
   1929       case Ain_SseReRg:
   1930          /* Moves between SSE regs */
   1931          if (i->Ain.SseReRg.op != Asse_MOV)
   1932             return False;
   1933          *src = i->Ain.SseReRg.src;
   1934          *dst = i->Ain.SseReRg.dst;
   1935          return True;
   1936       //uu case Ain_AvxReRg:
   1937       //uu    /* Moves between AVX regs */
   1938       //uu    if (i->Ain.AvxReRg.op != Asse_MOV)
   1939       //uu       return False;
   1940       //uu    *src = i->Ain.AvxReRg.src;
   1941       //uu    *dst = i->Ain.AvxReRg.dst;
   1942       //uu    return True;
   1943       default:
   1944          return False;
   1945    }
   1946    /*NOTREACHED*/
   1947 }
   1948 
   1949 
   1950 /* Generate amd64 spill/reload instructions under the direction of the
   1951    register allocator.  Note it's critical these don't write the
   1952    condition codes. */
   1953 
   1954 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1955                       HReg rreg, Int offsetB, Bool mode64 )
   1956 {
   1957    AMD64AMode* am;
   1958    vassert(offsetB >= 0);
   1959    vassert(!hregIsVirtual(rreg));
   1960    vassert(mode64 == True);
   1961    *i1 = *i2 = NULL;
   1962    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1963    switch (hregClass(rreg)) {
   1964       case HRcInt64:
   1965          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
   1966          return;
   1967       case HRcVec128:
   1968          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
   1969          return;
   1970       default:
   1971          ppHRegClass(hregClass(rreg));
   1972          vpanic("genSpill_AMD64: unimplemented regclass");
   1973    }
   1974 }
   1975 
   1976 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
   1977                        HReg rreg, Int offsetB, Bool mode64 )
   1978 {
   1979    AMD64AMode* am;
   1980    vassert(offsetB >= 0);
   1981    vassert(!hregIsVirtual(rreg));
   1982    vassert(mode64 == True);
   1983    *i1 = *i2 = NULL;
   1984    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
   1985    switch (hregClass(rreg)) {
   1986       case HRcInt64:
   1987          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
   1988          return;
   1989       case HRcVec128:
   1990          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
   1991          return;
   1992       default:
   1993          ppHRegClass(hregClass(rreg));
   1994          vpanic("genReload_AMD64: unimplemented regclass");
   1995    }
   1996 }
   1997 
   1998 
   1999 /* --------- The amd64 assembler (bleh.) --------- */
   2000 
   2001 /* Produce the low three bits of an integer register number. */
   2002 inline static UInt iregEnc210 ( HReg r )
   2003 {
   2004    UInt n;
   2005    vassert(hregClass(r) == HRcInt64);
   2006    vassert(!hregIsVirtual(r));
   2007    n = hregEncoding(r);
   2008    vassert(n <= 15);
   2009    return n & 7;
   2010 }
   2011 
   2012 /* Produce bit 3 of an integer register number. */
   2013 inline static UInt iregEnc3 ( HReg r )
   2014 {
   2015    UInt n;
   2016    vassert(hregClass(r) == HRcInt64);
   2017    vassert(!hregIsVirtual(r));
   2018    n = hregEncoding(r);
   2019    vassert(n <= 15);
   2020    return (n >> 3) & 1;
   2021 }
   2022 
   2023 /* Produce a complete 4-bit integer register number. */
   2024 inline static UInt iregEnc3210 ( HReg r )
   2025 {
   2026    UInt n;
   2027    vassert(hregClass(r) == HRcInt64);
   2028    vassert(!hregIsVirtual(r));
   2029    n = hregEncoding(r);
   2030    vassert(n <= 15);
   2031    return n;
   2032 }
   2033 
   2034 /* Produce a complete 4-bit integer register number. */
   2035 inline static UInt vregEnc3210 ( HReg r )
   2036 {
   2037    UInt n;
   2038    vassert(hregClass(r) == HRcVec128);
   2039    vassert(!hregIsVirtual(r));
   2040    n = hregEncoding(r);
   2041    vassert(n <= 15);
   2042    return n;
   2043 }
   2044 
   2045 inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
   2046 {
   2047    vassert(mod < 4);
   2048    vassert((reg|regmem) < 8);
   2049    return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
   2050 }
   2051 
   2052 inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
   2053 {
   2054    vassert(shift < 4);
   2055    vassert((regindex|regbase) < 8);
   2056    return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
   2057 }
   2058 
   2059 static UChar* emit32 ( UChar* p, UInt w32 )
   2060 {
   2061    *p++ = toUChar((w32)       & 0x000000FF);
   2062    *p++ = toUChar((w32 >>  8) & 0x000000FF);
   2063    *p++ = toUChar((w32 >> 16) & 0x000000FF);
   2064    *p++ = toUChar((w32 >> 24) & 0x000000FF);
   2065    return p;
   2066 }
   2067 
   2068 static UChar* emit64 ( UChar* p, ULong w64 )
   2069 {
   2070    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
   2071    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
   2072    return p;
   2073 }
   2074 
   2075 /* Does a sign-extend of the lowest 8 bits give
   2076    the original number? */
   2077 static Bool fits8bits ( UInt w32 )
   2078 {
   2079    Int i32 = (Int)w32;
   2080    return toBool(i32 == ((Int)(w32 << 24) >> 24));
   2081 }
   2082 /* Can the lower 32 bits be signedly widened to produce the whole
   2083    64-bit value?  In other words, are the top 33 bits either all 0 or
   2084    all 1 ? */
   2085 static Bool fitsIn32Bits ( ULong x )
   2086 {
   2087    Long y1;
   2088    y1 = x << 32;
   2089    y1 >>=/*s*/ 32;
   2090    return toBool(x == y1);
   2091 }
   2092 
   2093 
   2094 /* Forming mod-reg-rm bytes and scale-index-base bytes.
   2095 
   2096      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
   2097                        =  00 greg ereg
   2098 
   2099      greg,  d8(ereg)   |  ereg is neither of: RSP R12
   2100                        =  01 greg ereg, d8
   2101 
   2102      greg,  d32(ereg)  |  ereg is neither of: RSP R12
   2103                        =  10 greg ereg, d32
   2104 
   2105      greg,  d8(ereg)   |  ereg is either: RSP R12
   2106                        =  01 greg 100, 0x24, d8
   2107                        (lowest bit of rex distinguishes R12/RSP)
   2108 
   2109      greg,  d32(ereg)  |  ereg is either: RSP R12
   2110                        =  10 greg 100, 0x24, d32
   2111                        (lowest bit of rex distinguishes R12/RSP)
   2112 
   2113      -----------------------------------------------
   2114 
   2115      greg,  d8(base,index,scale)
   2116                |  index != RSP
   2117                =  01 greg 100, scale index base, d8
   2118 
   2119      greg,  d32(base,index,scale)
   2120                |  index != RSP
   2121                =  10 greg 100, scale index base, d32
   2122 */
   2123 static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
   2124 {
   2125    UInt gregEnc210 = gregEnc3210 & 7;
   2126    if (am->tag == Aam_IR) {
   2127       if (am->Aam.IR.imm == 0
   2128           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2129           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
   2130           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2131           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
   2132          ) {
   2133          *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
   2134          return p;
   2135       }
   2136       if (fits8bits(am->Aam.IR.imm)
   2137           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2138           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2139          ) {
   2140          *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
   2141          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2142          return p;
   2143       }
   2144       if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2145           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
   2146          ) {
   2147          *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
   2148          p = emit32(p, am->Aam.IR.imm);
   2149          return p;
   2150       }
   2151       if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2152            || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
   2153           && fits8bits(am->Aam.IR.imm)) {
   2154  	 *p++ = mkModRegRM(1, gregEnc210, 4);
   2155          *p++ = 0x24;
   2156          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
   2157          return p;
   2158       }
   2159       if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
   2160 	      || wait for test case for RSP case */
   2161           sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
   2162  	 *p++ = mkModRegRM(2, gregEnc210, 4);
   2163          *p++ = 0x24;
   2164          p = emit32(p, am->Aam.IR.imm);
   2165          return p;
   2166       }
   2167       ppAMD64AMode(am);
   2168       vpanic("doAMode_M: can't emit amode IR");
   2169       /*NOTREACHED*/
   2170    }
   2171    if (am->tag == Aam_IRRS) {
   2172       if (fits8bits(am->Aam.IRRS.imm)
   2173           && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
   2174          *p++ = mkModRegRM(1, gregEnc210, 4);
   2175          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
   2176                                           iregEnc210(am->Aam.IRRS.base));
   2177          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
   2178          return p;
   2179       }
   2180       if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
   2181          *p++ = mkModRegRM(2, gregEnc210, 4);
   2182          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
   2183                                           iregEnc210(am->Aam.IRRS.base));
   2184          p = emit32(p, am->Aam.IRRS.imm);
   2185          return p;
   2186       }
   2187       ppAMD64AMode(am);
   2188       vpanic("doAMode_M: can't emit amode IRRS");
   2189       /*NOTREACHED*/
   2190    }
   2191    vpanic("doAMode_M: unknown amode");
   2192    /*NOTREACHED*/
   2193 }
   2194 
   2195 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
   2196 {
   2197    return doAMode_M__wrk(p, iregEnc3210(greg), am);
   2198 }
   2199 
   2200 static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
   2201 {
   2202    vassert(gregEnc3210 < 16);
   2203    return doAMode_M__wrk(p, gregEnc3210, am);
   2204 }
   2205 
   2206 
   2207 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
   2208 inline
   2209 static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
   2210 {
   2211    *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
   2212    return p;
   2213 }
   2214 
   2215 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
   2216 {
   2217    return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
   2218 }
   2219 
   2220 static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
   2221 {
   2222    vassert(gregEnc3210 < 16);
   2223    return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
   2224 }
   2225 
   2226 static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
   2227 {
   2228    vassert(eregEnc3210 < 16);
   2229    return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
   2230 }
   2231 
   2232 static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
   2233 {
   2234    vassert( (gregEnc3210|eregEnc3210) < 16);
   2235    return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
   2236 }
   2237 
   2238 
   2239 /* Clear the W bit on a REX byte, thereby changing the operand size
   2240    back to whatever that instruction's default operand size is. */
   2241 static inline UChar clearWBit ( UChar rex )
   2242 {
   2243    return rex & ~(1<<3);
   2244 }
   2245 
   2246 
   2247 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
   2248 inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
   2249 {
   2250    if (am->tag == Aam_IR) {
   2251       UChar W = 1;  /* we want 64-bit mode */
   2252       UChar R = (gregEnc3210 >> 3) & 1;
   2253       UChar X = 0; /* not relevant */
   2254       UChar B = iregEnc3(am->Aam.IR.reg);
   2255       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
   2256    }
   2257    if (am->tag == Aam_IRRS) {
   2258       UChar W = 1;  /* we want 64-bit mode */
   2259       UChar R = (gregEnc3210 >> 3) & 1;
   2260       UChar X = iregEnc3(am->Aam.IRRS.index);
   2261       UChar B = iregEnc3(am->Aam.IRRS.base);
   2262       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
   2263    }
   2264    vassert(0);
   2265    return 0; /*NOTREACHED*/
   2266 }
   2267 
   2268 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
   2269 {
   2270    return rexAMode_M__wrk(iregEnc3210(greg), am);
   2271 }
   2272 
   2273 static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
   2274 {
   2275    vassert(gregEnc3210 < 16);
   2276    return rexAMode_M__wrk(gregEnc3210, am);
   2277 }
   2278 
   2279 
   2280 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
   2281 inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
   2282 {
   2283    UChar W = 1;  /* we want 64-bit mode */
   2284    UChar R = (gregEnc3210 >> 3) & 1;
   2285    UChar X = 0; /* not relevant */
   2286    UChar B = (eregEnc3210 >> 3) & 1;
   2287    return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
   2288 }
   2289 
   2290 static UChar rexAMode_R ( HReg greg, HReg ereg )
   2291 {
   2292    return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
   2293 }
   2294 
   2295 static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
   2296 {
   2297    vassert(gregEnc3210 < 16);
   2298    return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
   2299 }
   2300 
   2301 static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
   2302 {
   2303    vassert(eregEnc3210 < 16);
   2304    return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
   2305 }
   2306 
   2307 static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
   2308 {
   2309    vassert((gregEnc3210|eregEnc3210) < 16);
   2310    return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
   2311 }
   2312 
   2313 
   2314 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
   2315 //uu    verified correct (I reckon).  Certainly it has been known to
   2316 //uu    produce correct VEX prefixes during testing. */
   2317 //uu
   2318 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
   2319 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
   2320 //uu    in verbatim.  There's no range checking on the bits. */
   2321 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
   2322 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
   2323 //uu                             UInt L, UInt pp )
   2324 //uu {
   2325 //uu    UChar byte0 = 0;
   2326 //uu    UChar byte1 = 0;
   2327 //uu    UChar byte2 = 0;
   2328 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
   2329 //uu       /* 2 byte encoding is possible. */
   2330 //uu       byte0 = 0xC5;
   2331 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
   2332 //uu               | (L << 2) | pp;
   2333 //uu    } else {
   2334 //uu       /* 3 byte encoding is needed. */
   2335 //uu       byte0 = 0xC4;
   2336 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
   2337 //uu               | ((rexB ^ 1) << 5) | mmmmm;
   2338 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
   2339 //uu    }
   2340 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
   2341 //uu }
   2342 //uu
   2343 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
   2344 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
   2345 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
   2346 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
   2347 //uu    vvvv=1111 (unused 3rd reg). */
   2348 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
   2349 //uu {
   2350 //uu    UChar L       = 1; /* size = 256 */
   2351 //uu    UChar pp      = 0; /* no SIMD prefix */
   2352 //uu    UChar mmmmm   = 1; /* 0F */
   2353 //uu    UChar notVvvv = 0; /* unused */
   2354 //uu    UChar rexW    = 0;
   2355 //uu    UChar rexR    = 0;
   2356 //uu    UChar rexX    = 0;
   2357 //uu    UChar rexB    = 0;
   2358 //uu    /* Same logic as in rexAMode_M. */
   2359 //uu    if (am->tag == Aam_IR) {
   2360 //uu       rexR = iregEnc3(greg);
   2361 //uu       rexX = 0; /* not relevant */
   2362 //uu       rexB = iregEnc3(am->Aam.IR.reg);
   2363 //uu    }
   2364 //uu    else if (am->tag == Aam_IRRS) {
   2365 //uu       rexR = iregEnc3(greg);
   2366 //uu       rexX = iregEnc3(am->Aam.IRRS.index);
   2367 //uu       rexB = iregEnc3(am->Aam.IRRS.base);
   2368 //uu    } else {
   2369 //uu       vassert(0);
   2370 //uu    }
   2371 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
   2372 //uu }
   2373 //uu
   2374 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
   2375 //uu {
   2376 //uu    switch (vex & 0xFF) {
   2377 //uu       case 0xC5:
   2378 //uu          *p++ = 0xC5;
   2379 //uu          *p++ = (vex >> 8) & 0xFF;
   2380 //uu          vassert(0 == (vex >> 16));
   2381 //uu          break;
   2382 //uu       case 0xC4:
   2383 //uu          *p++ = 0xC4;
   2384 //uu          *p++ = (vex >> 8) & 0xFF;
   2385 //uu          *p++ = (vex >> 16) & 0xFF;
   2386 //uu          vassert(0 == (vex >> 24));
   2387 //uu          break;
   2388 //uu       default:
   2389 //uu          vassert(0);
   2390 //uu    }
   2391 //uu    return p;
   2392 //uu }
   2393 
   2394 
   2395 /* Emit ffree %st(N) */
   2396 static UChar* do_ffree_st ( UChar* p, Int n )
   2397 {
   2398    vassert(n >= 0 && n <= 7);
   2399    *p++ = 0xDD;
   2400    *p++ = toUChar(0xC0 + n);
   2401    return p;
   2402 }
   2403 
   2404 /* Emit an instruction into buf and return the number of bytes used.
   2405    Note that buf is not the insn's final place, and therefore it is
   2406    imperative to emit position-independent code.  If the emitted
   2407    instruction was a profiler inc, set *is_profInc to True, else
   2408    leave it unchanged. */
   2409 
   2410 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
   2411                       UChar* buf, Int nbuf, const AMD64Instr* i,
   2412                       Bool mode64, VexEndness endness_host,
   2413                       const void* disp_cp_chain_me_to_slowEP,
   2414                       const void* disp_cp_chain_me_to_fastEP,
   2415                       const void* disp_cp_xindir,
   2416                       const void* disp_cp_xassisted )
   2417 {
   2418    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
   2419    UInt   xtra;
   2420    UInt   reg;
   2421    UChar  rex;
   2422    UChar* p = &buf[0];
   2423    UChar* ptmp;
   2424    Int    j;
   2425    vassert(nbuf >= 64);
   2426    vassert(mode64 == True);
   2427 
   2428    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
   2429 
   2430    switch (i->tag) {
   2431 
   2432    case Ain_Imm64:
   2433       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
   2434          /* Use the short form (load into 32 bit reg, + default
   2435             widening rule) for constants under 1 million.  We could
   2436             use this form for the range 0 to 0x7FFFFFFF inclusive, but
   2437             limit it to a smaller range for verifiability purposes. */
   2438          if (1 & iregEnc3(i->Ain.Imm64.dst))
   2439             *p++ = 0x41;
   2440          *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
   2441          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
   2442       } else {
   2443          *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
   2444          *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
   2445          p = emit64(p, i->Ain.Imm64.imm64);
   2446       }
   2447       goto done;
   2448 
   2449    case Ain_Alu64R:
   2450       /* Deal specially with MOV */
   2451       if (i->Ain.Alu64R.op == Aalu_MOV) {
   2452          switch (i->Ain.Alu64R.src->tag) {
   2453             case Armi_Imm:
   2454                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
   2455                   /* Actually we could use this form for constants in
   2456                      the range 0 through 0x7FFFFFFF inclusive, but
   2457                      limit it to a small range for verifiability
   2458                      purposes. */
   2459                   /* Generate "movl $imm32, 32-bit-register" and let
   2460                      the default zero-extend rule cause the upper half
   2461                      of the dst to be zeroed out too.  This saves 1
   2462                      and sometimes 2 bytes compared to the more
   2463                      obvious encoding in the 'else' branch. */
   2464                   if (1 & iregEnc3(i->Ain.Alu64R.dst))
   2465                      *p++ = 0x41;
   2466                   *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
   2467                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2468                } else {
   2469                   *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
   2470                   *p++ = 0xC7;
   2471                   *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
   2472                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2473                }
   2474                goto done;
   2475             case Armi_Reg:
   2476                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2477                                   i->Ain.Alu64R.dst );
   2478                *p++ = 0x89;
   2479                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2480                                 i->Ain.Alu64R.dst);
   2481                goto done;
   2482             case Armi_Mem:
   2483                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2484                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2485                *p++ = 0x8B;
   2486                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2487                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2488                goto done;
   2489             default:
   2490                goto bad;
   2491          }
   2492       }
   2493       /* MUL */
   2494       if (i->Ain.Alu64R.op == Aalu_MUL) {
   2495          switch (i->Ain.Alu64R.src->tag) {
   2496             case Armi_Reg:
   2497                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
   2498                                   i->Ain.Alu64R.src->Armi.Reg.reg);
   2499                *p++ = 0x0F;
   2500                *p++ = 0xAF;
   2501                p = doAMode_R(p, i->Ain.Alu64R.dst,
   2502                                 i->Ain.Alu64R.src->Armi.Reg.reg);
   2503                goto done;
   2504             case Armi_Mem:
   2505                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
   2506                                  i->Ain.Alu64R.src->Armi.Mem.am);
   2507                *p++ = 0x0F;
   2508                *p++ = 0xAF;
   2509                p = doAMode_M(p, i->Ain.Alu64R.dst,
   2510                                 i->Ain.Alu64R.src->Armi.Mem.am);
   2511                goto done;
   2512             case Armi_Imm:
   2513                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2514                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2515                   *p++ = 0x6B;
   2516                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2517                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2518                } else {
   2519                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2520                   *p++ = 0x69;
   2521                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
   2522                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2523                }
   2524                goto done;
   2525             default:
   2526                goto bad;
   2527          }
   2528       }
   2529       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
   2530       opc = opc_rr = subopc_imm = opc_imma = 0;
   2531       switch (i->Ain.Alu64R.op) {
   2532          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
   2533                         subopc_imm = 2; opc_imma = 0x15; break;
   2534          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2535                         subopc_imm = 0; opc_imma = 0x05; break;
   2536          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2537                         subopc_imm = 5; opc_imma = 0x2D; break;
   2538          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
   2539                         subopc_imm = 3; opc_imma = 0x1D; break;
   2540          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2541                         subopc_imm = 4; opc_imma = 0x25; break;
   2542          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2543                         subopc_imm = 6; opc_imma = 0x35; break;
   2544          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2545                         subopc_imm = 1; opc_imma = 0x0D; break;
   2546          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2547                         subopc_imm = 7; opc_imma = 0x3D; break;
   2548          default: goto bad;
   2549       }
   2550       switch (i->Ain.Alu64R.src->tag) {
   2551          case Armi_Imm:
   2552             if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
   2553                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2554                goto bad; /* FIXME: awaiting test case */
   2555                *p++ = toUChar(opc_imma);
   2556                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2557             } else
   2558             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
   2559                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
   2560                *p++ = 0x83;
   2561                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
   2562                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
   2563             } else {
   2564                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
   2565                *p++ = 0x81;
   2566                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
   2567                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
   2568             }
   2569             goto done;
   2570          case Armi_Reg:
   2571             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
   2572                                i->Ain.Alu64R.dst);
   2573             *p++ = toUChar(opc_rr);
   2574             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
   2575                              i->Ain.Alu64R.dst);
   2576             goto done;
   2577          case Armi_Mem:
   2578             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
   2579                                i->Ain.Alu64R.src->Armi.Mem.am);
   2580             *p++ = toUChar(opc);
   2581             p = doAMode_M(p, i->Ain.Alu64R.dst,
   2582                              i->Ain.Alu64R.src->Armi.Mem.am);
   2583             goto done;
   2584          default:
   2585             goto bad;
   2586       }
   2587       break;
   2588 
   2589    case Ain_Alu64M:
   2590       /* Deal specially with MOV */
   2591       if (i->Ain.Alu64M.op == Aalu_MOV) {
   2592          switch (i->Ain.Alu64M.src->tag) {
   2593             case Ari_Reg:
   2594                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
   2595                                  i->Ain.Alu64M.dst);
   2596                *p++ = 0x89;
   2597                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
   2598                                 i->Ain.Alu64M.dst);
   2599                goto done;
   2600             case Ari_Imm:
   2601                *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
   2602                *p++ = 0xC7;
   2603                p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
   2604                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
   2605                goto done;
   2606             default:
   2607                goto bad;
   2608          }
   2609       }
   2610       break;
   2611 
   2612    case Ain_Sh64:
   2613       opc_cl = opc_imm = subopc = 0;
   2614       switch (i->Ain.Sh64.op) {
   2615          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
   2616          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
   2617          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
   2618          default: goto bad;
   2619       }
   2620       if (i->Ain.Sh64.src == 0) {
   2621          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
   2622          *p++ = toUChar(opc_cl);
   2623          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
   2624          goto done;
   2625       } else {
   2626          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
   2627          *p++ = toUChar(opc_imm);
   2628          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
   2629          *p++ = (UChar)(i->Ain.Sh64.src);
   2630          goto done;
   2631       }
   2632       break;
   2633 
   2634    case Ain_Test64:
   2635       /* testq sign-extend($imm32), %reg */
   2636       *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
   2637       *p++ = 0xF7;
   2638       p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
   2639       p = emit32(p, i->Ain.Test64.imm32);
   2640       goto done;
   2641 
   2642    case Ain_Unary64:
   2643       if (i->Ain.Unary64.op == Aun_NOT) {
   2644          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
   2645          *p++ = 0xF7;
   2646          p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
   2647          goto done;
   2648       }
   2649       if (i->Ain.Unary64.op == Aun_NEG) {
   2650          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
   2651          *p++ = 0xF7;
   2652          p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
   2653          goto done;
   2654       }
   2655       break;
   2656 
   2657    case Ain_Lea64:
   2658       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2659       *p++ = 0x8D;
   2660       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
   2661       goto done;
   2662 
   2663    case Ain_Alu32R:
   2664       /* ADD/SUB/AND/OR/XOR/CMP */
   2665       opc = opc_rr = subopc_imm = opc_imma = 0;
   2666       switch (i->Ain.Alu32R.op) {
   2667          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
   2668                         subopc_imm = 0; opc_imma = 0x05; break;
   2669          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
   2670                         subopc_imm = 5; opc_imma = 0x2D; break;
   2671          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
   2672                         subopc_imm = 4; opc_imma = 0x25; break;
   2673          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
   2674                         subopc_imm = 6; opc_imma = 0x35; break;
   2675          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
   2676                         subopc_imm = 1; opc_imma = 0x0D; break;
   2677          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
   2678                         subopc_imm = 7; opc_imma = 0x3D; break;
   2679          default: goto bad;
   2680       }
   2681       switch (i->Ain.Alu32R.src->tag) {
   2682          case Armi_Imm:
   2683             if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
   2684                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
   2685                goto bad; /* FIXME: awaiting test case */
   2686                *p++ = toUChar(opc_imma);
   2687                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
   2688             } else
   2689             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
   2690                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
   2691                if (rex != 0x40) *p++ = rex;
   2692                *p++ = 0x83;
   2693                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
   2694                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
   2695             } else {
   2696                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
   2697                if (rex != 0x40) *p++ = rex;
   2698                *p++ = 0x81;
   2699                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
   2700                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
   2701             }
   2702             goto done;
   2703          case Armi_Reg:
   2704             rex  = clearWBit(
   2705                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
   2706                                i->Ain.Alu32R.dst) );
   2707             if (rex != 0x40) *p++ = rex;
   2708             *p++ = toUChar(opc_rr);
   2709             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
   2710                              i->Ain.Alu32R.dst);
   2711             goto done;
   2712          case Armi_Mem:
   2713             rex  = clearWBit(
   2714                    rexAMode_M( i->Ain.Alu32R.dst,
   2715                                i->Ain.Alu32R.src->Armi.Mem.am) );
   2716             if (rex != 0x40) *p++ = rex;
   2717             *p++ = toUChar(opc);
   2718             p = doAMode_M(p, i->Ain.Alu32R.dst,
   2719                              i->Ain.Alu32R.src->Armi.Mem.am);
   2720             goto done;
   2721          default:
   2722             goto bad;
   2723       }
   2724       break;
   2725 
   2726    case Ain_MulL:
   2727       subopc = i->Ain.MulL.syned ? 5 : 4;
   2728       switch (i->Ain.MulL.src->tag)  {
   2729          case Arm_Mem:
   2730             *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
   2731             *p++ = 0xF7;
   2732             p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
   2733             goto done;
   2734          case Arm_Reg:
   2735             *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
   2736             *p++ = 0xF7;
   2737             p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
   2738             goto done;
   2739          default:
   2740             goto bad;
   2741       }
   2742       break;
   2743 
   2744    case Ain_Div:
   2745       subopc = i->Ain.Div.syned ? 7 : 6;
   2746       if (i->Ain.Div.sz == 4) {
   2747          switch (i->Ain.Div.src->tag)  {
   2748             case Arm_Mem:
   2749                goto bad;
   2750                /*FIXME*/
   2751                *p++ = 0xF7;
   2752                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
   2753                goto done;
   2754             case Arm_Reg:
   2755                *p++ = clearWBit(
   2756                       rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
   2757                *p++ = 0xF7;
   2758                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
   2759                goto done;
   2760             default:
   2761                goto bad;
   2762          }
   2763       }
   2764       if (i->Ain.Div.sz == 8) {
   2765          switch (i->Ain.Div.src->tag)  {
   2766             case Arm_Mem:
   2767                *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
   2768                *p++ = 0xF7;
   2769                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
   2770                goto done;
   2771             case Arm_Reg:
   2772                *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
   2773                *p++ = 0xF7;
   2774                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
   2775                goto done;
   2776             default:
   2777                goto bad;
   2778          }
   2779       }
   2780       break;
   2781 
   2782    case Ain_Push:
   2783       switch (i->Ain.Push.src->tag) {
   2784          case Armi_Mem:
   2785             *p++ = clearWBit(
   2786                    rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
   2787             *p++ = 0xFF;
   2788             p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
   2789             goto done;
   2790          case Armi_Imm:
   2791             *p++ = 0x68;
   2792             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
   2793             goto done;
   2794          case Armi_Reg:
   2795             *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
   2796             *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
   2797             goto done;
   2798         default:
   2799             goto bad;
   2800       }
   2801 
   2802    case Ain_Call: {
   2803       /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
   2804          above, %r11 is used as an address temporary. */
   2805       /* If we don't need to do any fixup actions in the case that the
   2806          call doesn't happen, just do the simple thing and emit
   2807          straight-line code.  This is usually the case. */
   2808       if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
   2809           || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
   2810          /* jump over the following two insns if the condition does
   2811             not hold */
   2812          Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
   2813          if (i->Ain.Call.cond != Acc_ALWAYS) {
   2814             *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
   2815             *p++ = shortImm ? 10 : 13;
   2816             /* 10 or 13 bytes in the next two insns */
   2817          }
   2818          if (shortImm) {
   2819             /* 7 bytes: movl sign-extend(imm32), %r11 */
   2820             *p++ = 0x49;
   2821             *p++ = 0xC7;
   2822             *p++ = 0xC3;
   2823             p = emit32(p, (UInt)i->Ain.Call.target);
   2824          } else {
   2825             /* 10 bytes: movabsq $target, %r11 */
   2826             *p++ = 0x49;
   2827             *p++ = 0xBB;
   2828             p = emit64(p, i->Ain.Call.target);
   2829          }
   2830          /* 3 bytes: call *%r11 */
   2831          *p++ = 0x41;
   2832          *p++ = 0xFF;
   2833          *p++ = 0xD3;
   2834       } else {
   2835          Int delta;
   2836          /* Complex case.  We have to generate an if-then-else diamond. */
   2837          // before:
   2838          //   j{!cond} else:
   2839          //   movabsq $target, %r11
   2840          //   call* %r11
   2841          // preElse:
   2842          //   jmp after:
   2843          // else:
   2844          //   movabsq $0x5555555555555555, %rax  // possibly
   2845          //   movq %rax, %rdx                    // possibly
   2846          // after:
   2847 
   2848          // before:
   2849          UChar* pBefore = p;
   2850 
   2851          //   j{!cond} else:
   2852          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
   2853          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2854 
   2855          //   movabsq $target, %r11
   2856          *p++ = 0x49;
   2857          *p++ = 0xBB;
   2858          p = emit64(p, i->Ain.Call.target);
   2859 
   2860          //   call* %r11
   2861          *p++ = 0x41;
   2862          *p++ = 0xFF;
   2863          *p++ = 0xD3;
   2864 
   2865          // preElse:
   2866          UChar* pPreElse = p;
   2867 
   2868          //   jmp after:
   2869          *p++ = 0xEB;
   2870          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2871 
   2872          // else:
   2873          UChar* pElse = p;
   2874 
   2875          /* Do the 'else' actions */
   2876          switch (i->Ain.Call.rloc.pri) {
   2877             case RLPri_Int:
   2878                // movabsq $0x5555555555555555, %rax
   2879                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
   2880                break;
   2881             case RLPri_2Int:
   2882                goto bad; //ATC
   2883                // movabsq $0x5555555555555555, %rax
   2884                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
   2885                // movq %rax, %rdx
   2886                *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
   2887                break;
   2888             case RLPri_V128SpRel:
   2889                if (i->Ain.Call.rloc.spOff == 0) {
   2890                   // We could accept any |spOff| here, but that's more
   2891                   // hassle and the only value we're ever going to get
   2892                   // is zero (I believe.)  Hence take the easy path :)
   2893                   // We need a scag register -- r11 can be it.
   2894                   // movabsq $0x5555555555555555, %r11
   2895                   *p++ = 0x49; *p++ = 0xBB;
   2896                   p = emit64(p, 0x5555555555555555ULL);
   2897                   // movq %r11, 0(%rsp)
   2898                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
   2899                   // movq %r11, 8(%rsp)
   2900                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
   2901                   *p++ = 0x08;
   2902                   break;
   2903                }
   2904                goto bad; //ATC for all other spOff values
   2905             case RLPri_V256SpRel:
   2906                goto bad; //ATC
   2907             case RLPri_None: case RLPri_INVALID: default:
   2908                vassert(0); // should never get here
   2909          }
   2910 
   2911          // after:
   2912          UChar* pAfter = p;
   2913 
   2914          // Fix up the branch offsets.  The +2s in the offset
   2915          // calculations are there because x86 requires conditional
   2916          // branches to have their offset stated relative to the
   2917          // instruction immediately following the branch insn.  And in
   2918          // both cases the branch insns are 2 bytes long.
   2919 
   2920          // First, the "j{!cond} else:" at pBefore.
   2921          delta = (Int)(Long)(pElse - (pBefore + 2));
   2922          vassert(delta >= 0 && delta < 100/*arbitrary*/);
   2923          *(pBefore+1) = (UChar)delta;
   2924 
   2925          // And secondly, the "jmp after:" at pPreElse.
   2926          delta = (Int)(Long)(pAfter - (pPreElse + 2));
   2927          vassert(delta >= 0 && delta < 100/*arbitrary*/);
   2928          *(pPreElse+1) = (UChar)delta;
   2929       }
   2930       goto done;
   2931    }
   2932 
   2933    case Ain_XDirect: {
   2934       /* NB: what goes on here has to be very closely coordinated with the
   2935          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
   2936       /* We're generating chain-me requests here, so we need to be
   2937          sure this is actually allowed -- no-redir translations can't
   2938          use chain-me's.  Hence: */
   2939       vassert(disp_cp_chain_me_to_slowEP != NULL);
   2940       vassert(disp_cp_chain_me_to_fastEP != NULL);
   2941 
   2942       HReg r11 = hregAMD64_R11();
   2943 
   2944       /* Use ptmp for backpatching conditional jumps. */
   2945       ptmp = NULL;
   2946 
   2947       /* First off, if this is conditional, create a conditional
   2948          jump over the rest of it. */
   2949       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
   2950          /* jmp fwds if !condition */
   2951          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
   2952          ptmp = p; /* fill in this bit later */
   2953          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   2954       }
   2955 
   2956       /* Update the guest RIP. */
   2957       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
   2958          /* use a shorter encoding */
   2959          /* movl sign-extend(dstGA), %r11 */
   2960          *p++ = 0x49;
   2961          *p++ = 0xC7;
   2962          *p++ = 0xC3;
   2963          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
   2964       } else {
   2965          /* movabsq $dstGA, %r11 */
   2966          *p++ = 0x49;
   2967          *p++ = 0xBB;
   2968          p = emit64(p, i->Ain.XDirect.dstGA);
   2969       }
   2970 
   2971       /* movq %r11, amRIP */
   2972       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
   2973       *p++ = 0x89;
   2974       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
   2975 
   2976       /* --- FIRST PATCHABLE BYTE follows --- */
   2977       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
   2978          to) backs up the return address, so as to find the address of
   2979          the first patchable byte.  So: don't change the length of the
   2980          two instructions below. */
   2981       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
   2982       *p++ = 0x49;
   2983       *p++ = 0xBB;
   2984       const void* disp_cp_chain_me
   2985                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
   2986                                          : disp_cp_chain_me_to_slowEP;
   2987       p = emit64(p, (Addr)disp_cp_chain_me);
   2988       /* call *%r11 */
   2989       *p++ = 0x41;
   2990       *p++ = 0xFF;
   2991       *p++ = 0xD3;
   2992       /* --- END of PATCHABLE BYTES --- */
   2993 
   2994       /* Fix up the conditional jump, if there was one. */
   2995       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
   2996          Int delta = p - ptmp;
   2997          vassert(delta > 0 && delta < 40);
   2998          *ptmp = toUChar(delta-1);
   2999       }
   3000       goto done;
   3001    }
   3002 
   3003    case Ain_XIndir: {
   3004       /* We're generating transfers that could lead indirectly to a
   3005          chain-me, so we need to be sure this is actually allowed --
   3006          no-redir translations are not allowed to reach normal
   3007          translations without going through the scheduler.  That means
   3008          no XDirects or XIndirs out from no-redir translations.
   3009          Hence: */
   3010       vassert(disp_cp_xindir != NULL);
   3011 
   3012       /* Use ptmp for backpatching conditional jumps. */
   3013       ptmp = NULL;
   3014 
   3015       /* First off, if this is conditional, create a conditional
   3016          jump over the rest of it. */
   3017       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
   3018          /* jmp fwds if !condition */
   3019          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
   3020          ptmp = p; /* fill in this bit later */
   3021          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3022       }
   3023 
   3024       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
   3025       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
   3026       *p++ = 0x89;
   3027       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
   3028 
   3029       /* get $disp_cp_xindir into %r11 */
   3030       if (fitsIn32Bits((Addr)disp_cp_xindir)) {
   3031          /* use a shorter encoding */
   3032          /* movl sign-extend(disp_cp_xindir), %r11 */
   3033          *p++ = 0x49;
   3034          *p++ = 0xC7;
   3035          *p++ = 0xC3;
   3036          p = emit32(p, (UInt)(Addr)disp_cp_xindir);
   3037       } else {
   3038          /* movabsq $disp_cp_xindir, %r11 */
   3039          *p++ = 0x49;
   3040          *p++ = 0xBB;
   3041          p = emit64(p, (Addr)disp_cp_xindir);
   3042       }
   3043 
   3044       /* jmp *%r11 */
   3045       *p++ = 0x41;
   3046       *p++ = 0xFF;
   3047       *p++ = 0xE3;
   3048 
   3049       /* Fix up the conditional jump, if there was one. */
   3050       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
   3051          Int delta = p - ptmp;
   3052          vassert(delta > 0 && delta < 40);
   3053          *ptmp = toUChar(delta-1);
   3054       }
   3055       goto done;
   3056    }
   3057 
   3058    case Ain_XAssisted: {
   3059       /* Use ptmp for backpatching conditional jumps. */
   3060       ptmp = NULL;
   3061 
   3062       /* First off, if this is conditional, create a conditional
   3063          jump over the rest of it. */
   3064       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
   3065          /* jmp fwds if !condition */
   3066          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
   3067          ptmp = p; /* fill in this bit later */
   3068          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3069       }
   3070 
   3071       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
   3072       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
   3073       *p++ = 0x89;
   3074       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
   3075       /* movl $magic_number, %ebp.  Since these numbers are all small positive
   3076          integers, we can get away with "movl $N, %ebp" rather than
   3077          the longer "movq $N, %rbp". */
   3078       UInt trcval = 0;
   3079       switch (i->Ain.XAssisted.jk) {
   3080          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
   3081          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
   3082          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
   3083          case Ijk_Sys_int210:  trcval = VEX_TRC_JMP_SYS_INT210;  break;
   3084          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
   3085          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
   3086          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
   3087          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
   3088          case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
   3089          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
   3090          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
   3091          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
   3092          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
   3093          /* We don't expect to see the following being assisted. */
   3094          case Ijk_Ret:
   3095          case Ijk_Call:
   3096          /* fallthrough */
   3097          default:
   3098             ppIRJumpKind(i->Ain.XAssisted.jk);
   3099             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
   3100       }
   3101       vassert(trcval != 0);
   3102       *p++ = 0xBD;
   3103       p = emit32(p, trcval);
   3104       /* movabsq $disp_assisted, %r11 */
   3105       *p++ = 0x49;
   3106       *p++ = 0xBB;
   3107       p = emit64(p, (Addr)disp_cp_xassisted);
   3108       /* jmp *%r11 */
   3109       *p++ = 0x41;
   3110       *p++ = 0xFF;
   3111       *p++ = 0xE3;
   3112 
   3113       /* Fix up the conditional jump, if there was one. */
   3114       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
   3115          Int delta = p - ptmp;
   3116          vassert(delta > 0 && delta < 40);
   3117          *ptmp = toUChar(delta-1);
   3118       }
   3119       goto done;
   3120    }
   3121 
   3122    case Ain_CMov64:
   3123       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
   3124       *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
   3125       *p++ = 0x0F;
   3126       *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
   3127       p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
   3128       goto done;
   3129 
   3130    case Ain_CLoad: {
   3131       vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
   3132 
   3133       /* Only 32- or 64-bit variants are allowed. */
   3134       vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
   3135 
   3136       /* Use ptmp for backpatching conditional jumps. */
   3137       ptmp = NULL;
   3138 
   3139       /* jmp fwds if !condition */
   3140       *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
   3141       ptmp = p; /* fill in this bit later */
   3142       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3143 
   3144       /* Now the load.  Either a normal 64 bit load or a normal 32 bit
   3145          load, which, by the default zero-extension rule, zeroes out
   3146          the upper half of the destination, as required. */
   3147       rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
   3148       *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
   3149       *p++ = 0x8B;
   3150       p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
   3151 
   3152       /* Fix up the conditional branch */
   3153       Int delta = p - ptmp;
   3154       vassert(delta > 0 && delta < 40);
   3155       *ptmp = toUChar(delta-1);
   3156       goto done;
   3157    }
   3158 
   3159    case Ain_CStore: {
   3160       /* AFAICS this is identical to Ain_CLoad except that the opcode
   3161          is 0x89 instead of 0x8B. */
   3162       vassert(i->Ain.CStore.cond != Acc_ALWAYS);
   3163 
   3164       /* Only 32- or 64-bit variants are allowed. */
   3165       vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
   3166 
   3167       /* Use ptmp for backpatching conditional jumps. */
   3168       ptmp = NULL;
   3169 
   3170       /* jmp fwds if !condition */
   3171       *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
   3172       ptmp = p; /* fill in this bit later */
   3173       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3174 
   3175       /* Now the store. */
   3176       rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
   3177       *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
   3178       *p++ = 0x89;
   3179       p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
   3180 
   3181       /* Fix up the conditional branch */
   3182       Int delta = p - ptmp;
   3183       vassert(delta > 0 && delta < 40);
   3184       *ptmp = toUChar(delta-1);
   3185       goto done;
   3186    }
   3187 
   3188    case Ain_MovxLQ:
   3189       /* No, _don't_ ask me why the sense of the args has to be
   3190          different in the S vs Z case.  I don't know. */
   3191       if (i->Ain.MovxLQ.syned) {
   3192          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
   3193          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   3194          *p++ = 0x63;
   3195          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
   3196       } else {
   3197          /* Produce a 32-bit reg-reg move, since the implicit
   3198             zero-extend does what we want. */
   3199          *p++ = clearWBit (
   3200                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
   3201          *p++ = 0x89;
   3202          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
   3203       }
   3204       goto done;
   3205 
   3206    case Ain_LoadEX:
   3207       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
   3208          /* movzbq */
   3209          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3210          *p++ = 0x0F;
   3211          *p++ = 0xB6;
   3212          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3213          goto done;
   3214       }
   3215       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
   3216          /* movzwq */
   3217          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3218          *p++ = 0x0F;
   3219          *p++ = 0xB7;
   3220          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3221          goto done;
   3222       }
   3223       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
   3224          /* movzlq */
   3225          /* This isn't really an existing AMD64 instruction per se.
   3226             Rather, we have to do a 32-bit load.  Because a 32-bit
   3227             write implicitly clears the upper 32 bits of the target
   3228             register, we get what we want. */
   3229          *p++ = clearWBit(
   3230                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
   3231          *p++ = 0x8B;
   3232          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
   3233          goto done;
   3234       }
   3235       break;
   3236 
   3237    case Ain_Set64:
   3238       /* Make the destination register be 1 or 0, depending on whether
   3239          the relevant condition holds.  Complication: the top 56 bits
   3240          of the destination should be forced to zero, but doing 'xorq
   3241          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
   3242          start off my moving $0 into the dest. */
   3243       reg = iregEnc3210(i->Ain.Set64.dst);
   3244       vassert(reg < 16);
   3245 
   3246       /* movq $0, %dst */
   3247       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
   3248       *p++ = 0xC7;
   3249       *p++ = toUChar(0xC0 + (reg & 7));
   3250       p = emit32(p, 0);
   3251 
   3252       /* setb lo8(%dst) */
   3253       /* note, 8-bit register rex trickyness.  Be careful here. */
   3254       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
   3255       *p++ = 0x0F;
   3256       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
   3257       *p++ = toUChar(0xC0 + (reg & 7));
   3258       goto done;
   3259 
   3260    case Ain_Bsfr64:
   3261       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   3262       *p++ = 0x0F;
   3263       if (i->Ain.Bsfr64.isFwds) {
   3264          *p++ = 0xBC;
   3265       } else {
   3266          *p++ = 0xBD;
   3267       }
   3268       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
   3269       goto done;
   3270 
   3271    case Ain_MFence:
   3272       /* mfence */
   3273       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
   3274       goto done;
   3275 
   3276    case Ain_ACAS:
   3277       /* lock */
   3278       *p++ = 0xF0;
   3279       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
   3280       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
   3281          in %rbx.  The new-value register is hardwired to be %rbx
   3282          since dealing with byte integer registers is too much hassle,
   3283          so we force the register operand to %rbx (could equally be
   3284          %rcx or %rdx). */
   3285       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
   3286       if (i->Ain.ACAS.sz != 8)
   3287          rex = clearWBit(rex);
   3288 
   3289       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
   3290       *p++ = 0x0F;
   3291       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
   3292       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
   3293       goto done;
   3294 
   3295    case Ain_DACAS:
   3296       /* lock */
   3297       *p++ = 0xF0;
   3298       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
   3299          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
   3300          aren't encoded in the insn. */
   3301       rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
   3302       if (i->Ain.ACAS.sz != 8)
   3303          rex = clearWBit(rex);
   3304       *p++ = rex;
   3305       *p++ = 0x0F;
   3306       *p++ = 0xC7;
   3307       p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
   3308       goto done;
   3309 
   3310    case Ain_A87Free:
   3311       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
   3312       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
   3313          p = do_ffree_st(p, 7-j);
   3314       }
   3315       goto done;
   3316 
   3317    case Ain_A87PushPop:
   3318       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
   3319       if (i->Ain.A87PushPop.isPush) {
   3320          /* Load from memory into %st(0): flds/fldl amode */
   3321          *p++ = clearWBit(
   3322                    rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
   3323          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   3324 	 p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
   3325       } else {
   3326          /* Dump %st(0) to memory: fstps/fstpl amode */
   3327          *p++ = clearWBit(
   3328                    rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
   3329          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
   3330          p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
   3331          goto done;
   3332       }
   3333       goto done;
   3334 
   3335    case Ain_A87FpOp:
   3336       switch (i->Ain.A87FpOp.op) {
   3337          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
   3338          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
   3339          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
   3340          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
   3341          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
   3342          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
   3343          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
   3344          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
   3345          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
   3346          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
   3347          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
   3348          case Afp_TAN:
   3349             /* fptan pushes 1.0 on the FP stack, except when the
   3350                argument is out of range.  Hence we have to do the
   3351                instruction, then inspect C2 to see if there is an out
   3352                of range condition.  If there is, we skip the fincstp
   3353                that is used by the in-range case to get rid of this
   3354                extra 1.0 value. */
   3355             *p++ = 0xD9; *p++ = 0xF2; // fptan
   3356             *p++ = 0x50;              // pushq %rax
   3357             *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
   3358             *p++ = 0x66; *p++ = 0xA9;
   3359             *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
   3360             *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
   3361             *p++ = 0xD9; *p++ = 0xF7; // fincstp
   3362             *p++ = 0x58;              // after_fincstp: popq %rax
   3363             break;
   3364          default:
   3365             goto bad;
   3366       }
   3367       goto done;
   3368 
   3369    case Ain_A87LdCW:
   3370       *p++ = clearWBit(
   3371                 rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
   3372       *p++ = 0xD9;
   3373       p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
   3374       goto done;
   3375 
   3376    case Ain_A87StSW:
   3377       *p++ = clearWBit(
   3378                 rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
   3379       *p++ = 0xDD;
   3380       p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
   3381       goto done;
   3382 
   3383    case Ain_Store:
   3384       if (i->Ain.Store.sz == 2) {
   3385          /* This just goes to show the crazyness of the instruction
   3386             set encoding.  We have to insert two prefix bytes, but be
   3387             careful to avoid a conflict in what the size should be, by
   3388             ensuring that REX.W = 0. */
   3389          *p++ = 0x66; /* override to 16-bits */
   3390 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3391          *p++ = 0x89;
   3392          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3393          goto done;
   3394       }
   3395       if (i->Ain.Store.sz == 4) {
   3396 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3397          *p++ = 0x89;
   3398          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3399          goto done;
   3400       }
   3401       if (i->Ain.Store.sz == 1) {
   3402          /* This is one place where it would be wrong to skip emitting
   3403             a rex byte of 0x40, since the mere presence of rex changes
   3404             the meaning of the byte register access.  Be careful. */
   3405 	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
   3406          *p++ = 0x88;
   3407          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
   3408          goto done;
   3409       }
   3410       break;
   3411 
   3412    case Ain_LdMXCSR:
   3413       *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
   3414       *p++ = 0x0F;
   3415       *p++ = 0xAE;
   3416       p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
   3417       goto done;
   3418 
   3419    case Ain_SseUComIS:
   3420       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
   3421       /* ucomi[sd] %srcL, %srcR */
   3422       if (i->Ain.SseUComIS.sz == 8) {
   3423          *p++ = 0x66;
   3424       } else {
   3425          goto bad;
   3426          vassert(i->Ain.SseUComIS.sz == 4);
   3427       }
   3428       *p++ = clearWBit (
   3429              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
   3430                                  vregEnc3210(i->Ain.SseUComIS.srcR) ));
   3431       *p++ = 0x0F;
   3432       *p++ = 0x2E;
   3433       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
   3434                                vregEnc3210(i->Ain.SseUComIS.srcR) );
   3435       /* pushfq */
   3436       *p++ = 0x9C;
   3437       /* popq %dst */
   3438       *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
   3439       *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
   3440       goto done;
   3441 
   3442    case Ain_SseSI2SF:
   3443       /* cvssi2s[sd] %src, %dst */
   3444       rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
   3445                                 i->Ain.SseSI2SF.src );
   3446       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
   3447       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
   3448       *p++ = 0x0F;
   3449       *p++ = 0x2A;
   3450       p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
   3451                                 i->Ain.SseSI2SF.src );
   3452       goto done;
   3453 
   3454    case Ain_SseSF2SI:
   3455       /* cvss[sd]2si %src, %dst */
   3456       rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
   3457                                 vregEnc3210(i->Ain.SseSF2SI.src) );
   3458       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
   3459       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
   3460       *p++ = 0x0F;
   3461       *p++ = 0x2D;
   3462       p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
   3463                                 vregEnc3210(i->Ain.SseSF2SI.src) );
   3464       goto done;
   3465 
   3466    case Ain_SseSDSS:
   3467       /* cvtsd2ss/cvtss2sd %src, %dst */
   3468       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
   3469       *p++ = clearWBit(
   3470               rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
   3471                                   vregEnc3210(i->Ain.SseSDSS.src) ));
   3472       *p++ = 0x0F;
   3473       *p++ = 0x5A;
   3474       p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
   3475                                 vregEnc3210(i->Ain.SseSDSS.src) );
   3476       goto done;
   3477 
   3478    case Ain_SseLdSt:
   3479       if (i->Ain.SseLdSt.sz == 8) {
   3480          *p++ = 0xF2;
   3481       } else
   3482       if (i->Ain.SseLdSt.sz == 4) {
   3483          *p++ = 0xF3;
   3484       } else
   3485       if (i->Ain.SseLdSt.sz != 16) {
   3486          vassert(0);
   3487       }
   3488       *p++ = clearWBit(
   3489              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
   3490                             i->Ain.SseLdSt.addr));
   3491       *p++ = 0x0F;
   3492       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
   3493       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
   3494                            i->Ain.SseLdSt.addr);
   3495       goto done;
   3496 
   3497    case Ain_SseCStore: {
   3498       vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
   3499 
   3500       /* Use ptmp for backpatching conditional jumps. */
   3501       ptmp = NULL;
   3502 
   3503       /* jmp fwds if !condition */
   3504       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
   3505       ptmp = p; /* fill in this bit later */
   3506       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3507 
   3508       /* Now the store. */
   3509       *p++ = clearWBit(
   3510              rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
   3511                             i->Ain.SseCStore.addr));
   3512       *p++ = 0x0F;
   3513       *p++ = toUChar(0x11);
   3514       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
   3515                            i->Ain.SseCStore.addr);
   3516 
   3517       /* Fix up the conditional branch */
   3518       Int delta = p - ptmp;
   3519       vassert(delta > 0 && delta < 40);
   3520       *ptmp = toUChar(delta-1);
   3521       goto done;
   3522    }
   3523 
   3524    case Ain_SseCLoad: {
   3525       vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
   3526 
   3527       /* Use ptmp for backpatching conditional jumps. */
   3528       ptmp = NULL;
   3529 
   3530       /* jmp fwds if !condition */
   3531       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
   3532       ptmp = p; /* fill in this bit later */
   3533       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
   3534 
   3535       /* Now the load. */
   3536       *p++ = clearWBit(
   3537              rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
   3538                             i->Ain.SseCLoad.addr));
   3539       *p++ = 0x0F;
   3540       *p++ = toUChar(0x10);
   3541       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
   3542                            i->Ain.SseCLoad.addr);
   3543 
   3544       /* Fix up the conditional branch */
   3545       Int delta = p - ptmp;
   3546       vassert(delta > 0 && delta < 40);
   3547       *ptmp = toUChar(delta-1);
   3548       goto done;
   3549    }
   3550 
   3551    case Ain_SseLdzLO:
   3552       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
   3553       /* movs[sd] amode, %xmm-dst */
   3554       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
   3555       *p++ = clearWBit(
   3556              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
   3557                             i->Ain.SseLdzLO.addr));
   3558       *p++ = 0x0F;
   3559       *p++ = 0x10;
   3560       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
   3561                            i->Ain.SseLdzLO.addr);
   3562       goto done;
   3563 
   3564    case Ain_Sse32Fx4:
   3565       xtra = 0;
   3566       *p++ = clearWBit(
   3567              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst),
   3568                                  vregEnc3210(i->Ain.Sse32Fx4.src) ));
   3569       *p++ = 0x0F;
   3570       switch (i->Ain.Sse32Fx4.op) {
   3571          case Asse_ADDF:   *p++ = 0x58; break;
   3572          case Asse_DIVF:   *p++ = 0x5E; break;
   3573          case Asse_MAXF:   *p++ = 0x5F; break;
   3574          case Asse_MINF:   *p++ = 0x5D; break;
   3575          case Asse_MULF:   *p++ = 0x59; break;
   3576          case Asse_RCPF:   *p++ = 0x53; break;
   3577          case Asse_RSQRTF: *p++ = 0x52; break;
   3578          case Asse_SQRTF:  *p++ = 0x51; break;
   3579          case Asse_SUBF:   *p++ = 0x5C; break;
   3580          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3581          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3582          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3583          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3584          default: goto bad;
   3585       }
   3586       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32Fx4.dst),
   3587                                vregEnc3210(i->Ain.Sse32Fx4.src) );
   3588       if (xtra & 0x100)
   3589          *p++ = toUChar(xtra & 0xFF);
   3590       goto done;
   3591 
   3592    case Ain_Sse64Fx2:
   3593       xtra = 0;
   3594       *p++ = 0x66;
   3595       *p++ = clearWBit(
   3596              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
   3597                                  vregEnc3210(i->Ain.Sse64Fx2.src) ));
   3598       *p++ = 0x0F;
   3599       switch (i->Ain.Sse64Fx2.op) {
   3600          case Asse_ADDF:   *p++ = 0x58; break;
   3601          case Asse_DIVF:   *p++ = 0x5E; break;
   3602          case Asse_MAXF:   *p++ = 0x5F; break;
   3603          case Asse_MINF:   *p++ = 0x5D; break;
   3604          case Asse_MULF:   *p++ = 0x59; break;
   3605          case Asse_SQRTF:  *p++ = 0x51; break;
   3606          case Asse_SUBF:   *p++ = 0x5C; break;
   3607          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3608          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3609          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3610          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3611          default: goto bad;
   3612       }
   3613       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
   3614                                vregEnc3210(i->Ain.Sse64Fx2.src) );
   3615       if (xtra & 0x100)
   3616          *p++ = toUChar(xtra & 0xFF);
   3617       goto done;
   3618 
   3619    case Ain_Sse32FLo:
   3620       xtra = 0;
   3621       *p++ = 0xF3;
   3622       *p++ = clearWBit(
   3623              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
   3624                                  vregEnc3210(i->Ain.Sse32FLo.src) ));
   3625       *p++ = 0x0F;
   3626       switch (i->Ain.Sse32FLo.op) {
   3627          case Asse_ADDF:   *p++ = 0x58; break;
   3628          case Asse_DIVF:   *p++ = 0x5E; break;
   3629          case Asse_MAXF:   *p++ = 0x5F; break;
   3630          case Asse_MINF:   *p++ = 0x5D; break;
   3631          case Asse_MULF:   *p++ = 0x59; break;
   3632          case Asse_RCPF:   *p++ = 0x53; break;
   3633          case Asse_RSQRTF: *p++ = 0x52; break;
   3634          case Asse_SQRTF:  *p++ = 0x51; break;
   3635          case Asse_SUBF:   *p++ = 0x5C; break;
   3636          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3637          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3638          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3639          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3640          default: goto bad;
   3641       }
   3642       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
   3643                                vregEnc3210(i->Ain.Sse32FLo.src) );
   3644       if (xtra & 0x100)
   3645          *p++ = toUChar(xtra & 0xFF);
   3646       goto done;
   3647 
   3648    case Ain_Sse64FLo:
   3649       xtra = 0;
   3650       *p++ = 0xF2;
   3651       *p++ = clearWBit(
   3652              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
   3653                                  vregEnc3210(i->Ain.Sse64FLo.src) ));
   3654       *p++ = 0x0F;
   3655       switch (i->Ain.Sse64FLo.op) {
   3656          case Asse_ADDF:   *p++ = 0x58; break;
   3657          case Asse_DIVF:   *p++ = 0x5E; break;
   3658          case Asse_MAXF:   *p++ = 0x5F; break;
   3659          case Asse_MINF:   *p++ = 0x5D; break;
   3660          case Asse_MULF:   *p++ = 0x59; break;
   3661          case Asse_SQRTF:  *p++ = 0x51; break;
   3662          case Asse_SUBF:   *p++ = 0x5C; break;
   3663          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
   3664          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
   3665          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
   3666          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
   3667          default: goto bad;
   3668       }
   3669       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
   3670                                vregEnc3210(i->Ain.Sse64FLo.src) );
   3671       if (xtra & 0x100)
   3672          *p++ = toUChar(xtra & 0xFF);
   3673       goto done;
   3674 
   3675    case Ain_SseReRg:
   3676 #     define XX(_n) *p++ = (_n)
   3677 
   3678       rex = clearWBit(
   3679             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
   3680                                 vregEnc3210(i->Ain.SseReRg.src) ));
   3681 
   3682       switch (i->Ain.SseReRg.op) {
   3683          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
   3684          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
   3685          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
   3686          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
   3687          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
   3688          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
   3689          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
   3690          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
   3691          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
   3692          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
   3693          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
   3694          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
   3695          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
   3696          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
   3697          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
   3698          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
   3699          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
   3700          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
   3701          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
   3702          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
   3703          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
   3704          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
   3705          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
   3706          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
   3707          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
   3708          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
   3709          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
   3710          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
   3711          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
   3712          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
   3713          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
   3714          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
   3715          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
   3716          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
   3717          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
   3718          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
   3719          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
   3720          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
   3721          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
   3722          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
   3723          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
   3724          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
   3725          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
   3726          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
   3727          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
   3728          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
   3729          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
   3730          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
   3731          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
   3732          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
   3733          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
   3734          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
   3735          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
   3736          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
   3737          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
   3738          default: goto bad;
   3739       }
   3740       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
   3741                                vregEnc3210(i->Ain.SseReRg.src) );
   3742 #     undef XX
   3743       goto done;
   3744 
   3745    case Ain_SseCMov:
   3746       /* jmp fwds if !condition */
   3747       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
   3748       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
   3749       ptmp = p;
   3750 
   3751       /* movaps %src, %dst */
   3752       *p++ = clearWBit(
   3753              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
   3754                                  vregEnc3210(i->Ain.SseCMov.src) ));
   3755       *p++ = 0x0F;
   3756       *p++ = 0x28;
   3757       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
   3758                                vregEnc3210(i->Ain.SseCMov.src) );
   3759 
   3760       /* Fill in the jump offset. */
   3761       *(ptmp-1) = toUChar(p - ptmp);
   3762       goto done;
   3763 
   3764    case Ain_SseShuf:
   3765       *p++ = 0x66;
   3766       *p++ = clearWBit(
   3767              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
   3768                                  vregEnc3210(i->Ain.SseShuf.src) ));
   3769       *p++ = 0x0F;
   3770       *p++ = 0x70;
   3771       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
   3772                                vregEnc3210(i->Ain.SseShuf.src) );
   3773       *p++ = (UChar)(i->Ain.SseShuf.order);
   3774       goto done;
   3775 
   3776    //uu case Ain_AvxLdSt: {
   3777    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
   3778    //uu                           i->Ain.AvxLdSt.addr );
   3779    //uu    p = emitVexPrefix(p, vex);
   3780    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
   3781    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
   3782    //uu      goto done;
   3783    //uu }
   3784 
   3785    case Ain_EvCheck: {
   3786       /* We generate:
   3787             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
   3788             (2 bytes)  jns  nofail     expected taken
   3789             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
   3790             nofail:
   3791       */
   3792       /* This is heavily asserted re instruction lengths.  It needs to
   3793          be.  If we get given unexpected forms of .amCounter or
   3794          .amFailAddr -- basically, anything that's not of the form
   3795          uimm7(%rbp) -- they are likely to fail. */
   3796       /* Note also that after the decl we must be very careful not to
   3797          read the carry flag, else we get a partial flags stall.
   3798          js/jns avoids that, though. */
   3799       UChar* p0 = p;
   3800       /* ---  decl 8(%rbp) --- */
   3801       /* Need to compute the REX byte for the decl in order to prove
   3802          that we don't need it, since this is a 32-bit inc and all
   3803          registers involved in the amode are < r8.  "1" because
   3804          there's no register in this encoding; instead the register
   3805          field is used as a sub opcode.  The encoding for "decl r/m32"
   3806          is FF /1, hence the "1". */
   3807       rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
   3808       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
   3809       *p++ = 0xFF;
   3810       p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
   3811       vassert(p - p0 == 3);
   3812       /* --- jns nofail --- */
   3813       *p++ = 0x79;
   3814       *p++ = 0x03; /* need to check this 0x03 after the next insn */
   3815       vassert(p - p0 == 5);
   3816       /* --- jmp* 0(%rbp) --- */
   3817       /* Once again, verify we don't need REX.  The encoding is FF /4.
   3818          We don't need REX.W since by default FF /4 in 64-bit mode
   3819          implies a 64 bit load. */
   3820       rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
   3821       if (rex != 0x40) goto bad;
   3822       *p++ = 0xFF;
   3823       p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
   3824       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
   3825       /* And crosscheck .. */
   3826       vassert(evCheckSzB_AMD64() == 8);
   3827       goto done;
   3828    }
   3829 
   3830    case Ain_ProfInc: {
   3831       /* We generate   movabsq $0, %r11
   3832                        incq (%r11)
   3833          in the expectation that a later call to LibVEX_patchProfCtr
   3834          will be used to fill in the immediate field once the right
   3835          value is known.
   3836          49 BB 00 00 00 00 00 00 00 00
   3837          49 FF 03
   3838       */
   3839       *p++ = 0x49; *p++ = 0xBB;
   3840       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3841       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
   3842       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
   3843       /* Tell the caller .. */
   3844       vassert(!(*is_profInc));
   3845       *is_profInc = True;
   3846       goto done;
   3847    }
   3848 
   3849    default:
   3850       goto bad;
   3851    }
   3852 
   3853   bad:
   3854    ppAMD64Instr(i, mode64);
   3855    vpanic("emit_AMD64Instr");
   3856    /*NOTREACHED*/
   3857 
   3858   done:
   3859    vassert(p - &buf[0] <= 64);
   3860    return p - &buf[0];
   3861 }
   3862 
   3863 
   3864 /* How big is an event check?  See case for Ain_EvCheck in
   3865    emit_AMD64Instr just above.  That crosschecks what this returns, so
   3866    we can tell if we're inconsistent. */
   3867 Int evCheckSzB_AMD64 (void)
   3868 {
   3869    return 8;
   3870 }
   3871 
   3872 
   3873 /* NB: what goes on here has to be very closely coordinated with the
   3874    emitInstr case for XDirect, above. */
   3875 VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
   3876                                    void* place_to_chain,
   3877                                    const void* disp_cp_chain_me_EXPECTED,
   3878                                    const void* place_to_jump_to )
   3879 {
   3880    vassert(endness_host == VexEndnessLE);
   3881 
   3882    /* What we're expecting to see is:
   3883         movabsq $disp_cp_chain_me_EXPECTED, %r11
   3884         call *%r11
   3885       viz
   3886         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
   3887         41 FF D3
   3888    */
   3889    UChar* p = (UChar*)place_to_chain;
   3890    vassert(p[0] == 0x49);
   3891    vassert(p[1] == 0xBB);
   3892    vassert(read_misaligned_ULong_LE(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
   3893    vassert(p[10] == 0x41);
   3894    vassert(p[11] == 0xFF);
   3895    vassert(p[12] == 0xD3);
   3896    /* And what we want to change it to is either:
   3897         (general case):
   3898           movabsq $place_to_jump_to, %r11
   3899           jmpq *%r11
   3900         viz
   3901           49 BB <8 bytes value == place_to_jump_to>
   3902           41 FF E3
   3903         So it's the same length (convenient, huh) and we don't
   3904         need to change all the bits.
   3905       ---OR---
   3906         in the case where the displacement falls within 32 bits
   3907           jmpq disp32   where disp32 is relative to the next insn
   3908           ud2; ud2; ud2; ud2
   3909         viz
   3910           E9 <4 bytes == disp32>
   3911           0F 0B 0F 0B 0F 0B 0F 0B
   3912 
   3913       In both cases the replacement has the same length as the original.
   3914       To remain sane & verifiable,
   3915       (1) limit the displacement for the short form to
   3916           (say) +/- one billion, so as to avoid wraparound
   3917           off-by-ones
   3918       (2) even if the short form is applicable, once every (say)
   3919           1024 times use the long form anyway, so as to maintain
   3920           verifiability
   3921    */
   3922    /* This is the delta we need to put into a JMP d32 insn.  It's
   3923       relative to the start of the next insn, hence the -5.  */
   3924    Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
   3925    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
   3926 
   3927    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
   3928    if (shortOK) {
   3929       shortCTR++; // thread safety bleh
   3930       if (0 == (shortCTR & 0x3FF)) {
   3931          shortOK = False;
   3932          if (0)
   3933             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
   3934                        "using long jmp\n", shortCTR);
   3935       }
   3936    }
   3937 
   3938    /* And make the modifications. */
   3939    if (shortOK) {
   3940       p[0]  = 0xE9;
   3941       write_misaligned_UInt_LE(&p[1], (UInt)(Int)delta);
   3942       p[5]  = 0x0F; p[6]  = 0x0B;
   3943       p[7]  = 0x0F; p[8]  = 0x0B;
   3944       p[9]  = 0x0F; p[10] = 0x0B;
   3945       p[11] = 0x0F; p[12] = 0x0B;
   3946       /* sanity check on the delta -- top 32 are all 0 or all 1 */
   3947       delta >>= 32;
   3948       vassert(delta == 0LL || delta == -1LL);
   3949    } else {
   3950       /* Minimal modifications from the starting sequence. */
   3951       write_misaligned_ULong_LE(&p[2], (ULong)(Addr)place_to_jump_to);
   3952       p[12] = 0xE3;
   3953    }
   3954    VexInvalRange vir = { (HWord)place_to_chain, 13 };
   3955    return vir;
   3956 }
   3957 
   3958 
   3959 /* NB: what goes on here has to be very closely coordinated with the
   3960    emitInstr case for XDirect, above. */
   3961 VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
   3962                                      void* place_to_unchain,
   3963                                      const void* place_to_jump_to_EXPECTED,
   3964                                      const void* disp_cp_chain_me )
   3965 {
   3966    vassert(endness_host == VexEndnessLE);
   3967 
   3968    /* What we're expecting to see is either:
   3969         (general case)
   3970           movabsq $place_to_jump_to_EXPECTED, %r11
   3971           jmpq *%r11
   3972         viz
   3973           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
   3974           41 FF E3
   3975       ---OR---
   3976         in the case where the displacement falls within 32 bits
   3977           jmpq d32
   3978           ud2; ud2; ud2; ud2
   3979         viz
   3980           E9 <4 bytes == disp32>
   3981           0F 0B 0F 0B 0F 0B 0F 0B
   3982    */
   3983    UChar* p     = (UChar*)place_to_unchain;
   3984    Bool   valid = False;
   3985    if (p[0] == 0x49 && p[1] == 0xBB
   3986        && read_misaligned_ULong_LE(&p[2])
   3987           == (ULong)(Addr)place_to_jump_to_EXPECTED
   3988        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
   3989       /* it's the long form */
   3990       valid = True;
   3991    }
   3992    else
   3993    if (p[0] == 0xE9
   3994        && p[5]  == 0x0F && p[6]  == 0x0B
   3995        && p[7]  == 0x0F && p[8]  == 0x0B
   3996        && p[9]  == 0x0F && p[10] == 0x0B
   3997        && p[11] == 0x0F && p[12] == 0x0B) {
   3998       /* It's the short form.  Check the offset is right. */
   3999       Int  s32 = (Int)read_misaligned_UInt_LE(&p[1]);
   4000       Long s64 = (Long)s32;
   4001       if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
   4002          valid = True;
   4003          if (0)
   4004             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
   4005       }
   4006    }
   4007    vassert(valid);
   4008    /* And what we want to change it to is:
   4009         movabsq $disp_cp_chain_me, %r11
   4010         call *%r11
   4011       viz
   4012         49 BB <8 bytes value == disp_cp_chain_me>
   4013         41 FF D3
   4014       So it's the same length (convenient, huh).
   4015    */
   4016    p[0] = 0x49;
   4017    p[1] = 0xBB;
   4018    write_misaligned_ULong_LE(&p[2], (ULong)(Addr)disp_cp_chain_me);
   4019    p[10] = 0x41;
   4020    p[11] = 0xFF;
   4021    p[12] = 0xD3;
   4022    VexInvalRange vir = { (HWord)place_to_unchain, 13 };
   4023    return vir;
   4024 }
   4025 
   4026 
   4027 /* Patch the counter address into a profile inc point, as previously
   4028    created by the Ain_ProfInc case for emit_AMD64Instr. */
   4029 VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
   4030                                    void*  place_to_patch,
   4031                                    const ULong* location_of_counter )
   4032 {
   4033    vassert(endness_host == VexEndnessLE);
   4034    vassert(sizeof(ULong*) == 8);
   4035    UChar* p = (UChar*)place_to_patch;
   4036    vassert(p[0] == 0x49);
   4037    vassert(p[1] == 0xBB);
   4038    vassert(p[2] == 0x00);
   4039    vassert(p[3] == 0x00);
   4040    vassert(p[4] == 0x00);
   4041    vassert(p[5] == 0x00);
   4042    vassert(p[6] == 0x00);
   4043    vassert(p[7] == 0x00);
   4044    vassert(p[8] == 0x00);
   4045    vassert(p[9] == 0x00);
   4046    vassert(p[10] == 0x49);
   4047    vassert(p[11] == 0xFF);
   4048    vassert(p[12] == 0x03);
   4049    ULong imm64 = (ULong)(Addr)location_of_counter;
   4050    p[2] = imm64 & 0xFF; imm64 >>= 8;
   4051    p[3] = imm64 & 0xFF; imm64 >>= 8;
   4052    p[4] = imm64 & 0xFF; imm64 >>= 8;
   4053    p[5] = imm64 & 0xFF; imm64 >>= 8;
   4054    p[6] = imm64 & 0xFF; imm64 >>= 8;
   4055    p[7] = imm64 & 0xFF; imm64 >>= 8;
   4056    p[8] = imm64 & 0xFF; imm64 >>= 8;
   4057    p[9] = imm64 & 0xFF; imm64 >>= 8;
   4058    VexInvalRange vir = { (HWord)place_to_patch, 13 };
   4059    return vir;
   4060 }
   4061 
   4062 
   4063 /*---------------------------------------------------------------*/
   4064 /*--- end                                   host_amd64_defs.c ---*/
   4065 /*---------------------------------------------------------------*/
   4066