Home | History | Annotate | Download | only in x86asm
      1 // Copyright 2014 The Go Authors.  All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Table-driven decoding of x86 instructions.
      6 
      7 package x86asm
      8 
      9 import (
     10 	"encoding/binary"
     11 	"errors"
     12 	"fmt"
     13 	"runtime"
     14 )
     15 
     16 // Set trace to true to cause the decoder to print the PC sequence
     17 // of the executed instruction codes. This is typically only useful
     18 // when you are running a test of a single input case.
     19 const trace = false
     20 
     21 // A decodeOp is a single instruction in the decoder bytecode program.
     22 //
     23 // The decodeOps correspond to consuming and conditionally branching
     24 // on input bytes, consuming additional fields, and then interpreting
     25 // consumed data as instruction arguments. The names of the xRead and xArg
     26 // operations are taken from the Intel manual conventions, for example
     27 // Volume 2, Section 3.1.1, page 487 of
     28 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
     29 //
     30 // The actual decoding program is generated by ../x86map.
     31 //
     32 // TODO(rsc): We may be able to merge various of the memory operands
     33 // since we don't care about, say, the distinction between m80dec and m80bcd.
     34 // Similarly, mm and mm1 have identical meaning, as do xmm and xmm1.
     35 
     36 type decodeOp uint16
     37 
     38 const (
     39 	xFail  decodeOp = iota // invalid instruction (return)
     40 	xMatch                 // completed match
     41 	xJump                  // jump to pc
     42 
     43 	xCondByte     // switch on instruction byte value
     44 	xCondSlashR   // read and switch on instruction /r value
     45 	xCondPrefix   // switch on presence of instruction prefix
     46 	xCondIs64     // switch on 64-bit processor mode
     47 	xCondDataSize // switch on operand size
     48 	xCondAddrSize // switch on address size
     49 	xCondIsMem    // switch on memory vs register argument
     50 
     51 	xSetOp // set instruction opcode
     52 
     53 	xReadSlashR // read /r
     54 	xReadIb     // read ib
     55 	xReadIw     // read iw
     56 	xReadId     // read id
     57 	xReadIo     // read io
     58 	xReadCb     // read cb
     59 	xReadCw     // read cw
     60 	xReadCd     // read cd
     61 	xReadCp     // read cp
     62 	xReadCm     // read cm
     63 
     64 	xArg1            // arg 1
     65 	xArg3            // arg 3
     66 	xArgAL           // arg AL
     67 	xArgAX           // arg AX
     68 	xArgCL           // arg CL
     69 	xArgCR0dashCR7   // arg CR0-CR7
     70 	xArgCS           // arg CS
     71 	xArgDR0dashDR7   // arg DR0-DR7
     72 	xArgDS           // arg DS
     73 	xArgDX           // arg DX
     74 	xArgEAX          // arg EAX
     75 	xArgEDX          // arg EDX
     76 	xArgES           // arg ES
     77 	xArgFS           // arg FS
     78 	xArgGS           // arg GS
     79 	xArgImm16        // arg imm16
     80 	xArgImm32        // arg imm32
     81 	xArgImm64        // arg imm64
     82 	xArgImm8         // arg imm8
     83 	xArgImm8u        // arg imm8 but record as unsigned
     84 	xArgImm16u       // arg imm8 but record as unsigned
     85 	xArgM            // arg m
     86 	xArgM128         // arg m128
     87 	xArgM1428byte    // arg m14/28byte
     88 	xArgM16          // arg m16
     89 	xArgM16and16     // arg m16&16
     90 	xArgM16and32     // arg m16&32
     91 	xArgM16and64     // arg m16&64
     92 	xArgM16colon16   // arg m16:16
     93 	xArgM16colon32   // arg m16:32
     94 	xArgM16colon64   // arg m16:64
     95 	xArgM16int       // arg m16int
     96 	xArgM2byte       // arg m2byte
     97 	xArgM32          // arg m32
     98 	xArgM32and32     // arg m32&32
     99 	xArgM32fp        // arg m32fp
    100 	xArgM32int       // arg m32int
    101 	xArgM512byte     // arg m512byte
    102 	xArgM64          // arg m64
    103 	xArgM64fp        // arg m64fp
    104 	xArgM64int       // arg m64int
    105 	xArgM8           // arg m8
    106 	xArgM80bcd       // arg m80bcd
    107 	xArgM80dec       // arg m80dec
    108 	xArgM80fp        // arg m80fp
    109 	xArgM94108byte   // arg m94/108byte
    110 	xArgMm           // arg mm
    111 	xArgMm1          // arg mm1
    112 	xArgMm2          // arg mm2
    113 	xArgMm2M64       // arg mm2/m64
    114 	xArgMmM32        // arg mm/m32
    115 	xArgMmM64        // arg mm/m64
    116 	xArgMem          // arg mem
    117 	xArgMoffs16      // arg moffs16
    118 	xArgMoffs32      // arg moffs32
    119 	xArgMoffs64      // arg moffs64
    120 	xArgMoffs8       // arg moffs8
    121 	xArgPtr16colon16 // arg ptr16:16
    122 	xArgPtr16colon32 // arg ptr16:32
    123 	xArgR16          // arg r16
    124 	xArgR16op        // arg r16 with +rw in opcode
    125 	xArgR32          // arg r32
    126 	xArgR32M16       // arg r32/m16
    127 	xArgR32M8        // arg r32/m8
    128 	xArgR32op        // arg r32 with +rd in opcode
    129 	xArgR64          // arg r64
    130 	xArgR64M16       // arg r64/m16
    131 	xArgR64op        // arg r64 with +rd in opcode
    132 	xArgR8           // arg r8
    133 	xArgR8op         // arg r8 with +rb in opcode
    134 	xArgRAX          // arg RAX
    135 	xArgRDX          // arg RDX
    136 	xArgRM           // arg r/m
    137 	xArgRM16         // arg r/m16
    138 	xArgRM32         // arg r/m32
    139 	xArgRM64         // arg r/m64
    140 	xArgRM8          // arg r/m8
    141 	xArgReg          // arg reg
    142 	xArgRegM16       // arg reg/m16
    143 	xArgRegM32       // arg reg/m32
    144 	xArgRegM8        // arg reg/m8
    145 	xArgRel16        // arg rel16
    146 	xArgRel32        // arg rel32
    147 	xArgRel8         // arg rel8
    148 	xArgSS           // arg SS
    149 	xArgST           // arg ST, aka ST(0)
    150 	xArgSTi          // arg ST(i) with +i in opcode
    151 	xArgSreg         // arg Sreg
    152 	xArgTR0dashTR7   // arg TR0-TR7
    153 	xArgXmm          // arg xmm
    154 	xArgXMM0         // arg <XMM0>
    155 	xArgXmm1         // arg xmm1
    156 	xArgXmm2         // arg xmm2
    157 	xArgXmm2M128     // arg xmm2/m128
    158 	xArgXmm2M16      // arg xmm2/m16
    159 	xArgXmm2M32      // arg xmm2/m32
    160 	xArgXmm2M64      // arg xmm2/m64
    161 	xArgXmmM128      // arg xmm/m128
    162 	xArgXmmM32       // arg xmm/m32
    163 	xArgXmmM64       // arg xmm/m64
    164 	xArgRmf16        // arg r/m16 but force mod=3
    165 	xArgRmf32        // arg r/m32 but force mod=3
    166 	xArgRmf64        // arg r/m64 but force mod=3
    167 )
    168 
    169 // instPrefix returns an Inst describing just one prefix byte.
    170 // It is only used if there is a prefix followed by an unintelligible
    171 // or invalid instruction byte sequence.
    172 func instPrefix(b byte, mode int) (Inst, error) {
    173 	// When tracing it is useful to see what called instPrefix to report an error.
    174 	if trace {
    175 		_, file, line, _ := runtime.Caller(1)
    176 		fmt.Printf("%s:%d\n", file, line)
    177 	}
    178 	p := Prefix(b)
    179 	switch p {
    180 	case PrefixDataSize:
    181 		if mode == 16 {
    182 			p = PrefixData32
    183 		} else {
    184 			p = PrefixData16
    185 		}
    186 	case PrefixAddrSize:
    187 		if mode == 32 {
    188 			p = PrefixAddr16
    189 		} else {
    190 			p = PrefixAddr32
    191 		}
    192 	}
    193 	// Note: using composite literal with Prefix key confuses 'bundle' tool.
    194 	inst := Inst{Len: 1}
    195 	inst.Prefix = Prefixes{p}
    196 	return inst, nil
    197 }
    198 
    199 // truncated reports a truncated instruction.
    200 // For now we use instPrefix but perhaps later we will return
    201 // a specific error here.
    202 func truncated(src []byte, mode int) (Inst, error) {
    203 	//	return Inst{}, len(src), ErrTruncated
    204 	return instPrefix(src[0], mode) // too long
    205 }
    206 
    207 // These are the errors returned by Decode.
    208 var (
    209 	ErrInvalidMode  = errors.New("invalid x86 mode in Decode")
    210 	ErrTruncated    = errors.New("truncated instruction")
    211 	ErrUnrecognized = errors.New("unrecognized instruction")
    212 )
    213 
    214 // decoderCover records coverage information for which parts
    215 // of the byte code have been executed.
    216 // TODO(rsc): This is for testing. Only use this if a flag is given.
    217 var decoderCover []bool
    218 
    219 // Decode decodes the leading bytes in src as a single instruction.
    220 // The mode arguments specifies the assumed processor mode:
    221 // 16, 32, or 64 for 16-, 32-, and 64-bit execution modes.
    222 func Decode(src []byte, mode int) (inst Inst, err error) {
    223 	return decode1(src, mode, false)
    224 }
    225 
    226 // decode1 is the implementation of Decode but takes an extra
    227 // gnuCompat flag to cause it to change its behavior to mimic
    228 // bugs (or at least unique features) of GNU libopcodes as used
    229 // by objdump. We don't believe that logic is the right thing to do
    230 // in general, but when testing against libopcodes it simplifies the
    231 // comparison if we adjust a few small pieces of logic.
    232 // The affected logic is in the conditional branch for "mandatory" prefixes,
    233 // case xCondPrefix.
    234 func decode1(src []byte, mode int, gnuCompat bool) (Inst, error) {
    235 	switch mode {
    236 	case 16, 32, 64:
    237 		// ok
    238 		// TODO(rsc): 64-bit mode not tested, probably not working.
    239 	default:
    240 		return Inst{}, ErrInvalidMode
    241 	}
    242 
    243 	// Maximum instruction size is 15 bytes.
    244 	// If we need to read more, return 'truncated instruction.
    245 	if len(src) > 15 {
    246 		src = src[:15]
    247 	}
    248 
    249 	var (
    250 		// prefix decoding information
    251 		pos           = 0    // position reading src
    252 		nprefix       = 0    // number of prefixes
    253 		lockIndex     = -1   // index of LOCK prefix in src and inst.Prefix
    254 		repIndex      = -1   // index of REP/REPN prefix in src and inst.Prefix
    255 		segIndex      = -1   // index of Group 2 prefix in src and inst.Prefix
    256 		dataSizeIndex = -1   // index of Group 3 prefix in src and inst.Prefix
    257 		addrSizeIndex = -1   // index of Group 4 prefix in src and inst.Prefix
    258 		rex           Prefix // rex byte if present (or 0)
    259 		rexUsed       Prefix // bits used in rex byte
    260 		rexIndex      = -1   // index of rex byte
    261 
    262 		addrMode = mode // address mode (width in bits)
    263 		dataMode = mode // operand mode (width in bits)
    264 
    265 		// decoded ModR/M fields
    266 		haveModrm bool
    267 		modrm     int
    268 		mod       int
    269 		regop     int
    270 		rm        int
    271 
    272 		// if ModR/M is memory reference, Mem form
    273 		mem     Mem
    274 		haveMem bool
    275 
    276 		// decoded SIB fields
    277 		haveSIB bool
    278 		sib     int
    279 		scale   int
    280 		index   int
    281 		base    int
    282 
    283 		// decoded immediate values
    284 		imm  int64
    285 		imm8 int8
    286 		immc int64
    287 
    288 		// output
    289 		opshift int
    290 		inst    Inst
    291 		narg    int // number of arguments written to inst
    292 	)
    293 
    294 	if mode == 64 {
    295 		dataMode = 32
    296 	}
    297 
    298 	// Prefixes are certainly the most complex and underspecified part of
    299 	// decoding x86 instructions. Although the manuals say things like
    300 	// up to four prefixes, one from each group, nearly everyone seems to
    301 	// agree that in practice as many prefixes as possible, including multiple
    302 	// from a particular group or repetitions of a given prefix, can be used on
    303 	// an instruction, provided the total instruction length including prefixes
    304 	// does not exceed the agreed-upon maximum of 15 bytes.
    305 	// Everyone also agrees that if one of these prefixes is the LOCK prefix
    306 	// and the instruction is not one of the instructions that can be used with
    307 	// the LOCK prefix or if the destination is not a memory operand,
    308 	// then the instruction is invalid and produces the #UD exception.
    309 	// However, that is the end of any semblance of agreement.
    310 	//
    311 	// What happens if prefixes are given that conflict with other prefixes?
    312 	// For example, the memory segment overrides CS, DS, ES, FS, GS, SS
    313 	// conflict with each other: only one segment can be in effect.
    314 	// Disassemblers seem to agree that later prefixes take priority over
    315 	// earlier ones. I have not taken the time to write assembly programs
    316 	// to check to see if the hardware agrees.
    317 	//
    318 	// What happens if prefixes are given that have no meaning for the
    319 	// specific instruction to which they are attached? It depends.
    320 	// If they really have no meaning, they are ignored. However, a future
    321 	// processor may assign a different meaning. As a disassembler, we
    322 	// don't really know whether we're seeing a meaningless prefix or one
    323 	// whose meaning we simply haven't been told yet.
    324 	//
    325 	// Combining the two questions, what happens when conflicting
    326 	// extension prefixes are given? No one seems to know for sure.
    327 	// For example, MOVQ is 66 0F D6 /r, MOVDQ2Q is F2 0F D6 /r,
    328 	// and MOVQ2DQ is F3 0F D6 /r. What is '66 F2 F3 0F D6 /r'?
    329 	// Which prefix wins? See the xCondPrefix prefix for more.
    330 	//
    331 	// Writing assembly test cases to divine which interpretation the
    332 	// CPU uses might clarify the situation, but more likely it would
    333 	// make the situation even less clear.
    334 
    335 	// Read non-REX prefixes.
    336 ReadPrefixes:
    337 	for ; pos < len(src); pos++ {
    338 		p := Prefix(src[pos])
    339 		switch p {
    340 		default:
    341 			nprefix = pos
    342 			break ReadPrefixes
    343 
    344 		// Group 1 - lock and repeat prefixes
    345 		// According to Intel, there should only be one from this set,
    346 		// but according to AMD both can be present.
    347 		case 0xF0:
    348 			if lockIndex >= 0 {
    349 				inst.Prefix[lockIndex] |= PrefixIgnored
    350 			}
    351 			lockIndex = pos
    352 		case 0xF2, 0xF3:
    353 			if repIndex >= 0 {
    354 				inst.Prefix[repIndex] |= PrefixIgnored
    355 			}
    356 			repIndex = pos
    357 
    358 		// Group 2 - segment override / branch hints
    359 		case 0x26, 0x2E, 0x36, 0x3E:
    360 			if mode == 64 {
    361 				p |= PrefixIgnored
    362 				break
    363 			}
    364 			fallthrough
    365 		case 0x64, 0x65:
    366 			if segIndex >= 0 {
    367 				inst.Prefix[segIndex] |= PrefixIgnored
    368 			}
    369 			segIndex = pos
    370 
    371 		// Group 3 - operand size override
    372 		case 0x66:
    373 			if mode == 16 {
    374 				dataMode = 32
    375 				p = PrefixData32
    376 			} else {
    377 				dataMode = 16
    378 				p = PrefixData16
    379 			}
    380 			if dataSizeIndex >= 0 {
    381 				inst.Prefix[dataSizeIndex] |= PrefixIgnored
    382 			}
    383 			dataSizeIndex = pos
    384 
    385 		// Group 4 - address size override
    386 		case 0x67:
    387 			if mode == 32 {
    388 				addrMode = 16
    389 				p = PrefixAddr16
    390 			} else {
    391 				addrMode = 32
    392 				p = PrefixAddr32
    393 			}
    394 			if addrSizeIndex >= 0 {
    395 				inst.Prefix[addrSizeIndex] |= PrefixIgnored
    396 			}
    397 			addrSizeIndex = pos
    398 		}
    399 
    400 		if pos >= len(inst.Prefix) {
    401 			return instPrefix(src[0], mode) // too long
    402 		}
    403 
    404 		inst.Prefix[pos] = p
    405 	}
    406 
    407 	// Read REX prefix.
    408 	if pos < len(src) && mode == 64 && Prefix(src[pos]).IsREX() {
    409 		rex = Prefix(src[pos])
    410 		rexIndex = pos
    411 		if pos >= len(inst.Prefix) {
    412 			return instPrefix(src[0], mode) // too long
    413 		}
    414 		inst.Prefix[pos] = rex
    415 		pos++
    416 		if rex&PrefixREXW != 0 {
    417 			dataMode = 64
    418 			if dataSizeIndex >= 0 {
    419 				inst.Prefix[dataSizeIndex] |= PrefixIgnored
    420 			}
    421 		}
    422 	}
    423 
    424 	// Decode instruction stream, interpreting decoding instructions.
    425 	// opshift gives the shift to use when saving the next
    426 	// opcode byte into inst.Opcode.
    427 	opshift = 24
    428 	if decoderCover == nil {
    429 		decoderCover = make([]bool, len(decoder))
    430 	}
    431 
    432 	// Decode loop, executing decoder program.
    433 	var oldPC, prevPC int
    434 Decode:
    435 	for pc := 1; ; { // TODO uint
    436 		oldPC = prevPC
    437 		prevPC = pc
    438 		if trace {
    439 			println("run", pc)
    440 		}
    441 		x := decoder[pc]
    442 		decoderCover[pc] = true
    443 		pc++
    444 
    445 		// Read and decode ModR/M if needed by opcode.
    446 		switch decodeOp(x) {
    447 		case xCondSlashR, xReadSlashR:
    448 			if haveModrm {
    449 				return Inst{Len: pos}, errInternal
    450 			}
    451 			haveModrm = true
    452 			if pos >= len(src) {
    453 				return truncated(src, mode)
    454 			}
    455 			modrm = int(src[pos])
    456 			pos++
    457 			if opshift >= 0 {
    458 				inst.Opcode |= uint32(modrm) << uint(opshift)
    459 				opshift -= 8
    460 			}
    461 			mod = modrm >> 6
    462 			regop = (modrm >> 3) & 07
    463 			rm = modrm & 07
    464 			if rex&PrefixREXR != 0 {
    465 				rexUsed |= PrefixREXR
    466 				regop |= 8
    467 			}
    468 			if addrMode == 16 {
    469 				// 16-bit modrm form
    470 				if mod != 3 {
    471 					haveMem = true
    472 					mem = addr16[rm]
    473 					if rm == 6 && mod == 0 {
    474 						mem.Base = 0
    475 					}
    476 
    477 					// Consume disp16 if present.
    478 					if mod == 0 && rm == 6 || mod == 2 {
    479 						if pos+2 > len(src) {
    480 							return truncated(src, mode)
    481 						}
    482 						mem.Disp = int64(binary.LittleEndian.Uint16(src[pos:]))
    483 						pos += 2
    484 					}
    485 
    486 					// Consume disp8 if present.
    487 					if mod == 1 {
    488 						if pos >= len(src) {
    489 							return truncated(src, mode)
    490 						}
    491 						mem.Disp = int64(int8(src[pos]))
    492 						pos++
    493 					}
    494 				}
    495 			} else {
    496 				haveMem = mod != 3
    497 
    498 				// 32-bit or 64-bit form
    499 				// Consume SIB encoding if present.
    500 				if rm == 4 && mod != 3 {
    501 					haveSIB = true
    502 					if pos >= len(src) {
    503 						return truncated(src, mode)
    504 					}
    505 					sib = int(src[pos])
    506 					pos++
    507 					if opshift >= 0 {
    508 						inst.Opcode |= uint32(sib) << uint(opshift)
    509 						opshift -= 8
    510 					}
    511 					scale = sib >> 6
    512 					index = (sib >> 3) & 07
    513 					base = sib & 07
    514 					if rex&PrefixREXB != 0 {
    515 						rexUsed |= PrefixREXB
    516 						base |= 8
    517 					}
    518 					if rex&PrefixREXX != 0 {
    519 						rexUsed |= PrefixREXX
    520 						index |= 8
    521 					}
    522 
    523 					mem.Scale = 1 << uint(scale)
    524 					if index == 4 {
    525 						// no mem.Index
    526 					} else {
    527 						mem.Index = baseRegForBits(addrMode) + Reg(index)
    528 					}
    529 					if base&7 == 5 && mod == 0 {
    530 						// no mem.Base
    531 					} else {
    532 						mem.Base = baseRegForBits(addrMode) + Reg(base)
    533 					}
    534 				} else {
    535 					if rex&PrefixREXB != 0 {
    536 						rexUsed |= PrefixREXB
    537 						rm |= 8
    538 					}
    539 					if mod == 0 && rm&7 == 5 || rm&7 == 4 {
    540 						// base omitted
    541 					} else if mod != 3 {
    542 						mem.Base = baseRegForBits(addrMode) + Reg(rm)
    543 					}
    544 				}
    545 
    546 				// Consume disp32 if present.
    547 				if mod == 0 && (rm&7 == 5 || haveSIB && base&7 == 5) || mod == 2 {
    548 					if pos+4 > len(src) {
    549 						return truncated(src, mode)
    550 					}
    551 					mem.Disp = int64(binary.LittleEndian.Uint32(src[pos:]))
    552 					pos += 4
    553 				}
    554 
    555 				// Consume disp8 if present.
    556 				if mod == 1 {
    557 					if pos >= len(src) {
    558 						return truncated(src, mode)
    559 					}
    560 					mem.Disp = int64(int8(src[pos]))
    561 					pos++
    562 				}
    563 
    564 				// In 64-bit, mod=0 rm=5 is PC-relative instead of just disp.
    565 				// See Vol 2A. Table 2-7.
    566 				if mode == 64 && mod == 0 && rm&7 == 5 {
    567 					if addrMode == 32 {
    568 						mem.Base = EIP
    569 					} else {
    570 						mem.Base = RIP
    571 					}
    572 				}
    573 			}
    574 
    575 			if segIndex >= 0 {
    576 				mem.Segment = prefixToSegment(inst.Prefix[segIndex])
    577 			}
    578 		}
    579 
    580 		// Execute single opcode.
    581 		switch decodeOp(x) {
    582 		default:
    583 			println("bad op", x, "at", pc-1, "from", oldPC)
    584 			return Inst{Len: pos}, errInternal
    585 
    586 		case xFail:
    587 			inst.Op = 0
    588 			break Decode
    589 
    590 		case xMatch:
    591 			break Decode
    592 
    593 		case xJump:
    594 			pc = int(decoder[pc])
    595 
    596 		// Conditional branches.
    597 
    598 		case xCondByte:
    599 			if pos >= len(src) {
    600 				return truncated(src, mode)
    601 			}
    602 			b := src[pos]
    603 			n := int(decoder[pc])
    604 			pc++
    605 			for i := 0; i < n; i++ {
    606 				xb, xpc := decoder[pc], int(decoder[pc+1])
    607 				pc += 2
    608 				if b == byte(xb) {
    609 					pc = xpc
    610 					pos++
    611 					if opshift >= 0 {
    612 						inst.Opcode |= uint32(b) << uint(opshift)
    613 						opshift -= 8
    614 					}
    615 					continue Decode
    616 				}
    617 			}
    618 			// xCondByte is the only conditional with a fall through,
    619 			// so that it can be used to pick off special cases before
    620 			// an xCondSlash. If the fallthrough instruction is xFail,
    621 			// advance the position so that the decoded instruction
    622 			// size includes the byte we just compared against.
    623 			if decodeOp(decoder[pc]) == xJump {
    624 				pc = int(decoder[pc+1])
    625 			}
    626 			if decodeOp(decoder[pc]) == xFail {
    627 				pos++
    628 			}
    629 
    630 		case xCondIs64:
    631 			if mode == 64 {
    632 				pc = int(decoder[pc+1])
    633 			} else {
    634 				pc = int(decoder[pc])
    635 			}
    636 
    637 		case xCondIsMem:
    638 			mem := haveMem
    639 			if !haveModrm {
    640 				if pos >= len(src) {
    641 					return instPrefix(src[0], mode) // too long
    642 				}
    643 				mem = src[pos]>>6 != 3
    644 			}
    645 			if mem {
    646 				pc = int(decoder[pc+1])
    647 			} else {
    648 				pc = int(decoder[pc])
    649 			}
    650 
    651 		case xCondDataSize:
    652 			switch dataMode {
    653 			case 16:
    654 				if dataSizeIndex >= 0 {
    655 					inst.Prefix[dataSizeIndex] |= PrefixImplicit
    656 				}
    657 				pc = int(decoder[pc])
    658 			case 32:
    659 				if dataSizeIndex >= 0 {
    660 					inst.Prefix[dataSizeIndex] |= PrefixImplicit
    661 				}
    662 				pc = int(decoder[pc+1])
    663 			case 64:
    664 				rexUsed |= PrefixREXW
    665 				pc = int(decoder[pc+2])
    666 			}
    667 
    668 		case xCondAddrSize:
    669 			switch addrMode {
    670 			case 16:
    671 				if addrSizeIndex >= 0 {
    672 					inst.Prefix[addrSizeIndex] |= PrefixImplicit
    673 				}
    674 				pc = int(decoder[pc])
    675 			case 32:
    676 				if addrSizeIndex >= 0 {
    677 					inst.Prefix[addrSizeIndex] |= PrefixImplicit
    678 				}
    679 				pc = int(decoder[pc+1])
    680 			case 64:
    681 				pc = int(decoder[pc+2])
    682 			}
    683 
    684 		case xCondPrefix:
    685 			// Conditional branch based on presence or absence of prefixes.
    686 			// The conflict cases here are completely undocumented and
    687 			// differ significantly between GNU libopcodes and Intel xed.
    688 			// I have not written assembly code to divine what various CPUs
    689 			// do, but it wouldn't surprise me if they are not consistent either.
    690 			//
    691 			// The basic idea is to switch on the presence of a prefix, so that
    692 			// for example:
    693 			//
    694 			//	xCondPrefix, 4
    695 			//	0xF3, 123,
    696 			//	0xF2, 234,
    697 			//	0x66, 345,
    698 			//	0, 456
    699 			//
    700 			// branch to 123 if the F3 prefix is present, 234 if the F2 prefix
    701 			// is present, 66 if the 345 prefix is present, and 456 otherwise.
    702 			// The prefixes are given in descending order so that the 0 will be last.
    703 			//
    704 			// It is unclear what should happen if multiple conditions are
    705 			// satisfied: what if F2 and F3 are both present, or if 66 and F2
    706 			// are present, or if all three are present? The one chosen becomes
    707 			// part of the opcode and the others do not. Perhaps the answer
    708 			// depends on the specific opcodes in question.
    709 			//
    710 			// The only clear example is that CRC32 is F2 0F 38 F1 /r, and
    711 			// it comes in 16-bit and 32-bit forms based on the 66 prefix,
    712 			// so 66 F2 0F 38 F1 /r should be treated as F2 taking priority,
    713 			// with the 66 being only an operand size override, and probably
    714 			// F2 66 0F 38 F1 /r should be treated the same.
    715 			// Perhaps that rule is specific to the case of CRC32, since no
    716 			// 66 0F 38 F1 instruction is defined (today) (that we know of).
    717 			// However, both libopcodes and xed seem to generalize this
    718 			// example and choose F2/F3 in preference to 66, and we
    719 			// do the same.
    720 			//
    721 			// Next, what if both F2 and F3 are present? Which wins?
    722 			// The Intel xed rule, and ours, is that the one that occurs last wins.
    723 			// The GNU libopcodes rule, which we implement only in gnuCompat mode,
    724 			// is that F3 beats F2 unless F3 has no special meaning, in which
    725 			// case F3 can be a modified on an F2 special meaning.
    726 			//
    727 			// Concretely,
    728 			//	66 0F D6 /r is MOVQ
    729 			//	F2 0F D6 /r is MOVDQ2Q
    730 			//	F3 0F D6 /r is MOVQ2DQ.
    731 			//
    732 			//	F2 66 0F D6 /r is 66 + MOVDQ2Q always.
    733 			//	66 F2 0F D6 /r is 66 + MOVDQ2Q always.
    734 			//	F3 66 0F D6 /r is 66 + MOVQ2DQ always.
    735 			//	66 F3 0F D6 /r is 66 + MOVQ2DQ always.
    736 			//	F2 F3 0F D6 /r is F2 + MOVQ2DQ always.
    737 			//	F3 F2 0F D6 /r is F3 + MOVQ2DQ in Intel xed, but F2 + MOVQ2DQ in GNU libopcodes.
    738 			//	Adding 66 anywhere in the prefix section of the
    739 			//	last two cases does not change the outcome.
    740 			//
    741 			// Finally, what if there is a variant in which 66 is a mandatory
    742 			// prefix rather than an operand size override, but we know of
    743 			// no corresponding F2/F3 form, and we see both F2/F3 and 66.
    744 			// Does F2/F3 still take priority, so that the result is an unknown
    745 			// instruction, or does the 66 take priority, so that the extended
    746 			// 66 instruction should be interpreted as having a REP/REPN prefix?
    747 			// Intel xed does the former and GNU libopcodes does the latter.
    748 			// We side with Intel xed, unless we are trying to match libopcodes
    749 			// more closely during the comparison-based test suite.
    750 			//
    751 			// In 64-bit mode REX.W is another valid prefix to test for, but
    752 			// there is less ambiguity about that. When present, REX.W is
    753 			// always the first entry in the table.
    754 			n := int(decoder[pc])
    755 			pc++
    756 			sawF3 := false
    757 			for j := 0; j < n; j++ {
    758 				prefix := Prefix(decoder[pc+2*j])
    759 				if prefix.IsREX() {
    760 					rexUsed |= prefix
    761 					if rex&prefix == prefix {
    762 						pc = int(decoder[pc+2*j+1])
    763 						continue Decode
    764 					}
    765 					continue
    766 				}
    767 				ok := false
    768 				if prefix == 0 {
    769 					ok = true
    770 				} else if prefix.IsREX() {
    771 					rexUsed |= prefix
    772 					if rex&prefix == prefix {
    773 						ok = true
    774 					}
    775 				} else {
    776 					if prefix == 0xF3 {
    777 						sawF3 = true
    778 					}
    779 					switch prefix {
    780 					case PrefixLOCK:
    781 						if lockIndex >= 0 {
    782 							inst.Prefix[lockIndex] |= PrefixImplicit
    783 							ok = true
    784 						}
    785 					case PrefixREP, PrefixREPN:
    786 						if repIndex >= 0 && inst.Prefix[repIndex]&0xFF == prefix {
    787 							inst.Prefix[repIndex] |= PrefixImplicit
    788 							ok = true
    789 						}
    790 						if gnuCompat && !ok && prefix == 0xF3 && repIndex >= 0 && (j+1 >= n || decoder[pc+2*(j+1)] != 0xF2) {
    791 							// Check to see if earlier prefix F3 is present.
    792 							for i := repIndex - 1; i >= 0; i-- {
    793 								if inst.Prefix[i]&0xFF == prefix {
    794 									inst.Prefix[i] |= PrefixImplicit
    795 									ok = true
    796 								}
    797 							}
    798 						}
    799 						if gnuCompat && !ok && prefix == 0xF2 && repIndex >= 0 && !sawF3 && inst.Prefix[repIndex]&0xFF == 0xF3 {
    800 							// Check to see if earlier prefix F2 is present.
    801 							for i := repIndex - 1; i >= 0; i-- {
    802 								if inst.Prefix[i]&0xFF == prefix {
    803 									inst.Prefix[i] |= PrefixImplicit
    804 									ok = true
    805 								}
    806 							}
    807 						}
    808 					case PrefixCS, PrefixDS, PrefixES, PrefixFS, PrefixGS, PrefixSS:
    809 						if segIndex >= 0 && inst.Prefix[segIndex]&0xFF == prefix {
    810 							inst.Prefix[segIndex] |= PrefixImplicit
    811 							ok = true
    812 						}
    813 					case PrefixDataSize:
    814 						// Looking for 66 mandatory prefix.
    815 						// The F2/F3 mandatory prefixes take priority when both are present.
    816 						// If we got this far in the xCondPrefix table and an F2/F3 is present,
    817 						// it means the table didn't have any entry for that prefix. But if 66 has
    818 						// special meaning, perhaps F2/F3 have special meaning that we don't know.
    819 						// Intel xed works this way, treating the F2/F3 as inhibiting the 66.
    820 						// GNU libopcodes allows the 66 to match. We do what Intel xed does
    821 						// except in gnuCompat mode.
    822 						if repIndex >= 0 && !gnuCompat {
    823 							inst.Op = 0
    824 							break Decode
    825 						}
    826 						if dataSizeIndex >= 0 {
    827 							inst.Prefix[dataSizeIndex] |= PrefixImplicit
    828 							ok = true
    829 						}
    830 					case PrefixAddrSize:
    831 						if addrSizeIndex >= 0 {
    832 							inst.Prefix[addrSizeIndex] |= PrefixImplicit
    833 							ok = true
    834 						}
    835 					}
    836 				}
    837 				if ok {
    838 					pc = int(decoder[pc+2*j+1])
    839 					continue Decode
    840 				}
    841 			}
    842 			inst.Op = 0
    843 			break Decode
    844 
    845 		case xCondSlashR:
    846 			pc = int(decoder[pc+regop&7])
    847 
    848 		// Input.
    849 
    850 		case xReadSlashR:
    851 			// done above
    852 
    853 		case xReadIb:
    854 			if pos >= len(src) {
    855 				return truncated(src, mode)
    856 			}
    857 			imm8 = int8(src[pos])
    858 			pos++
    859 
    860 		case xReadIw:
    861 			if pos+2 > len(src) {
    862 				return truncated(src, mode)
    863 			}
    864 			imm = int64(binary.LittleEndian.Uint16(src[pos:]))
    865 			pos += 2
    866 
    867 		case xReadId:
    868 			if pos+4 > len(src) {
    869 				return truncated(src, mode)
    870 			}
    871 			imm = int64(binary.LittleEndian.Uint32(src[pos:]))
    872 			pos += 4
    873 
    874 		case xReadIo:
    875 			if pos+8 > len(src) {
    876 				return truncated(src, mode)
    877 			}
    878 			imm = int64(binary.LittleEndian.Uint64(src[pos:]))
    879 			pos += 8
    880 
    881 		case xReadCb:
    882 			if pos >= len(src) {
    883 				return truncated(src, mode)
    884 			}
    885 			immc = int64(src[pos])
    886 			pos++
    887 
    888 		case xReadCw:
    889 			if pos+2 > len(src) {
    890 				return truncated(src, mode)
    891 			}
    892 			immc = int64(binary.LittleEndian.Uint16(src[pos:]))
    893 			pos += 2
    894 
    895 		case xReadCm:
    896 			if addrMode == 16 {
    897 				if pos+2 > len(src) {
    898 					return truncated(src, mode)
    899 				}
    900 				immc = int64(binary.LittleEndian.Uint16(src[pos:]))
    901 				pos += 2
    902 			} else if addrMode == 32 {
    903 				if pos+4 > len(src) {
    904 					return truncated(src, mode)
    905 				}
    906 				immc = int64(binary.LittleEndian.Uint32(src[pos:]))
    907 				pos += 4
    908 			} else {
    909 				if pos+8 > len(src) {
    910 					return truncated(src, mode)
    911 				}
    912 				immc = int64(binary.LittleEndian.Uint64(src[pos:]))
    913 				pos += 8
    914 			}
    915 		case xReadCd:
    916 			if pos+4 > len(src) {
    917 				return truncated(src, mode)
    918 			}
    919 			immc = int64(binary.LittleEndian.Uint32(src[pos:]))
    920 			pos += 4
    921 
    922 		case xReadCp:
    923 			if pos+6 > len(src) {
    924 				return truncated(src, mode)
    925 			}
    926 			w := binary.LittleEndian.Uint32(src[pos:])
    927 			w2 := binary.LittleEndian.Uint16(src[pos+4:])
    928 			immc = int64(w2)<<32 | int64(w)
    929 			pos += 6
    930 
    931 		// Output.
    932 
    933 		case xSetOp:
    934 			inst.Op = Op(decoder[pc])
    935 			pc++
    936 
    937 		case xArg1,
    938 			xArg3,
    939 			xArgAL,
    940 			xArgAX,
    941 			xArgCL,
    942 			xArgCS,
    943 			xArgDS,
    944 			xArgDX,
    945 			xArgEAX,
    946 			xArgEDX,
    947 			xArgES,
    948 			xArgFS,
    949 			xArgGS,
    950 			xArgRAX,
    951 			xArgRDX,
    952 			xArgSS,
    953 			xArgST,
    954 			xArgXMM0:
    955 			inst.Args[narg] = fixedArg[x]
    956 			narg++
    957 
    958 		case xArgImm8:
    959 			inst.Args[narg] = Imm(imm8)
    960 			narg++
    961 
    962 		case xArgImm8u:
    963 			inst.Args[narg] = Imm(uint8(imm8))
    964 			narg++
    965 
    966 		case xArgImm16:
    967 			inst.Args[narg] = Imm(int16(imm))
    968 			narg++
    969 
    970 		case xArgImm16u:
    971 			inst.Args[narg] = Imm(uint16(imm))
    972 			narg++
    973 
    974 		case xArgImm32:
    975 			inst.Args[narg] = Imm(int32(imm))
    976 			narg++
    977 
    978 		case xArgImm64:
    979 			inst.Args[narg] = Imm(imm)
    980 			narg++
    981 
    982 		case xArgM,
    983 			xArgM128,
    984 			xArgM1428byte,
    985 			xArgM16,
    986 			xArgM16and16,
    987 			xArgM16and32,
    988 			xArgM16and64,
    989 			xArgM16colon16,
    990 			xArgM16colon32,
    991 			xArgM16colon64,
    992 			xArgM16int,
    993 			xArgM2byte,
    994 			xArgM32,
    995 			xArgM32and32,
    996 			xArgM32fp,
    997 			xArgM32int,
    998 			xArgM512byte,
    999 			xArgM64,
   1000 			xArgM64fp,
   1001 			xArgM64int,
   1002 			xArgM8,
   1003 			xArgM80bcd,
   1004 			xArgM80dec,
   1005 			xArgM80fp,
   1006 			xArgM94108byte,
   1007 			xArgMem:
   1008 			if !haveMem {
   1009 				inst.Op = 0
   1010 				break Decode
   1011 			}
   1012 			inst.Args[narg] = mem
   1013 			inst.MemBytes = int(memBytes[decodeOp(x)])
   1014 			narg++
   1015 
   1016 		case xArgPtr16colon16:
   1017 			inst.Args[narg] = Imm(immc >> 16)
   1018 			inst.Args[narg+1] = Imm(immc & (1<<16 - 1))
   1019 			narg += 2
   1020 
   1021 		case xArgPtr16colon32:
   1022 			inst.Args[narg] = Imm(immc >> 32)
   1023 			inst.Args[narg+1] = Imm(immc & (1<<32 - 1))
   1024 			narg += 2
   1025 
   1026 		case xArgMoffs8, xArgMoffs16, xArgMoffs32, xArgMoffs64:
   1027 			// TODO(rsc): Can address be 64 bits?
   1028 			mem = Mem{Disp: int64(immc)}
   1029 			if segIndex >= 0 {
   1030 				mem.Segment = prefixToSegment(inst.Prefix[segIndex])
   1031 				inst.Prefix[segIndex] |= PrefixImplicit
   1032 			}
   1033 			inst.Args[narg] = mem
   1034 			inst.MemBytes = int(memBytes[decodeOp(x)])
   1035 			narg++
   1036 
   1037 		case xArgR8, xArgR16, xArgR32, xArgR64, xArgXmm, xArgXmm1, xArgDR0dashDR7:
   1038 			base := baseReg[x]
   1039 			index := Reg(regop)
   1040 			if rex != 0 && base == AL && index >= 4 {
   1041 				rexUsed |= PrefixREX
   1042 				index -= 4
   1043 				base = SPB
   1044 			}
   1045 			inst.Args[narg] = base + index
   1046 			narg++
   1047 
   1048 		case xArgMm, xArgMm1, xArgTR0dashTR7:
   1049 			inst.Args[narg] = baseReg[x] + Reg(regop&7)
   1050 			narg++
   1051 
   1052 		case xArgCR0dashCR7:
   1053 			// AMD documents an extension that the LOCK prefix
   1054 			// can be used in place of a REX prefix in order to access
   1055 			// CR8 from 32-bit mode. The LOCK prefix is allowed in
   1056 			// all modes, provided the corresponding CPUID bit is set.
   1057 			if lockIndex >= 0 {
   1058 				inst.Prefix[lockIndex] |= PrefixImplicit
   1059 				regop += 8
   1060 			}
   1061 			inst.Args[narg] = CR0 + Reg(regop)
   1062 			narg++
   1063 
   1064 		case xArgSreg:
   1065 			regop &= 7
   1066 			if regop >= 6 {
   1067 				inst.Op = 0
   1068 				break Decode
   1069 			}
   1070 			inst.Args[narg] = ES + Reg(regop)
   1071 			narg++
   1072 
   1073 		case xArgRmf16, xArgRmf32, xArgRmf64:
   1074 			base := baseReg[x]
   1075 			index := Reg(modrm & 07)
   1076 			if rex&PrefixREXB != 0 {
   1077 				rexUsed |= PrefixREXB
   1078 				index += 8
   1079 			}
   1080 			inst.Args[narg] = base + index
   1081 			narg++
   1082 
   1083 		case xArgR8op, xArgR16op, xArgR32op, xArgR64op, xArgSTi:
   1084 			n := inst.Opcode >> uint(opshift+8) & 07
   1085 			base := baseReg[x]
   1086 			index := Reg(n)
   1087 			if rex&PrefixREXB != 0 && decodeOp(x) != xArgSTi {
   1088 				rexUsed |= PrefixREXB
   1089 				index += 8
   1090 			}
   1091 			if rex != 0 && base == AL && index >= 4 {
   1092 				rexUsed |= PrefixREX
   1093 				index -= 4
   1094 				base = SPB
   1095 			}
   1096 			inst.Args[narg] = base + index
   1097 			narg++
   1098 
   1099 		case xArgRM8, xArgRM16, xArgRM32, xArgRM64, xArgR32M16, xArgR32M8, xArgR64M16,
   1100 			xArgMmM32, xArgMmM64, xArgMm2M64,
   1101 			xArgXmm2M16, xArgXmm2M32, xArgXmm2M64, xArgXmmM64, xArgXmmM128, xArgXmmM32, xArgXmm2M128:
   1102 			if haveMem {
   1103 				inst.Args[narg] = mem
   1104 				inst.MemBytes = int(memBytes[decodeOp(x)])
   1105 			} else {
   1106 				base := baseReg[x]
   1107 				index := Reg(rm)
   1108 				switch decodeOp(x) {
   1109 				case xArgMmM32, xArgMmM64, xArgMm2M64:
   1110 					// There are only 8 MMX registers, so these ignore the REX.X bit.
   1111 					index &= 7
   1112 				case xArgRM8:
   1113 					if rex != 0 && index >= 4 {
   1114 						rexUsed |= PrefixREX
   1115 						index -= 4
   1116 						base = SPB
   1117 					}
   1118 				}
   1119 				inst.Args[narg] = base + index
   1120 			}
   1121 			narg++
   1122 
   1123 		case xArgMm2: // register only; TODO(rsc): Handle with tag modrm_regonly tag
   1124 			if haveMem {
   1125 				inst.Op = 0
   1126 				break Decode
   1127 			}
   1128 			inst.Args[narg] = baseReg[x] + Reg(rm&7)
   1129 			narg++
   1130 
   1131 		case xArgXmm2: // register only; TODO(rsc): Handle with tag modrm_regonly tag
   1132 			if haveMem {
   1133 				inst.Op = 0
   1134 				break Decode
   1135 			}
   1136 			inst.Args[narg] = baseReg[x] + Reg(rm)
   1137 			narg++
   1138 
   1139 		case xArgRel8:
   1140 			inst.Args[narg] = Rel(int8(immc))
   1141 			narg++
   1142 
   1143 		case xArgRel16:
   1144 			inst.Args[narg] = Rel(int16(immc))
   1145 			narg++
   1146 
   1147 		case xArgRel32:
   1148 			inst.Args[narg] = Rel(int32(immc))
   1149 			narg++
   1150 		}
   1151 	}
   1152 
   1153 	if inst.Op == 0 {
   1154 		// Invalid instruction.
   1155 		if nprefix > 0 {
   1156 			return instPrefix(src[0], mode) // invalid instruction
   1157 		}
   1158 		return Inst{Len: pos}, ErrUnrecognized
   1159 	}
   1160 
   1161 	// Matched! Hooray!
   1162 
   1163 	// 90 decodes as XCHG EAX, EAX but is NOP.
   1164 	// 66 90 decodes as XCHG AX, AX and is NOP too.
   1165 	// 48 90 decodes as XCHG RAX, RAX and is NOP too.
   1166 	// 43 90 decodes as XCHG R8D, EAX and is *not* NOP.
   1167 	// F3 90 decodes as REP XCHG EAX, EAX but is PAUSE.
   1168 	// It's all too special to handle in the decoding tables, at least for now.
   1169 	if inst.Op == XCHG && inst.Opcode>>24 == 0x90 {
   1170 		if inst.Args[0] == RAX || inst.Args[0] == EAX || inst.Args[0] == AX {
   1171 			inst.Op = NOP
   1172 			if dataSizeIndex >= 0 {
   1173 				inst.Prefix[dataSizeIndex] &^= PrefixImplicit
   1174 			}
   1175 			inst.Args[0] = nil
   1176 			inst.Args[1] = nil
   1177 		}
   1178 		if repIndex >= 0 && inst.Prefix[repIndex] == 0xF3 {
   1179 			inst.Prefix[repIndex] |= PrefixImplicit
   1180 			inst.Op = PAUSE
   1181 			inst.Args[0] = nil
   1182 			inst.Args[1] = nil
   1183 		} else if gnuCompat {
   1184 			for i := nprefix - 1; i >= 0; i-- {
   1185 				if inst.Prefix[i]&0xFF == 0xF3 {
   1186 					inst.Prefix[i] |= PrefixImplicit
   1187 					inst.Op = PAUSE
   1188 					inst.Args[0] = nil
   1189 					inst.Args[1] = nil
   1190 					break
   1191 				}
   1192 			}
   1193 		}
   1194 	}
   1195 
   1196 	// defaultSeg returns the default segment for an implicit
   1197 	// memory reference: the final override if present, or else DS.
   1198 	defaultSeg := func() Reg {
   1199 		if segIndex >= 0 {
   1200 			inst.Prefix[segIndex] |= PrefixImplicit
   1201 			return prefixToSegment(inst.Prefix[segIndex])
   1202 		}
   1203 		return DS
   1204 	}
   1205 
   1206 	// Add implicit arguments not present in the tables.
   1207 	// Normally we shy away from making implicit arguments explicit,
   1208 	// following the Intel manuals, but adding the arguments seems
   1209 	// the best way to express the effect of the segment override prefixes.
   1210 	// TODO(rsc): Perhaps add these to the tables and
   1211 	// create bytecode instructions for them.
   1212 	usedAddrSize := false
   1213 	switch inst.Op {
   1214 	case INSB, INSW, INSD:
   1215 		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1216 		inst.Args[1] = DX
   1217 		usedAddrSize = true
   1218 
   1219 	case OUTSB, OUTSW, OUTSD:
   1220 		inst.Args[0] = DX
   1221 		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
   1222 		usedAddrSize = true
   1223 
   1224 	case MOVSB, MOVSW, MOVSD, MOVSQ:
   1225 		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1226 		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
   1227 		usedAddrSize = true
   1228 
   1229 	case CMPSB, CMPSW, CMPSD, CMPSQ:
   1230 		inst.Args[0] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
   1231 		inst.Args[1] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1232 		usedAddrSize = true
   1233 
   1234 	case LODSB, LODSW, LODSD, LODSQ:
   1235 		switch inst.Op {
   1236 		case LODSB:
   1237 			inst.Args[0] = AL
   1238 		case LODSW:
   1239 			inst.Args[0] = AX
   1240 		case LODSD:
   1241 			inst.Args[0] = EAX
   1242 		case LODSQ:
   1243 			inst.Args[0] = RAX
   1244 		}
   1245 		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
   1246 		usedAddrSize = true
   1247 
   1248 	case STOSB, STOSW, STOSD, STOSQ:
   1249 		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1250 		switch inst.Op {
   1251 		case STOSB:
   1252 			inst.Args[1] = AL
   1253 		case STOSW:
   1254 			inst.Args[1] = AX
   1255 		case STOSD:
   1256 			inst.Args[1] = EAX
   1257 		case STOSQ:
   1258 			inst.Args[1] = RAX
   1259 		}
   1260 		usedAddrSize = true
   1261 
   1262 	case SCASB, SCASW, SCASD, SCASQ:
   1263 		inst.Args[1] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1264 		switch inst.Op {
   1265 		case SCASB:
   1266 			inst.Args[0] = AL
   1267 		case SCASW:
   1268 			inst.Args[0] = AX
   1269 		case SCASD:
   1270 			inst.Args[0] = EAX
   1271 		case SCASQ:
   1272 			inst.Args[0] = RAX
   1273 		}
   1274 		usedAddrSize = true
   1275 
   1276 	case XLATB:
   1277 		inst.Args[0] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + BX - AX}
   1278 		usedAddrSize = true
   1279 	}
   1280 
   1281 	// If we used the address size annotation to construct the
   1282 	// argument list, mark that prefix as implicit: it doesn't need
   1283 	// to be shown when printing the instruction.
   1284 	if haveMem || usedAddrSize {
   1285 		if addrSizeIndex >= 0 {
   1286 			inst.Prefix[addrSizeIndex] |= PrefixImplicit
   1287 		}
   1288 	}
   1289 
   1290 	// Similarly, if there's some memory operand, the segment
   1291 	// will be shown there and doesn't need to be shown as an
   1292 	// explicit prefix.
   1293 	if haveMem {
   1294 		if segIndex >= 0 {
   1295 			inst.Prefix[segIndex] |= PrefixImplicit
   1296 		}
   1297 	}
   1298 
   1299 	// Branch predict prefixes are overloaded segment prefixes,
   1300 	// since segment prefixes don't make sense on conditional jumps.
   1301 	// Rewrite final instance to prediction prefix.
   1302 	// The set of instructions to which the prefixes apply (other then the
   1303 	// Jcc conditional jumps) is not 100% clear from the manuals, but
   1304 	// the disassemblers seem to agree about the LOOP and JCXZ instructions,
   1305 	// so we'll follow along.
   1306 	// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
   1307 	if isCondJmp[inst.Op] || isLoop[inst.Op] || inst.Op == JCXZ || inst.Op == JECXZ || inst.Op == JRCXZ {
   1308 	PredictLoop:
   1309 		for i := nprefix - 1; i >= 0; i-- {
   1310 			p := inst.Prefix[i]
   1311 			switch p & 0xFF {
   1312 			case PrefixCS:
   1313 				inst.Prefix[i] = PrefixPN
   1314 				break PredictLoop
   1315 			case PrefixDS:
   1316 				inst.Prefix[i] = PrefixPT
   1317 				break PredictLoop
   1318 			}
   1319 		}
   1320 	}
   1321 
   1322 	// The BND prefix is part of the Intel Memory Protection Extensions (MPX).
   1323 	// A REPN applied to certain control transfers is a BND prefix to bound
   1324 	// the range of possible destinations. There's surprisingly little documentation
   1325 	// about this, so we just do what libopcodes and xed agree on.
   1326 	// In particular, it's unclear why a REPN applied to LOOP or JCXZ instructions
   1327 	// does not turn into a BND.
   1328 	// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
   1329 	if isCondJmp[inst.Op] || inst.Op == JMP || inst.Op == CALL || inst.Op == RET {
   1330 		for i := nprefix - 1; i >= 0; i-- {
   1331 			p := inst.Prefix[i]
   1332 			if p&^PrefixIgnored == PrefixREPN {
   1333 				inst.Prefix[i] = PrefixBND
   1334 				break
   1335 			}
   1336 		}
   1337 	}
   1338 
   1339 	// The LOCK prefix only applies to certain instructions, and then only
   1340 	// to instances of the instruction with a memory destination.
   1341 	// Other uses of LOCK are invalid and cause a processor exception,
   1342 	// in contrast to the "just ignore it" spirit applied to all other prefixes.
   1343 	// Mark invalid lock prefixes.
   1344 	hasLock := false
   1345 	if lockIndex >= 0 && inst.Prefix[lockIndex]&PrefixImplicit == 0 {
   1346 		switch inst.Op {
   1347 		// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
   1348 		case ADD, ADC, AND, BTC, BTR, BTS, CMPXCHG, CMPXCHG8B, CMPXCHG16B, DEC, INC, NEG, NOT, OR, SBB, SUB, XOR, XADD, XCHG:
   1349 			if isMem(inst.Args[0]) {
   1350 				hasLock = true
   1351 				break
   1352 			}
   1353 			fallthrough
   1354 		default:
   1355 			inst.Prefix[lockIndex] |= PrefixInvalid
   1356 		}
   1357 	}
   1358 
   1359 	// In certain cases, all of which require a memory destination,
   1360 	// the REPN and REP prefixes are interpreted as XACQUIRE and XRELEASE
   1361 	// from the Intel Transactional Synchroniation Extensions (TSX).
   1362 	//
   1363 	// The specific rules are:
   1364 	// (1) Any instruction with a valid LOCK prefix can have XACQUIRE or XRELEASE.
   1365 	// (2) Any XCHG, which always has an implicit LOCK, can have XACQUIRE or XRELEASE.
   1366 	// (3) Any 0x88-, 0x89-, 0xC6-, or 0xC7-opcode MOV can have XRELEASE.
   1367 	if isMem(inst.Args[0]) {
   1368 		if inst.Op == XCHG {
   1369 			hasLock = true
   1370 		}
   1371 
   1372 		for i := len(inst.Prefix) - 1; i >= 0; i-- {
   1373 			p := inst.Prefix[i] &^ PrefixIgnored
   1374 			switch p {
   1375 			case PrefixREPN:
   1376 				if hasLock {
   1377 					inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXACQUIRE
   1378 				}
   1379 
   1380 			case PrefixREP:
   1381 				if hasLock {
   1382 					inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXRELEASE
   1383 				}
   1384 
   1385 				if inst.Op == MOV {
   1386 					op := (inst.Opcode >> 24) &^ 1
   1387 					if op == 0x88 || op == 0xC6 {
   1388 						inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXRELEASE
   1389 					}
   1390 				}
   1391 			}
   1392 		}
   1393 	}
   1394 
   1395 	// If REP is used on a non-REP-able instruction, mark the prefix as ignored.
   1396 	if repIndex >= 0 {
   1397 		switch inst.Prefix[repIndex] {
   1398 		case PrefixREP, PrefixREPN:
   1399 			switch inst.Op {
   1400 			// According to the manuals, the REP/REPE prefix applies to all of these,
   1401 			// while the REPN applies only to some of them. However, both libopcodes
   1402 			// and xed show both prefixes explicitly for all instructions, so we do the same.
   1403 			// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
   1404 			case INSB, INSW, INSD,
   1405 				MOVSB, MOVSW, MOVSD, MOVSQ,
   1406 				OUTSB, OUTSW, OUTSD,
   1407 				LODSB, LODSW, LODSD, LODSQ,
   1408 				CMPSB, CMPSW, CMPSD, CMPSQ,
   1409 				SCASB, SCASW, SCASD, SCASQ,
   1410 				STOSB, STOSW, STOSD, STOSQ:
   1411 				// ok
   1412 			default:
   1413 				inst.Prefix[repIndex] |= PrefixIgnored
   1414 			}
   1415 		}
   1416 	}
   1417 
   1418 	// If REX was present, mark implicit if all the 1 bits were consumed.
   1419 	if rexIndex >= 0 {
   1420 		if rexUsed != 0 {
   1421 			rexUsed |= PrefixREX
   1422 		}
   1423 		if rex&^rexUsed == 0 {
   1424 			inst.Prefix[rexIndex] |= PrefixImplicit
   1425 		}
   1426 	}
   1427 
   1428 	inst.DataSize = dataMode
   1429 	inst.AddrSize = addrMode
   1430 	inst.Mode = mode
   1431 	inst.Len = pos
   1432 	return inst, nil
   1433 }
   1434 
   1435 var errInternal = errors.New("internal error")
   1436 
   1437 // addr16 records the eight 16-bit addressing modes.
   1438 var addr16 = [8]Mem{
   1439 	{Base: BX, Scale: 1, Index: SI},
   1440 	{Base: BX, Scale: 1, Index: DI},
   1441 	{Base: BP, Scale: 1, Index: SI},
   1442 	{Base: BP, Scale: 1, Index: DI},
   1443 	{Base: SI},
   1444 	{Base: DI},
   1445 	{Base: BP},
   1446 	{Base: BX},
   1447 }
   1448 
   1449 // baseReg returns the base register for a given register size in bits.
   1450 func baseRegForBits(bits int) Reg {
   1451 	switch bits {
   1452 	case 8:
   1453 		return AL
   1454 	case 16:
   1455 		return AX
   1456 	case 32:
   1457 		return EAX
   1458 	case 64:
   1459 		return RAX
   1460 	}
   1461 	return 0
   1462 }
   1463 
   1464 // baseReg records the base register for argument types that specify
   1465 // a range of registers indexed by op, regop, or rm.
   1466 var baseReg = [...]Reg{
   1467 	xArgDR0dashDR7: DR0,
   1468 	xArgMm1:        M0,
   1469 	xArgMm2:        M0,
   1470 	xArgMm2M64:     M0,
   1471 	xArgMm:         M0,
   1472 	xArgMmM32:      M0,
   1473 	xArgMmM64:      M0,
   1474 	xArgR16:        AX,
   1475 	xArgR16op:      AX,
   1476 	xArgR32:        EAX,
   1477 	xArgR32M16:     EAX,
   1478 	xArgR32M8:      EAX,
   1479 	xArgR32op:      EAX,
   1480 	xArgR64:        RAX,
   1481 	xArgR64M16:     RAX,
   1482 	xArgR64op:      RAX,
   1483 	xArgR8:         AL,
   1484 	xArgR8op:       AL,
   1485 	xArgRM16:       AX,
   1486 	xArgRM32:       EAX,
   1487 	xArgRM64:       RAX,
   1488 	xArgRM8:        AL,
   1489 	xArgRmf16:      AX,
   1490 	xArgRmf32:      EAX,
   1491 	xArgRmf64:      RAX,
   1492 	xArgSTi:        F0,
   1493 	xArgTR0dashTR7: TR0,
   1494 	xArgXmm1:       X0,
   1495 	xArgXmm2:       X0,
   1496 	xArgXmm2M128:   X0,
   1497 	xArgXmm2M16:    X0,
   1498 	xArgXmm2M32:    X0,
   1499 	xArgXmm2M64:    X0,
   1500 	xArgXmm:        X0,
   1501 	xArgXmmM128:    X0,
   1502 	xArgXmmM32:     X0,
   1503 	xArgXmmM64:     X0,
   1504 }
   1505 
   1506 // prefixToSegment returns the segment register
   1507 // corresponding to a particular segment prefix.
   1508 func prefixToSegment(p Prefix) Reg {
   1509 	switch p &^ PrefixImplicit {
   1510 	case PrefixCS:
   1511 		return CS
   1512 	case PrefixDS:
   1513 		return DS
   1514 	case PrefixES:
   1515 		return ES
   1516 	case PrefixFS:
   1517 		return FS
   1518 	case PrefixGS:
   1519 		return GS
   1520 	case PrefixSS:
   1521 		return SS
   1522 	}
   1523 	return 0
   1524 }
   1525 
   1526 // fixedArg records the fixed arguments corresponding to the given bytecodes.
   1527 var fixedArg = [...]Arg{
   1528 	xArg1:    Imm(1),
   1529 	xArg3:    Imm(3),
   1530 	xArgAL:   AL,
   1531 	xArgAX:   AX,
   1532 	xArgDX:   DX,
   1533 	xArgEAX:  EAX,
   1534 	xArgEDX:  EDX,
   1535 	xArgRAX:  RAX,
   1536 	xArgRDX:  RDX,
   1537 	xArgCL:   CL,
   1538 	xArgCS:   CS,
   1539 	xArgDS:   DS,
   1540 	xArgES:   ES,
   1541 	xArgFS:   FS,
   1542 	xArgGS:   GS,
   1543 	xArgSS:   SS,
   1544 	xArgST:   F0,
   1545 	xArgXMM0: X0,
   1546 }
   1547 
   1548 // memBytes records the size of the memory pointed at
   1549 // by a memory argument of the given form.
   1550 var memBytes = [...]int8{
   1551 	xArgM128:       128 / 8,
   1552 	xArgM16:        16 / 8,
   1553 	xArgM16and16:   (16 + 16) / 8,
   1554 	xArgM16colon16: (16 + 16) / 8,
   1555 	xArgM16colon32: (16 + 32) / 8,
   1556 	xArgM16int:     16 / 8,
   1557 	xArgM2byte:     2,
   1558 	xArgM32:        32 / 8,
   1559 	xArgM32and32:   (32 + 32) / 8,
   1560 	xArgM32fp:      32 / 8,
   1561 	xArgM32int:     32 / 8,
   1562 	xArgM64:        64 / 8,
   1563 	xArgM64fp:      64 / 8,
   1564 	xArgM64int:     64 / 8,
   1565 	xArgMm2M64:     64 / 8,
   1566 	xArgMmM32:      32 / 8,
   1567 	xArgMmM64:      64 / 8,
   1568 	xArgMoffs16:    16 / 8,
   1569 	xArgMoffs32:    32 / 8,
   1570 	xArgMoffs64:    64 / 8,
   1571 	xArgMoffs8:     8 / 8,
   1572 	xArgR32M16:     16 / 8,
   1573 	xArgR32M8:      8 / 8,
   1574 	xArgR64M16:     16 / 8,
   1575 	xArgRM16:       16 / 8,
   1576 	xArgRM32:       32 / 8,
   1577 	xArgRM64:       64 / 8,
   1578 	xArgRM8:        8 / 8,
   1579 	xArgXmm2M128:   128 / 8,
   1580 	xArgXmm2M16:    16 / 8,
   1581 	xArgXmm2M32:    32 / 8,
   1582 	xArgXmm2M64:    64 / 8,
   1583 	xArgXmm:        128 / 8,
   1584 	xArgXmmM128:    128 / 8,
   1585 	xArgXmmM32:     32 / 8,
   1586 	xArgXmmM64:     64 / 8,
   1587 }
   1588 
   1589 // isCondJmp records the conditional jumps.
   1590 var isCondJmp = [maxOp + 1]bool{
   1591 	JA:  true,
   1592 	JAE: true,
   1593 	JB:  true,
   1594 	JBE: true,
   1595 	JE:  true,
   1596 	JG:  true,
   1597 	JGE: true,
   1598 	JL:  true,
   1599 	JLE: true,
   1600 	JNE: true,
   1601 	JNO: true,
   1602 	JNP: true,
   1603 	JNS: true,
   1604 	JO:  true,
   1605 	JP:  true,
   1606 	JS:  true,
   1607 }
   1608 
   1609 // isLoop records the loop operators.
   1610 var isLoop = [maxOp + 1]bool{
   1611 	LOOP:   true,
   1612 	LOOPE:  true,
   1613 	LOOPNE: true,
   1614 	JECXZ:  true,
   1615 	JRCXZ:  true,
   1616 }
   1617