Home | History | Annotate | Download | only in x86asm
      1 // Copyright 2014 The Go Authors.  All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Table-driven decoding of x86 instructions.
      6 
      7 package x86asm
      8 
      9 import (
     10 	"encoding/binary"
     11 	"errors"
     12 	"fmt"
     13 	"runtime"
     14 )
     15 
     16 // Set trace to true to cause the decoder to print the PC sequence
     17 // of the executed instruction codes. This is typically only useful
     18 // when you are running a test of a single input case.
     19 const trace = false
     20 
     21 // A decodeOp is a single instruction in the decoder bytecode program.
     22 //
     23 // The decodeOps correspond to consuming and conditionally branching
     24 // on input bytes, consuming additional fields, and then interpreting
     25 // consumed data as instruction arguments. The names of the xRead and xArg
     26 // operations are taken from the Intel manual conventions, for example
     27 // Volume 2, Section 3.1.1, page 487 of
     28 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
     29 //
     30 // The actual decoding program is generated by ../x86map.
     31 //
     32 // TODO(rsc): We may be able to merge various of the memory operands
     33 // since we don't care about, say, the distinction between m80dec and m80bcd.
     34 // Similarly, mm and mm1 have identical meaning, as do xmm and xmm1.
     35 
     36 type decodeOp uint16
     37 
     38 const (
     39 	xFail  decodeOp = iota // invalid instruction (return)
     40 	xMatch                 // completed match
     41 	xJump                  // jump to pc
     42 
     43 	xCondByte     // switch on instruction byte value
     44 	xCondSlashR   // read and switch on instruction /r value
     45 	xCondPrefix   // switch on presence of instruction prefix
     46 	xCondIs64     // switch on 64-bit processor mode
     47 	xCondDataSize // switch on operand size
     48 	xCondAddrSize // switch on address size
     49 	xCondIsMem    // switch on memory vs register argument
     50 
     51 	xSetOp // set instruction opcode
     52 
     53 	xReadSlashR // read /r
     54 	xReadIb     // read ib
     55 	xReadIw     // read iw
     56 	xReadId     // read id
     57 	xReadIo     // read io
     58 	xReadCb     // read cb
     59 	xReadCw     // read cw
     60 	xReadCd     // read cd
     61 	xReadCp     // read cp
     62 	xReadCm     // read cm
     63 
     64 	xArg1            // arg 1
     65 	xArg3            // arg 3
     66 	xArgAL           // arg AL
     67 	xArgAX           // arg AX
     68 	xArgCL           // arg CL
     69 	xArgCR0dashCR7   // arg CR0-CR7
     70 	xArgCS           // arg CS
     71 	xArgDR0dashDR7   // arg DR0-DR7
     72 	xArgDS           // arg DS
     73 	xArgDX           // arg DX
     74 	xArgEAX          // arg EAX
     75 	xArgEDX          // arg EDX
     76 	xArgES           // arg ES
     77 	xArgFS           // arg FS
     78 	xArgGS           // arg GS
     79 	xArgImm16        // arg imm16
     80 	xArgImm32        // arg imm32
     81 	xArgImm64        // arg imm64
     82 	xArgImm8         // arg imm8
     83 	xArgImm8u        // arg imm8 but record as unsigned
     84 	xArgImm16u       // arg imm8 but record as unsigned
     85 	xArgM            // arg m
     86 	xArgM128         // arg m128
     87 	xArgM256         // arg m256
     88 	xArgM1428byte    // arg m14/28byte
     89 	xArgM16          // arg m16
     90 	xArgM16and16     // arg m16&16
     91 	xArgM16and32     // arg m16&32
     92 	xArgM16and64     // arg m16&64
     93 	xArgM16colon16   // arg m16:16
     94 	xArgM16colon32   // arg m16:32
     95 	xArgM16colon64   // arg m16:64
     96 	xArgM16int       // arg m16int
     97 	xArgM2byte       // arg m2byte
     98 	xArgM32          // arg m32
     99 	xArgM32and32     // arg m32&32
    100 	xArgM32fp        // arg m32fp
    101 	xArgM32int       // arg m32int
    102 	xArgM512byte     // arg m512byte
    103 	xArgM64          // arg m64
    104 	xArgM64fp        // arg m64fp
    105 	xArgM64int       // arg m64int
    106 	xArgM8           // arg m8
    107 	xArgM80bcd       // arg m80bcd
    108 	xArgM80dec       // arg m80dec
    109 	xArgM80fp        // arg m80fp
    110 	xArgM94108byte   // arg m94/108byte
    111 	xArgMm           // arg mm
    112 	xArgMm1          // arg mm1
    113 	xArgMm2          // arg mm2
    114 	xArgMm2M64       // arg mm2/m64
    115 	xArgMmM32        // arg mm/m32
    116 	xArgMmM64        // arg mm/m64
    117 	xArgMem          // arg mem
    118 	xArgMoffs16      // arg moffs16
    119 	xArgMoffs32      // arg moffs32
    120 	xArgMoffs64      // arg moffs64
    121 	xArgMoffs8       // arg moffs8
    122 	xArgPtr16colon16 // arg ptr16:16
    123 	xArgPtr16colon32 // arg ptr16:32
    124 	xArgR16          // arg r16
    125 	xArgR16op        // arg r16 with +rw in opcode
    126 	xArgR32          // arg r32
    127 	xArgR32M16       // arg r32/m16
    128 	xArgR32M8        // arg r32/m8
    129 	xArgR32op        // arg r32 with +rd in opcode
    130 	xArgR64          // arg r64
    131 	xArgR64M16       // arg r64/m16
    132 	xArgR64op        // arg r64 with +rd in opcode
    133 	xArgR8           // arg r8
    134 	xArgR8op         // arg r8 with +rb in opcode
    135 	xArgRAX          // arg RAX
    136 	xArgRDX          // arg RDX
    137 	xArgRM           // arg r/m
    138 	xArgRM16         // arg r/m16
    139 	xArgRM32         // arg r/m32
    140 	xArgRM64         // arg r/m64
    141 	xArgRM8          // arg r/m8
    142 	xArgReg          // arg reg
    143 	xArgRegM16       // arg reg/m16
    144 	xArgRegM32       // arg reg/m32
    145 	xArgRegM8        // arg reg/m8
    146 	xArgRel16        // arg rel16
    147 	xArgRel32        // arg rel32
    148 	xArgRel8         // arg rel8
    149 	xArgSS           // arg SS
    150 	xArgST           // arg ST, aka ST(0)
    151 	xArgSTi          // arg ST(i) with +i in opcode
    152 	xArgSreg         // arg Sreg
    153 	xArgTR0dashTR7   // arg TR0-TR7
    154 	xArgXmm          // arg xmm
    155 	xArgXMM0         // arg <XMM0>
    156 	xArgXmm1         // arg xmm1
    157 	xArgXmm2         // arg xmm2
    158 	xArgXmm2M128     // arg xmm2/m128
    159 	xArgYmm2M256     // arg ymm2/m256
    160 	xArgXmm2M16      // arg xmm2/m16
    161 	xArgXmm2M32      // arg xmm2/m32
    162 	xArgXmm2M64      // arg xmm2/m64
    163 	xArgXmmM128      // arg xmm/m128
    164 	xArgXmmM32       // arg xmm/m32
    165 	xArgXmmM64       // arg xmm/m64
    166 	xArgYmm1         // arg ymm1
    167 	xArgRmf16        // arg r/m16 but force mod=3
    168 	xArgRmf32        // arg r/m32 but force mod=3
    169 	xArgRmf64        // arg r/m64 but force mod=3
    170 )
    171 
    172 // instPrefix returns an Inst describing just one prefix byte.
    173 // It is only used if there is a prefix followed by an unintelligible
    174 // or invalid instruction byte sequence.
    175 func instPrefix(b byte, mode int) (Inst, error) {
    176 	// When tracing it is useful to see what called instPrefix to report an error.
    177 	if trace {
    178 		_, file, line, _ := runtime.Caller(1)
    179 		fmt.Printf("%s:%d\n", file, line)
    180 	}
    181 	p := Prefix(b)
    182 	switch p {
    183 	case PrefixDataSize:
    184 		if mode == 16 {
    185 			p = PrefixData32
    186 		} else {
    187 			p = PrefixData16
    188 		}
    189 	case PrefixAddrSize:
    190 		if mode == 32 {
    191 			p = PrefixAddr16
    192 		} else {
    193 			p = PrefixAddr32
    194 		}
    195 	}
    196 	// Note: using composite literal with Prefix key confuses 'bundle' tool.
    197 	inst := Inst{Len: 1}
    198 	inst.Prefix = Prefixes{p}
    199 	return inst, nil
    200 }
    201 
    202 // truncated reports a truncated instruction.
    203 // For now we use instPrefix but perhaps later we will return
    204 // a specific error here.
    205 func truncated(src []byte, mode int) (Inst, error) {
    206 	//	return Inst{}, len(src), ErrTruncated
    207 	return instPrefix(src[0], mode) // too long
    208 }
    209 
    210 // These are the errors returned by Decode.
    211 var (
    212 	ErrInvalidMode  = errors.New("invalid x86 mode in Decode")
    213 	ErrTruncated    = errors.New("truncated instruction")
    214 	ErrUnrecognized = errors.New("unrecognized instruction")
    215 )
    216 
    217 // decoderCover records coverage information for which parts
    218 // of the byte code have been executed.
    219 // TODO(rsc): This is for testing. Only use this if a flag is given.
    220 var decoderCover []bool
    221 
    222 // Decode decodes the leading bytes in src as a single instruction.
    223 // The mode arguments specifies the assumed processor mode:
    224 // 16, 32, or 64 for 16-, 32-, and 64-bit execution modes.
    225 func Decode(src []byte, mode int) (inst Inst, err error) {
    226 	return decode1(src, mode, false)
    227 }
    228 
    229 // decode1 is the implementation of Decode but takes an extra
    230 // gnuCompat flag to cause it to change its behavior to mimic
    231 // bugs (or at least unique features) of GNU libopcodes as used
    232 // by objdump. We don't believe that logic is the right thing to do
    233 // in general, but when testing against libopcodes it simplifies the
    234 // comparison if we adjust a few small pieces of logic.
    235 // The affected logic is in the conditional branch for "mandatory" prefixes,
    236 // case xCondPrefix.
    237 func decode1(src []byte, mode int, gnuCompat bool) (Inst, error) {
    238 	switch mode {
    239 	case 16, 32, 64:
    240 		// ok
    241 		// TODO(rsc): 64-bit mode not tested, probably not working.
    242 	default:
    243 		return Inst{}, ErrInvalidMode
    244 	}
    245 
    246 	// Maximum instruction size is 15 bytes.
    247 	// If we need to read more, return 'truncated instruction.
    248 	if len(src) > 15 {
    249 		src = src[:15]
    250 	}
    251 
    252 	var (
    253 		// prefix decoding information
    254 		pos           = 0    // position reading src
    255 		nprefix       = 0    // number of prefixes
    256 		lockIndex     = -1   // index of LOCK prefix in src and inst.Prefix
    257 		repIndex      = -1   // index of REP/REPN prefix in src and inst.Prefix
    258 		segIndex      = -1   // index of Group 2 prefix in src and inst.Prefix
    259 		dataSizeIndex = -1   // index of Group 3 prefix in src and inst.Prefix
    260 		addrSizeIndex = -1   // index of Group 4 prefix in src and inst.Prefix
    261 		rex           Prefix // rex byte if present (or 0)
    262 		rexUsed       Prefix // bits used in rex byte
    263 		rexIndex      = -1   // index of rex byte
    264 		vex           Prefix // use vex encoding
    265 		vexIndex      = -1   // index of vex prefix
    266 
    267 		addrMode = mode // address mode (width in bits)
    268 		dataMode = mode // operand mode (width in bits)
    269 
    270 		// decoded ModR/M fields
    271 		haveModrm bool
    272 		modrm     int
    273 		mod       int
    274 		regop     int
    275 		rm        int
    276 
    277 		// if ModR/M is memory reference, Mem form
    278 		mem     Mem
    279 		haveMem bool
    280 
    281 		// decoded SIB fields
    282 		haveSIB bool
    283 		sib     int
    284 		scale   int
    285 		index   int
    286 		base    int
    287 		displen int
    288 		dispoff int
    289 
    290 		// decoded immediate values
    291 		imm     int64
    292 		imm8    int8
    293 		immc    int64
    294 		immcpos int
    295 
    296 		// output
    297 		opshift int
    298 		inst    Inst
    299 		narg    int // number of arguments written to inst
    300 	)
    301 
    302 	if mode == 64 {
    303 		dataMode = 32
    304 	}
    305 
    306 	// Prefixes are certainly the most complex and underspecified part of
    307 	// decoding x86 instructions. Although the manuals say things like
    308 	// up to four prefixes, one from each group, nearly everyone seems to
    309 	// agree that in practice as many prefixes as possible, including multiple
    310 	// from a particular group or repetitions of a given prefix, can be used on
    311 	// an instruction, provided the total instruction length including prefixes
    312 	// does not exceed the agreed-upon maximum of 15 bytes.
    313 	// Everyone also agrees that if one of these prefixes is the LOCK prefix
    314 	// and the instruction is not one of the instructions that can be used with
    315 	// the LOCK prefix or if the destination is not a memory operand,
    316 	// then the instruction is invalid and produces the #UD exception.
    317 	// However, that is the end of any semblance of agreement.
    318 	//
    319 	// What happens if prefixes are given that conflict with other prefixes?
    320 	// For example, the memory segment overrides CS, DS, ES, FS, GS, SS
    321 	// conflict with each other: only one segment can be in effect.
    322 	// Disassemblers seem to agree that later prefixes take priority over
    323 	// earlier ones. I have not taken the time to write assembly programs
    324 	// to check to see if the hardware agrees.
    325 	//
    326 	// What happens if prefixes are given that have no meaning for the
    327 	// specific instruction to which they are attached? It depends.
    328 	// If they really have no meaning, they are ignored. However, a future
    329 	// processor may assign a different meaning. As a disassembler, we
    330 	// don't really know whether we're seeing a meaningless prefix or one
    331 	// whose meaning we simply haven't been told yet.
    332 	//
    333 	// Combining the two questions, what happens when conflicting
    334 	// extension prefixes are given? No one seems to know for sure.
    335 	// For example, MOVQ is 66 0F D6 /r, MOVDQ2Q is F2 0F D6 /r,
    336 	// and MOVQ2DQ is F3 0F D6 /r. What is '66 F2 F3 0F D6 /r'?
    337 	// Which prefix wins? See the xCondPrefix prefix for more.
    338 	//
    339 	// Writing assembly test cases to divine which interpretation the
    340 	// CPU uses might clarify the situation, but more likely it would
    341 	// make the situation even less clear.
    342 
    343 	// Read non-REX prefixes.
    344 ReadPrefixes:
    345 	for ; pos < len(src); pos++ {
    346 		p := Prefix(src[pos])
    347 		switch p {
    348 		default:
    349 			nprefix = pos
    350 			break ReadPrefixes
    351 
    352 		// Group 1 - lock and repeat prefixes
    353 		// According to Intel, there should only be one from this set,
    354 		// but according to AMD both can be present.
    355 		case 0xF0:
    356 			if lockIndex >= 0 {
    357 				inst.Prefix[lockIndex] |= PrefixIgnored
    358 			}
    359 			lockIndex = pos
    360 		case 0xF2, 0xF3:
    361 			if repIndex >= 0 {
    362 				inst.Prefix[repIndex] |= PrefixIgnored
    363 			}
    364 			repIndex = pos
    365 
    366 		// Group 2 - segment override / branch hints
    367 		case 0x26, 0x2E, 0x36, 0x3E:
    368 			if mode == 64 {
    369 				p |= PrefixIgnored
    370 				break
    371 			}
    372 			fallthrough
    373 		case 0x64, 0x65:
    374 			if segIndex >= 0 {
    375 				inst.Prefix[segIndex] |= PrefixIgnored
    376 			}
    377 			segIndex = pos
    378 
    379 		// Group 3 - operand size override
    380 		case 0x66:
    381 			if mode == 16 {
    382 				dataMode = 32
    383 				p = PrefixData32
    384 			} else {
    385 				dataMode = 16
    386 				p = PrefixData16
    387 			}
    388 			if dataSizeIndex >= 0 {
    389 				inst.Prefix[dataSizeIndex] |= PrefixIgnored
    390 			}
    391 			dataSizeIndex = pos
    392 
    393 		// Group 4 - address size override
    394 		case 0x67:
    395 			if mode == 32 {
    396 				addrMode = 16
    397 				p = PrefixAddr16
    398 			} else {
    399 				addrMode = 32
    400 				p = PrefixAddr32
    401 			}
    402 			if addrSizeIndex >= 0 {
    403 				inst.Prefix[addrSizeIndex] |= PrefixIgnored
    404 			}
    405 			addrSizeIndex = pos
    406 
    407 		//Group 5 - Vex encoding
    408 		case 0xC5:
    409 			if pos == 0 && (mode == 64 || (mode == 32 && pos+1 < len(src) && src[pos+1]&0xc0 == 0xc0)) {
    410 				vex = p
    411 				vexIndex = pos
    412 				inst.Prefix[pos] = p
    413 				inst.Prefix[pos+1] = Prefix(src[pos+1])
    414 				pos += 1
    415 				continue
    416 			} else {
    417 				nprefix = pos
    418 				break ReadPrefixes
    419 			}
    420 		case 0xC4:
    421 			if pos == 0 && (mode == 64 || (mode == 32 && pos+2 < len(src) && src[pos+1]&0xc0 == 0xc0)) {
    422 				vex = p
    423 				vexIndex = pos
    424 				inst.Prefix[pos] = p
    425 				inst.Prefix[pos+1] = Prefix(src[pos+1])
    426 				inst.Prefix[pos+2] = Prefix(src[pos+2])
    427 				pos += 2
    428 				continue
    429 			} else {
    430 				nprefix = pos
    431 				break ReadPrefixes
    432 			}
    433 		}
    434 
    435 		if pos >= len(inst.Prefix) {
    436 			return instPrefix(src[0], mode) // too long
    437 		}
    438 
    439 		inst.Prefix[pos] = p
    440 	}
    441 
    442 	// Read REX prefix.
    443 	if pos < len(src) && mode == 64 && Prefix(src[pos]).IsREX() && vex == 0 {
    444 		rex = Prefix(src[pos])
    445 		rexIndex = pos
    446 		if pos >= len(inst.Prefix) {
    447 			return instPrefix(src[0], mode) // too long
    448 		}
    449 		inst.Prefix[pos] = rex
    450 		pos++
    451 		if rex&PrefixREXW != 0 {
    452 			dataMode = 64
    453 			if dataSizeIndex >= 0 {
    454 				inst.Prefix[dataSizeIndex] |= PrefixIgnored
    455 			}
    456 		}
    457 	}
    458 
    459 	// Decode instruction stream, interpreting decoding instructions.
    460 	// opshift gives the shift to use when saving the next
    461 	// opcode byte into inst.Opcode.
    462 	opshift = 24
    463 	if decoderCover == nil {
    464 		decoderCover = make([]bool, len(decoder))
    465 	}
    466 
    467 	// Decode loop, executing decoder program.
    468 	var oldPC, prevPC int
    469 Decode:
    470 	for pc := 1; ; { // TODO uint
    471 		oldPC = prevPC
    472 		prevPC = pc
    473 		if trace {
    474 			println("run", pc)
    475 		}
    476 		x := decoder[pc]
    477 		decoderCover[pc] = true
    478 		pc++
    479 
    480 		// Read and decode ModR/M if needed by opcode.
    481 		switch decodeOp(x) {
    482 		case xCondSlashR, xReadSlashR:
    483 			if haveModrm {
    484 				return Inst{Len: pos}, errInternal
    485 			}
    486 			haveModrm = true
    487 			if pos >= len(src) {
    488 				return truncated(src, mode)
    489 			}
    490 			modrm = int(src[pos])
    491 			pos++
    492 			if opshift >= 0 {
    493 				inst.Opcode |= uint32(modrm) << uint(opshift)
    494 				opshift -= 8
    495 			}
    496 			mod = modrm >> 6
    497 			regop = (modrm >> 3) & 07
    498 			rm = modrm & 07
    499 			if rex&PrefixREXR != 0 {
    500 				rexUsed |= PrefixREXR
    501 				regop |= 8
    502 			}
    503 			if addrMode == 16 {
    504 				// 16-bit modrm form
    505 				if mod != 3 {
    506 					haveMem = true
    507 					mem = addr16[rm]
    508 					if rm == 6 && mod == 0 {
    509 						mem.Base = 0
    510 					}
    511 
    512 					// Consume disp16 if present.
    513 					if mod == 0 && rm == 6 || mod == 2 {
    514 						if pos+2 > len(src) {
    515 							return truncated(src, mode)
    516 						}
    517 						mem.Disp = int64(binary.LittleEndian.Uint16(src[pos:]))
    518 						pos += 2
    519 					}
    520 
    521 					// Consume disp8 if present.
    522 					if mod == 1 {
    523 						if pos >= len(src) {
    524 							return truncated(src, mode)
    525 						}
    526 						mem.Disp = int64(int8(src[pos]))
    527 						pos++
    528 					}
    529 				}
    530 			} else {
    531 				haveMem = mod != 3
    532 
    533 				// 32-bit or 64-bit form
    534 				// Consume SIB encoding if present.
    535 				if rm == 4 && mod != 3 {
    536 					haveSIB = true
    537 					if pos >= len(src) {
    538 						return truncated(src, mode)
    539 					}
    540 					sib = int(src[pos])
    541 					pos++
    542 					if opshift >= 0 {
    543 						inst.Opcode |= uint32(sib) << uint(opshift)
    544 						opshift -= 8
    545 					}
    546 					scale = sib >> 6
    547 					index = (sib >> 3) & 07
    548 					base = sib & 07
    549 					if rex&PrefixREXB != 0 || vex == 0xC4 && inst.Prefix[vexIndex+1]&0x20 == 0 {
    550 						rexUsed |= PrefixREXB
    551 						base |= 8
    552 					}
    553 					if rex&PrefixREXX != 0 || vex == 0xC4 && inst.Prefix[vexIndex+1]&0x40 == 0 {
    554 						rexUsed |= PrefixREXX
    555 						index |= 8
    556 					}
    557 
    558 					mem.Scale = 1 << uint(scale)
    559 					if index == 4 {
    560 						// no mem.Index
    561 					} else {
    562 						mem.Index = baseRegForBits(addrMode) + Reg(index)
    563 					}
    564 					if base&7 == 5 && mod == 0 {
    565 						// no mem.Base
    566 					} else {
    567 						mem.Base = baseRegForBits(addrMode) + Reg(base)
    568 					}
    569 				} else {
    570 					if rex&PrefixREXB != 0 {
    571 						rexUsed |= PrefixREXB
    572 						rm |= 8
    573 					}
    574 					if mod == 0 && rm&7 == 5 || rm&7 == 4 {
    575 						// base omitted
    576 					} else if mod != 3 {
    577 						mem.Base = baseRegForBits(addrMode) + Reg(rm)
    578 					}
    579 				}
    580 
    581 				// Consume disp32 if present.
    582 				if mod == 0 && (rm&7 == 5 || haveSIB && base&7 == 5) || mod == 2 {
    583 					if pos+4 > len(src) {
    584 						return truncated(src, mode)
    585 					}
    586 					dispoff = pos
    587 					displen = 4
    588 					mem.Disp = int64(binary.LittleEndian.Uint32(src[pos:]))
    589 					pos += 4
    590 				}
    591 
    592 				// Consume disp8 if present.
    593 				if mod == 1 {
    594 					if pos >= len(src) {
    595 						return truncated(src, mode)
    596 					}
    597 					dispoff = pos
    598 					displen = 1
    599 					mem.Disp = int64(int8(src[pos]))
    600 					pos++
    601 				}
    602 
    603 				// In 64-bit, mod=0 rm=5 is PC-relative instead of just disp.
    604 				// See Vol 2A. Table 2-7.
    605 				if mode == 64 && mod == 0 && rm&7 == 5 {
    606 					if addrMode == 32 {
    607 						mem.Base = EIP
    608 					} else {
    609 						mem.Base = RIP
    610 					}
    611 				}
    612 			}
    613 
    614 			if segIndex >= 0 {
    615 				mem.Segment = prefixToSegment(inst.Prefix[segIndex])
    616 			}
    617 		}
    618 
    619 		// Execute single opcode.
    620 		switch decodeOp(x) {
    621 		default:
    622 			println("bad op", x, "at", pc-1, "from", oldPC)
    623 			return Inst{Len: pos}, errInternal
    624 
    625 		case xFail:
    626 			inst.Op = 0
    627 			break Decode
    628 
    629 		case xMatch:
    630 			break Decode
    631 
    632 		case xJump:
    633 			pc = int(decoder[pc])
    634 
    635 		// Conditional branches.
    636 
    637 		case xCondByte:
    638 			if pos >= len(src) {
    639 				return truncated(src, mode)
    640 			}
    641 			b := src[pos]
    642 			n := int(decoder[pc])
    643 			pc++
    644 			for i := 0; i < n; i++ {
    645 				xb, xpc := decoder[pc], int(decoder[pc+1])
    646 				pc += 2
    647 				if b == byte(xb) {
    648 					pc = xpc
    649 					pos++
    650 					if opshift >= 0 {
    651 						inst.Opcode |= uint32(b) << uint(opshift)
    652 						opshift -= 8
    653 					}
    654 					continue Decode
    655 				}
    656 			}
    657 			// xCondByte is the only conditional with a fall through,
    658 			// so that it can be used to pick off special cases before
    659 			// an xCondSlash. If the fallthrough instruction is xFail,
    660 			// advance the position so that the decoded instruction
    661 			// size includes the byte we just compared against.
    662 			if decodeOp(decoder[pc]) == xJump {
    663 				pc = int(decoder[pc+1])
    664 			}
    665 			if decodeOp(decoder[pc]) == xFail {
    666 				pos++
    667 			}
    668 
    669 		case xCondIs64:
    670 			if mode == 64 {
    671 				pc = int(decoder[pc+1])
    672 			} else {
    673 				pc = int(decoder[pc])
    674 			}
    675 
    676 		case xCondIsMem:
    677 			mem := haveMem
    678 			if !haveModrm {
    679 				if pos >= len(src) {
    680 					return instPrefix(src[0], mode) // too long
    681 				}
    682 				mem = src[pos]>>6 != 3
    683 			}
    684 			if mem {
    685 				pc = int(decoder[pc+1])
    686 			} else {
    687 				pc = int(decoder[pc])
    688 			}
    689 
    690 		case xCondDataSize:
    691 			switch dataMode {
    692 			case 16:
    693 				if dataSizeIndex >= 0 {
    694 					inst.Prefix[dataSizeIndex] |= PrefixImplicit
    695 				}
    696 				pc = int(decoder[pc])
    697 			case 32:
    698 				if dataSizeIndex >= 0 {
    699 					inst.Prefix[dataSizeIndex] |= PrefixImplicit
    700 				}
    701 				pc = int(decoder[pc+1])
    702 			case 64:
    703 				rexUsed |= PrefixREXW
    704 				pc = int(decoder[pc+2])
    705 			}
    706 
    707 		case xCondAddrSize:
    708 			switch addrMode {
    709 			case 16:
    710 				if addrSizeIndex >= 0 {
    711 					inst.Prefix[addrSizeIndex] |= PrefixImplicit
    712 				}
    713 				pc = int(decoder[pc])
    714 			case 32:
    715 				if addrSizeIndex >= 0 {
    716 					inst.Prefix[addrSizeIndex] |= PrefixImplicit
    717 				}
    718 				pc = int(decoder[pc+1])
    719 			case 64:
    720 				pc = int(decoder[pc+2])
    721 			}
    722 
    723 		case xCondPrefix:
    724 			// Conditional branch based on presence or absence of prefixes.
    725 			// The conflict cases here are completely undocumented and
    726 			// differ significantly between GNU libopcodes and Intel xed.
    727 			// I have not written assembly code to divine what various CPUs
    728 			// do, but it wouldn't surprise me if they are not consistent either.
    729 			//
    730 			// The basic idea is to switch on the presence of a prefix, so that
    731 			// for example:
    732 			//
    733 			//	xCondPrefix, 4
    734 			//	0xF3, 123,
    735 			//	0xF2, 234,
    736 			//	0x66, 345,
    737 			//	0, 456
    738 			//
    739 			// branch to 123 if the F3 prefix is present, 234 if the F2 prefix
    740 			// is present, 66 if the 345 prefix is present, and 456 otherwise.
    741 			// The prefixes are given in descending order so that the 0 will be last.
    742 			//
    743 			// It is unclear what should happen if multiple conditions are
    744 			// satisfied: what if F2 and F3 are both present, or if 66 and F2
    745 			// are present, or if all three are present? The one chosen becomes
    746 			// part of the opcode and the others do not. Perhaps the answer
    747 			// depends on the specific opcodes in question.
    748 			//
    749 			// The only clear example is that CRC32 is F2 0F 38 F1 /r, and
    750 			// it comes in 16-bit and 32-bit forms based on the 66 prefix,
    751 			// so 66 F2 0F 38 F1 /r should be treated as F2 taking priority,
    752 			// with the 66 being only an operand size override, and probably
    753 			// F2 66 0F 38 F1 /r should be treated the same.
    754 			// Perhaps that rule is specific to the case of CRC32, since no
    755 			// 66 0F 38 F1 instruction is defined (today) (that we know of).
    756 			// However, both libopcodes and xed seem to generalize this
    757 			// example and choose F2/F3 in preference to 66, and we
    758 			// do the same.
    759 			//
    760 			// Next, what if both F2 and F3 are present? Which wins?
    761 			// The Intel xed rule, and ours, is that the one that occurs last wins.
    762 			// The GNU libopcodes rule, which we implement only in gnuCompat mode,
    763 			// is that F3 beats F2 unless F3 has no special meaning, in which
    764 			// case F3 can be a modified on an F2 special meaning.
    765 			//
    766 			// Concretely,
    767 			//	66 0F D6 /r is MOVQ
    768 			//	F2 0F D6 /r is MOVDQ2Q
    769 			//	F3 0F D6 /r is MOVQ2DQ.
    770 			//
    771 			//	F2 66 0F D6 /r is 66 + MOVDQ2Q always.
    772 			//	66 F2 0F D6 /r is 66 + MOVDQ2Q always.
    773 			//	F3 66 0F D6 /r is 66 + MOVQ2DQ always.
    774 			//	66 F3 0F D6 /r is 66 + MOVQ2DQ always.
    775 			//	F2 F3 0F D6 /r is F2 + MOVQ2DQ always.
    776 			//	F3 F2 0F D6 /r is F3 + MOVQ2DQ in Intel xed, but F2 + MOVQ2DQ in GNU libopcodes.
    777 			//	Adding 66 anywhere in the prefix section of the
    778 			//	last two cases does not change the outcome.
    779 			//
    780 			// Finally, what if there is a variant in which 66 is a mandatory
    781 			// prefix rather than an operand size override, but we know of
    782 			// no corresponding F2/F3 form, and we see both F2/F3 and 66.
    783 			// Does F2/F3 still take priority, so that the result is an unknown
    784 			// instruction, or does the 66 take priority, so that the extended
    785 			// 66 instruction should be interpreted as having a REP/REPN prefix?
    786 			// Intel xed does the former and GNU libopcodes does the latter.
    787 			// We side with Intel xed, unless we are trying to match libopcodes
    788 			// more closely during the comparison-based test suite.
    789 			//
    790 			// In 64-bit mode REX.W is another valid prefix to test for, but
    791 			// there is less ambiguity about that. When present, REX.W is
    792 			// always the first entry in the table.
    793 			n := int(decoder[pc])
    794 			pc++
    795 			sawF3 := false
    796 			for j := 0; j < n; j++ {
    797 				prefix := Prefix(decoder[pc+2*j])
    798 				if prefix.IsREX() {
    799 					rexUsed |= prefix
    800 					if rex&prefix == prefix {
    801 						pc = int(decoder[pc+2*j+1])
    802 						continue Decode
    803 					}
    804 					continue
    805 				}
    806 				ok := false
    807 				if prefix == 0 {
    808 					ok = true
    809 				} else if prefix.IsREX() {
    810 					rexUsed |= prefix
    811 					if rex&prefix == prefix {
    812 						ok = true
    813 					}
    814 				} else if prefix == 0xC5 || prefix == 0xC4 {
    815 					if vex == prefix {
    816 						ok = true
    817 					}
    818 				} else if vex != 0 && (prefix == 0x0F || prefix == 0x0F38 || prefix == 0x0F3A ||
    819 					prefix == 0x66 || prefix == 0xF2 || prefix == 0xF3) {
    820 					var vexM, vexP Prefix
    821 					if vex == 0xC5 {
    822 						vexM = 1 // 2 byte vex always implies 0F
    823 						vexP = inst.Prefix[vexIndex+1]
    824 					} else {
    825 						vexM = inst.Prefix[vexIndex+1]
    826 						vexP = inst.Prefix[vexIndex+2]
    827 					}
    828 					switch prefix {
    829 					case 0x66:
    830 						ok = vexP&3 == 1
    831 					case 0xF3:
    832 						ok = vexP&3 == 2
    833 					case 0xF2:
    834 						ok = vexP&3 == 3
    835 					case 0x0F:
    836 						ok = vexM&3 == 1
    837 					case 0x0F38:
    838 						ok = vexM&3 == 2
    839 					case 0x0F3A:
    840 						ok = vexM&3 == 3
    841 					}
    842 				} else {
    843 					if prefix == 0xF3 {
    844 						sawF3 = true
    845 					}
    846 					switch prefix {
    847 					case PrefixLOCK:
    848 						if lockIndex >= 0 {
    849 							inst.Prefix[lockIndex] |= PrefixImplicit
    850 							ok = true
    851 						}
    852 					case PrefixREP, PrefixREPN:
    853 						if repIndex >= 0 && inst.Prefix[repIndex]&0xFF == prefix {
    854 							inst.Prefix[repIndex] |= PrefixImplicit
    855 							ok = true
    856 						}
    857 						if gnuCompat && !ok && prefix == 0xF3 && repIndex >= 0 && (j+1 >= n || decoder[pc+2*(j+1)] != 0xF2) {
    858 							// Check to see if earlier prefix F3 is present.
    859 							for i := repIndex - 1; i >= 0; i-- {
    860 								if inst.Prefix[i]&0xFF == prefix {
    861 									inst.Prefix[i] |= PrefixImplicit
    862 									ok = true
    863 								}
    864 							}
    865 						}
    866 						if gnuCompat && !ok && prefix == 0xF2 && repIndex >= 0 && !sawF3 && inst.Prefix[repIndex]&0xFF == 0xF3 {
    867 							// Check to see if earlier prefix F2 is present.
    868 							for i := repIndex - 1; i >= 0; i-- {
    869 								if inst.Prefix[i]&0xFF == prefix {
    870 									inst.Prefix[i] |= PrefixImplicit
    871 									ok = true
    872 								}
    873 							}
    874 						}
    875 					case PrefixCS, PrefixDS, PrefixES, PrefixFS, PrefixGS, PrefixSS:
    876 						if segIndex >= 0 && inst.Prefix[segIndex]&0xFF == prefix {
    877 							inst.Prefix[segIndex] |= PrefixImplicit
    878 							ok = true
    879 						}
    880 					case PrefixDataSize:
    881 						// Looking for 66 mandatory prefix.
    882 						// The F2/F3 mandatory prefixes take priority when both are present.
    883 						// If we got this far in the xCondPrefix table and an F2/F3 is present,
    884 						// it means the table didn't have any entry for that prefix. But if 66 has
    885 						// special meaning, perhaps F2/F3 have special meaning that we don't know.
    886 						// Intel xed works this way, treating the F2/F3 as inhibiting the 66.
    887 						// GNU libopcodes allows the 66 to match. We do what Intel xed does
    888 						// except in gnuCompat mode.
    889 						if repIndex >= 0 && !gnuCompat {
    890 							inst.Op = 0
    891 							break Decode
    892 						}
    893 						if dataSizeIndex >= 0 {
    894 							inst.Prefix[dataSizeIndex] |= PrefixImplicit
    895 							ok = true
    896 						}
    897 					case PrefixAddrSize:
    898 						if addrSizeIndex >= 0 {
    899 							inst.Prefix[addrSizeIndex] |= PrefixImplicit
    900 							ok = true
    901 						}
    902 					}
    903 				}
    904 				if ok {
    905 					pc = int(decoder[pc+2*j+1])
    906 					continue Decode
    907 				}
    908 			}
    909 			inst.Op = 0
    910 			break Decode
    911 
    912 		case xCondSlashR:
    913 			pc = int(decoder[pc+regop&7])
    914 
    915 		// Input.
    916 
    917 		case xReadSlashR:
    918 			// done above
    919 
    920 		case xReadIb:
    921 			if pos >= len(src) {
    922 				return truncated(src, mode)
    923 			}
    924 			imm8 = int8(src[pos])
    925 			pos++
    926 
    927 		case xReadIw:
    928 			if pos+2 > len(src) {
    929 				return truncated(src, mode)
    930 			}
    931 			imm = int64(binary.LittleEndian.Uint16(src[pos:]))
    932 			pos += 2
    933 
    934 		case xReadId:
    935 			if pos+4 > len(src) {
    936 				return truncated(src, mode)
    937 			}
    938 			imm = int64(binary.LittleEndian.Uint32(src[pos:]))
    939 			pos += 4
    940 
    941 		case xReadIo:
    942 			if pos+8 > len(src) {
    943 				return truncated(src, mode)
    944 			}
    945 			imm = int64(binary.LittleEndian.Uint64(src[pos:]))
    946 			pos += 8
    947 
    948 		case xReadCb:
    949 			if pos >= len(src) {
    950 				return truncated(src, mode)
    951 			}
    952 			immcpos = pos
    953 			immc = int64(src[pos])
    954 			pos++
    955 
    956 		case xReadCw:
    957 			if pos+2 > len(src) {
    958 				return truncated(src, mode)
    959 			}
    960 			immcpos = pos
    961 			immc = int64(binary.LittleEndian.Uint16(src[pos:]))
    962 			pos += 2
    963 
    964 		case xReadCm:
    965 			immcpos = pos
    966 			if addrMode == 16 {
    967 				if pos+2 > len(src) {
    968 					return truncated(src, mode)
    969 				}
    970 				immc = int64(binary.LittleEndian.Uint16(src[pos:]))
    971 				pos += 2
    972 			} else if addrMode == 32 {
    973 				if pos+4 > len(src) {
    974 					return truncated(src, mode)
    975 				}
    976 				immc = int64(binary.LittleEndian.Uint32(src[pos:]))
    977 				pos += 4
    978 			} else {
    979 				if pos+8 > len(src) {
    980 					return truncated(src, mode)
    981 				}
    982 				immc = int64(binary.LittleEndian.Uint64(src[pos:]))
    983 				pos += 8
    984 			}
    985 		case xReadCd:
    986 			immcpos = pos
    987 			if pos+4 > len(src) {
    988 				return truncated(src, mode)
    989 			}
    990 			immc = int64(binary.LittleEndian.Uint32(src[pos:]))
    991 			pos += 4
    992 
    993 		case xReadCp:
    994 			immcpos = pos
    995 			if pos+6 > len(src) {
    996 				return truncated(src, mode)
    997 			}
    998 			w := binary.LittleEndian.Uint32(src[pos:])
    999 			w2 := binary.LittleEndian.Uint16(src[pos+4:])
   1000 			immc = int64(w2)<<32 | int64(w)
   1001 			pos += 6
   1002 
   1003 		// Output.
   1004 
   1005 		case xSetOp:
   1006 			inst.Op = Op(decoder[pc])
   1007 			pc++
   1008 
   1009 		case xArg1,
   1010 			xArg3,
   1011 			xArgAL,
   1012 			xArgAX,
   1013 			xArgCL,
   1014 			xArgCS,
   1015 			xArgDS,
   1016 			xArgDX,
   1017 			xArgEAX,
   1018 			xArgEDX,
   1019 			xArgES,
   1020 			xArgFS,
   1021 			xArgGS,
   1022 			xArgRAX,
   1023 			xArgRDX,
   1024 			xArgSS,
   1025 			xArgST,
   1026 			xArgXMM0:
   1027 			inst.Args[narg] = fixedArg[x]
   1028 			narg++
   1029 
   1030 		case xArgImm8:
   1031 			inst.Args[narg] = Imm(imm8)
   1032 			narg++
   1033 
   1034 		case xArgImm8u:
   1035 			inst.Args[narg] = Imm(uint8(imm8))
   1036 			narg++
   1037 
   1038 		case xArgImm16:
   1039 			inst.Args[narg] = Imm(int16(imm))
   1040 			narg++
   1041 
   1042 		case xArgImm16u:
   1043 			inst.Args[narg] = Imm(uint16(imm))
   1044 			narg++
   1045 
   1046 		case xArgImm32:
   1047 			inst.Args[narg] = Imm(int32(imm))
   1048 			narg++
   1049 
   1050 		case xArgImm64:
   1051 			inst.Args[narg] = Imm(imm)
   1052 			narg++
   1053 
   1054 		case xArgM,
   1055 			xArgM128,
   1056 			xArgM256,
   1057 			xArgM1428byte,
   1058 			xArgM16,
   1059 			xArgM16and16,
   1060 			xArgM16and32,
   1061 			xArgM16and64,
   1062 			xArgM16colon16,
   1063 			xArgM16colon32,
   1064 			xArgM16colon64,
   1065 			xArgM16int,
   1066 			xArgM2byte,
   1067 			xArgM32,
   1068 			xArgM32and32,
   1069 			xArgM32fp,
   1070 			xArgM32int,
   1071 			xArgM512byte,
   1072 			xArgM64,
   1073 			xArgM64fp,
   1074 			xArgM64int,
   1075 			xArgM8,
   1076 			xArgM80bcd,
   1077 			xArgM80dec,
   1078 			xArgM80fp,
   1079 			xArgM94108byte,
   1080 			xArgMem:
   1081 			if !haveMem {
   1082 				inst.Op = 0
   1083 				break Decode
   1084 			}
   1085 			inst.Args[narg] = mem
   1086 			inst.MemBytes = int(memBytes[decodeOp(x)])
   1087 			if mem.Base == RIP {
   1088 				inst.PCRel = displen
   1089 				inst.PCRelOff = dispoff
   1090 			}
   1091 			narg++
   1092 
   1093 		case xArgPtr16colon16:
   1094 			inst.Args[narg] = Imm(immc >> 16)
   1095 			inst.Args[narg+1] = Imm(immc & (1<<16 - 1))
   1096 			narg += 2
   1097 
   1098 		case xArgPtr16colon32:
   1099 			inst.Args[narg] = Imm(immc >> 32)
   1100 			inst.Args[narg+1] = Imm(immc & (1<<32 - 1))
   1101 			narg += 2
   1102 
   1103 		case xArgMoffs8, xArgMoffs16, xArgMoffs32, xArgMoffs64:
   1104 			// TODO(rsc): Can address be 64 bits?
   1105 			mem = Mem{Disp: int64(immc)}
   1106 			if segIndex >= 0 {
   1107 				mem.Segment = prefixToSegment(inst.Prefix[segIndex])
   1108 				inst.Prefix[segIndex] |= PrefixImplicit
   1109 			}
   1110 			inst.Args[narg] = mem
   1111 			inst.MemBytes = int(memBytes[decodeOp(x)])
   1112 			if mem.Base == RIP {
   1113 				inst.PCRel = displen
   1114 				inst.PCRelOff = dispoff
   1115 			}
   1116 			narg++
   1117 
   1118 		case xArgYmm1:
   1119 			base := baseReg[x]
   1120 			index := Reg(regop)
   1121 			if inst.Prefix[vexIndex+1]&0x80 == 0 {
   1122 				index += 8
   1123 			}
   1124 			inst.Args[narg] = base + index
   1125 			narg++
   1126 
   1127 		case xArgR8, xArgR16, xArgR32, xArgR64, xArgXmm, xArgXmm1, xArgDR0dashDR7:
   1128 			base := baseReg[x]
   1129 			index := Reg(regop)
   1130 			if rex != 0 && base == AL && index >= 4 {
   1131 				rexUsed |= PrefixREX
   1132 				index -= 4
   1133 				base = SPB
   1134 			}
   1135 			inst.Args[narg] = base + index
   1136 			narg++
   1137 
   1138 		case xArgMm, xArgMm1, xArgTR0dashTR7:
   1139 			inst.Args[narg] = baseReg[x] + Reg(regop&7)
   1140 			narg++
   1141 
   1142 		case xArgCR0dashCR7:
   1143 			// AMD documents an extension that the LOCK prefix
   1144 			// can be used in place of a REX prefix in order to access
   1145 			// CR8 from 32-bit mode. The LOCK prefix is allowed in
   1146 			// all modes, provided the corresponding CPUID bit is set.
   1147 			if lockIndex >= 0 {
   1148 				inst.Prefix[lockIndex] |= PrefixImplicit
   1149 				regop += 8
   1150 			}
   1151 			inst.Args[narg] = CR0 + Reg(regop)
   1152 			narg++
   1153 
   1154 		case xArgSreg:
   1155 			regop &= 7
   1156 			if regop >= 6 {
   1157 				inst.Op = 0
   1158 				break Decode
   1159 			}
   1160 			inst.Args[narg] = ES + Reg(regop)
   1161 			narg++
   1162 
   1163 		case xArgRmf16, xArgRmf32, xArgRmf64:
   1164 			base := baseReg[x]
   1165 			index := Reg(modrm & 07)
   1166 			if rex&PrefixREXB != 0 {
   1167 				rexUsed |= PrefixREXB
   1168 				index += 8
   1169 			}
   1170 			inst.Args[narg] = base + index
   1171 			narg++
   1172 
   1173 		case xArgR8op, xArgR16op, xArgR32op, xArgR64op, xArgSTi:
   1174 			n := inst.Opcode >> uint(opshift+8) & 07
   1175 			base := baseReg[x]
   1176 			index := Reg(n)
   1177 			if rex&PrefixREXB != 0 && decodeOp(x) != xArgSTi {
   1178 				rexUsed |= PrefixREXB
   1179 				index += 8
   1180 			}
   1181 			if rex != 0 && base == AL && index >= 4 {
   1182 				rexUsed |= PrefixREX
   1183 				index -= 4
   1184 				base = SPB
   1185 			}
   1186 			inst.Args[narg] = base + index
   1187 			narg++
   1188 		case xArgRM8, xArgRM16, xArgRM32, xArgRM64, xArgR32M16, xArgR32M8, xArgR64M16,
   1189 			xArgMmM32, xArgMmM64, xArgMm2M64,
   1190 			xArgXmm2M16, xArgXmm2M32, xArgXmm2M64, xArgXmmM64, xArgXmmM128, xArgXmmM32, xArgXmm2M128,
   1191 			xArgYmm2M256:
   1192 			if haveMem {
   1193 				inst.Args[narg] = mem
   1194 				inst.MemBytes = int(memBytes[decodeOp(x)])
   1195 				if mem.Base == RIP {
   1196 					inst.PCRel = displen
   1197 					inst.PCRelOff = dispoff
   1198 				}
   1199 			} else {
   1200 				base := baseReg[x]
   1201 				index := Reg(rm)
   1202 				switch decodeOp(x) {
   1203 				case xArgMmM32, xArgMmM64, xArgMm2M64:
   1204 					// There are only 8 MMX registers, so these ignore the REX.X bit.
   1205 					index &= 7
   1206 				case xArgRM8:
   1207 					if rex != 0 && index >= 4 {
   1208 						rexUsed |= PrefixREX
   1209 						index -= 4
   1210 						base = SPB
   1211 					}
   1212 				case xArgYmm2M256:
   1213 					if vex == 0xC4 && inst.Prefix[vexIndex+1]&0x40 == 0x40 {
   1214 						index += 8
   1215 					}
   1216 				}
   1217 				inst.Args[narg] = base + index
   1218 			}
   1219 			narg++
   1220 
   1221 		case xArgMm2: // register only; TODO(rsc): Handle with tag modrm_regonly tag
   1222 			if haveMem {
   1223 				inst.Op = 0
   1224 				break Decode
   1225 			}
   1226 			inst.Args[narg] = baseReg[x] + Reg(rm&7)
   1227 			narg++
   1228 
   1229 		case xArgXmm2: // register only; TODO(rsc): Handle with tag modrm_regonly tag
   1230 			if haveMem {
   1231 				inst.Op = 0
   1232 				break Decode
   1233 			}
   1234 			inst.Args[narg] = baseReg[x] + Reg(rm)
   1235 			narg++
   1236 
   1237 		case xArgRel8:
   1238 			inst.PCRelOff = immcpos
   1239 			inst.PCRel = 1
   1240 			inst.Args[narg] = Rel(int8(immc))
   1241 			narg++
   1242 
   1243 		case xArgRel16:
   1244 			inst.PCRelOff = immcpos
   1245 			inst.PCRel = 2
   1246 			inst.Args[narg] = Rel(int16(immc))
   1247 			narg++
   1248 
   1249 		case xArgRel32:
   1250 			inst.PCRelOff = immcpos
   1251 			inst.PCRel = 4
   1252 			inst.Args[narg] = Rel(int32(immc))
   1253 			narg++
   1254 		}
   1255 	}
   1256 
   1257 	if inst.Op == 0 {
   1258 		// Invalid instruction.
   1259 		if nprefix > 0 {
   1260 			return instPrefix(src[0], mode) // invalid instruction
   1261 		}
   1262 		return Inst{Len: pos}, ErrUnrecognized
   1263 	}
   1264 
   1265 	// Matched! Hooray!
   1266 
   1267 	// 90 decodes as XCHG EAX, EAX but is NOP.
   1268 	// 66 90 decodes as XCHG AX, AX and is NOP too.
   1269 	// 48 90 decodes as XCHG RAX, RAX and is NOP too.
   1270 	// 43 90 decodes as XCHG R8D, EAX and is *not* NOP.
   1271 	// F3 90 decodes as REP XCHG EAX, EAX but is PAUSE.
   1272 	// It's all too special to handle in the decoding tables, at least for now.
   1273 	if inst.Op == XCHG && inst.Opcode>>24 == 0x90 {
   1274 		if inst.Args[0] == RAX || inst.Args[0] == EAX || inst.Args[0] == AX {
   1275 			inst.Op = NOP
   1276 			if dataSizeIndex >= 0 {
   1277 				inst.Prefix[dataSizeIndex] &^= PrefixImplicit
   1278 			}
   1279 			inst.Args[0] = nil
   1280 			inst.Args[1] = nil
   1281 		}
   1282 		if repIndex >= 0 && inst.Prefix[repIndex] == 0xF3 {
   1283 			inst.Prefix[repIndex] |= PrefixImplicit
   1284 			inst.Op = PAUSE
   1285 			inst.Args[0] = nil
   1286 			inst.Args[1] = nil
   1287 		} else if gnuCompat {
   1288 			for i := nprefix - 1; i >= 0; i-- {
   1289 				if inst.Prefix[i]&0xFF == 0xF3 {
   1290 					inst.Prefix[i] |= PrefixImplicit
   1291 					inst.Op = PAUSE
   1292 					inst.Args[0] = nil
   1293 					inst.Args[1] = nil
   1294 					break
   1295 				}
   1296 			}
   1297 		}
   1298 	}
   1299 
   1300 	// defaultSeg returns the default segment for an implicit
   1301 	// memory reference: the final override if present, or else DS.
   1302 	defaultSeg := func() Reg {
   1303 		if segIndex >= 0 {
   1304 			inst.Prefix[segIndex] |= PrefixImplicit
   1305 			return prefixToSegment(inst.Prefix[segIndex])
   1306 		}
   1307 		return DS
   1308 	}
   1309 
   1310 	// Add implicit arguments not present in the tables.
   1311 	// Normally we shy away from making implicit arguments explicit,
   1312 	// following the Intel manuals, but adding the arguments seems
   1313 	// the best way to express the effect of the segment override prefixes.
   1314 	// TODO(rsc): Perhaps add these to the tables and
   1315 	// create bytecode instructions for them.
   1316 	usedAddrSize := false
   1317 	switch inst.Op {
   1318 	case INSB, INSW, INSD:
   1319 		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1320 		inst.Args[1] = DX
   1321 		usedAddrSize = true
   1322 
   1323 	case OUTSB, OUTSW, OUTSD:
   1324 		inst.Args[0] = DX
   1325 		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
   1326 		usedAddrSize = true
   1327 
   1328 	case MOVSB, MOVSW, MOVSD, MOVSQ:
   1329 		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1330 		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
   1331 		usedAddrSize = true
   1332 
   1333 	case CMPSB, CMPSW, CMPSD, CMPSQ:
   1334 		inst.Args[0] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
   1335 		inst.Args[1] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1336 		usedAddrSize = true
   1337 
   1338 	case LODSB, LODSW, LODSD, LODSQ:
   1339 		switch inst.Op {
   1340 		case LODSB:
   1341 			inst.Args[0] = AL
   1342 		case LODSW:
   1343 			inst.Args[0] = AX
   1344 		case LODSD:
   1345 			inst.Args[0] = EAX
   1346 		case LODSQ:
   1347 			inst.Args[0] = RAX
   1348 		}
   1349 		inst.Args[1] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + SI - AX}
   1350 		usedAddrSize = true
   1351 
   1352 	case STOSB, STOSW, STOSD, STOSQ:
   1353 		inst.Args[0] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1354 		switch inst.Op {
   1355 		case STOSB:
   1356 			inst.Args[1] = AL
   1357 		case STOSW:
   1358 			inst.Args[1] = AX
   1359 		case STOSD:
   1360 			inst.Args[1] = EAX
   1361 		case STOSQ:
   1362 			inst.Args[1] = RAX
   1363 		}
   1364 		usedAddrSize = true
   1365 
   1366 	case SCASB, SCASW, SCASD, SCASQ:
   1367 		inst.Args[1] = Mem{Segment: ES, Base: baseRegForBits(addrMode) + DI - AX}
   1368 		switch inst.Op {
   1369 		case SCASB:
   1370 			inst.Args[0] = AL
   1371 		case SCASW:
   1372 			inst.Args[0] = AX
   1373 		case SCASD:
   1374 			inst.Args[0] = EAX
   1375 		case SCASQ:
   1376 			inst.Args[0] = RAX
   1377 		}
   1378 		usedAddrSize = true
   1379 
   1380 	case XLATB:
   1381 		inst.Args[0] = Mem{Segment: defaultSeg(), Base: baseRegForBits(addrMode) + BX - AX}
   1382 		usedAddrSize = true
   1383 	}
   1384 
   1385 	// If we used the address size annotation to construct the
   1386 	// argument list, mark that prefix as implicit: it doesn't need
   1387 	// to be shown when printing the instruction.
   1388 	if haveMem || usedAddrSize {
   1389 		if addrSizeIndex >= 0 {
   1390 			inst.Prefix[addrSizeIndex] |= PrefixImplicit
   1391 		}
   1392 	}
   1393 
   1394 	// Similarly, if there's some memory operand, the segment
   1395 	// will be shown there and doesn't need to be shown as an
   1396 	// explicit prefix.
   1397 	if haveMem {
   1398 		if segIndex >= 0 {
   1399 			inst.Prefix[segIndex] |= PrefixImplicit
   1400 		}
   1401 	}
   1402 
   1403 	// Branch predict prefixes are overloaded segment prefixes,
   1404 	// since segment prefixes don't make sense on conditional jumps.
   1405 	// Rewrite final instance to prediction prefix.
   1406 	// The set of instructions to which the prefixes apply (other then the
   1407 	// Jcc conditional jumps) is not 100% clear from the manuals, but
   1408 	// the disassemblers seem to agree about the LOOP and JCXZ instructions,
   1409 	// so we'll follow along.
   1410 	// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
   1411 	if isCondJmp[inst.Op] || isLoop[inst.Op] || inst.Op == JCXZ || inst.Op == JECXZ || inst.Op == JRCXZ {
   1412 	PredictLoop:
   1413 		for i := nprefix - 1; i >= 0; i-- {
   1414 			p := inst.Prefix[i]
   1415 			switch p & 0xFF {
   1416 			case PrefixCS:
   1417 				inst.Prefix[i] = PrefixPN
   1418 				break PredictLoop
   1419 			case PrefixDS:
   1420 				inst.Prefix[i] = PrefixPT
   1421 				break PredictLoop
   1422 			}
   1423 		}
   1424 	}
   1425 
   1426 	// The BND prefix is part of the Intel Memory Protection Extensions (MPX).
   1427 	// A REPN applied to certain control transfers is a BND prefix to bound
   1428 	// the range of possible destinations. There's surprisingly little documentation
   1429 	// about this, so we just do what libopcodes and xed agree on.
   1430 	// In particular, it's unclear why a REPN applied to LOOP or JCXZ instructions
   1431 	// does not turn into a BND.
   1432 	// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
   1433 	if isCondJmp[inst.Op] || inst.Op == JMP || inst.Op == CALL || inst.Op == RET {
   1434 		for i := nprefix - 1; i >= 0; i-- {
   1435 			p := inst.Prefix[i]
   1436 			if p&^PrefixIgnored == PrefixREPN {
   1437 				inst.Prefix[i] = PrefixBND
   1438 				break
   1439 			}
   1440 		}
   1441 	}
   1442 
   1443 	// The LOCK prefix only applies to certain instructions, and then only
   1444 	// to instances of the instruction with a memory destination.
   1445 	// Other uses of LOCK are invalid and cause a processor exception,
   1446 	// in contrast to the "just ignore it" spirit applied to all other prefixes.
   1447 	// Mark invalid lock prefixes.
   1448 	hasLock := false
   1449 	if lockIndex >= 0 && inst.Prefix[lockIndex]&PrefixImplicit == 0 {
   1450 		switch inst.Op {
   1451 		// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
   1452 		case ADD, ADC, AND, BTC, BTR, BTS, CMPXCHG, CMPXCHG8B, CMPXCHG16B, DEC, INC, NEG, NOT, OR, SBB, SUB, XOR, XADD, XCHG:
   1453 			if isMem(inst.Args[0]) {
   1454 				hasLock = true
   1455 				break
   1456 			}
   1457 			fallthrough
   1458 		default:
   1459 			inst.Prefix[lockIndex] |= PrefixInvalid
   1460 		}
   1461 	}
   1462 
   1463 	// In certain cases, all of which require a memory destination,
   1464 	// the REPN and REP prefixes are interpreted as XACQUIRE and XRELEASE
   1465 	// from the Intel Transactional Synchroniation Extensions (TSX).
   1466 	//
   1467 	// The specific rules are:
   1468 	// (1) Any instruction with a valid LOCK prefix can have XACQUIRE or XRELEASE.
   1469 	// (2) Any XCHG, which always has an implicit LOCK, can have XACQUIRE or XRELEASE.
   1470 	// (3) Any 0x88-, 0x89-, 0xC6-, or 0xC7-opcode MOV can have XRELEASE.
   1471 	if isMem(inst.Args[0]) {
   1472 		if inst.Op == XCHG {
   1473 			hasLock = true
   1474 		}
   1475 
   1476 		for i := len(inst.Prefix) - 1; i >= 0; i-- {
   1477 			p := inst.Prefix[i] &^ PrefixIgnored
   1478 			switch p {
   1479 			case PrefixREPN:
   1480 				if hasLock {
   1481 					inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXACQUIRE
   1482 				}
   1483 
   1484 			case PrefixREP:
   1485 				if hasLock {
   1486 					inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXRELEASE
   1487 				}
   1488 
   1489 				if inst.Op == MOV {
   1490 					op := (inst.Opcode >> 24) &^ 1
   1491 					if op == 0x88 || op == 0xC6 {
   1492 						inst.Prefix[i] = inst.Prefix[i]&PrefixIgnored | PrefixXRELEASE
   1493 					}
   1494 				}
   1495 			}
   1496 		}
   1497 	}
   1498 
   1499 	// If REP is used on a non-REP-able instruction, mark the prefix as ignored.
   1500 	if repIndex >= 0 {
   1501 		switch inst.Prefix[repIndex] {
   1502 		case PrefixREP, PrefixREPN:
   1503 			switch inst.Op {
   1504 			// According to the manuals, the REP/REPE prefix applies to all of these,
   1505 			// while the REPN applies only to some of them. However, both libopcodes
   1506 			// and xed show both prefixes explicitly for all instructions, so we do the same.
   1507 			// TODO(rsc): Perhaps this instruction class should be derived from the CSV.
   1508 			case INSB, INSW, INSD,
   1509 				MOVSB, MOVSW, MOVSD, MOVSQ,
   1510 				OUTSB, OUTSW, OUTSD,
   1511 				LODSB, LODSW, LODSD, LODSQ,
   1512 				CMPSB, CMPSW, CMPSD, CMPSQ,
   1513 				SCASB, SCASW, SCASD, SCASQ,
   1514 				STOSB, STOSW, STOSD, STOSQ:
   1515 				// ok
   1516 			default:
   1517 				inst.Prefix[repIndex] |= PrefixIgnored
   1518 			}
   1519 		}
   1520 	}
   1521 
   1522 	// If REX was present, mark implicit if all the 1 bits were consumed.
   1523 	if rexIndex >= 0 {
   1524 		if rexUsed != 0 {
   1525 			rexUsed |= PrefixREX
   1526 		}
   1527 		if rex&^rexUsed == 0 {
   1528 			inst.Prefix[rexIndex] |= PrefixImplicit
   1529 		}
   1530 	}
   1531 
   1532 	inst.DataSize = dataMode
   1533 	inst.AddrSize = addrMode
   1534 	inst.Mode = mode
   1535 	inst.Len = pos
   1536 	return inst, nil
   1537 }
   1538 
   1539 var errInternal = errors.New("internal error")
   1540 
   1541 // addr16 records the eight 16-bit addressing modes.
   1542 var addr16 = [8]Mem{
   1543 	{Base: BX, Scale: 1, Index: SI},
   1544 	{Base: BX, Scale: 1, Index: DI},
   1545 	{Base: BP, Scale: 1, Index: SI},
   1546 	{Base: BP, Scale: 1, Index: DI},
   1547 	{Base: SI},
   1548 	{Base: DI},
   1549 	{Base: BP},
   1550 	{Base: BX},
   1551 }
   1552 
   1553 // baseReg returns the base register for a given register size in bits.
   1554 func baseRegForBits(bits int) Reg {
   1555 	switch bits {
   1556 	case 8:
   1557 		return AL
   1558 	case 16:
   1559 		return AX
   1560 	case 32:
   1561 		return EAX
   1562 	case 64:
   1563 		return RAX
   1564 	}
   1565 	return 0
   1566 }
   1567 
   1568 // baseReg records the base register for argument types that specify
   1569 // a range of registers indexed by op, regop, or rm.
   1570 var baseReg = [...]Reg{
   1571 	xArgDR0dashDR7: DR0,
   1572 	xArgMm1:        M0,
   1573 	xArgMm2:        M0,
   1574 	xArgMm2M64:     M0,
   1575 	xArgMm:         M0,
   1576 	xArgMmM32:      M0,
   1577 	xArgMmM64:      M0,
   1578 	xArgR16:        AX,
   1579 	xArgR16op:      AX,
   1580 	xArgR32:        EAX,
   1581 	xArgR32M16:     EAX,
   1582 	xArgR32M8:      EAX,
   1583 	xArgR32op:      EAX,
   1584 	xArgR64:        RAX,
   1585 	xArgR64M16:     RAX,
   1586 	xArgR64op:      RAX,
   1587 	xArgR8:         AL,
   1588 	xArgR8op:       AL,
   1589 	xArgRM16:       AX,
   1590 	xArgRM32:       EAX,
   1591 	xArgRM64:       RAX,
   1592 	xArgRM8:        AL,
   1593 	xArgRmf16:      AX,
   1594 	xArgRmf32:      EAX,
   1595 	xArgRmf64:      RAX,
   1596 	xArgSTi:        F0,
   1597 	xArgTR0dashTR7: TR0,
   1598 	xArgXmm1:       X0,
   1599 	xArgYmm1:       X0,
   1600 	xArgXmm2:       X0,
   1601 	xArgXmm2M128:   X0,
   1602 	xArgYmm2M256:   X0,
   1603 	xArgXmm2M16:    X0,
   1604 	xArgXmm2M32:    X0,
   1605 	xArgXmm2M64:    X0,
   1606 	xArgXmm:        X0,
   1607 	xArgXmmM128:    X0,
   1608 	xArgXmmM32:     X0,
   1609 	xArgXmmM64:     X0,
   1610 }
   1611 
   1612 // prefixToSegment returns the segment register
   1613 // corresponding to a particular segment prefix.
   1614 func prefixToSegment(p Prefix) Reg {
   1615 	switch p &^ PrefixImplicit {
   1616 	case PrefixCS:
   1617 		return CS
   1618 	case PrefixDS:
   1619 		return DS
   1620 	case PrefixES:
   1621 		return ES
   1622 	case PrefixFS:
   1623 		return FS
   1624 	case PrefixGS:
   1625 		return GS
   1626 	case PrefixSS:
   1627 		return SS
   1628 	}
   1629 	return 0
   1630 }
   1631 
   1632 // fixedArg records the fixed arguments corresponding to the given bytecodes.
   1633 var fixedArg = [...]Arg{
   1634 	xArg1:    Imm(1),
   1635 	xArg3:    Imm(3),
   1636 	xArgAL:   AL,
   1637 	xArgAX:   AX,
   1638 	xArgDX:   DX,
   1639 	xArgEAX:  EAX,
   1640 	xArgEDX:  EDX,
   1641 	xArgRAX:  RAX,
   1642 	xArgRDX:  RDX,
   1643 	xArgCL:   CL,
   1644 	xArgCS:   CS,
   1645 	xArgDS:   DS,
   1646 	xArgES:   ES,
   1647 	xArgFS:   FS,
   1648 	xArgGS:   GS,
   1649 	xArgSS:   SS,
   1650 	xArgST:   F0,
   1651 	xArgXMM0: X0,
   1652 }
   1653 
   1654 // memBytes records the size of the memory pointed at
   1655 // by a memory argument of the given form.
   1656 var memBytes = [...]int8{
   1657 	xArgM128:       128 / 8,
   1658 	xArgM256:       256 / 8,
   1659 	xArgM16:        16 / 8,
   1660 	xArgM16and16:   (16 + 16) / 8,
   1661 	xArgM16colon16: (16 + 16) / 8,
   1662 	xArgM16colon32: (16 + 32) / 8,
   1663 	xArgM16int:     16 / 8,
   1664 	xArgM2byte:     2,
   1665 	xArgM32:        32 / 8,
   1666 	xArgM32and32:   (32 + 32) / 8,
   1667 	xArgM32fp:      32 / 8,
   1668 	xArgM32int:     32 / 8,
   1669 	xArgM64:        64 / 8,
   1670 	xArgM64fp:      64 / 8,
   1671 	xArgM64int:     64 / 8,
   1672 	xArgMm2M64:     64 / 8,
   1673 	xArgMmM32:      32 / 8,
   1674 	xArgMmM64:      64 / 8,
   1675 	xArgMoffs16:    16 / 8,
   1676 	xArgMoffs32:    32 / 8,
   1677 	xArgMoffs64:    64 / 8,
   1678 	xArgMoffs8:     8 / 8,
   1679 	xArgR32M16:     16 / 8,
   1680 	xArgR32M8:      8 / 8,
   1681 	xArgR64M16:     16 / 8,
   1682 	xArgRM16:       16 / 8,
   1683 	xArgRM32:       32 / 8,
   1684 	xArgRM64:       64 / 8,
   1685 	xArgRM8:        8 / 8,
   1686 	xArgXmm2M128:   128 / 8,
   1687 	xArgYmm2M256:   256 / 8,
   1688 	xArgXmm2M16:    16 / 8,
   1689 	xArgXmm2M32:    32 / 8,
   1690 	xArgXmm2M64:    64 / 8,
   1691 	xArgXmm:        128 / 8,
   1692 	xArgXmmM128:    128 / 8,
   1693 	xArgXmmM32:     32 / 8,
   1694 	xArgXmmM64:     64 / 8,
   1695 }
   1696 
   1697 // isCondJmp records the conditional jumps.
   1698 var isCondJmp = [maxOp + 1]bool{
   1699 	JA:  true,
   1700 	JAE: true,
   1701 	JB:  true,
   1702 	JBE: true,
   1703 	JE:  true,
   1704 	JG:  true,
   1705 	JGE: true,
   1706 	JL:  true,
   1707 	JLE: true,
   1708 	JNE: true,
   1709 	JNO: true,
   1710 	JNP: true,
   1711 	JNS: true,
   1712 	JO:  true,
   1713 	JP:  true,
   1714 	JS:  true,
   1715 }
   1716 
   1717 // isLoop records the loop operators.
   1718 var isLoop = [maxOp + 1]bool{
   1719 	LOOP:   true,
   1720 	LOOPE:  true,
   1721 	LOOPNE: true,
   1722 	JECXZ:  true,
   1723 	JRCXZ:  true,
   1724 }
   1725