Home | History | Annotate | Download | only in amd64
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package amd64
      6 
      7 import (
      8 	"fmt"
      9 	"math"
     10 
     11 	"cmd/compile/internal/gc"
     12 	"cmd/compile/internal/ssa"
     13 	"cmd/compile/internal/types"
     14 	"cmd/internal/obj"
     15 	"cmd/internal/obj/x86"
     16 )
     17 
     18 // markMoves marks any MOVXconst ops that need to avoid clobbering flags.
     19 func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
     20 	flive := b.FlagsLiveAtEnd
     21 	if b.Control != nil && b.Control.Type.IsFlags() {
     22 		flive = true
     23 	}
     24 	for i := len(b.Values) - 1; i >= 0; i-- {
     25 		v := b.Values[i]
     26 		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
     27 			// The "mark" is any non-nil Aux value.
     28 			v.Aux = v
     29 		}
     30 		if v.Type.IsFlags() {
     31 			flive = false
     32 		}
     33 		for _, a := range v.Args {
     34 			if a.Type.IsFlags() {
     35 				flive = true
     36 			}
     37 		}
     38 	}
     39 }
     40 
     41 // loadByType returns the load instruction of the given type.
     42 func loadByType(t *types.Type) obj.As {
     43 	// Avoid partial register write
     44 	if !t.IsFloat() && t.Size() <= 2 {
     45 		if t.Size() == 1 {
     46 			return x86.AMOVBLZX
     47 		} else {
     48 			return x86.AMOVWLZX
     49 		}
     50 	}
     51 	// Otherwise, there's no difference between load and store opcodes.
     52 	return storeByType(t)
     53 }
     54 
     55 // storeByType returns the store instruction of the given type.
     56 func storeByType(t *types.Type) obj.As {
     57 	width := t.Size()
     58 	if t.IsFloat() {
     59 		switch width {
     60 		case 4:
     61 			return x86.AMOVSS
     62 		case 8:
     63 			return x86.AMOVSD
     64 		}
     65 	} else {
     66 		switch width {
     67 		case 1:
     68 			return x86.AMOVB
     69 		case 2:
     70 			return x86.AMOVW
     71 		case 4:
     72 			return x86.AMOVL
     73 		case 8:
     74 			return x86.AMOVQ
     75 		}
     76 	}
     77 	panic("bad store type")
     78 }
     79 
     80 // moveByType returns the reg->reg move instruction of the given type.
     81 func moveByType(t *types.Type) obj.As {
     82 	if t.IsFloat() {
     83 		// Moving the whole sse2 register is faster
     84 		// than moving just the correct low portion of it.
     85 		// There is no xmm->xmm move with 1 byte opcode,
     86 		// so use movups, which has 2 byte opcode.
     87 		return x86.AMOVUPS
     88 	} else {
     89 		switch t.Size() {
     90 		case 1:
     91 			// Avoids partial register write
     92 			return x86.AMOVL
     93 		case 2:
     94 			return x86.AMOVL
     95 		case 4:
     96 			return x86.AMOVL
     97 		case 8:
     98 			return x86.AMOVQ
     99 		case 16:
    100 			return x86.AMOVUPS // int128s are in SSE registers
    101 		default:
    102 			panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
    103 		}
    104 	}
    105 }
    106 
    107 // opregreg emits instructions for
    108 //     dest := dest(To) op src(From)
    109 // and also returns the created obj.Prog so it
    110 // may be further adjusted (offset, scale, etc).
    111 func opregreg(s *gc.SSAGenState, op obj.As, dest, src int16) *obj.Prog {
    112 	p := s.Prog(op)
    113 	p.From.Type = obj.TYPE_REG
    114 	p.To.Type = obj.TYPE_REG
    115 	p.To.Reg = dest
    116 	p.From.Reg = src
    117 	return p
    118 }
    119 
    120 // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
    121 // See runtime/mkduff.go.
    122 func duffStart(size int64) int64 {
    123 	x, _ := duff(size)
    124 	return x
    125 }
    126 func duffAdj(size int64) int64 {
    127 	_, x := duff(size)
    128 	return x
    129 }
    130 
    131 // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
    132 // required to use the duffzero mechanism for a block of the given size.
    133 func duff(size int64) (int64, int64) {
    134 	if size < 32 || size > 1024 || size%dzClearStep != 0 {
    135 		panic("bad duffzero size")
    136 	}
    137 	steps := size / dzClearStep
    138 	blocks := steps / dzBlockLen
    139 	steps %= dzBlockLen
    140 	off := dzBlockSize * (dzBlocks - blocks)
    141 	var adj int64
    142 	if steps != 0 {
    143 		off -= dzLeaqSize
    144 		off -= dzMovSize * steps
    145 		adj -= dzClearStep * (dzBlockLen - steps)
    146 	}
    147 	return off, adj
    148 }
    149 
    150 func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
    151 	switch v.Op {
    152 	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
    153 		r := v.Reg()
    154 		r1 := v.Args[0].Reg()
    155 		r2 := v.Args[1].Reg()
    156 		switch {
    157 		case r == r1:
    158 			p := s.Prog(v.Op.Asm())
    159 			p.From.Type = obj.TYPE_REG
    160 			p.From.Reg = r2
    161 			p.To.Type = obj.TYPE_REG
    162 			p.To.Reg = r
    163 		case r == r2:
    164 			p := s.Prog(v.Op.Asm())
    165 			p.From.Type = obj.TYPE_REG
    166 			p.From.Reg = r1
    167 			p.To.Type = obj.TYPE_REG
    168 			p.To.Reg = r
    169 		default:
    170 			var asm obj.As
    171 			if v.Op == ssa.OpAMD64ADDQ {
    172 				asm = x86.ALEAQ
    173 			} else {
    174 				asm = x86.ALEAL
    175 			}
    176 			p := s.Prog(asm)
    177 			p.From.Type = obj.TYPE_MEM
    178 			p.From.Reg = r1
    179 			p.From.Scale = 1
    180 			p.From.Index = r2
    181 			p.To.Type = obj.TYPE_REG
    182 			p.To.Reg = r
    183 		}
    184 	// 2-address opcode arithmetic
    185 	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
    186 		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
    187 		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
    188 		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
    189 		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
    190 		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
    191 		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
    192 		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
    193 		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
    194 		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
    195 		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
    196 		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
    197 		ssa.OpAMD64PXOR:
    198 		r := v.Reg()
    199 		if r != v.Args[0].Reg() {
    200 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    201 		}
    202 		opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
    203 
    204 	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
    205 		// Arg[0] (the dividend) is in AX.
    206 		// Arg[1] (the divisor) can be in any other register.
    207 		// Result[0] (the quotient) is in AX.
    208 		// Result[1] (the remainder) is in DX.
    209 		r := v.Args[1].Reg()
    210 
    211 		// Zero extend dividend.
    212 		c := s.Prog(x86.AXORL)
    213 		c.From.Type = obj.TYPE_REG
    214 		c.From.Reg = x86.REG_DX
    215 		c.To.Type = obj.TYPE_REG
    216 		c.To.Reg = x86.REG_DX
    217 
    218 		// Issue divide.
    219 		p := s.Prog(v.Op.Asm())
    220 		p.From.Type = obj.TYPE_REG
    221 		p.From.Reg = r
    222 
    223 	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
    224 		// Arg[0] (the dividend) is in AX.
    225 		// Arg[1] (the divisor) can be in any other register.
    226 		// Result[0] (the quotient) is in AX.
    227 		// Result[1] (the remainder) is in DX.
    228 		r := v.Args[1].Reg()
    229 
    230 		// CPU faults upon signed overflow, which occurs when the most
    231 		// negative int is divided by -1. Handle divide by -1 as a special case.
    232 		var c *obj.Prog
    233 		switch v.Op {
    234 		case ssa.OpAMD64DIVQ:
    235 			c = s.Prog(x86.ACMPQ)
    236 		case ssa.OpAMD64DIVL:
    237 			c = s.Prog(x86.ACMPL)
    238 		case ssa.OpAMD64DIVW:
    239 			c = s.Prog(x86.ACMPW)
    240 		}
    241 		c.From.Type = obj.TYPE_REG
    242 		c.From.Reg = r
    243 		c.To.Type = obj.TYPE_CONST
    244 		c.To.Offset = -1
    245 		j1 := s.Prog(x86.AJEQ)
    246 		j1.To.Type = obj.TYPE_BRANCH
    247 
    248 		// Sign extend dividend.
    249 		switch v.Op {
    250 		case ssa.OpAMD64DIVQ:
    251 			s.Prog(x86.ACQO)
    252 		case ssa.OpAMD64DIVL:
    253 			s.Prog(x86.ACDQ)
    254 		case ssa.OpAMD64DIVW:
    255 			s.Prog(x86.ACWD)
    256 		}
    257 
    258 		// Issue divide.
    259 		p := s.Prog(v.Op.Asm())
    260 		p.From.Type = obj.TYPE_REG
    261 		p.From.Reg = r
    262 
    263 		// Skip over -1 fixup code.
    264 		j2 := s.Prog(obj.AJMP)
    265 		j2.To.Type = obj.TYPE_BRANCH
    266 
    267 		// Issue -1 fixup code.
    268 		// n / -1 = -n
    269 		n1 := s.Prog(x86.ANEGQ)
    270 		n1.To.Type = obj.TYPE_REG
    271 		n1.To.Reg = x86.REG_AX
    272 
    273 		// n % -1 == 0
    274 		n2 := s.Prog(x86.AXORL)
    275 		n2.From.Type = obj.TYPE_REG
    276 		n2.From.Reg = x86.REG_DX
    277 		n2.To.Type = obj.TYPE_REG
    278 		n2.To.Reg = x86.REG_DX
    279 
    280 		// TODO(khr): issue only the -1 fixup code we need.
    281 		// For instance, if only the quotient is used, no point in zeroing the remainder.
    282 
    283 		j1.To.Val = n1
    284 		j2.To.Val = s.Pc()
    285 
    286 	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
    287 		// the frontend rewrites constant division by 8/16/32 bit integers into
    288 		// HMUL by a constant
    289 		// SSA rewrites generate the 64 bit versions
    290 
    291 		// Arg[0] is already in AX as it's the only register we allow
    292 		// and DX is the only output we care about (the high bits)
    293 		p := s.Prog(v.Op.Asm())
    294 		p.From.Type = obj.TYPE_REG
    295 		p.From.Reg = v.Args[1].Reg()
    296 
    297 		// IMULB puts the high portion in AH instead of DL,
    298 		// so move it to DL for consistency
    299 		if v.Type.Size() == 1 {
    300 			m := s.Prog(x86.AMOVB)
    301 			m.From.Type = obj.TYPE_REG
    302 			m.From.Reg = x86.REG_AH
    303 			m.To.Type = obj.TYPE_REG
    304 			m.To.Reg = x86.REG_DX
    305 		}
    306 
    307 	case ssa.OpAMD64MULQU2:
    308 		// Arg[0] is already in AX as it's the only register we allow
    309 		// results hi in DX, lo in AX
    310 		p := s.Prog(v.Op.Asm())
    311 		p.From.Type = obj.TYPE_REG
    312 		p.From.Reg = v.Args[1].Reg()
    313 
    314 	case ssa.OpAMD64DIVQU2:
    315 		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
    316 		// results q in AX, r in DX
    317 		p := s.Prog(v.Op.Asm())
    318 		p.From.Type = obj.TYPE_REG
    319 		p.From.Reg = v.Args[2].Reg()
    320 
    321 	case ssa.OpAMD64AVGQU:
    322 		// compute (x+y)/2 unsigned.
    323 		// Do a 64-bit add, the overflow goes into the carry.
    324 		// Shift right once and pull the carry back into the 63rd bit.
    325 		r := v.Reg()
    326 		if r != v.Args[0].Reg() {
    327 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    328 		}
    329 		p := s.Prog(x86.AADDQ)
    330 		p.From.Type = obj.TYPE_REG
    331 		p.To.Type = obj.TYPE_REG
    332 		p.To.Reg = r
    333 		p.From.Reg = v.Args[1].Reg()
    334 		p = s.Prog(x86.ARCRQ)
    335 		p.From.Type = obj.TYPE_CONST
    336 		p.From.Offset = 1
    337 		p.To.Type = obj.TYPE_REG
    338 		p.To.Reg = r
    339 
    340 	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
    341 		r := v.Reg()
    342 		a := v.Args[0].Reg()
    343 		if r == a {
    344 			if v.AuxInt == 1 {
    345 				var asm obj.As
    346 				// Software optimization manual recommends add $1,reg.
    347 				// But inc/dec is 1 byte smaller. ICC always uses inc
    348 				// Clang/GCC choose depending on flags, but prefer add.
    349 				// Experiments show that inc/dec is both a little faster
    350 				// and make a binary a little smaller.
    351 				if v.Op == ssa.OpAMD64ADDQconst {
    352 					asm = x86.AINCQ
    353 				} else {
    354 					asm = x86.AINCL
    355 				}
    356 				p := s.Prog(asm)
    357 				p.To.Type = obj.TYPE_REG
    358 				p.To.Reg = r
    359 				return
    360 			}
    361 			if v.AuxInt == -1 {
    362 				var asm obj.As
    363 				if v.Op == ssa.OpAMD64ADDQconst {
    364 					asm = x86.ADECQ
    365 				} else {
    366 					asm = x86.ADECL
    367 				}
    368 				p := s.Prog(asm)
    369 				p.To.Type = obj.TYPE_REG
    370 				p.To.Reg = r
    371 				return
    372 			}
    373 			p := s.Prog(v.Op.Asm())
    374 			p.From.Type = obj.TYPE_CONST
    375 			p.From.Offset = v.AuxInt
    376 			p.To.Type = obj.TYPE_REG
    377 			p.To.Reg = r
    378 			return
    379 		}
    380 		var asm obj.As
    381 		if v.Op == ssa.OpAMD64ADDQconst {
    382 			asm = x86.ALEAQ
    383 		} else {
    384 			asm = x86.ALEAL
    385 		}
    386 		p := s.Prog(asm)
    387 		p.From.Type = obj.TYPE_MEM
    388 		p.From.Reg = a
    389 		p.From.Offset = v.AuxInt
    390 		p.To.Type = obj.TYPE_REG
    391 		p.To.Reg = r
    392 
    393 	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ:
    394 		r := v.Reg()
    395 		if r != v.Args[0].Reg() {
    396 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    397 		}
    398 		p := s.Prog(v.Op.Asm())
    399 		p.From.Type = obj.TYPE_REG
    400 		p.From.Reg = v.Args[1].Reg()
    401 		p.To.Type = obj.TYPE_REG
    402 		p.To.Reg = r
    403 
    404 	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
    405 		r := v.Reg()
    406 		if r != v.Args[0].Reg() {
    407 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    408 		}
    409 		p := s.Prog(v.Op.Asm())
    410 		p.From.Type = obj.TYPE_CONST
    411 		p.From.Offset = v.AuxInt
    412 		p.To.Type = obj.TYPE_REG
    413 		p.To.Reg = r
    414 		// TODO: Teach doasm to compile the three-address multiply imul $c, r1, r2
    415 		// then we don't need to use resultInArg0 for these ops.
    416 		//p.From3 = new(obj.Addr)
    417 		//p.From3.Type = obj.TYPE_REG
    418 		//p.From3.Reg = v.Args[0].Reg()
    419 
    420 	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
    421 		ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst,
    422 		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
    423 		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
    424 		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
    425 		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
    426 		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
    427 		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
    428 		r := v.Reg()
    429 		if r != v.Args[0].Reg() {
    430 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    431 		}
    432 		p := s.Prog(v.Op.Asm())
    433 		p.From.Type = obj.TYPE_CONST
    434 		p.From.Offset = v.AuxInt
    435 		p.To.Type = obj.TYPE_REG
    436 		p.To.Reg = r
    437 	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
    438 		r := v.Reg()
    439 		p := s.Prog(v.Op.Asm())
    440 		p.From.Type = obj.TYPE_REG
    441 		p.From.Reg = r
    442 		p.To.Type = obj.TYPE_REG
    443 		p.To.Reg = r
    444 	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
    445 		r := v.Args[0].Reg()
    446 		i := v.Args[1].Reg()
    447 		p := s.Prog(x86.ALEAQ)
    448 		switch v.Op {
    449 		case ssa.OpAMD64LEAQ1:
    450 			p.From.Scale = 1
    451 			if i == x86.REG_SP {
    452 				r, i = i, r
    453 			}
    454 		case ssa.OpAMD64LEAQ2:
    455 			p.From.Scale = 2
    456 		case ssa.OpAMD64LEAQ4:
    457 			p.From.Scale = 4
    458 		case ssa.OpAMD64LEAQ8:
    459 			p.From.Scale = 8
    460 		}
    461 		p.From.Type = obj.TYPE_MEM
    462 		p.From.Reg = r
    463 		p.From.Index = i
    464 		gc.AddAux(&p.From, v)
    465 		p.To.Type = obj.TYPE_REG
    466 		p.To.Reg = v.Reg()
    467 	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL:
    468 		p := s.Prog(v.Op.Asm())
    469 		p.From.Type = obj.TYPE_MEM
    470 		p.From.Reg = v.Args[0].Reg()
    471 		gc.AddAux(&p.From, v)
    472 		p.To.Type = obj.TYPE_REG
    473 		p.To.Reg = v.Reg()
    474 	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
    475 		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
    476 		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
    477 		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
    478 	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
    479 		// Go assembler has swapped operands for UCOMISx relative to CMP,
    480 		// must account for that right here.
    481 		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
    482 	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
    483 		p := s.Prog(v.Op.Asm())
    484 		p.From.Type = obj.TYPE_REG
    485 		p.From.Reg = v.Args[0].Reg()
    486 		p.To.Type = obj.TYPE_CONST
    487 		p.To.Offset = v.AuxInt
    488 	case ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
    489 		ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst:
    490 		p := s.Prog(v.Op.Asm())
    491 		p.From.Type = obj.TYPE_CONST
    492 		p.From.Offset = v.AuxInt
    493 		p.To.Type = obj.TYPE_REG
    494 		p.To.Reg = v.Args[0].Reg()
    495 	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
    496 		x := v.Reg()
    497 
    498 		// If flags aren't live (indicated by v.Aux == nil),
    499 		// then we can rewrite MOV $0, AX into XOR AX, AX.
    500 		if v.AuxInt == 0 && v.Aux == nil {
    501 			p := s.Prog(x86.AXORL)
    502 			p.From.Type = obj.TYPE_REG
    503 			p.From.Reg = x
    504 			p.To.Type = obj.TYPE_REG
    505 			p.To.Reg = x
    506 			break
    507 		}
    508 
    509 		asm := v.Op.Asm()
    510 		// Use MOVL to move a small constant into a register
    511 		// when the constant is positive and fits into 32 bits.
    512 		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
    513 			// The upper 32bit are zeroed automatically when using MOVL.
    514 			asm = x86.AMOVL
    515 		}
    516 		p := s.Prog(asm)
    517 		p.From.Type = obj.TYPE_CONST
    518 		p.From.Offset = v.AuxInt
    519 		p.To.Type = obj.TYPE_REG
    520 		p.To.Reg = x
    521 	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
    522 		x := v.Reg()
    523 		p := s.Prog(v.Op.Asm())
    524 		p.From.Type = obj.TYPE_FCONST
    525 		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
    526 		p.To.Type = obj.TYPE_REG
    527 		p.To.Reg = x
    528 	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload:
    529 		p := s.Prog(v.Op.Asm())
    530 		p.From.Type = obj.TYPE_MEM
    531 		p.From.Reg = v.Args[0].Reg()
    532 		gc.AddAux(&p.From, v)
    533 		p.To.Type = obj.TYPE_REG
    534 		p.To.Reg = v.Reg()
    535 	case ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8:
    536 		p := s.Prog(v.Op.Asm())
    537 		p.From.Type = obj.TYPE_MEM
    538 		p.From.Reg = v.Args[0].Reg()
    539 		gc.AddAux(&p.From, v)
    540 		p.From.Scale = 8
    541 		p.From.Index = v.Args[1].Reg()
    542 		p.To.Type = obj.TYPE_REG
    543 		p.To.Reg = v.Reg()
    544 	case ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4:
    545 		p := s.Prog(v.Op.Asm())
    546 		p.From.Type = obj.TYPE_MEM
    547 		p.From.Reg = v.Args[0].Reg()
    548 		gc.AddAux(&p.From, v)
    549 		p.From.Scale = 4
    550 		p.From.Index = v.Args[1].Reg()
    551 		p.To.Type = obj.TYPE_REG
    552 		p.To.Reg = v.Reg()
    553 	case ssa.OpAMD64MOVWloadidx2:
    554 		p := s.Prog(v.Op.Asm())
    555 		p.From.Type = obj.TYPE_MEM
    556 		p.From.Reg = v.Args[0].Reg()
    557 		gc.AddAux(&p.From, v)
    558 		p.From.Scale = 2
    559 		p.From.Index = v.Args[1].Reg()
    560 		p.To.Type = obj.TYPE_REG
    561 		p.To.Reg = v.Reg()
    562 	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1:
    563 		r := v.Args[0].Reg()
    564 		i := v.Args[1].Reg()
    565 		if i == x86.REG_SP {
    566 			r, i = i, r
    567 		}
    568 		p := s.Prog(v.Op.Asm())
    569 		p.From.Type = obj.TYPE_MEM
    570 		p.From.Reg = r
    571 		p.From.Scale = 1
    572 		p.From.Index = i
    573 		gc.AddAux(&p.From, v)
    574 		p.To.Type = obj.TYPE_REG
    575 		p.To.Reg = v.Reg()
    576 	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore:
    577 		p := s.Prog(v.Op.Asm())
    578 		p.From.Type = obj.TYPE_REG
    579 		p.From.Reg = v.Args[1].Reg()
    580 		p.To.Type = obj.TYPE_MEM
    581 		p.To.Reg = v.Args[0].Reg()
    582 		gc.AddAux(&p.To, v)
    583 	case ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8:
    584 		p := s.Prog(v.Op.Asm())
    585 		p.From.Type = obj.TYPE_REG
    586 		p.From.Reg = v.Args[2].Reg()
    587 		p.To.Type = obj.TYPE_MEM
    588 		p.To.Reg = v.Args[0].Reg()
    589 		p.To.Scale = 8
    590 		p.To.Index = v.Args[1].Reg()
    591 		gc.AddAux(&p.To, v)
    592 	case ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4:
    593 		p := s.Prog(v.Op.Asm())
    594 		p.From.Type = obj.TYPE_REG
    595 		p.From.Reg = v.Args[2].Reg()
    596 		p.To.Type = obj.TYPE_MEM
    597 		p.To.Reg = v.Args[0].Reg()
    598 		p.To.Scale = 4
    599 		p.To.Index = v.Args[1].Reg()
    600 		gc.AddAux(&p.To, v)
    601 	case ssa.OpAMD64MOVWstoreidx2:
    602 		p := s.Prog(v.Op.Asm())
    603 		p.From.Type = obj.TYPE_REG
    604 		p.From.Reg = v.Args[2].Reg()
    605 		p.To.Type = obj.TYPE_MEM
    606 		p.To.Reg = v.Args[0].Reg()
    607 		p.To.Scale = 2
    608 		p.To.Index = v.Args[1].Reg()
    609 		gc.AddAux(&p.To, v)
    610 	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1:
    611 		r := v.Args[0].Reg()
    612 		i := v.Args[1].Reg()
    613 		if i == x86.REG_SP {
    614 			r, i = i, r
    615 		}
    616 		p := s.Prog(v.Op.Asm())
    617 		p.From.Type = obj.TYPE_REG
    618 		p.From.Reg = v.Args[2].Reg()
    619 		p.To.Type = obj.TYPE_MEM
    620 		p.To.Reg = r
    621 		p.To.Scale = 1
    622 		p.To.Index = i
    623 		gc.AddAux(&p.To, v)
    624 	case ssa.OpAMD64ADDQconstmem, ssa.OpAMD64ADDLconstmem:
    625 		sc := v.AuxValAndOff()
    626 		off := sc.Off()
    627 		val := sc.Val()
    628 		if val == 1 {
    629 			var asm obj.As
    630 			if v.Op == ssa.OpAMD64ADDQconstmem {
    631 				asm = x86.AINCQ
    632 			} else {
    633 				asm = x86.AINCL
    634 			}
    635 			p := s.Prog(asm)
    636 			p.To.Type = obj.TYPE_MEM
    637 			p.To.Reg = v.Args[0].Reg()
    638 			gc.AddAux2(&p.To, v, off)
    639 		} else {
    640 			p := s.Prog(v.Op.Asm())
    641 			p.From.Type = obj.TYPE_CONST
    642 			p.From.Offset = val
    643 			p.To.Type = obj.TYPE_MEM
    644 			p.To.Reg = v.Args[0].Reg()
    645 			gc.AddAux2(&p.To, v, off)
    646 		}
    647 	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
    648 		p := s.Prog(v.Op.Asm())
    649 		p.From.Type = obj.TYPE_CONST
    650 		sc := v.AuxValAndOff()
    651 		p.From.Offset = sc.Val()
    652 		p.To.Type = obj.TYPE_MEM
    653 		p.To.Reg = v.Args[0].Reg()
    654 		gc.AddAux2(&p.To, v, sc.Off())
    655 	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1:
    656 		p := s.Prog(v.Op.Asm())
    657 		p.From.Type = obj.TYPE_CONST
    658 		sc := v.AuxValAndOff()
    659 		p.From.Offset = sc.Val()
    660 		r := v.Args[0].Reg()
    661 		i := v.Args[1].Reg()
    662 		switch v.Op {
    663 		case ssa.OpAMD64MOVBstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx1:
    664 			p.To.Scale = 1
    665 			if i == x86.REG_SP {
    666 				r, i = i, r
    667 			}
    668 		case ssa.OpAMD64MOVWstoreconstidx2:
    669 			p.To.Scale = 2
    670 		case ssa.OpAMD64MOVLstoreconstidx4:
    671 			p.To.Scale = 4
    672 		case ssa.OpAMD64MOVQstoreconstidx8:
    673 			p.To.Scale = 8
    674 		}
    675 		p.To.Type = obj.TYPE_MEM
    676 		p.To.Reg = r
    677 		p.To.Index = i
    678 		gc.AddAux2(&p.To, v, sc.Off())
    679 	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
    680 		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
    681 		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
    682 		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
    683 	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
    684 		r := v.Reg()
    685 		// Break false dependency on destination register.
    686 		opregreg(s, x86.AXORPS, r, r)
    687 		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
    688 	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
    689 		p := s.Prog(x86.AMOVQ)
    690 		p.From.Type = obj.TYPE_REG
    691 		p.From.Reg = v.Args[0].Reg()
    692 		p.To.Type = obj.TYPE_REG
    693 		p.To.Reg = v.Reg()
    694 	case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
    695 		p := s.Prog(x86.AMOVL)
    696 		p.From.Type = obj.TYPE_REG
    697 		p.From.Reg = v.Args[0].Reg()
    698 		p.To.Type = obj.TYPE_REG
    699 		p.To.Reg = v.Reg()
    700 	case ssa.OpAMD64ADDQmem, ssa.OpAMD64ADDLmem, ssa.OpAMD64SUBQmem, ssa.OpAMD64SUBLmem,
    701 		ssa.OpAMD64ANDQmem, ssa.OpAMD64ANDLmem, ssa.OpAMD64ORQmem, ssa.OpAMD64ORLmem,
    702 		ssa.OpAMD64XORQmem, ssa.OpAMD64XORLmem, ssa.OpAMD64ADDSDmem, ssa.OpAMD64ADDSSmem,
    703 		ssa.OpAMD64SUBSDmem, ssa.OpAMD64SUBSSmem, ssa.OpAMD64MULSDmem, ssa.OpAMD64MULSSmem:
    704 		p := s.Prog(v.Op.Asm())
    705 		p.From.Type = obj.TYPE_MEM
    706 		p.From.Reg = v.Args[1].Reg()
    707 		gc.AddAux(&p.From, v)
    708 		p.To.Type = obj.TYPE_REG
    709 		p.To.Reg = v.Reg()
    710 		if v.Reg() != v.Args[0].Reg() {
    711 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    712 		}
    713 	case ssa.OpAMD64DUFFZERO:
    714 		off := duffStart(v.AuxInt)
    715 		adj := duffAdj(v.AuxInt)
    716 		var p *obj.Prog
    717 		if adj != 0 {
    718 			p = s.Prog(x86.ALEAQ)
    719 			p.From.Type = obj.TYPE_MEM
    720 			p.From.Offset = adj
    721 			p.From.Reg = x86.REG_DI
    722 			p.To.Type = obj.TYPE_REG
    723 			p.To.Reg = x86.REG_DI
    724 		}
    725 		p = s.Prog(obj.ADUFFZERO)
    726 		p.To.Type = obj.TYPE_ADDR
    727 		p.To.Sym = gc.Duffzero
    728 		p.To.Offset = off
    729 	case ssa.OpAMD64MOVOconst:
    730 		if v.AuxInt != 0 {
    731 			v.Fatalf("MOVOconst can only do constant=0")
    732 		}
    733 		r := v.Reg()
    734 		opregreg(s, x86.AXORPS, r, r)
    735 	case ssa.OpAMD64DUFFCOPY:
    736 		p := s.Prog(obj.ADUFFCOPY)
    737 		p.To.Type = obj.TYPE_ADDR
    738 		p.To.Sym = gc.Duffcopy
    739 		p.To.Offset = v.AuxInt
    740 
    741 	case ssa.OpAMD64MOVQconvert, ssa.OpAMD64MOVLconvert:
    742 		if v.Args[0].Reg() != v.Reg() {
    743 			v.Fatalf("MOVXconvert should be a no-op")
    744 		}
    745 	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
    746 		if v.Type.IsMemory() {
    747 			return
    748 		}
    749 		x := v.Args[0].Reg()
    750 		y := v.Reg()
    751 		if x != y {
    752 			opregreg(s, moveByType(v.Type), y, x)
    753 		}
    754 	case ssa.OpLoadReg:
    755 		if v.Type.IsFlags() {
    756 			v.Fatalf("load flags not implemented: %v", v.LongString())
    757 			return
    758 		}
    759 		p := s.Prog(loadByType(v.Type))
    760 		gc.AddrAuto(&p.From, v.Args[0])
    761 		p.To.Type = obj.TYPE_REG
    762 		p.To.Reg = v.Reg()
    763 
    764 	case ssa.OpStoreReg:
    765 		if v.Type.IsFlags() {
    766 			v.Fatalf("store flags not implemented: %v", v.LongString())
    767 			return
    768 		}
    769 		p := s.Prog(storeByType(v.Type))
    770 		p.From.Type = obj.TYPE_REG
    771 		p.From.Reg = v.Args[0].Reg()
    772 		gc.AddrAuto(&p.To, v)
    773 	case ssa.OpAMD64LoweredGetClosurePtr:
    774 		// Closure pointer is DX.
    775 		gc.CheckLoweredGetClosurePtr(v)
    776 	case ssa.OpAMD64LoweredGetG:
    777 		r := v.Reg()
    778 		// See the comments in cmd/internal/obj/x86/obj6.go
    779 		// near CanUse1InsnTLS for a detailed explanation of these instructions.
    780 		if x86.CanUse1InsnTLS(gc.Ctxt) {
    781 			// MOVQ (TLS), r
    782 			p := s.Prog(x86.AMOVQ)
    783 			p.From.Type = obj.TYPE_MEM
    784 			p.From.Reg = x86.REG_TLS
    785 			p.To.Type = obj.TYPE_REG
    786 			p.To.Reg = r
    787 		} else {
    788 			// MOVQ TLS, r
    789 			// MOVQ (r)(TLS*1), r
    790 			p := s.Prog(x86.AMOVQ)
    791 			p.From.Type = obj.TYPE_REG
    792 			p.From.Reg = x86.REG_TLS
    793 			p.To.Type = obj.TYPE_REG
    794 			p.To.Reg = r
    795 			q := s.Prog(x86.AMOVQ)
    796 			q.From.Type = obj.TYPE_MEM
    797 			q.From.Reg = r
    798 			q.From.Index = x86.REG_TLS
    799 			q.From.Scale = 1
    800 			q.To.Type = obj.TYPE_REG
    801 			q.To.Reg = r
    802 		}
    803 	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
    804 		s.Call(v)
    805 
    806 	case ssa.OpAMD64LoweredGetCallerPC:
    807 		p := s.Prog(x86.AMOVQ)
    808 		p.From.Type = obj.TYPE_MEM
    809 		p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
    810 		p.From.Name = obj.NAME_PARAM
    811 		p.To.Type = obj.TYPE_REG
    812 		p.To.Reg = v.Reg()
    813 
    814 	case ssa.OpAMD64LoweredGetCallerSP:
    815 		// caller's SP is the address of the first arg
    816 		mov := x86.AMOVQ
    817 		if gc.Widthptr == 4 {
    818 			mov = x86.AMOVL
    819 		}
    820 		p := s.Prog(mov)
    821 		p.From.Type = obj.TYPE_ADDR
    822 		p.From.Offset = -gc.Ctxt.FixedFrameSize() // 0 on amd64, just to be consistent with other architectures
    823 		p.From.Name = obj.NAME_PARAM
    824 		p.To.Type = obj.TYPE_REG
    825 		p.To.Reg = v.Reg()
    826 
    827 	case ssa.OpAMD64LoweredWB:
    828 		p := s.Prog(obj.ACALL)
    829 		p.To.Type = obj.TYPE_MEM
    830 		p.To.Name = obj.NAME_EXTERN
    831 		p.To.Sym = v.Aux.(*obj.LSym)
    832 
    833 	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
    834 		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
    835 		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
    836 		r := v.Reg()
    837 		if r != v.Args[0].Reg() {
    838 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    839 		}
    840 		p := s.Prog(v.Op.Asm())
    841 		p.To.Type = obj.TYPE_REG
    842 		p.To.Reg = r
    843 	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRQ, ssa.OpAMD64BSRL:
    844 		p := s.Prog(v.Op.Asm())
    845 		p.From.Type = obj.TYPE_REG
    846 		p.From.Reg = v.Args[0].Reg()
    847 		p.To.Type = obj.TYPE_REG
    848 		p.To.Reg = v.Reg0()
    849 	case ssa.OpAMD64SQRTSD:
    850 		p := s.Prog(v.Op.Asm())
    851 		p.From.Type = obj.TYPE_REG
    852 		p.From.Reg = v.Args[0].Reg()
    853 		p.To.Type = obj.TYPE_REG
    854 		p.To.Reg = v.Reg()
    855 	case ssa.OpAMD64ROUNDSD:
    856 		p := s.Prog(v.Op.Asm())
    857 		val := v.AuxInt
    858 		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
    859 		if val != 0 && val != 1 && val != 2 && val != 3 {
    860 			v.Fatalf("Invalid rounding mode")
    861 		}
    862 		p.From.Offset = val
    863 		p.From.Type = obj.TYPE_CONST
    864 		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
    865 		p.To.Type = obj.TYPE_REG
    866 		p.To.Reg = v.Reg()
    867 	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
    868 		if v.Args[0].Reg() != v.Reg() {
    869 			// POPCNT on Intel has a false dependency on the destination register.
    870 			// Zero the destination to break the dependency.
    871 			p := s.Prog(x86.AMOVQ)
    872 			p.From.Type = obj.TYPE_CONST
    873 			p.From.Offset = 0
    874 			p.To.Type = obj.TYPE_REG
    875 			p.To.Reg = v.Reg()
    876 		}
    877 		p := s.Prog(v.Op.Asm())
    878 		p.From.Type = obj.TYPE_REG
    879 		p.From.Reg = v.Args[0].Reg()
    880 		p.To.Type = obj.TYPE_REG
    881 		p.To.Reg = v.Reg()
    882 
    883 	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
    884 		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
    885 		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
    886 		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
    887 		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
    888 		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
    889 		ssa.OpAMD64SETA, ssa.OpAMD64SETAE:
    890 		p := s.Prog(v.Op.Asm())
    891 		p.To.Type = obj.TYPE_REG
    892 		p.To.Reg = v.Reg()
    893 
    894 	case ssa.OpAMD64SETEQmem, ssa.OpAMD64SETNEmem,
    895 		ssa.OpAMD64SETLmem, ssa.OpAMD64SETLEmem,
    896 		ssa.OpAMD64SETGmem, ssa.OpAMD64SETGEmem,
    897 		ssa.OpAMD64SETBmem, ssa.OpAMD64SETBEmem,
    898 		ssa.OpAMD64SETAmem, ssa.OpAMD64SETAEmem:
    899 		p := s.Prog(v.Op.Asm())
    900 		p.To.Type = obj.TYPE_MEM
    901 		p.To.Reg = v.Args[0].Reg()
    902 		gc.AddAux(&p.To, v)
    903 
    904 	case ssa.OpAMD64SETNEF:
    905 		p := s.Prog(v.Op.Asm())
    906 		p.To.Type = obj.TYPE_REG
    907 		p.To.Reg = v.Reg()
    908 		q := s.Prog(x86.ASETPS)
    909 		q.To.Type = obj.TYPE_REG
    910 		q.To.Reg = x86.REG_AX
    911 		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
    912 		opregreg(s, x86.AORL, v.Reg(), x86.REG_AX)
    913 
    914 	case ssa.OpAMD64SETEQF:
    915 		p := s.Prog(v.Op.Asm())
    916 		p.To.Type = obj.TYPE_REG
    917 		p.To.Reg = v.Reg()
    918 		q := s.Prog(x86.ASETPC)
    919 		q.To.Type = obj.TYPE_REG
    920 		q.To.Reg = x86.REG_AX
    921 		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
    922 		opregreg(s, x86.AANDL, v.Reg(), x86.REG_AX)
    923 
    924 	case ssa.OpAMD64InvertFlags:
    925 		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
    926 	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
    927 		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
    928 	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
    929 		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
    930 	case ssa.OpAMD64REPSTOSQ:
    931 		s.Prog(x86.AREP)
    932 		s.Prog(x86.ASTOSQ)
    933 	case ssa.OpAMD64REPMOVSQ:
    934 		s.Prog(x86.AREP)
    935 		s.Prog(x86.AMOVSQ)
    936 	case ssa.OpAMD64LoweredNilCheck:
    937 		// Issue a load which will fault if the input is nil.
    938 		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
    939 		// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
    940 		// but it doesn't have false dependency on AX.
    941 		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
    942 		// That trades clobbering flags for clobbering a register.
    943 		p := s.Prog(x86.ATESTB)
    944 		p.From.Type = obj.TYPE_REG
    945 		p.From.Reg = x86.REG_AX
    946 		p.To.Type = obj.TYPE_MEM
    947 		p.To.Reg = v.Args[0].Reg()
    948 		gc.AddAux(&p.To, v)
    949 		if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
    950 			gc.Warnl(v.Pos, "generated nil check")
    951 		}
    952 	case ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
    953 		p := s.Prog(v.Op.Asm())
    954 		p.From.Type = obj.TYPE_MEM
    955 		p.From.Reg = v.Args[0].Reg()
    956 		gc.AddAux(&p.From, v)
    957 		p.To.Type = obj.TYPE_REG
    958 		p.To.Reg = v.Reg0()
    959 	case ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
    960 		r := v.Reg0()
    961 		if r != v.Args[0].Reg() {
    962 			v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
    963 		}
    964 		p := s.Prog(v.Op.Asm())
    965 		p.From.Type = obj.TYPE_REG
    966 		p.From.Reg = r
    967 		p.To.Type = obj.TYPE_MEM
    968 		p.To.Reg = v.Args[1].Reg()
    969 		gc.AddAux(&p.To, v)
    970 	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
    971 		r := v.Reg0()
    972 		if r != v.Args[0].Reg() {
    973 			v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
    974 		}
    975 		s.Prog(x86.ALOCK)
    976 		p := s.Prog(v.Op.Asm())
    977 		p.From.Type = obj.TYPE_REG
    978 		p.From.Reg = r
    979 		p.To.Type = obj.TYPE_MEM
    980 		p.To.Reg = v.Args[1].Reg()
    981 		gc.AddAux(&p.To, v)
    982 	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
    983 		if v.Args[1].Reg() != x86.REG_AX {
    984 			v.Fatalf("input[1] not in AX %s", v.LongString())
    985 		}
    986 		s.Prog(x86.ALOCK)
    987 		p := s.Prog(v.Op.Asm())
    988 		p.From.Type = obj.TYPE_REG
    989 		p.From.Reg = v.Args[2].Reg()
    990 		p.To.Type = obj.TYPE_MEM
    991 		p.To.Reg = v.Args[0].Reg()
    992 		gc.AddAux(&p.To, v)
    993 		p = s.Prog(x86.ASETEQ)
    994 		p.To.Type = obj.TYPE_REG
    995 		p.To.Reg = v.Reg0()
    996 	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ORBlock:
    997 		s.Prog(x86.ALOCK)
    998 		p := s.Prog(v.Op.Asm())
    999 		p.From.Type = obj.TYPE_REG
   1000 		p.From.Reg = v.Args[1].Reg()
   1001 		p.To.Type = obj.TYPE_MEM
   1002 		p.To.Reg = v.Args[0].Reg()
   1003 		gc.AddAux(&p.To, v)
   1004 	case ssa.OpClobber:
   1005 		p := s.Prog(x86.AMOVL)
   1006 		p.From.Type = obj.TYPE_CONST
   1007 		p.From.Offset = 0xdeaddead
   1008 		p.To.Type = obj.TYPE_MEM
   1009 		p.To.Reg = x86.REG_SP
   1010 		gc.AddAux(&p.To, v)
   1011 		p = s.Prog(x86.AMOVL)
   1012 		p.From.Type = obj.TYPE_CONST
   1013 		p.From.Offset = 0xdeaddead
   1014 		p.To.Type = obj.TYPE_MEM
   1015 		p.To.Reg = x86.REG_SP
   1016 		gc.AddAux(&p.To, v)
   1017 		p.To.Offset += 4
   1018 	default:
   1019 		v.Fatalf("genValue not implemented: %s", v.LongString())
   1020 	}
   1021 }
   1022 
   1023 var blockJump = [...]struct {
   1024 	asm, invasm obj.As
   1025 }{
   1026 	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
   1027 	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
   1028 	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
   1029 	ssa.BlockAMD64GE:  {x86.AJGE, x86.AJLT},
   1030 	ssa.BlockAMD64LE:  {x86.AJLE, x86.AJGT},
   1031 	ssa.BlockAMD64GT:  {x86.AJGT, x86.AJLE},
   1032 	ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
   1033 	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
   1034 	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
   1035 	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
   1036 	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
   1037 	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
   1038 }
   1039 
   1040 var eqfJumps = [2][2]gc.FloatingEQNEJump{
   1041 	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
   1042 	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
   1043 }
   1044 var nefJumps = [2][2]gc.FloatingEQNEJump{
   1045 	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
   1046 	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
   1047 }
   1048 
   1049 func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
   1050 	switch b.Kind {
   1051 	case ssa.BlockPlain:
   1052 		if b.Succs[0].Block() != next {
   1053 			p := s.Prog(obj.AJMP)
   1054 			p.To.Type = obj.TYPE_BRANCH
   1055 			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
   1056 		}
   1057 	case ssa.BlockDefer:
   1058 		// defer returns in rax:
   1059 		// 0 if we should continue executing
   1060 		// 1 if we should jump to deferreturn call
   1061 		p := s.Prog(x86.ATESTL)
   1062 		p.From.Type = obj.TYPE_REG
   1063 		p.From.Reg = x86.REG_AX
   1064 		p.To.Type = obj.TYPE_REG
   1065 		p.To.Reg = x86.REG_AX
   1066 		p = s.Prog(x86.AJNE)
   1067 		p.To.Type = obj.TYPE_BRANCH
   1068 		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
   1069 		if b.Succs[0].Block() != next {
   1070 			p := s.Prog(obj.AJMP)
   1071 			p.To.Type = obj.TYPE_BRANCH
   1072 			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
   1073 		}
   1074 	case ssa.BlockExit:
   1075 		s.Prog(obj.AUNDEF) // tell plive.go that we never reach here
   1076 	case ssa.BlockRet:
   1077 		s.Prog(obj.ARET)
   1078 	case ssa.BlockRetJmp:
   1079 		p := s.Prog(obj.AJMP)
   1080 		p.To.Type = obj.TYPE_MEM
   1081 		p.To.Name = obj.NAME_EXTERN
   1082 		p.To.Sym = b.Aux.(*obj.LSym)
   1083 
   1084 	case ssa.BlockAMD64EQF:
   1085 		s.FPJump(b, next, &eqfJumps)
   1086 
   1087 	case ssa.BlockAMD64NEF:
   1088 		s.FPJump(b, next, &nefJumps)
   1089 
   1090 	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
   1091 		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
   1092 		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
   1093 		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
   1094 		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
   1095 		jmp := blockJump[b.Kind]
   1096 		var p *obj.Prog
   1097 		switch next {
   1098 		case b.Succs[0].Block():
   1099 			p = s.Prog(jmp.invasm)
   1100 			p.To.Type = obj.TYPE_BRANCH
   1101 			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
   1102 		case b.Succs[1].Block():
   1103 			p = s.Prog(jmp.asm)
   1104 			p.To.Type = obj.TYPE_BRANCH
   1105 			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
   1106 		default:
   1107 			p = s.Prog(jmp.asm)
   1108 			p.To.Type = obj.TYPE_BRANCH
   1109 			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
   1110 			q := s.Prog(obj.AJMP)
   1111 			q.To.Type = obj.TYPE_BRANCH
   1112 			s.Branches = append(s.Branches, gc.Branch{P: q, B: b.Succs[1].Block()})
   1113 		}
   1114 
   1115 	default:
   1116 		b.Fatalf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString())
   1117 	}
   1118 }
   1119