Home | History | Annotate | Download | only in runtime
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "go_asm.h"
      6 #include "go_tls.h"
      7 #include "funcdata.h"
      8 #include "textflag.h"
      9 
     10 TEXT runtimert0_go(SB),NOSPLIT,$0
     11 	// copy arguments forward on an even stack
     12 	MOVL	argc+0(FP), AX
     13 	MOVL	argv+4(FP), BX
     14 	SUBL	$128, SP		// plenty of scratch
     15 	ANDL	$~15, SP
     16 	MOVL	AX, 120(SP)		// save argc, argv away
     17 	MOVL	BX, 124(SP)
     18 
     19 	// set default stack bounds.
     20 	// _cgo_init may update stackguard.
     21 	MOVL	$runtimeg0(SB), BP
     22 	LEAL	(-64*1024+104)(SP), BX
     23 	MOVL	BX, g_stackguard0(BP)
     24 	MOVL	BX, g_stackguard1(BP)
     25 	MOVL	BX, (g_stack+stack_lo)(BP)
     26 	MOVL	SP, (g_stack+stack_hi)(BP)
     27 
     28 	// find out information about the processor we're on
     29 	MOVL	$0, AX
     30 	CPUID
     31 	CMPL	AX, $0
     32 	JE	nocpuinfo
     33 
     34 	// Figure out how to serialize RDTSC.
     35 	// On Intel processors LFENCE is enough. AMD requires MFENCE.
     36 	// Don't know about the rest, so let's do MFENCE.
     37 	CMPL	BX, $0x756E6547  // "Genu"
     38 	JNE	notintel
     39 	CMPL	DX, $0x49656E69  // "ineI"
     40 	JNE	notintel
     41 	CMPL	CX, $0x6C65746E  // "ntel"
     42 	JNE	notintel
     43 	MOVB	$1, runtimelfenceBeforeRdtsc(SB)
     44 notintel:
     45 
     46 	MOVL	$1, AX
     47 	CPUID
     48 	MOVL	CX, runtimecpuid_ecx(SB)
     49 	MOVL	DX, runtimecpuid_edx(SB)
     50 nocpuinfo:
     51 
     52 	// if there is an _cgo_init, call it to let it
     53 	// initialize and to set up GS.  if not,
     54 	// we set up GS ourselves.
     55 	MOVL	_cgo_init(SB), AX
     56 	TESTL	AX, AX
     57 	JZ	needtls
     58 	MOVL	$setg_gcc<>(SB), BX
     59 	MOVL	BX, 4(SP)
     60 	MOVL	BP, 0(SP)
     61 	CALL	AX
     62 
     63 	// update stackguard after _cgo_init
     64 	MOVL	$runtimeg0(SB), CX
     65 	MOVL	(g_stack+stack_lo)(CX), AX
     66 	ADDL	$const__StackGuard, AX
     67 	MOVL	AX, g_stackguard0(CX)
     68 	MOVL	AX, g_stackguard1(CX)
     69 
     70 	// skip runtimeldt0setup(SB) and tls test after _cgo_init for non-windows
     71 	CMPL runtimeiswindows(SB), $0
     72 	JEQ ok
     73 needtls:
     74 	// skip runtimeldt0setup(SB) and tls test on Plan 9 in all cases
     75 	CMPL	runtimeisplan9(SB), $1
     76 	JEQ	ok
     77 
     78 	// set up %gs
     79 	CALL	runtimeldt0setup(SB)
     80 
     81 	// store through it, to make sure it works
     82 	get_tls(BX)
     83 	MOVL	$0x123, g(BX)
     84 	MOVL	runtimetls0(SB), AX
     85 	CMPL	AX, $0x123
     86 	JEQ	ok
     87 	MOVL	AX, 0	// abort
     88 ok:
     89 	// set up m and g "registers"
     90 	get_tls(BX)
     91 	LEAL	runtimeg0(SB), CX
     92 	MOVL	CX, g(BX)
     93 	LEAL	runtimem0(SB), AX
     94 
     95 	// save m->g0 = g0
     96 	MOVL	CX, m_g0(AX)
     97 	// save g0->m = m0
     98 	MOVL	AX, g_m(CX)
     99 
    100 	CALL	runtimeemptyfunc(SB)	// fault if stack check is wrong
    101 
    102 	// convention is D is always cleared
    103 	CLD
    104 
    105 	CALL	runtimecheck(SB)
    106 
    107 	// saved argc, argv
    108 	MOVL	120(SP), AX
    109 	MOVL	AX, 0(SP)
    110 	MOVL	124(SP), AX
    111 	MOVL	AX, 4(SP)
    112 	CALL	runtimeargs(SB)
    113 	CALL	runtimeosinit(SB)
    114 	CALL	runtimeschedinit(SB)
    115 
    116 	// create a new goroutine to start program
    117 	PUSHL	$runtimemainPC(SB)	// entry
    118 	PUSHL	$0	// arg size
    119 	CALL	runtimenewproc(SB)
    120 	POPL	AX
    121 	POPL	AX
    122 
    123 	// start this M
    124 	CALL	runtimemstart(SB)
    125 
    126 	INT $3
    127 	RET
    128 
    129 DATA	runtimemainPC+0(SB)/4,$runtimemain(SB)
    130 GLOBL	runtimemainPC(SB),RODATA,$4
    131 
    132 TEXT runtimebreakpoint(SB),NOSPLIT,$0-0
    133 	INT $3
    134 	RET
    135 
    136 TEXT runtimeasminit(SB),NOSPLIT,$0-0
    137 	// Linux and MinGW start the FPU in extended double precision.
    138 	// Other operating systems use double precision.
    139 	// Change to double precision to match them,
    140 	// and to match other hardware that only has double.
    141 	PUSHL $0x27F
    142 	FLDCW	0(SP)
    143 	POPL AX
    144 	RET
    145 
    146 /*
    147  *  go-routine
    148  */
    149 
    150 // void gosave(Gobuf*)
    151 // save state in Gobuf; setjmp
    152 TEXT runtimegosave(SB), NOSPLIT, $0-4
    153 	MOVL	buf+0(FP), AX		// gobuf
    154 	LEAL	buf+0(FP), BX		// caller's SP
    155 	MOVL	BX, gobuf_sp(AX)
    156 	MOVL	0(SP), BX		// caller's PC
    157 	MOVL	BX, gobuf_pc(AX)
    158 	MOVL	$0, gobuf_ret(AX)
    159 	MOVL	$0, gobuf_ctxt(AX)
    160 	get_tls(CX)
    161 	MOVL	g(CX), BX
    162 	MOVL	BX, gobuf_g(AX)
    163 	RET
    164 
    165 // void gogo(Gobuf*)
    166 // restore state from Gobuf; longjmp
    167 TEXT runtimegogo(SB), NOSPLIT, $0-4
    168 	MOVL	buf+0(FP), BX		// gobuf
    169 	MOVL	gobuf_g(BX), DX
    170 	MOVL	0(DX), CX		// make sure g != nil
    171 	get_tls(CX)
    172 	MOVL	DX, g(CX)
    173 	MOVL	gobuf_sp(BX), SP	// restore SP
    174 	MOVL	gobuf_ret(BX), AX
    175 	MOVL	gobuf_ctxt(BX), DX
    176 	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
    177 	MOVL	$0, gobuf_ret(BX)
    178 	MOVL	$0, gobuf_ctxt(BX)
    179 	MOVL	gobuf_pc(BX), BX
    180 	JMP	BX
    181 
    182 // func mcall(fn func(*g))
    183 // Switch to m->g0's stack, call fn(g).
    184 // Fn must never return.  It should gogo(&g->sched)
    185 // to keep running g.
    186 TEXT runtimemcall(SB), NOSPLIT, $0-4
    187 	MOVL	fn+0(FP), DI
    188 
    189 	get_tls(CX)
    190 	MOVL	g(CX), AX	// save state in g->sched
    191 	MOVL	0(SP), BX	// caller's PC
    192 	MOVL	BX, (g_sched+gobuf_pc)(AX)
    193 	LEAL	fn+0(FP), BX	// caller's SP
    194 	MOVL	BX, (g_sched+gobuf_sp)(AX)
    195 	MOVL	AX, (g_sched+gobuf_g)(AX)
    196 
    197 	// switch to m->g0 & its stack, call fn
    198 	MOVL	g(CX), BX
    199 	MOVL	g_m(BX), BX
    200 	MOVL	m_g0(BX), SI
    201 	CMPL	SI, AX	// if g == m->g0 call badmcall
    202 	JNE	3(PC)
    203 	MOVL	$runtimebadmcall(SB), AX
    204 	JMP	AX
    205 	MOVL	SI, g(CX)	// g = m->g0
    206 	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
    207 	PUSHL	AX
    208 	MOVL	DI, DX
    209 	MOVL	0(DI), DI
    210 	CALL	DI
    211 	POPL	AX
    212 	MOVL	$runtimebadmcall2(SB), AX
    213 	JMP	AX
    214 	RET
    215 
    216 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
    217 // of the G stack.  We need to distinguish the routine that
    218 // lives at the bottom of the G stack from the one that lives
    219 // at the top of the system stack because the one at the top of
    220 // the system stack terminates the stack walk (see topofstack()).
    221 TEXT runtimesystemstack_switch(SB), NOSPLIT, $0-0
    222 	RET
    223 
    224 // func systemstack(fn func())
    225 TEXT runtimesystemstack(SB), NOSPLIT, $0-4
    226 	MOVL	fn+0(FP), DI	// DI = fn
    227 	get_tls(CX)
    228 	MOVL	g(CX), AX	// AX = g
    229 	MOVL	g_m(AX), BX	// BX = m
    230 
    231 	MOVL	m_gsignal(BX), DX	// DX = gsignal
    232 	CMPL	AX, DX
    233 	JEQ	noswitch
    234 
    235 	MOVL	m_g0(BX), DX	// DX = g0
    236 	CMPL	AX, DX
    237 	JEQ	noswitch
    238 
    239 	MOVL	m_curg(BX), BP
    240 	CMPL	AX, BP
    241 	JEQ	switch
    242 
    243 	// Bad: g is not gsignal, not g0, not curg. What is it?
    244 	// Hide call from linker nosplit analysis.
    245 	MOVL	$runtimebadsystemstack(SB), AX
    246 	CALL	AX
    247 
    248 switch:
    249 	// save our state in g->sched.  Pretend to
    250 	// be systemstack_switch if the G stack is scanned.
    251 	MOVL	$runtimesystemstack_switch(SB), (g_sched+gobuf_pc)(AX)
    252 	MOVL	SP, (g_sched+gobuf_sp)(AX)
    253 	MOVL	AX, (g_sched+gobuf_g)(AX)
    254 
    255 	// switch to g0
    256 	MOVL	DX, g(CX)
    257 	MOVL	(g_sched+gobuf_sp)(DX), BX
    258 	// make it look like mstart called systemstack on g0, to stop traceback
    259 	SUBL	$4, BX
    260 	MOVL	$runtimemstart(SB), DX
    261 	MOVL	DX, 0(BX)
    262 	MOVL	BX, SP
    263 
    264 	// call target function
    265 	MOVL	DI, DX
    266 	MOVL	0(DI), DI
    267 	CALL	DI
    268 
    269 	// switch back to g
    270 	get_tls(CX)
    271 	MOVL	g(CX), AX
    272 	MOVL	g_m(AX), BX
    273 	MOVL	m_curg(BX), AX
    274 	MOVL	AX, g(CX)
    275 	MOVL	(g_sched+gobuf_sp)(AX), SP
    276 	MOVL	$0, (g_sched+gobuf_sp)(AX)
    277 	RET
    278 
    279 noswitch:
    280 	// already on system stack, just call directly
    281 	MOVL	DI, DX
    282 	MOVL	0(DI), DI
    283 	CALL	DI
    284 	RET
    285 
    286 /*
    287  * support for morestack
    288  */
    289 
    290 // Called during function prolog when more stack is needed.
    291 //
    292 // The traceback routines see morestack on a g0 as being
    293 // the top of a stack (for example, morestack calling newstack
    294 // calling the scheduler calling newm calling gc), so we must
    295 // record an argument size. For that purpose, it has no arguments.
    296 TEXT runtimemorestack(SB),NOSPLIT,$0-0
    297 	// Cannot grow scheduler stack (m->g0).
    298 	get_tls(CX)
    299 	MOVL	g(CX), BX
    300 	MOVL	g_m(BX), BX
    301 	MOVL	m_g0(BX), SI
    302 	CMPL	g(CX), SI
    303 	JNE	2(PC)
    304 	INT	$3
    305 
    306 	// Cannot grow signal stack.
    307 	MOVL	m_gsignal(BX), SI
    308 	CMPL	g(CX), SI
    309 	JNE	2(PC)
    310 	INT	$3
    311 
    312 	// Called from f.
    313 	// Set m->morebuf to f's caller.
    314 	MOVL	4(SP), DI	// f's caller's PC
    315 	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
    316 	LEAL	8(SP), CX	// f's caller's SP
    317 	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
    318 	get_tls(CX)
    319 	MOVL	g(CX), SI
    320 	MOVL	SI, (m_morebuf+gobuf_g)(BX)
    321 
    322 	// Set g->sched to context in f.
    323 	MOVL	0(SP), AX	// f's PC
    324 	MOVL	AX, (g_sched+gobuf_pc)(SI)
    325 	MOVL	SI, (g_sched+gobuf_g)(SI)
    326 	LEAL	4(SP), AX	// f's SP
    327 	MOVL	AX, (g_sched+gobuf_sp)(SI)
    328 	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
    329 
    330 	// Call newstack on m->g0's stack.
    331 	MOVL	m_g0(BX), BP
    332 	MOVL	BP, g(CX)
    333 	MOVL	(g_sched+gobuf_sp)(BP), AX
    334 	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
    335 	MOVL	AX, SP
    336 	CALL	runtimenewstack(SB)
    337 	MOVL	$0, 0x1003	// crash if newstack returns
    338 	RET
    339 
    340 TEXT runtimemorestack_noctxt(SB),NOSPLIT,$0-0
    341 	MOVL	$0, DX
    342 	JMP runtimemorestack(SB)
    343 
    344 TEXT runtimestackBarrier(SB),NOSPLIT,$0
    345 	// We came here via a RET to an overwritten return PC.
    346 	// AX may be live. Other registers are available.
    347 
    348 	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
    349 	get_tls(CX)
    350 	MOVL	g(CX), CX
    351 	MOVL	(g_stkbar+slice_array)(CX), DX
    352 	MOVL	g_stkbarPos(CX), BX
    353 	IMULL	$stkbar__size, BX	// Too big for SIB.
    354 	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
    355 	// Record that this stack barrier was hit.
    356 	ADDL	$1, g_stkbarPos(CX)
    357 	// Jump to the original return PC.
    358 	JMP	BX
    359 
    360 // reflectcall: call a function with the given argument list
    361 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
    362 // we don't have variable-sized frames, so we use a small number
    363 // of constant-sized-frame functions to encode a few bits of size in the pc.
    364 // Caution: ugly multiline assembly macros in your future!
    365 
    366 #define DISPATCH(NAME,MAXSIZE)		\
    367 	CMPL	CX, $MAXSIZE;		\
    368 	JA	3(PC);			\
    369 	MOVL	$NAME(SB), AX;		\
    370 	JMP	AX
    371 // Note: can't just "JMP NAME(SB)" - bad inlining results.
    372 
    373 TEXT reflectcall(SB), NOSPLIT, $0-0
    374 	JMP	reflectcall(SB)
    375 
    376 TEXT reflectcall(SB), NOSPLIT, $0-20
    377 	MOVL	argsize+12(FP), CX
    378 	DISPATCH(runtimecall16, 16)
    379 	DISPATCH(runtimecall32, 32)
    380 	DISPATCH(runtimecall64, 64)
    381 	DISPATCH(runtimecall128, 128)
    382 	DISPATCH(runtimecall256, 256)
    383 	DISPATCH(runtimecall512, 512)
    384 	DISPATCH(runtimecall1024, 1024)
    385 	DISPATCH(runtimecall2048, 2048)
    386 	DISPATCH(runtimecall4096, 4096)
    387 	DISPATCH(runtimecall8192, 8192)
    388 	DISPATCH(runtimecall16384, 16384)
    389 	DISPATCH(runtimecall32768, 32768)
    390 	DISPATCH(runtimecall65536, 65536)
    391 	DISPATCH(runtimecall131072, 131072)
    392 	DISPATCH(runtimecall262144, 262144)
    393 	DISPATCH(runtimecall524288, 524288)
    394 	DISPATCH(runtimecall1048576, 1048576)
    395 	DISPATCH(runtimecall2097152, 2097152)
    396 	DISPATCH(runtimecall4194304, 4194304)
    397 	DISPATCH(runtimecall8388608, 8388608)
    398 	DISPATCH(runtimecall16777216, 16777216)
    399 	DISPATCH(runtimecall33554432, 33554432)
    400 	DISPATCH(runtimecall67108864, 67108864)
    401 	DISPATCH(runtimecall134217728, 134217728)
    402 	DISPATCH(runtimecall268435456, 268435456)
    403 	DISPATCH(runtimecall536870912, 536870912)
    404 	DISPATCH(runtimecall1073741824, 1073741824)
    405 	MOVL	$runtimebadreflectcall(SB), AX
    406 	JMP	AX
    407 
    408 #define CALLFN(NAME,MAXSIZE)			\
    409 TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
    410 	NO_LOCAL_POINTERS;			\
    411 	/* copy arguments to stack */		\
    412 	MOVL	argptr+8(FP), SI;		\
    413 	MOVL	argsize+12(FP), CX;		\
    414 	MOVL	SP, DI;				\
    415 	REP;MOVSB;				\
    416 	/* call function */			\
    417 	MOVL	f+4(FP), DX;			\
    418 	MOVL	(DX), AX; 			\
    419 	PCDATA  $PCDATA_StackMapIndex, $0;	\
    420 	CALL	AX;				\
    421 	/* copy return values back */		\
    422 	MOVL	argptr+8(FP), DI;		\
    423 	MOVL	argsize+12(FP), CX;		\
    424 	MOVL	retoffset+16(FP), BX;		\
    425 	MOVL	SP, SI;				\
    426 	ADDL	BX, DI;				\
    427 	ADDL	BX, SI;				\
    428 	SUBL	BX, CX;				\
    429 	REP;MOVSB;				\
    430 	/* execute write barrier updates */	\
    431 	MOVL	argtype+0(FP), DX;		\
    432 	MOVL	argptr+8(FP), DI;		\
    433 	MOVL	argsize+12(FP), CX;		\
    434 	MOVL	retoffset+16(FP), BX;		\
    435 	MOVL	DX, 0(SP);			\
    436 	MOVL	DI, 4(SP);			\
    437 	MOVL	CX, 8(SP);			\
    438 	MOVL	BX, 12(SP);			\
    439 	CALL	runtimecallwritebarrier(SB);	\
    440 	RET
    441 
    442 CALLFN(call16, 16)
    443 CALLFN(call32, 32)
    444 CALLFN(call64, 64)
    445 CALLFN(call128, 128)
    446 CALLFN(call256, 256)
    447 CALLFN(call512, 512)
    448 CALLFN(call1024, 1024)
    449 CALLFN(call2048, 2048)
    450 CALLFN(call4096, 4096)
    451 CALLFN(call8192, 8192)
    452 CALLFN(call16384, 16384)
    453 CALLFN(call32768, 32768)
    454 CALLFN(call65536, 65536)
    455 CALLFN(call131072, 131072)
    456 CALLFN(call262144, 262144)
    457 CALLFN(call524288, 524288)
    458 CALLFN(call1048576, 1048576)
    459 CALLFN(call2097152, 2097152)
    460 CALLFN(call4194304, 4194304)
    461 CALLFN(call8388608, 8388608)
    462 CALLFN(call16777216, 16777216)
    463 CALLFN(call33554432, 33554432)
    464 CALLFN(call67108864, 67108864)
    465 CALLFN(call134217728, 134217728)
    466 CALLFN(call268435456, 268435456)
    467 CALLFN(call536870912, 536870912)
    468 CALLFN(call1073741824, 1073741824)
    469 
    470 // bool cas(int32 *val, int32 old, int32 new)
    471 // Atomically:
    472 //	if(*val == old){
    473 //		*val = new;
    474 //		return 1;
    475 //	}else
    476 //		return 0;
    477 TEXT runtimecas(SB), NOSPLIT, $0-13
    478 	MOVL	ptr+0(FP), BX
    479 	MOVL	old+4(FP), AX
    480 	MOVL	new+8(FP), CX
    481 	LOCK
    482 	CMPXCHGL	CX, 0(BX)
    483 	SETEQ	ret+12(FP)
    484 	RET
    485 
    486 TEXT runtimecasuintptr(SB), NOSPLIT, $0-13
    487 	JMP	runtimecas(SB)
    488 
    489 TEXT runtimeatomicloaduintptr(SB), NOSPLIT, $0-8
    490 	JMP	runtimeatomicload(SB)
    491 
    492 TEXT runtimeatomicloaduint(SB), NOSPLIT, $0-8
    493 	JMP	runtimeatomicload(SB)
    494 
    495 TEXT runtimeatomicstoreuintptr(SB), NOSPLIT, $0-8
    496 	JMP	runtimeatomicstore(SB)
    497 
    498 // bool runtimecas64(uint64 *val, uint64 old, uint64 new)
    499 // Atomically:
    500 //	if(*val == *old){
    501 //		*val = new;
    502 //		return 1;
    503 //	} else {
    504 //		return 0;
    505 //	}
    506 TEXT runtimecas64(SB), NOSPLIT, $0-21
    507 	MOVL	ptr+0(FP), BP
    508 	MOVL	old_lo+4(FP), AX
    509 	MOVL	old_hi+8(FP), DX
    510 	MOVL	new_lo+12(FP), BX
    511 	MOVL	new_hi+16(FP), CX
    512 	LOCK
    513 	CMPXCHG8B	0(BP)
    514 	SETEQ	ret+20(FP)
    515 	RET
    516 
    517 // bool casp(void **p, void *old, void *new)
    518 // Atomically:
    519 //	if(*p == old){
    520 //		*p = new;
    521 //		return 1;
    522 //	}else
    523 //		return 0;
    524 TEXT runtimecasp1(SB), NOSPLIT, $0-13
    525 	MOVL	ptr+0(FP), BX
    526 	MOVL	old+4(FP), AX
    527 	MOVL	new+8(FP), CX
    528 	LOCK
    529 	CMPXCHGL	CX, 0(BX)
    530 	SETEQ	ret+12(FP)
    531 	RET
    532 
    533 // uint32 xadd(uint32 volatile *val, int32 delta)
    534 // Atomically:
    535 //	*val += delta;
    536 //	return *val;
    537 TEXT runtimexadd(SB), NOSPLIT, $0-12
    538 	MOVL	ptr+0(FP), BX
    539 	MOVL	delta+4(FP), AX
    540 	MOVL	AX, CX
    541 	LOCK
    542 	XADDL	AX, 0(BX)
    543 	ADDL	CX, AX
    544 	MOVL	AX, ret+8(FP)
    545 	RET
    546 
    547 TEXT runtimexchg(SB), NOSPLIT, $0-12
    548 	MOVL	ptr+0(FP), BX
    549 	MOVL	new+4(FP), AX
    550 	XCHGL	AX, 0(BX)
    551 	MOVL	AX, ret+8(FP)
    552 	RET
    553 
    554 TEXT runtimexchgp1(SB), NOSPLIT, $0-12
    555 	MOVL	ptr+0(FP), BX
    556 	MOVL	new+4(FP), AX
    557 	XCHGL	AX, 0(BX)
    558 	MOVL	AX, ret+8(FP)
    559 	RET
    560 
    561 TEXT runtimexchguintptr(SB), NOSPLIT, $0-12
    562 	JMP	runtimexchg(SB)
    563 
    564 TEXT runtimeprocyield(SB),NOSPLIT,$0-0
    565 	MOVL	cycles+0(FP), AX
    566 again:
    567 	PAUSE
    568 	SUBL	$1, AX
    569 	JNZ	again
    570 	RET
    571 
    572 TEXT runtimeatomicstorep1(SB), NOSPLIT, $0-8
    573 	MOVL	ptr+0(FP), BX
    574 	MOVL	val+4(FP), AX
    575 	XCHGL	AX, 0(BX)
    576 	RET
    577 
    578 TEXT runtimeatomicstore(SB), NOSPLIT, $0-8
    579 	MOVL	ptr+0(FP), BX
    580 	MOVL	val+4(FP), AX
    581 	XCHGL	AX, 0(BX)
    582 	RET
    583 
    584 // uint64 atomicload64(uint64 volatile* addr);
    585 TEXT runtimeatomicload64(SB), NOSPLIT, $0-12
    586 	MOVL	ptr+0(FP), AX
    587 	TESTL	$7, AX
    588 	JZ	2(PC)
    589 	MOVL	0, AX // crash with nil ptr deref
    590 	LEAL	ret_lo+4(FP), BX
    591 	// MOVQ (%EAX), %MM0
    592 	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
    593 	// MOVQ %MM0, 0(%EBX)
    594 	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
    595 	// EMMS
    596 	BYTE $0x0F; BYTE $0x77
    597 	RET
    598 
    599 // void runtimeatomicstore64(uint64 volatile* addr, uint64 v);
    600 TEXT runtimeatomicstore64(SB), NOSPLIT, $0-12
    601 	MOVL	ptr+0(FP), AX
    602 	TESTL	$7, AX
    603 	JZ	2(PC)
    604 	MOVL	0, AX // crash with nil ptr deref
    605 	// MOVQ and EMMS were introduced on the Pentium MMX.
    606 	// MOVQ 0x8(%ESP), %MM0
    607 	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
    608 	// MOVQ %MM0, (%EAX)
    609 	BYTE $0x0f; BYTE $0x7f; BYTE $0x00
    610 	// EMMS
    611 	BYTE $0x0F; BYTE $0x77
    612 	// This is essentially a no-op, but it provides required memory fencing.
    613 	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
    614 	MOVL	$0, AX
    615 	LOCK
    616 	XADDL	AX, (SP)
    617 	RET
    618 
    619 // void	runtimeatomicor8(byte volatile*, byte);
    620 TEXT runtimeatomicor8(SB), NOSPLIT, $0-5
    621 	MOVL	ptr+0(FP), AX
    622 	MOVB	val+4(FP), BX
    623 	LOCK
    624 	ORB	BX, (AX)
    625 	RET
    626 
    627 // void	runtimeatomicand8(byte volatile*, byte);
    628 TEXT runtimeatomicand8(SB), NOSPLIT, $0-5
    629 	MOVL	ptr+0(FP), AX
    630 	MOVB	val+4(FP), BX
    631 	LOCK
    632 	ANDB	BX, (AX)
    633 	RET
    634 
    635 TEXT publicationBarrier(SB),NOSPLIT,$0-0
    636 	// Stores are already ordered on x86, so this is just a
    637 	// compile barrier.
    638 	RET
    639 
    640 // void jmpdefer(fn, sp);
    641 // called from deferreturn.
    642 // 1. pop the caller
    643 // 2. sub 5 bytes from the callers return
    644 // 3. jmp to the argument
    645 TEXT runtimejmpdefer(SB), NOSPLIT, $0-8
    646 	MOVL	fv+0(FP), DX	// fn
    647 	MOVL	argp+4(FP), BX	// caller sp
    648 	LEAL	-4(BX), SP	// caller sp after CALL
    649 	SUBL	$5, (SP)	// return to CALL again
    650 	MOVL	0(DX), BX
    651 	JMP	BX	// but first run the deferred function
    652 
    653 // Save state of caller into g->sched.
    654 TEXT gosave<>(SB),NOSPLIT,$0
    655 	PUSHL	AX
    656 	PUSHL	BX
    657 	get_tls(BX)
    658 	MOVL	g(BX), BX
    659 	LEAL	arg+0(FP), AX
    660 	MOVL	AX, (g_sched+gobuf_sp)(BX)
    661 	MOVL	-4(AX), AX
    662 	MOVL	AX, (g_sched+gobuf_pc)(BX)
    663 	MOVL	$0, (g_sched+gobuf_ret)(BX)
    664 	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
    665 	POPL	BX
    666 	POPL	AX
    667 	RET
    668 
    669 // func asmcgocall(fn, arg unsafe.Pointer) int32
    670 // Call fn(arg) on the scheduler stack,
    671 // aligned appropriately for the gcc ABI.
    672 // See cgocall.go for more details.
    673 TEXT asmcgocall(SB),NOSPLIT,$0-12
    674 	MOVL	fn+0(FP), AX
    675 	MOVL	arg+4(FP), BX
    676 
    677 	MOVL	SP, DX
    678 
    679 	// Figure out if we need to switch to m->g0 stack.
    680 	// We get called to create new OS threads too, and those
    681 	// come in on the m->g0 stack already.
    682 	get_tls(CX)
    683 	MOVL	g(CX), BP
    684 	MOVL	g_m(BP), BP
    685 	MOVL	m_g0(BP), SI
    686 	MOVL	g(CX), DI
    687 	CMPL	SI, DI
    688 	JEQ	4(PC)
    689 	CALL	gosave<>(SB)
    690 	MOVL	SI, g(CX)
    691 	MOVL	(g_sched+gobuf_sp)(SI), SP
    692 
    693 	// Now on a scheduling stack (a pthread-created stack).
    694 	SUBL	$32, SP
    695 	ANDL	$~15, SP	// alignment, perhaps unnecessary
    696 	MOVL	DI, 8(SP)	// save g
    697 	MOVL	(g_stack+stack_hi)(DI), DI
    698 	SUBL	DX, DI
    699 	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
    700 	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
    701 	CALL	AX
    702 
    703 	// Restore registers, g, stack pointer.
    704 	get_tls(CX)
    705 	MOVL	8(SP), DI
    706 	MOVL	(g_stack+stack_hi)(DI), SI
    707 	SUBL	4(SP), SI
    708 	MOVL	DI, g(CX)
    709 	MOVL	SI, SP
    710 
    711 	MOVL	AX, ret+8(FP)
    712 	RET
    713 
    714 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
    715 // Turn the fn into a Go func (by taking its address) and call
    716 // cgocallback_gofunc.
    717 TEXT runtimecgocallback(SB),NOSPLIT,$12-12
    718 	LEAL	fn+0(FP), AX
    719 	MOVL	AX, 0(SP)
    720 	MOVL	frame+4(FP), AX
    721 	MOVL	AX, 4(SP)
    722 	MOVL	framesize+8(FP), AX
    723 	MOVL	AX, 8(SP)
    724 	MOVL	$runtimecgocallback_gofunc(SB), AX
    725 	CALL	AX
    726 	RET
    727 
    728 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
    729 // See cgocall.go for more details.
    730 TEXT cgocallback_gofunc(SB),NOSPLIT,$12-12
    731 	NO_LOCAL_POINTERS
    732 
    733 	// If g is nil, Go did not create the current thread.
    734 	// Call needm to obtain one for temporary use.
    735 	// In this case, we're running on the thread stack, so there's
    736 	// lots of space, but the linker doesn't know. Hide the call from
    737 	// the linker analysis by using an indirect call through AX.
    738 	get_tls(CX)
    739 #ifdef GOOS_windows
    740 	MOVL	$0, BP
    741 	CMPL	CX, $0
    742 	JEQ	2(PC) // TODO
    743 #endif
    744 	MOVL	g(CX), BP
    745 	CMPL	BP, $0
    746 	JEQ	needm
    747 	MOVL	g_m(BP), BP
    748 	MOVL	BP, DX // saved copy of oldm
    749 	JMP	havem
    750 needm:
    751 	MOVL	$0, 0(SP)
    752 	MOVL	$runtimeneedm(SB), AX
    753 	CALL	AX
    754 	MOVL	0(SP), DX
    755 	get_tls(CX)
    756 	MOVL	g(CX), BP
    757 	MOVL	g_m(BP), BP
    758 
    759 	// Set m->sched.sp = SP, so that if a panic happens
    760 	// during the function we are about to execute, it will
    761 	// have a valid SP to run on the g0 stack.
    762 	// The next few lines (after the havem label)
    763 	// will save this SP onto the stack and then write
    764 	// the same SP back to m->sched.sp. That seems redundant,
    765 	// but if an unrecovered panic happens, unwindm will
    766 	// restore the g->sched.sp from the stack location
    767 	// and then systemstack will try to use it. If we don't set it here,
    768 	// that restored SP will be uninitialized (typically 0) and
    769 	// will not be usable.
    770 	MOVL	m_g0(BP), SI
    771 	MOVL	SP, (g_sched+gobuf_sp)(SI)
    772 
    773 havem:
    774 	// Now there's a valid m, and we're running on its m->g0.
    775 	// Save current m->g0->sched.sp on stack and then set it to SP.
    776 	// Save current sp in m->g0->sched.sp in preparation for
    777 	// switch back to m->curg stack.
    778 	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
    779 	MOVL	m_g0(BP), SI
    780 	MOVL	(g_sched+gobuf_sp)(SI), AX
    781 	MOVL	AX, 0(SP)
    782 	MOVL	SP, (g_sched+gobuf_sp)(SI)
    783 
    784 	// Switch to m->curg stack and call runtime.cgocallbackg.
    785 	// Because we are taking over the execution of m->curg
    786 	// but *not* resuming what had been running, we need to
    787 	// save that information (m->curg->sched) so we can restore it.
    788 	// We can restore m->curg->sched.sp easily, because calling
    789 	// runtime.cgocallbackg leaves SP unchanged upon return.
    790 	// To save m->curg->sched.pc, we push it onto the stack.
    791 	// This has the added benefit that it looks to the traceback
    792 	// routine like cgocallbackg is going to return to that
    793 	// PC (because the frame we allocate below has the same
    794 	// size as cgocallback_gofunc's frame declared above)
    795 	// so that the traceback will seamlessly trace back into
    796 	// the earlier calls.
    797 	//
    798 	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
    799 	// 4(SP) and 8(SP) are unused.
    800 	MOVL	m_curg(BP), SI
    801 	MOVL	SI, g(CX)
    802 	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
    803 	MOVL	(g_sched+gobuf_pc)(SI), BP
    804 	MOVL	BP, -4(DI)
    805 	LEAL	-(4+12)(DI), SP
    806 	MOVL	DX, 0(SP)
    807 	CALL	runtimecgocallbackg(SB)
    808 	MOVL	0(SP), DX
    809 
    810 	// Restore g->sched (== m->curg->sched) from saved values.
    811 	get_tls(CX)
    812 	MOVL	g(CX), SI
    813 	MOVL	12(SP), BP
    814 	MOVL	BP, (g_sched+gobuf_pc)(SI)
    815 	LEAL	(12+4)(SP), DI
    816 	MOVL	DI, (g_sched+gobuf_sp)(SI)
    817 
    818 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
    819 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
    820 	// so we do not have to restore it.)
    821 	MOVL	g(CX), BP
    822 	MOVL	g_m(BP), BP
    823 	MOVL	m_g0(BP), SI
    824 	MOVL	SI, g(CX)
    825 	MOVL	(g_sched+gobuf_sp)(SI), SP
    826 	MOVL	0(SP), AX
    827 	MOVL	AX, (g_sched+gobuf_sp)(SI)
    828 
    829 	// If the m on entry was nil, we called needm above to borrow an m
    830 	// for the duration of the call. Since the call is over, return it with dropm.
    831 	CMPL	DX, $0
    832 	JNE 3(PC)
    833 	MOVL	$runtimedropm(SB), AX
    834 	CALL	AX
    835 
    836 	// Done!
    837 	RET
    838 
    839 // void setg(G*); set g. for use by needm.
    840 TEXT runtimesetg(SB), NOSPLIT, $0-4
    841 	MOVL	gg+0(FP), BX
    842 #ifdef GOOS_windows
    843 	CMPL	BX, $0
    844 	JNE	settls
    845 	MOVL	$0, 0x14(FS)
    846 	RET
    847 settls:
    848 	MOVL	g_m(BX), AX
    849 	LEAL	m_tls(AX), AX
    850 	MOVL	AX, 0x14(FS)
    851 #endif
    852 	get_tls(CX)
    853 	MOVL	BX, g(CX)
    854 	RET
    855 
    856 // void setg_gcc(G*); set g. for use by gcc
    857 TEXT setg_gcc<>(SB), NOSPLIT, $0
    858 	get_tls(AX)
    859 	MOVL	gg+0(FP), DX
    860 	MOVL	DX, g(AX)
    861 	RET
    862 
    863 // check that SP is in range [g->stack.lo, g->stack.hi)
    864 TEXT runtimestackcheck(SB), NOSPLIT, $0-0
    865 	get_tls(CX)
    866 	MOVL	g(CX), AX
    867 	CMPL	(g_stack+stack_hi)(AX), SP
    868 	JHI	2(PC)
    869 	INT	$3
    870 	CMPL	SP, (g_stack+stack_lo)(AX)
    871 	JHI	2(PC)
    872 	INT	$3
    873 	RET
    874 
    875 TEXT runtimegetcallerpc(SB),NOSPLIT,$4-8
    876 	MOVL	argp+0(FP),AX		// addr of first arg
    877 	MOVL	-4(AX),AX		// get calling pc
    878 	CMPL	AX, runtimestackBarrierPC(SB)
    879 	JNE	nobar
    880 	// Get original return PC.
    881 	CALL	runtimenextBarrierPC(SB)
    882 	MOVL	0(SP), AX
    883 nobar:
    884 	MOVL	AX, ret+4(FP)
    885 	RET
    886 
    887 TEXT runtimesetcallerpc(SB),NOSPLIT,$4-8
    888 	MOVL	argp+0(FP),AX		// addr of first arg
    889 	MOVL	pc+4(FP), BX
    890 	MOVL	-4(AX), CX
    891 	CMPL	CX, runtimestackBarrierPC(SB)
    892 	JEQ	setbar
    893 	MOVL	BX, -4(AX)		// set calling pc
    894 	RET
    895 setbar:
    896 	// Set the stack barrier return PC.
    897 	MOVL	BX, 0(SP)
    898 	CALL	runtimesetNextBarrierPC(SB)
    899 	RET
    900 
    901 TEXT runtimegetcallersp(SB), NOSPLIT, $0-8
    902 	MOVL	argp+0(FP), AX
    903 	MOVL	AX, ret+4(FP)
    904 	RET
    905 
    906 // func cputicks() int64
    907 TEXT runtimecputicks(SB),NOSPLIT,$0-8
    908 	TESTL	$0x4000000, runtimecpuid_edx(SB) // no sse2, no mfence
    909 	JEQ	done
    910 	CMPB	runtimelfenceBeforeRdtsc(SB), $1
    911 	JNE	mfence
    912 	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
    913 	JMP	done
    914 mfence:
    915 	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
    916 done:
    917 	RDTSC
    918 	MOVL	AX, ret_lo+0(FP)
    919 	MOVL	DX, ret_hi+4(FP)
    920 	RET
    921 
    922 TEXT runtimeldt0setup(SB),NOSPLIT,$16-0
    923 	// set up ldt 7 to point at tls0
    924 	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
    925 	// the entry number is just a hint.  setldt will set up GS with what it used.
    926 	MOVL	$7, 0(SP)
    927 	LEAL	runtimetls0(SB), AX
    928 	MOVL	AX, 4(SP)
    929 	MOVL	$32, 8(SP)	// sizeof(tls array)
    930 	CALL	runtimesetldt(SB)
    931 	RET
    932 
    933 TEXT runtimeemptyfunc(SB),0,$0-0
    934 	RET
    935 
    936 TEXT runtimeabort(SB),NOSPLIT,$0-0
    937 	INT $0x3
    938 
    939 // memhash_varlen(p unsafe.Pointer, h seed) uintptr
    940 // redirects to memhash(p, h, size) using the size
    941 // stored in the closure.
    942 TEXT runtimememhash_varlen(SB),NOSPLIT,$16-12
    943 	GO_ARGS
    944 	NO_LOCAL_POINTERS
    945 	MOVL	p+0(FP), AX
    946 	MOVL	h+4(FP), BX
    947 	MOVL	4(DX), CX
    948 	MOVL	AX, 0(SP)
    949 	MOVL	BX, 4(SP)
    950 	MOVL	CX, 8(SP)
    951 	CALL	runtimememhash(SB)
    952 	MOVL	12(SP), AX
    953 	MOVL	AX, ret+8(FP)
    954 	RET
    955 
    956 // hash function using AES hardware instructions
    957 TEXT runtimeaeshash(SB),NOSPLIT,$0-16
    958 	MOVL	p+0(FP), AX	// ptr to data
    959 	MOVL	s+8(FP), CX	// size
    960 	LEAL	ret+12(FP), DX
    961 	JMP	runtimeaeshashbody(SB)
    962 
    963 TEXT runtimeaeshashstr(SB),NOSPLIT,$0-12
    964 	MOVL	p+0(FP), AX	// ptr to string object
    965 	MOVL	4(AX), CX	// length of string
    966 	MOVL	(AX), AX	// string data
    967 	LEAL	ret+8(FP), DX
    968 	JMP	runtimeaeshashbody(SB)
    969 
    970 // AX: data
    971 // CX: length
    972 // DX: address to put return value
    973 TEXT runtimeaeshashbody(SB),NOSPLIT,$0-0
    974 	MOVL	h+4(FP), X6	// seed to low 64 bits of xmm6
    975 	PINSRD	$2, CX, X6	// size to high 64 bits of xmm6
    976 	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
    977 	MOVO	runtimeaeskeysched(SB), X7
    978 	CMPL	CX, $16
    979 	JB	aes0to15
    980 	JE	aes16
    981 	CMPL	CX, $32
    982 	JBE	aes17to32
    983 	CMPL	CX, $64
    984 	JBE	aes33to64
    985 	JMP	aes65plus
    986 
    987 aes0to15:
    988 	TESTL	CX, CX
    989 	JE	aes0
    990 
    991 	ADDL	$16, AX
    992 	TESTW	$0xff0, AX
    993 	JE	endofpage
    994 
    995 	// 16 bytes loaded at this address won't cross
    996 	// a page boundary, so we can load it directly.
    997 	MOVOU	-16(AX), X0
    998 	ADDL	CX, CX
    999 	PAND	masks<>(SB)(CX*8), X0
   1000 
   1001 	// scramble 3 times
   1002 	AESENC	X6, X0
   1003 	AESENC	X7, X0
   1004 	AESENC	X7, X0
   1005 	MOVL	X0, (DX)
   1006 	RET
   1007 
   1008 endofpage:
   1009 	// address ends in 1111xxxx.  Might be up against
   1010 	// a page boundary, so load ending at last byte.
   1011 	// Then shift bytes down using pshufb.
   1012 	MOVOU	-32(AX)(CX*1), X0
   1013 	ADDL	CX, CX
   1014 	PSHUFB	shifts<>(SB)(CX*8), X0
   1015 	AESENC	X6, X0
   1016 	AESENC	X7, X0
   1017 	AESENC	X7, X0
   1018 	MOVL	X0, (DX)
   1019 	RET
   1020 
   1021 aes0:
   1022 	// return input seed
   1023 	MOVL	h+4(FP), AX
   1024 	MOVL	AX, (DX)
   1025 	RET
   1026 
   1027 aes16:
   1028 	MOVOU	(AX), X0
   1029 	AESENC	X6, X0
   1030 	AESENC	X7, X0
   1031 	AESENC	X7, X0
   1032 	MOVL	X0, (DX)
   1033 	RET
   1034 
   1035 
   1036 aes17to32:
   1037 	// load data to be hashed
   1038 	MOVOU	(AX), X0
   1039 	MOVOU	-16(AX)(CX*1), X1
   1040 
   1041 	// scramble 3 times
   1042 	AESENC	X6, X0
   1043 	AESENC	runtimeaeskeysched+16(SB), X1
   1044 	AESENC	X7, X0
   1045 	AESENC	X7, X1
   1046 	AESENC	X7, X0
   1047 	AESENC	X7, X1
   1048 
   1049 	// combine results
   1050 	PXOR	X1, X0
   1051 	MOVL	X0, (DX)
   1052 	RET
   1053 
   1054 aes33to64:
   1055 	MOVOU	(AX), X0
   1056 	MOVOU	16(AX), X1
   1057 	MOVOU	-32(AX)(CX*1), X2
   1058 	MOVOU	-16(AX)(CX*1), X3
   1059 
   1060 	AESENC	X6, X0
   1061 	AESENC	runtimeaeskeysched+16(SB), X1
   1062 	AESENC	runtimeaeskeysched+32(SB), X2
   1063 	AESENC	runtimeaeskeysched+48(SB), X3
   1064 	AESENC	X7, X0
   1065 	AESENC	X7, X1
   1066 	AESENC	X7, X2
   1067 	AESENC	X7, X3
   1068 	AESENC	X7, X0
   1069 	AESENC	X7, X1
   1070 	AESENC	X7, X2
   1071 	AESENC	X7, X3
   1072 
   1073 	PXOR	X2, X0
   1074 	PXOR	X3, X1
   1075 	PXOR	X1, X0
   1076 	MOVL	X0, (DX)
   1077 	RET
   1078 
   1079 aes65plus:
   1080 	// start with last (possibly overlapping) block
   1081 	MOVOU	-64(AX)(CX*1), X0
   1082 	MOVOU	-48(AX)(CX*1), X1
   1083 	MOVOU	-32(AX)(CX*1), X2
   1084 	MOVOU	-16(AX)(CX*1), X3
   1085 
   1086 	// scramble state once
   1087 	AESENC	X6, X0
   1088 	AESENC	runtimeaeskeysched+16(SB), X1
   1089 	AESENC	runtimeaeskeysched+32(SB), X2
   1090 	AESENC	runtimeaeskeysched+48(SB), X3
   1091 
   1092 	// compute number of remaining 64-byte blocks
   1093 	DECL	CX
   1094 	SHRL	$6, CX
   1095 
   1096 aesloop:
   1097 	// scramble state, xor in a block
   1098 	MOVOU	(AX), X4
   1099 	MOVOU	16(AX), X5
   1100 	AESENC	X4, X0
   1101 	AESENC	X5, X1
   1102 	MOVOU	32(AX), X4
   1103 	MOVOU	48(AX), X5
   1104 	AESENC	X4, X2
   1105 	AESENC	X5, X3
   1106 
   1107 	// scramble state
   1108 	AESENC	X7, X0
   1109 	AESENC	X7, X1
   1110 	AESENC	X7, X2
   1111 	AESENC	X7, X3
   1112 
   1113 	ADDL	$64, AX
   1114 	DECL	CX
   1115 	JNE	aesloop
   1116 
   1117 	// 2 more scrambles to finish
   1118 	AESENC	X7, X0
   1119 	AESENC	X7, X1
   1120 	AESENC	X7, X2
   1121 	AESENC	X7, X3
   1122 	AESENC	X7, X0
   1123 	AESENC	X7, X1
   1124 	AESENC	X7, X2
   1125 	AESENC	X7, X3
   1126 
   1127 	PXOR	X2, X0
   1128 	PXOR	X3, X1
   1129 	PXOR	X1, X0
   1130 	MOVL	X0, (DX)
   1131 	RET
   1132 
   1133 TEXT runtimeaeshash32(SB),NOSPLIT,$0-12
   1134 	MOVL	p+0(FP), AX	// ptr to data
   1135 	MOVL	h+4(FP), X0	// seed
   1136 	PINSRD	$1, (AX), X0	// data
   1137 	AESENC	runtimeaeskeysched+0(SB), X0
   1138 	AESENC	runtimeaeskeysched+16(SB), X0
   1139 	AESENC	runtimeaeskeysched+32(SB), X0
   1140 	MOVL	X0, ret+8(FP)
   1141 	RET
   1142 
   1143 TEXT runtimeaeshash64(SB),NOSPLIT,$0-12
   1144 	MOVL	p+0(FP), AX	// ptr to data
   1145 	MOVQ	(AX), X0	// data
   1146 	PINSRD	$2, h+4(FP), X0	// seed
   1147 	AESENC	runtimeaeskeysched+0(SB), X0
   1148 	AESENC	runtimeaeskeysched+16(SB), X0
   1149 	AESENC	runtimeaeskeysched+32(SB), X0
   1150 	MOVL	X0, ret+8(FP)
   1151 	RET
   1152 
   1153 // simple mask to get rid of data in the high part of the register.
   1154 DATA masks<>+0x00(SB)/4, $0x00000000
   1155 DATA masks<>+0x04(SB)/4, $0x00000000
   1156 DATA masks<>+0x08(SB)/4, $0x00000000
   1157 DATA masks<>+0x0c(SB)/4, $0x00000000
   1158 
   1159 DATA masks<>+0x10(SB)/4, $0x000000ff
   1160 DATA masks<>+0x14(SB)/4, $0x00000000
   1161 DATA masks<>+0x18(SB)/4, $0x00000000
   1162 DATA masks<>+0x1c(SB)/4, $0x00000000
   1163 
   1164 DATA masks<>+0x20(SB)/4, $0x0000ffff
   1165 DATA masks<>+0x24(SB)/4, $0x00000000
   1166 DATA masks<>+0x28(SB)/4, $0x00000000
   1167 DATA masks<>+0x2c(SB)/4, $0x00000000
   1168 
   1169 DATA masks<>+0x30(SB)/4, $0x00ffffff
   1170 DATA masks<>+0x34(SB)/4, $0x00000000
   1171 DATA masks<>+0x38(SB)/4, $0x00000000
   1172 DATA masks<>+0x3c(SB)/4, $0x00000000
   1173 
   1174 DATA masks<>+0x40(SB)/4, $0xffffffff
   1175 DATA masks<>+0x44(SB)/4, $0x00000000
   1176 DATA masks<>+0x48(SB)/4, $0x00000000
   1177 DATA masks<>+0x4c(SB)/4, $0x00000000
   1178 
   1179 DATA masks<>+0x50(SB)/4, $0xffffffff
   1180 DATA masks<>+0x54(SB)/4, $0x000000ff
   1181 DATA masks<>+0x58(SB)/4, $0x00000000
   1182 DATA masks<>+0x5c(SB)/4, $0x00000000
   1183 
   1184 DATA masks<>+0x60(SB)/4, $0xffffffff
   1185 DATA masks<>+0x64(SB)/4, $0x0000ffff
   1186 DATA masks<>+0x68(SB)/4, $0x00000000
   1187 DATA masks<>+0x6c(SB)/4, $0x00000000
   1188 
   1189 DATA masks<>+0x70(SB)/4, $0xffffffff
   1190 DATA masks<>+0x74(SB)/4, $0x00ffffff
   1191 DATA masks<>+0x78(SB)/4, $0x00000000
   1192 DATA masks<>+0x7c(SB)/4, $0x00000000
   1193 
   1194 DATA masks<>+0x80(SB)/4, $0xffffffff
   1195 DATA masks<>+0x84(SB)/4, $0xffffffff
   1196 DATA masks<>+0x88(SB)/4, $0x00000000
   1197 DATA masks<>+0x8c(SB)/4, $0x00000000
   1198 
   1199 DATA masks<>+0x90(SB)/4, $0xffffffff
   1200 DATA masks<>+0x94(SB)/4, $0xffffffff
   1201 DATA masks<>+0x98(SB)/4, $0x000000ff
   1202 DATA masks<>+0x9c(SB)/4, $0x00000000
   1203 
   1204 DATA masks<>+0xa0(SB)/4, $0xffffffff
   1205 DATA masks<>+0xa4(SB)/4, $0xffffffff
   1206 DATA masks<>+0xa8(SB)/4, $0x0000ffff
   1207 DATA masks<>+0xac(SB)/4, $0x00000000
   1208 
   1209 DATA masks<>+0xb0(SB)/4, $0xffffffff
   1210 DATA masks<>+0xb4(SB)/4, $0xffffffff
   1211 DATA masks<>+0xb8(SB)/4, $0x00ffffff
   1212 DATA masks<>+0xbc(SB)/4, $0x00000000
   1213 
   1214 DATA masks<>+0xc0(SB)/4, $0xffffffff
   1215 DATA masks<>+0xc4(SB)/4, $0xffffffff
   1216 DATA masks<>+0xc8(SB)/4, $0xffffffff
   1217 DATA masks<>+0xcc(SB)/4, $0x00000000
   1218 
   1219 DATA masks<>+0xd0(SB)/4, $0xffffffff
   1220 DATA masks<>+0xd4(SB)/4, $0xffffffff
   1221 DATA masks<>+0xd8(SB)/4, $0xffffffff
   1222 DATA masks<>+0xdc(SB)/4, $0x000000ff
   1223 
   1224 DATA masks<>+0xe0(SB)/4, $0xffffffff
   1225 DATA masks<>+0xe4(SB)/4, $0xffffffff
   1226 DATA masks<>+0xe8(SB)/4, $0xffffffff
   1227 DATA masks<>+0xec(SB)/4, $0x0000ffff
   1228 
   1229 DATA masks<>+0xf0(SB)/4, $0xffffffff
   1230 DATA masks<>+0xf4(SB)/4, $0xffffffff
   1231 DATA masks<>+0xf8(SB)/4, $0xffffffff
   1232 DATA masks<>+0xfc(SB)/4, $0x00ffffff
   1233 
   1234 GLOBL masks<>(SB),RODATA,$256
   1235 
   1236 // these are arguments to pshufb.  They move data down from
   1237 // the high bytes of the register to the low bytes of the register.
   1238 // index is how many bytes to move.
   1239 DATA shifts<>+0x00(SB)/4, $0x00000000
   1240 DATA shifts<>+0x04(SB)/4, $0x00000000
   1241 DATA shifts<>+0x08(SB)/4, $0x00000000
   1242 DATA shifts<>+0x0c(SB)/4, $0x00000000
   1243 
   1244 DATA shifts<>+0x10(SB)/4, $0xffffff0f
   1245 DATA shifts<>+0x14(SB)/4, $0xffffffff
   1246 DATA shifts<>+0x18(SB)/4, $0xffffffff
   1247 DATA shifts<>+0x1c(SB)/4, $0xffffffff
   1248 
   1249 DATA shifts<>+0x20(SB)/4, $0xffff0f0e
   1250 DATA shifts<>+0x24(SB)/4, $0xffffffff
   1251 DATA shifts<>+0x28(SB)/4, $0xffffffff
   1252 DATA shifts<>+0x2c(SB)/4, $0xffffffff
   1253 
   1254 DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
   1255 DATA shifts<>+0x34(SB)/4, $0xffffffff
   1256 DATA shifts<>+0x38(SB)/4, $0xffffffff
   1257 DATA shifts<>+0x3c(SB)/4, $0xffffffff
   1258 
   1259 DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
   1260 DATA shifts<>+0x44(SB)/4, $0xffffffff
   1261 DATA shifts<>+0x48(SB)/4, $0xffffffff
   1262 DATA shifts<>+0x4c(SB)/4, $0xffffffff
   1263 
   1264 DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
   1265 DATA shifts<>+0x54(SB)/4, $0xffffff0f
   1266 DATA shifts<>+0x58(SB)/4, $0xffffffff
   1267 DATA shifts<>+0x5c(SB)/4, $0xffffffff
   1268 
   1269 DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
   1270 DATA shifts<>+0x64(SB)/4, $0xffff0f0e
   1271 DATA shifts<>+0x68(SB)/4, $0xffffffff
   1272 DATA shifts<>+0x6c(SB)/4, $0xffffffff
   1273 
   1274 DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
   1275 DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
   1276 DATA shifts<>+0x78(SB)/4, $0xffffffff
   1277 DATA shifts<>+0x7c(SB)/4, $0xffffffff
   1278 
   1279 DATA shifts<>+0x80(SB)/4, $0x0b0a0908
   1280 DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
   1281 DATA shifts<>+0x88(SB)/4, $0xffffffff
   1282 DATA shifts<>+0x8c(SB)/4, $0xffffffff
   1283 
   1284 DATA shifts<>+0x90(SB)/4, $0x0a090807
   1285 DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
   1286 DATA shifts<>+0x98(SB)/4, $0xffffff0f
   1287 DATA shifts<>+0x9c(SB)/4, $0xffffffff
   1288 
   1289 DATA shifts<>+0xa0(SB)/4, $0x09080706
   1290 DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
   1291 DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
   1292 DATA shifts<>+0xac(SB)/4, $0xffffffff
   1293 
   1294 DATA shifts<>+0xb0(SB)/4, $0x08070605
   1295 DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
   1296 DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
   1297 DATA shifts<>+0xbc(SB)/4, $0xffffffff
   1298 
   1299 DATA shifts<>+0xc0(SB)/4, $0x07060504
   1300 DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
   1301 DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
   1302 DATA shifts<>+0xcc(SB)/4, $0xffffffff
   1303 
   1304 DATA shifts<>+0xd0(SB)/4, $0x06050403
   1305 DATA shifts<>+0xd4(SB)/4, $0x0a090807
   1306 DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
   1307 DATA shifts<>+0xdc(SB)/4, $0xffffff0f
   1308 
   1309 DATA shifts<>+0xe0(SB)/4, $0x05040302
   1310 DATA shifts<>+0xe4(SB)/4, $0x09080706
   1311 DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
   1312 DATA shifts<>+0xec(SB)/4, $0xffff0f0e
   1313 
   1314 DATA shifts<>+0xf0(SB)/4, $0x04030201
   1315 DATA shifts<>+0xf4(SB)/4, $0x08070605
   1316 DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
   1317 DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
   1318 
   1319 GLOBL shifts<>(SB),RODATA,$256
   1320 
   1321 TEXT runtimememeq(SB),NOSPLIT,$0-13
   1322 	MOVL	a+0(FP), SI
   1323 	MOVL	b+4(FP), DI
   1324 	MOVL	size+8(FP), BX
   1325 	LEAL	ret+12(FP), AX
   1326 	JMP	runtimememeqbody(SB)
   1327 
   1328 // memequal_varlen(a, b unsafe.Pointer) bool
   1329 TEXT runtimememequal_varlen(SB),NOSPLIT,$0-9
   1330 	MOVL    a+0(FP), SI
   1331 	MOVL    b+4(FP), DI
   1332 	CMPL    SI, DI
   1333 	JEQ     eq
   1334 	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
   1335 	LEAL	ret+8(FP), AX
   1336 	JMP	runtimememeqbody(SB)
   1337 eq:
   1338 	MOVB    $1, ret+8(FP)
   1339 	RET
   1340 
   1341 // eqstring tests whether two strings are equal.
   1342 // The compiler guarantees that strings passed
   1343 // to eqstring have equal length.
   1344 // See runtime_test.go:eqstring_generic for
   1345 // equivalent Go code.
   1346 TEXT runtimeeqstring(SB),NOSPLIT,$0-17
   1347 	MOVL	s1str+0(FP), SI
   1348 	MOVL	s2str+8(FP), DI
   1349 	CMPL	SI, DI
   1350 	JEQ	same
   1351 	MOVL	s1len+4(FP), BX
   1352 	LEAL	v+16(FP), AX
   1353 	JMP	runtimememeqbody(SB)
   1354 same:
   1355 	MOVB	$1, v+16(FP)
   1356 	RET
   1357 
   1358 TEXT bytesEqual(SB),NOSPLIT,$0-25
   1359 	MOVL	a_len+4(FP), BX
   1360 	MOVL	b_len+16(FP), CX
   1361 	CMPL	BX, CX
   1362 	JNE	eqret
   1363 	MOVL	a+0(FP), SI
   1364 	MOVL	b+12(FP), DI
   1365 	LEAL	ret+24(FP), AX
   1366 	JMP	runtimememeqbody(SB)
   1367 eqret:
   1368 	MOVB	$0, ret+24(FP)
   1369 	RET
   1370 
   1371 // a in SI
   1372 // b in DI
   1373 // count in BX
   1374 // address of result byte in AX
   1375 TEXT runtimememeqbody(SB),NOSPLIT,$0-0
   1376 	CMPL	BX, $4
   1377 	JB	small
   1378 
   1379 	// 64 bytes at a time using xmm registers
   1380 hugeloop:
   1381 	CMPL	BX, $64
   1382 	JB	bigloop
   1383 	TESTL	$0x4000000, runtimecpuid_edx(SB) // check for sse2
   1384 	JE	bigloop
   1385 	MOVOU	(SI), X0
   1386 	MOVOU	(DI), X1
   1387 	MOVOU	16(SI), X2
   1388 	MOVOU	16(DI), X3
   1389 	MOVOU	32(SI), X4
   1390 	MOVOU	32(DI), X5
   1391 	MOVOU	48(SI), X6
   1392 	MOVOU	48(DI), X7
   1393 	PCMPEQB	X1, X0
   1394 	PCMPEQB	X3, X2
   1395 	PCMPEQB	X5, X4
   1396 	PCMPEQB	X7, X6
   1397 	PAND	X2, X0
   1398 	PAND	X6, X4
   1399 	PAND	X4, X0
   1400 	PMOVMSKB X0, DX
   1401 	ADDL	$64, SI
   1402 	ADDL	$64, DI
   1403 	SUBL	$64, BX
   1404 	CMPL	DX, $0xffff
   1405 	JEQ	hugeloop
   1406 	MOVB	$0, (AX)
   1407 	RET
   1408 
   1409 	// 4 bytes at a time using 32-bit register
   1410 bigloop:
   1411 	CMPL	BX, $4
   1412 	JBE	leftover
   1413 	MOVL	(SI), CX
   1414 	MOVL	(DI), DX
   1415 	ADDL	$4, SI
   1416 	ADDL	$4, DI
   1417 	SUBL	$4, BX
   1418 	CMPL	CX, DX
   1419 	JEQ	bigloop
   1420 	MOVB	$0, (AX)
   1421 	RET
   1422 
   1423 	// remaining 0-4 bytes
   1424 leftover:
   1425 	MOVL	-4(SI)(BX*1), CX
   1426 	MOVL	-4(DI)(BX*1), DX
   1427 	CMPL	CX, DX
   1428 	SETEQ	(AX)
   1429 	RET
   1430 
   1431 small:
   1432 	CMPL	BX, $0
   1433 	JEQ	equal
   1434 
   1435 	LEAL	0(BX*8), CX
   1436 	NEGL	CX
   1437 
   1438 	MOVL	SI, DX
   1439 	CMPB	DX, $0xfc
   1440 	JA	si_high
   1441 
   1442 	// load at SI won't cross a page boundary.
   1443 	MOVL	(SI), SI
   1444 	JMP	si_finish
   1445 si_high:
   1446 	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
   1447 	MOVL	-4(SI)(BX*1), SI
   1448 	SHRL	CX, SI
   1449 si_finish:
   1450 
   1451 	// same for DI.
   1452 	MOVL	DI, DX
   1453 	CMPB	DX, $0xfc
   1454 	JA	di_high
   1455 	MOVL	(DI), DI
   1456 	JMP	di_finish
   1457 di_high:
   1458 	MOVL	-4(DI)(BX*1), DI
   1459 	SHRL	CX, DI
   1460 di_finish:
   1461 
   1462 	SUBL	SI, DI
   1463 	SHLL	CX, DI
   1464 equal:
   1465 	SETEQ	(AX)
   1466 	RET
   1467 
   1468 TEXT runtimecmpstring(SB),NOSPLIT,$0-20
   1469 	MOVL	s1_base+0(FP), SI
   1470 	MOVL	s1_len+4(FP), BX
   1471 	MOVL	s2_base+8(FP), DI
   1472 	MOVL	s2_len+12(FP), DX
   1473 	LEAL	ret+16(FP), AX
   1474 	JMP	runtimecmpbody(SB)
   1475 
   1476 TEXT bytesCompare(SB),NOSPLIT,$0-28
   1477 	MOVL	s1+0(FP), SI
   1478 	MOVL	s1+4(FP), BX
   1479 	MOVL	s2+12(FP), DI
   1480 	MOVL	s2+16(FP), DX
   1481 	LEAL	ret+24(FP), AX
   1482 	JMP	runtimecmpbody(SB)
   1483 
   1484 TEXT bytesIndexByte(SB),NOSPLIT,$0-20
   1485 	MOVL	s+0(FP), SI
   1486 	MOVL	s_len+4(FP), CX
   1487 	MOVB	c+12(FP), AL
   1488 	MOVL	SI, DI
   1489 	CLD; REPN; SCASB
   1490 	JZ 3(PC)
   1491 	MOVL	$-1, ret+16(FP)
   1492 	RET
   1493 	SUBL	SI, DI
   1494 	SUBL	$1, DI
   1495 	MOVL	DI, ret+16(FP)
   1496 	RET
   1497 
   1498 TEXT stringsIndexByte(SB),NOSPLIT,$0-16
   1499 	MOVL	s+0(FP), SI
   1500 	MOVL	s_len+4(FP), CX
   1501 	MOVB	c+8(FP), AL
   1502 	MOVL	SI, DI
   1503 	CLD; REPN; SCASB
   1504 	JZ 3(PC)
   1505 	MOVL	$-1, ret+12(FP)
   1506 	RET
   1507 	SUBL	SI, DI
   1508 	SUBL	$1, DI
   1509 	MOVL	DI, ret+12(FP)
   1510 	RET
   1511 
   1512 // input:
   1513 //   SI = a
   1514 //   DI = b
   1515 //   BX = alen
   1516 //   DX = blen
   1517 //   AX = address of return word (set to 1/0/-1)
   1518 TEXT runtimecmpbody(SB),NOSPLIT,$0-0
   1519 	MOVL	DX, BP
   1520 	SUBL	BX, DX // DX = blen-alen
   1521 	CMOVLGT	BX, BP // BP = min(alen, blen)
   1522 	CMPL	SI, DI
   1523 	JEQ	allsame
   1524 	CMPL	BP, $4
   1525 	JB	small
   1526 	TESTL	$0x4000000, runtimecpuid_edx(SB) // check for sse2
   1527 	JE	mediumloop
   1528 largeloop:
   1529 	CMPL	BP, $16
   1530 	JB	mediumloop
   1531 	MOVOU	(SI), X0
   1532 	MOVOU	(DI), X1
   1533 	PCMPEQB X0, X1
   1534 	PMOVMSKB X1, BX
   1535 	XORL	$0xffff, BX	// convert EQ to NE
   1536 	JNE	diff16	// branch if at least one byte is not equal
   1537 	ADDL	$16, SI
   1538 	ADDL	$16, DI
   1539 	SUBL	$16, BP
   1540 	JMP	largeloop
   1541 
   1542 diff16:
   1543 	BSFL	BX, BX	// index of first byte that differs
   1544 	XORL	DX, DX
   1545 	MOVB	(SI)(BX*1), CX
   1546 	CMPB	CX, (DI)(BX*1)
   1547 	SETHI	DX
   1548 	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
   1549 	MOVL	DX, (AX)
   1550 	RET
   1551 
   1552 mediumloop:
   1553 	CMPL	BP, $4
   1554 	JBE	_0through4
   1555 	MOVL	(SI), BX
   1556 	MOVL	(DI), CX
   1557 	CMPL	BX, CX
   1558 	JNE	diff4
   1559 	ADDL	$4, SI
   1560 	ADDL	$4, DI
   1561 	SUBL	$4, BP
   1562 	JMP	mediumloop
   1563 
   1564 _0through4:
   1565 	MOVL	-4(SI)(BP*1), BX
   1566 	MOVL	-4(DI)(BP*1), CX
   1567 	CMPL	BX, CX
   1568 	JEQ	allsame
   1569 
   1570 diff4:
   1571 	BSWAPL	BX	// reverse order of bytes
   1572 	BSWAPL	CX
   1573 	XORL	BX, CX	// find bit differences
   1574 	BSRL	CX, CX	// index of highest bit difference
   1575 	SHRL	CX, BX	// move a's bit to bottom
   1576 	ANDL	$1, BX	// mask bit
   1577 	LEAL	-1(BX*2), BX // 1/0 => +1/-1
   1578 	MOVL	BX, (AX)
   1579 	RET
   1580 
   1581 	// 0-3 bytes in common
   1582 small:
   1583 	LEAL	(BP*8), CX
   1584 	NEGL	CX
   1585 	JEQ	allsame
   1586 
   1587 	// load si
   1588 	CMPB	SI, $0xfc
   1589 	JA	si_high
   1590 	MOVL	(SI), SI
   1591 	JMP	si_finish
   1592 si_high:
   1593 	MOVL	-4(SI)(BP*1), SI
   1594 	SHRL	CX, SI
   1595 si_finish:
   1596 	SHLL	CX, SI
   1597 
   1598 	// same for di
   1599 	CMPB	DI, $0xfc
   1600 	JA	di_high
   1601 	MOVL	(DI), DI
   1602 	JMP	di_finish
   1603 di_high:
   1604 	MOVL	-4(DI)(BP*1), DI
   1605 	SHRL	CX, DI
   1606 di_finish:
   1607 	SHLL	CX, DI
   1608 
   1609 	BSWAPL	SI	// reverse order of bytes
   1610 	BSWAPL	DI
   1611 	XORL	SI, DI	// find bit differences
   1612 	JEQ	allsame
   1613 	BSRL	DI, CX	// index of highest bit difference
   1614 	SHRL	CX, SI	// move a's bit to bottom
   1615 	ANDL	$1, SI	// mask bit
   1616 	LEAL	-1(SI*2), BX // 1/0 => +1/-1
   1617 	MOVL	BX, (AX)
   1618 	RET
   1619 
   1620 	// all the bytes in common are the same, so we just need
   1621 	// to compare the lengths.
   1622 allsame:
   1623 	XORL	BX, BX
   1624 	XORL	CX, CX
   1625 	TESTL	DX, DX
   1626 	SETLT	BX	// 1 if alen > blen
   1627 	SETEQ	CX	// 1 if alen == blen
   1628 	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
   1629 	MOVL	BX, (AX)
   1630 	RET
   1631 
   1632 TEXT runtimefastrand1(SB), NOSPLIT, $0-4
   1633 	get_tls(CX)
   1634 	MOVL	g(CX), AX
   1635 	MOVL	g_m(AX), AX
   1636 	MOVL	m_fastrand(AX), DX
   1637 	ADDL	DX, DX
   1638 	MOVL	DX, BX
   1639 	XORL	$0x88888eef, DX
   1640 	CMOVLMI	BX, DX
   1641 	MOVL	DX, m_fastrand(AX)
   1642 	MOVL	DX, ret+0(FP)
   1643 	RET
   1644 
   1645 TEXT runtimereturn0(SB), NOSPLIT, $0
   1646 	MOVL	$0, AX
   1647 	RET
   1648 
   1649 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
   1650 // Must obey the gcc calling convention.
   1651 TEXT _cgo_topofstack(SB),NOSPLIT,$0
   1652 	get_tls(CX)
   1653 	MOVL	g(CX), AX
   1654 	MOVL	g_m(AX), AX
   1655 	MOVL	m_curg(AX), AX
   1656 	MOVL	(g_stack+stack_hi)(AX), AX
   1657 	RET
   1658 
   1659 // The top-most function running on a goroutine
   1660 // returns to goexit+PCQuantum.
   1661 TEXT runtimegoexit(SB),NOSPLIT,$0-0
   1662 	BYTE	$0x90	// NOP
   1663 	CALL	runtimegoexit1(SB)	// does not return
   1664 	// traceback from goexit1 must hit code range of goexit
   1665 	BYTE	$0x90	// NOP
   1666 
   1667 TEXT runtimeprefetcht0(SB),NOSPLIT,$0-4
   1668 	MOVL	addr+0(FP), AX
   1669 	PREFETCHT0	(AX)
   1670 	RET
   1671 
   1672 TEXT runtimeprefetcht1(SB),NOSPLIT,$0-4
   1673 	MOVL	addr+0(FP), AX
   1674 	PREFETCHT1	(AX)
   1675 	RET
   1676 
   1677 
   1678 TEXT runtimeprefetcht2(SB),NOSPLIT,$0-4
   1679 	MOVL	addr+0(FP), AX
   1680 	PREFETCHT2	(AX)
   1681 	RET
   1682 
   1683 TEXT runtimeprefetchnta(SB),NOSPLIT,$0-4
   1684 	MOVL	addr+0(FP), AX
   1685 	PREFETCHNTA	(AX)
   1686 	RET
   1687