Home | History | Annotate | Download | only in runtime
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "go_asm.h"
      6 #include "go_tls.h"
      7 #include "funcdata.h"
      8 #include "textflag.h"
      9 
     10 // _rt0_386 is common startup code for most 386 systems when using
     11 // internal linking. This is the entry point for the program from the
     12 // kernel for an ordinary -buildmode=exe program. The stack holds the
     13 // number of arguments and the C-style argv.
     14 TEXT _rt0_386(SB),NOSPLIT,$8
     15 	MOVL	8(SP), AX	// argc
     16 	LEAL	12(SP), BX	// argv
     17 	MOVL	AX, 0(SP)
     18 	MOVL	BX, 4(SP)
     19 	JMP	runtimert0_go(SB)
     20 
     21 // _rt0_386_lib is common startup code for most 386 systems when
     22 // using -buildmode=c-archive or -buildmode=c-shared. The linker will
     23 // arrange to invoke this function as a global constructor (for
     24 // c-archive) or when the shared library is loaded (for c-shared).
     25 // We expect argc and argv to be passed on the stack following the
     26 // usual C ABI.
     27 TEXT _rt0_386_lib(SB),NOSPLIT,$0
     28 	PUSHL	BP
     29 	MOVL	SP, BP
     30 	PUSHL	BX
     31 	PUSHL	SI
     32 	PUSHL	DI
     33 
     34 	MOVL	8(BP), AX
     35 	MOVL	AX, _rt0_386_lib_argc<>(SB)
     36 	MOVL	12(BP), AX
     37 	MOVL	AX, _rt0_386_lib_argv<>(SB)
     38 
     39 	// Synchronous initialization.
     40 	CALL	runtimelibpreinit(SB)
     41 
     42 	SUBL	$8, SP
     43 
     44 	// Create a new thread to do the runtime initialization.
     45 	MOVL	_cgo_sys_thread_create(SB), AX
     46 	TESTL	AX, AX
     47 	JZ	nocgo
     48 
     49 	// Align stack to call C function.
     50 	// We moved SP to BP above, but BP was clobbered by the libpreinit call.
     51 	MOVL	SP, BP
     52 	ANDL	$~15, SP
     53 
     54 	MOVL	$_rt0_386_lib_go(SB), BX
     55 	MOVL	BX, 0(SP)
     56 	MOVL	$0, 4(SP)
     57 
     58 	CALL	AX
     59 
     60 	MOVL	BP, SP
     61 
     62 	JMP	restore
     63 
     64 nocgo:
     65 	MOVL	$0x800000, 0(SP)                    // stacksize = 8192KB
     66 	MOVL	$_rt0_386_lib_go(SB), AX
     67 	MOVL	AX, 4(SP)                           // fn
     68 	CALL	runtimenewosproc0(SB)
     69 
     70 restore:
     71 	ADDL	$8, SP
     72 	POPL	DI
     73 	POPL	SI
     74 	POPL	BX
     75 	POPL	BP
     76 	RET
     77 
     78 // _rt0_386_lib_go initializes the Go runtime.
     79 // This is started in a separate thread by _rt0_386_lib.
     80 TEXT _rt0_386_lib_go(SB),NOSPLIT,$8
     81 	MOVL	_rt0_386_lib_argc<>(SB), AX
     82 	MOVL	AX, 0(SP)
     83 	MOVL	_rt0_386_lib_argv<>(SB), AX
     84 	MOVL	AX, 4(SP)
     85 	JMP	runtimert0_go(SB)
     86 
     87 DATA _rt0_386_lib_argc<>(SB)/4, $0
     88 GLOBL _rt0_386_lib_argc<>(SB),NOPTR, $4
     89 DATA _rt0_386_lib_argv<>(SB)/4, $0
     90 GLOBL _rt0_386_lib_argv<>(SB),NOPTR, $4
     91 
     92 TEXT runtimert0_go(SB),NOSPLIT,$0
     93 	// Copy arguments forward on an even stack.
     94 	// Users of this function jump to it, they don't call it.
     95 	MOVL	0(SP), AX
     96 	MOVL	4(SP), BX
     97 	SUBL	$128, SP		// plenty of scratch
     98 	ANDL	$~15, SP
     99 	MOVL	AX, 120(SP)		// save argc, argv away
    100 	MOVL	BX, 124(SP)
    101 
    102 	// set default stack bounds.
    103 	// _cgo_init may update stackguard.
    104 	MOVL	$runtimeg0(SB), BP
    105 	LEAL	(-64*1024+104)(SP), BX
    106 	MOVL	BX, g_stackguard0(BP)
    107 	MOVL	BX, g_stackguard1(BP)
    108 	MOVL	BX, (g_stack+stack_lo)(BP)
    109 	MOVL	SP, (g_stack+stack_hi)(BP)
    110 
    111 	// find out information about the processor we're on
    112 #ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
    113 	JMP 	has_cpuid
    114 #else
    115 	// first see if CPUID instruction is supported.
    116 	PUSHFL
    117 	PUSHFL
    118 	XORL	$(1<<21), 0(SP) // flip ID bit
    119 	POPFL
    120 	PUSHFL
    121 	POPL	AX
    122 	XORL	0(SP), AX
    123 	POPFL	// restore EFLAGS
    124 	TESTL	$(1<<21), AX
    125 	JNE 	has_cpuid
    126 #endif
    127 
    128 bad_proc: // show that the program requires MMX.
    129 	MOVL	$2, 0(SP)
    130 	MOVL	$bad_proc_msg<>(SB), 4(SP)
    131 	MOVL	$0x3d, 8(SP)
    132 	CALL	runtimewrite(SB)
    133 	MOVL	$1, 0(SP)
    134 	CALL	runtimeexit(SB)
    135 	INT	$3
    136 
    137 has_cpuid:
    138 	MOVL	$0, AX
    139 	CPUID
    140 	MOVL	AX, SI
    141 	CMPL	AX, $0
    142 	JE	nocpuinfo
    143 
    144 	// Figure out how to serialize RDTSC.
    145 	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    146 	// Don't know about the rest, so let's do MFENCE.
    147 	CMPL	BX, $0x756E6547  // "Genu"
    148 	JNE	notintel
    149 	CMPL	DX, $0x49656E69  // "ineI"
    150 	JNE	notintel
    151 	CMPL	CX, $0x6C65746E  // "ntel"
    152 	JNE	notintel
    153 	MOVB	$1, runtimeisIntel(SB)
    154 	MOVB	$1, runtimelfenceBeforeRdtsc(SB)
    155 notintel:
    156 
    157 	// Load EAX=1 cpuid flags
    158 	MOVL	$1, AX
    159 	CPUID
    160 	MOVL	CX, DI // Move to global variable clobbers CX when generating PIC
    161 	MOVL	AX, runtimeprocessorVersionInfo(SB)
    162 
    163 	// Check for MMX support
    164 	TESTL	$(1<<23), DX // MMX
    165 	JZ	bad_proc
    166 
    167 	TESTL	$(1<<26), DX // SSE2
    168 	SETNE	runtimesupport_sse2(SB)
    169 
    170 	TESTL	$(1<<9), DI // SSSE3
    171 	SETNE	runtimesupport_ssse3(SB)
    172 
    173 	TESTL	$(1<<19), DI // SSE4.1
    174 	SETNE	runtimesupport_sse41(SB)
    175 
    176 	TESTL	$(1<<20), DI // SSE4.2
    177 	SETNE	runtimesupport_sse42(SB)
    178 
    179 	TESTL	$(1<<23), DI // POPCNT
    180 	SETNE	runtimesupport_popcnt(SB)
    181 
    182 	TESTL	$(1<<25), DI // AES
    183 	SETNE	runtimesupport_aes(SB)
    184 
    185 	TESTL	$(1<<27), DI // OSXSAVE
    186 	SETNE	runtimesupport_osxsave(SB)
    187 
    188 	// If OS support for XMM and YMM is not present
    189 	// support_avx will be set back to false later.
    190 	TESTL	$(1<<28), DI // AVX
    191 	SETNE	runtimesupport_avx(SB)
    192 
    193 eax7:
    194 	// Load EAX=7/ECX=0 cpuid flags
    195 	CMPL	SI, $7
    196 	JLT	osavx
    197 	MOVL	$7, AX
    198 	MOVL	$0, CX
    199 	CPUID
    200 
    201 	TESTL	$(1<<3), BX // BMI1
    202 	SETNE	runtimesupport_bmi1(SB)
    203 
    204 	// If OS support for XMM and YMM is not present
    205 	// support_avx2 will be set back to false later.
    206 	TESTL	$(1<<5), BX
    207 	SETNE	runtimesupport_avx2(SB)
    208 
    209 	TESTL	$(1<<8), BX // BMI2
    210 	SETNE	runtimesupport_bmi2(SB)
    211 
    212 	TESTL	$(1<<9), BX // ERMS
    213 	SETNE	runtimesupport_erms(SB)
    214 
    215 osavx:
    216 	// nacl does not support XGETBV to test
    217 	// for XMM and YMM OS support.
    218 #ifndef GOOS_nacl
    219 	CMPB	runtimesupport_osxsave(SB), $1
    220 	JNE	noavx
    221 	MOVL	$0, CX
    222 	// For XGETBV, OSXSAVE bit is required and sufficient
    223 	XGETBV
    224 	ANDL	$6, AX
    225 	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
    226 	JE nocpuinfo
    227 #endif
    228 noavx:
    229 	MOVB $0, runtimesupport_avx(SB)
    230 	MOVB $0, runtimesupport_avx2(SB)
    231 
    232 nocpuinfo:
    233 	// if there is an _cgo_init, call it to let it
    234 	// initialize and to set up GS.  if not,
    235 	// we set up GS ourselves.
    236 	MOVL	_cgo_init(SB), AX
    237 	TESTL	AX, AX
    238 	JZ	needtls
    239 	MOVL	$setg_gcc<>(SB), BX
    240 	MOVL	BX, 4(SP)
    241 	MOVL	BP, 0(SP)
    242 	CALL	AX
    243 
    244 	// update stackguard after _cgo_init
    245 	MOVL	$runtimeg0(SB), CX
    246 	MOVL	(g_stack+stack_lo)(CX), AX
    247 	ADDL	$const__StackGuard, AX
    248 	MOVL	AX, g_stackguard0(CX)
    249 	MOVL	AX, g_stackguard1(CX)
    250 
    251 #ifndef GOOS_windows
    252 	// skip runtimeldt0setup(SB) and tls test after _cgo_init for non-windows
    253 	JMP ok
    254 #endif
    255 needtls:
    256 #ifdef GOOS_plan9
    257 	// skip runtimeldt0setup(SB) and tls test on Plan 9 in all cases
    258 	JMP	ok
    259 #endif
    260 
    261 	// set up %gs
    262 	CALL	runtimeldt0setup(SB)
    263 
    264 	// store through it, to make sure it works
    265 	get_tls(BX)
    266 	MOVL	$0x123, g(BX)
    267 	MOVL	runtimem0+m_tls(SB), AX
    268 	CMPL	AX, $0x123
    269 	JEQ	ok
    270 	MOVL	AX, 0	// abort
    271 ok:
    272 	// set up m and g "registers"
    273 	get_tls(BX)
    274 	LEAL	runtimeg0(SB), DX
    275 	MOVL	DX, g(BX)
    276 	LEAL	runtimem0(SB), AX
    277 
    278 	// save m->g0 = g0
    279 	MOVL	DX, m_g0(AX)
    280 	// save g0->m = m0
    281 	MOVL	AX, g_m(DX)
    282 
    283 	CALL	runtimeemptyfunc(SB)	// fault if stack check is wrong
    284 
    285 	// convention is D is always cleared
    286 	CLD
    287 
    288 	CALL	runtimecheck(SB)
    289 
    290 	// saved argc, argv
    291 	MOVL	120(SP), AX
    292 	MOVL	AX, 0(SP)
    293 	MOVL	124(SP), AX
    294 	MOVL	AX, 4(SP)
    295 	CALL	runtimeargs(SB)
    296 	CALL	runtimeosinit(SB)
    297 	CALL	runtimeschedinit(SB)
    298 
    299 	// create a new goroutine to start program
    300 	PUSHL	$runtimemainPC(SB)	// entry
    301 	PUSHL	$0	// arg size
    302 	CALL	runtimenewproc(SB)
    303 	POPL	AX
    304 	POPL	AX
    305 
    306 	// start this M
    307 	CALL	runtimemstart(SB)
    308 
    309 	INT $3
    310 	RET
    311 
    312 DATA	bad_proc_msg<>+0x00(SB)/8, $"This pro"
    313 DATA	bad_proc_msg<>+0x08(SB)/8, $"gram can"
    314 DATA	bad_proc_msg<>+0x10(SB)/8, $" only be"
    315 DATA	bad_proc_msg<>+0x18(SB)/8, $" run on "
    316 DATA	bad_proc_msg<>+0x20(SB)/8, $"processo"
    317 DATA	bad_proc_msg<>+0x28(SB)/8, $"rs with "
    318 DATA	bad_proc_msg<>+0x30(SB)/8, $"MMX supp"
    319 DATA	bad_proc_msg<>+0x38(SB)/4, $"ort."
    320 DATA	bad_proc_msg<>+0x3c(SB)/1, $0xa
    321 GLOBL	bad_proc_msg<>(SB), RODATA, $0x3d
    322 
    323 DATA	runtimemainPC+0(SB)/4,$runtimemain(SB)
    324 GLOBL	runtimemainPC(SB),RODATA,$4
    325 
    326 TEXT runtimebreakpoint(SB),NOSPLIT,$0-0
    327 	INT $3
    328 	RET
    329 
    330 TEXT runtimeasminit(SB),NOSPLIT,$0-0
    331 	// Linux and MinGW start the FPU in extended double precision.
    332 	// Other operating systems use double precision.
    333 	// Change to double precision to match them,
    334 	// and to match other hardware that only has double.
    335 	FLDCW	runtimecontrolWord64(SB)
    336 	RET
    337 
    338 /*
    339  *  go-routine
    340  */
    341 
    342 // void gosave(Gobuf*)
    343 // save state in Gobuf; setjmp
    344 TEXT runtimegosave(SB), NOSPLIT, $0-4
    345 	MOVL	buf+0(FP), AX		// gobuf
    346 	LEAL	buf+0(FP), BX		// caller's SP
    347 	MOVL	BX, gobuf_sp(AX)
    348 	MOVL	0(SP), BX		// caller's PC
    349 	MOVL	BX, gobuf_pc(AX)
    350 	MOVL	$0, gobuf_ret(AX)
    351 	// Assert ctxt is zero. See func save.
    352 	MOVL	gobuf_ctxt(AX), BX
    353 	TESTL	BX, BX
    354 	JZ	2(PC)
    355 	CALL	runtimebadctxt(SB)
    356 	get_tls(CX)
    357 	MOVL	g(CX), BX
    358 	MOVL	BX, gobuf_g(AX)
    359 	RET
    360 
    361 // void gogo(Gobuf*)
    362 // restore state from Gobuf; longjmp
    363 TEXT runtimegogo(SB), NOSPLIT, $8-4
    364 	MOVL	buf+0(FP), BX		// gobuf
    365 	MOVL	gobuf_g(BX), DX
    366 	MOVL	0(DX), CX		// make sure g != nil
    367 	get_tls(CX)
    368 	MOVL	DX, g(CX)
    369 	MOVL	gobuf_sp(BX), SP	// restore SP
    370 	MOVL	gobuf_ret(BX), AX
    371 	MOVL	gobuf_ctxt(BX), DX
    372 	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
    373 	MOVL	$0, gobuf_ret(BX)
    374 	MOVL	$0, gobuf_ctxt(BX)
    375 	MOVL	gobuf_pc(BX), BX
    376 	JMP	BX
    377 
    378 // func mcall(fn func(*g))
    379 // Switch to m->g0's stack, call fn(g).
    380 // Fn must never return. It should gogo(&g->sched)
    381 // to keep running g.
    382 TEXT runtimemcall(SB), NOSPLIT, $0-4
    383 	MOVL	fn+0(FP), DI
    384 
    385 	get_tls(DX)
    386 	MOVL	g(DX), AX	// save state in g->sched
    387 	MOVL	0(SP), BX	// caller's PC
    388 	MOVL	BX, (g_sched+gobuf_pc)(AX)
    389 	LEAL	fn+0(FP), BX	// caller's SP
    390 	MOVL	BX, (g_sched+gobuf_sp)(AX)
    391 	MOVL	AX, (g_sched+gobuf_g)(AX)
    392 
    393 	// switch to m->g0 & its stack, call fn
    394 	MOVL	g(DX), BX
    395 	MOVL	g_m(BX), BX
    396 	MOVL	m_g0(BX), SI
    397 	CMPL	SI, AX	// if g == m->g0 call badmcall
    398 	JNE	3(PC)
    399 	MOVL	$runtimebadmcall(SB), AX
    400 	JMP	AX
    401 	MOVL	SI, g(DX)	// g = m->g0
    402 	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
    403 	PUSHL	AX
    404 	MOVL	DI, DX
    405 	MOVL	0(DI), DI
    406 	CALL	DI
    407 	POPL	AX
    408 	MOVL	$runtimebadmcall2(SB), AX
    409 	JMP	AX
    410 	RET
    411 
    412 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
    413 // of the G stack. We need to distinguish the routine that
    414 // lives at the bottom of the G stack from the one that lives
    415 // at the top of the system stack because the one at the top of
    416 // the system stack terminates the stack walk (see topofstack()).
    417 TEXT runtimesystemstack_switch(SB), NOSPLIT, $0-0
    418 	RET
    419 
    420 // func systemstack(fn func())
    421 TEXT runtimesystemstack(SB), NOSPLIT, $0-4
    422 	MOVL	fn+0(FP), DI	// DI = fn
    423 	get_tls(CX)
    424 	MOVL	g(CX), AX	// AX = g
    425 	MOVL	g_m(AX), BX	// BX = m
    426 
    427 	MOVL	m_gsignal(BX), DX	// DX = gsignal
    428 	CMPL	AX, DX
    429 	JEQ	noswitch
    430 
    431 	MOVL	m_g0(BX), DX	// DX = g0
    432 	CMPL	AX, DX
    433 	JEQ	noswitch
    434 
    435 	MOVL	m_curg(BX), BP
    436 	CMPL	AX, BP
    437 	JEQ	switch
    438 
    439 	// Bad: g is not gsignal, not g0, not curg. What is it?
    440 	// Hide call from linker nosplit analysis.
    441 	MOVL	$runtimebadsystemstack(SB), AX
    442 	CALL	AX
    443 
    444 switch:
    445 	// save our state in g->sched. Pretend to
    446 	// be systemstack_switch if the G stack is scanned.
    447 	MOVL	$runtimesystemstack_switch(SB), (g_sched+gobuf_pc)(AX)
    448 	MOVL	SP, (g_sched+gobuf_sp)(AX)
    449 	MOVL	AX, (g_sched+gobuf_g)(AX)
    450 
    451 	// switch to g0
    452 	get_tls(CX)
    453 	MOVL	DX, g(CX)
    454 	MOVL	(g_sched+gobuf_sp)(DX), BX
    455 	// make it look like mstart called systemstack on g0, to stop traceback
    456 	SUBL	$4, BX
    457 	MOVL	$runtimemstart(SB), DX
    458 	MOVL	DX, 0(BX)
    459 	MOVL	BX, SP
    460 
    461 	// call target function
    462 	MOVL	DI, DX
    463 	MOVL	0(DI), DI
    464 	CALL	DI
    465 
    466 	// switch back to g
    467 	get_tls(CX)
    468 	MOVL	g(CX), AX
    469 	MOVL	g_m(AX), BX
    470 	MOVL	m_curg(BX), AX
    471 	MOVL	AX, g(CX)
    472 	MOVL	(g_sched+gobuf_sp)(AX), SP
    473 	MOVL	$0, (g_sched+gobuf_sp)(AX)
    474 	RET
    475 
    476 noswitch:
    477 	// already on system stack; tail call the function
    478 	// Using a tail call here cleans up tracebacks since we won't stop
    479 	// at an intermediate systemstack.
    480 	MOVL	DI, DX
    481 	MOVL	0(DI), DI
    482 	JMP	DI
    483 
    484 /*
    485  * support for morestack
    486  */
    487 
    488 // Called during function prolog when more stack is needed.
    489 //
    490 // The traceback routines see morestack on a g0 as being
    491 // the top of a stack (for example, morestack calling newstack
    492 // calling the scheduler calling newm calling gc), so we must
    493 // record an argument size. For that purpose, it has no arguments.
    494 TEXT runtimemorestack(SB),NOSPLIT,$0-0
    495 	// Cannot grow scheduler stack (m->g0).
    496 	get_tls(CX)
    497 	MOVL	g(CX), BX
    498 	MOVL	g_m(BX), BX
    499 	MOVL	m_g0(BX), SI
    500 	CMPL	g(CX), SI
    501 	JNE	3(PC)
    502 	CALL	runtimebadmorestackg0(SB)
    503 	INT	$3
    504 
    505 	// Cannot grow signal stack.
    506 	MOVL	m_gsignal(BX), SI
    507 	CMPL	g(CX), SI
    508 	JNE	3(PC)
    509 	CALL	runtimebadmorestackgsignal(SB)
    510 	INT	$3
    511 
    512 	// Called from f.
    513 	// Set m->morebuf to f's caller.
    514 	MOVL	4(SP), DI	// f's caller's PC
    515 	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
    516 	LEAL	8(SP), CX	// f's caller's SP
    517 	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
    518 	get_tls(CX)
    519 	MOVL	g(CX), SI
    520 	MOVL	SI, (m_morebuf+gobuf_g)(BX)
    521 
    522 	// Set g->sched to context in f.
    523 	MOVL	0(SP), AX	// f's PC
    524 	MOVL	AX, (g_sched+gobuf_pc)(SI)
    525 	MOVL	SI, (g_sched+gobuf_g)(SI)
    526 	LEAL	4(SP), AX	// f's SP
    527 	MOVL	AX, (g_sched+gobuf_sp)(SI)
    528 	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
    529 
    530 	// Call newstack on m->g0's stack.
    531 	MOVL	m_g0(BX), BP
    532 	MOVL	BP, g(CX)
    533 	MOVL	(g_sched+gobuf_sp)(BP), AX
    534 	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
    535 	MOVL	AX, SP
    536 	CALL	runtimenewstack(SB)
    537 	MOVL	$0, 0x1003	// crash if newstack returns
    538 	RET
    539 
    540 TEXT runtimemorestack_noctxt(SB),NOSPLIT,$0-0
    541 	MOVL	$0, DX
    542 	JMP runtimemorestack(SB)
    543 
    544 // reflectcall: call a function with the given argument list
    545 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
    546 // we don't have variable-sized frames, so we use a small number
    547 // of constant-sized-frame functions to encode a few bits of size in the pc.
    548 // Caution: ugly multiline assembly macros in your future!
    549 
    550 #define DISPATCH(NAME,MAXSIZE)		\
    551 	CMPL	CX, $MAXSIZE;		\
    552 	JA	3(PC);			\
    553 	MOVL	$NAME(SB), AX;		\
    554 	JMP	AX
    555 // Note: can't just "JMP NAME(SB)" - bad inlining results.
    556 
    557 TEXT reflectcall(SB), NOSPLIT, $0-0
    558 	JMP	reflectcall(SB)
    559 
    560 TEXT reflectcall(SB), NOSPLIT, $0-20
    561 	MOVL	argsize+12(FP), CX
    562 	DISPATCH(runtimecall16, 16)
    563 	DISPATCH(runtimecall32, 32)
    564 	DISPATCH(runtimecall64, 64)
    565 	DISPATCH(runtimecall128, 128)
    566 	DISPATCH(runtimecall256, 256)
    567 	DISPATCH(runtimecall512, 512)
    568 	DISPATCH(runtimecall1024, 1024)
    569 	DISPATCH(runtimecall2048, 2048)
    570 	DISPATCH(runtimecall4096, 4096)
    571 	DISPATCH(runtimecall8192, 8192)
    572 	DISPATCH(runtimecall16384, 16384)
    573 	DISPATCH(runtimecall32768, 32768)
    574 	DISPATCH(runtimecall65536, 65536)
    575 	DISPATCH(runtimecall131072, 131072)
    576 	DISPATCH(runtimecall262144, 262144)
    577 	DISPATCH(runtimecall524288, 524288)
    578 	DISPATCH(runtimecall1048576, 1048576)
    579 	DISPATCH(runtimecall2097152, 2097152)
    580 	DISPATCH(runtimecall4194304, 4194304)
    581 	DISPATCH(runtimecall8388608, 8388608)
    582 	DISPATCH(runtimecall16777216, 16777216)
    583 	DISPATCH(runtimecall33554432, 33554432)
    584 	DISPATCH(runtimecall67108864, 67108864)
    585 	DISPATCH(runtimecall134217728, 134217728)
    586 	DISPATCH(runtimecall268435456, 268435456)
    587 	DISPATCH(runtimecall536870912, 536870912)
    588 	DISPATCH(runtimecall1073741824, 1073741824)
    589 	MOVL	$runtimebadreflectcall(SB), AX
    590 	JMP	AX
    591 
    592 #define CALLFN(NAME,MAXSIZE)			\
    593 TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
    594 	NO_LOCAL_POINTERS;			\
    595 	/* copy arguments to stack */		\
    596 	MOVL	argptr+8(FP), SI;		\
    597 	MOVL	argsize+12(FP), CX;		\
    598 	MOVL	SP, DI;				\
    599 	REP;MOVSB;				\
    600 	/* call function */			\
    601 	MOVL	f+4(FP), DX;			\
    602 	MOVL	(DX), AX; 			\
    603 	PCDATA  $PCDATA_StackMapIndex, $0;	\
    604 	CALL	AX;				\
    605 	/* copy return values back */		\
    606 	MOVL	argtype+0(FP), DX;		\
    607 	MOVL	argptr+8(FP), DI;		\
    608 	MOVL	argsize+12(FP), CX;		\
    609 	MOVL	retoffset+16(FP), BX;		\
    610 	MOVL	SP, SI;				\
    611 	ADDL	BX, DI;				\
    612 	ADDL	BX, SI;				\
    613 	SUBL	BX, CX;				\
    614 	CALL	callRet<>(SB);			\
    615 	RET
    616 
    617 // callRet copies return values back at the end of call*. This is a
    618 // separate function so it can allocate stack space for the arguments
    619 // to reflectcallmove. It does not follow the Go ABI; it expects its
    620 // arguments in registers.
    621 TEXT callRet<>(SB), NOSPLIT, $16-0
    622 	MOVL	DX, 0(SP)
    623 	MOVL	DI, 4(SP)
    624 	MOVL	SI, 8(SP)
    625 	MOVL	CX, 12(SP)
    626 	CALL	runtimereflectcallmove(SB)
    627 	RET
    628 
    629 CALLFN(call16, 16)
    630 CALLFN(call32, 32)
    631 CALLFN(call64, 64)
    632 CALLFN(call128, 128)
    633 CALLFN(call256, 256)
    634 CALLFN(call512, 512)
    635 CALLFN(call1024, 1024)
    636 CALLFN(call2048, 2048)
    637 CALLFN(call4096, 4096)
    638 CALLFN(call8192, 8192)
    639 CALLFN(call16384, 16384)
    640 CALLFN(call32768, 32768)
    641 CALLFN(call65536, 65536)
    642 CALLFN(call131072, 131072)
    643 CALLFN(call262144, 262144)
    644 CALLFN(call524288, 524288)
    645 CALLFN(call1048576, 1048576)
    646 CALLFN(call2097152, 2097152)
    647 CALLFN(call4194304, 4194304)
    648 CALLFN(call8388608, 8388608)
    649 CALLFN(call16777216, 16777216)
    650 CALLFN(call33554432, 33554432)
    651 CALLFN(call67108864, 67108864)
    652 CALLFN(call134217728, 134217728)
    653 CALLFN(call268435456, 268435456)
    654 CALLFN(call536870912, 536870912)
    655 CALLFN(call1073741824, 1073741824)
    656 
    657 TEXT runtimeprocyield(SB),NOSPLIT,$0-0
    658 	MOVL	cycles+0(FP), AX
    659 again:
    660 	PAUSE
    661 	SUBL	$1, AX
    662 	JNZ	again
    663 	RET
    664 
    665 TEXT publicationBarrier(SB),NOSPLIT,$0-0
    666 	// Stores are already ordered on x86, so this is just a
    667 	// compile barrier.
    668 	RET
    669 
    670 // void jmpdefer(fn, sp);
    671 // called from deferreturn.
    672 // 1. pop the caller
    673 // 2. sub 5 bytes (the length of CALL & a 32 bit displacement) from the callers
    674 //    return (when building for shared libraries, subtract 16 bytes -- 5 bytes
    675 //    for CALL & displacement to call __x86.get_pc_thunk.cx, 6 bytes for the
    676 //    LEAL to load the offset into BX, and finally 5 for the call & displacement)
    677 // 3. jmp to the argument
    678 TEXT runtimejmpdefer(SB), NOSPLIT, $0-8
    679 	MOVL	fv+0(FP), DX	// fn
    680 	MOVL	argp+4(FP), BX	// caller sp
    681 	LEAL	-4(BX), SP	// caller sp after CALL
    682 #ifdef GOBUILDMODE_shared
    683 	SUBL	$16, (SP)	// return to CALL again
    684 #else
    685 	SUBL	$5, (SP)	// return to CALL again
    686 #endif
    687 	MOVL	0(DX), BX
    688 	JMP	BX	// but first run the deferred function
    689 
    690 // Save state of caller into g->sched.
    691 TEXT gosave<>(SB),NOSPLIT,$0
    692 	PUSHL	AX
    693 	PUSHL	BX
    694 	get_tls(BX)
    695 	MOVL	g(BX), BX
    696 	LEAL	arg+0(FP), AX
    697 	MOVL	AX, (g_sched+gobuf_sp)(BX)
    698 	MOVL	-4(AX), AX
    699 	MOVL	AX, (g_sched+gobuf_pc)(BX)
    700 	MOVL	$0, (g_sched+gobuf_ret)(BX)
    701 	// Assert ctxt is zero. See func save.
    702 	MOVL	(g_sched+gobuf_ctxt)(BX), AX
    703 	TESTL	AX, AX
    704 	JZ	2(PC)
    705 	CALL	runtimebadctxt(SB)
    706 	POPL	BX
    707 	POPL	AX
    708 	RET
    709 
    710 // func asmcgocall(fn, arg unsafe.Pointer) int32
    711 // Call fn(arg) on the scheduler stack,
    712 // aligned appropriately for the gcc ABI.
    713 // See cgocall.go for more details.
    714 TEXT asmcgocall(SB),NOSPLIT,$0-12
    715 	MOVL	fn+0(FP), AX
    716 	MOVL	arg+4(FP), BX
    717 
    718 	MOVL	SP, DX
    719 
    720 	// Figure out if we need to switch to m->g0 stack.
    721 	// We get called to create new OS threads too, and those
    722 	// come in on the m->g0 stack already.
    723 	get_tls(CX)
    724 	MOVL	g(CX), BP
    725 	MOVL	g_m(BP), BP
    726 	MOVL	m_g0(BP), SI
    727 	MOVL	g(CX), DI
    728 	CMPL	SI, DI
    729 	JEQ	noswitch
    730 	CALL	gosave<>(SB)
    731 	get_tls(CX)
    732 	MOVL	SI, g(CX)
    733 	MOVL	(g_sched+gobuf_sp)(SI), SP
    734 
    735 noswitch:
    736 	// Now on a scheduling stack (a pthread-created stack).
    737 	SUBL	$32, SP
    738 	ANDL	$~15, SP	// alignment, perhaps unnecessary
    739 	MOVL	DI, 8(SP)	// save g
    740 	MOVL	(g_stack+stack_hi)(DI), DI
    741 	SUBL	DX, DI
    742 	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
    743 	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
    744 	CALL	AX
    745 
    746 	// Restore registers, g, stack pointer.
    747 	get_tls(CX)
    748 	MOVL	8(SP), DI
    749 	MOVL	(g_stack+stack_hi)(DI), SI
    750 	SUBL	4(SP), SI
    751 	MOVL	DI, g(CX)
    752 	MOVL	SI, SP
    753 
    754 	MOVL	AX, ret+8(FP)
    755 	RET
    756 
    757 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
    758 // Turn the fn into a Go func (by taking its address) and call
    759 // cgocallback_gofunc.
    760 TEXT runtimecgocallback(SB),NOSPLIT,$16-16
    761 	LEAL	fn+0(FP), AX
    762 	MOVL	AX, 0(SP)
    763 	MOVL	frame+4(FP), AX
    764 	MOVL	AX, 4(SP)
    765 	MOVL	framesize+8(FP), AX
    766 	MOVL	AX, 8(SP)
    767 	MOVL	ctxt+12(FP), AX
    768 	MOVL	AX, 12(SP)
    769 	MOVL	$runtimecgocallback_gofunc(SB), AX
    770 	CALL	AX
    771 	RET
    772 
    773 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
    774 // See cgocall.go for more details.
    775 TEXT cgocallback_gofunc(SB),NOSPLIT,$12-16
    776 	NO_LOCAL_POINTERS
    777 
    778 	// If g is nil, Go did not create the current thread.
    779 	// Call needm to obtain one for temporary use.
    780 	// In this case, we're running on the thread stack, so there's
    781 	// lots of space, but the linker doesn't know. Hide the call from
    782 	// the linker analysis by using an indirect call through AX.
    783 	get_tls(CX)
    784 #ifdef GOOS_windows
    785 	MOVL	$0, BP
    786 	CMPL	CX, $0
    787 	JEQ	2(PC) // TODO
    788 #endif
    789 	MOVL	g(CX), BP
    790 	CMPL	BP, $0
    791 	JEQ	needm
    792 	MOVL	g_m(BP), BP
    793 	MOVL	BP, DX // saved copy of oldm
    794 	JMP	havem
    795 needm:
    796 	MOVL	$0, 0(SP)
    797 	MOVL	$runtimeneedm(SB), AX
    798 	CALL	AX
    799 	MOVL	0(SP), DX
    800 	get_tls(CX)
    801 	MOVL	g(CX), BP
    802 	MOVL	g_m(BP), BP
    803 
    804 	// Set m->sched.sp = SP, so that if a panic happens
    805 	// during the function we are about to execute, it will
    806 	// have a valid SP to run on the g0 stack.
    807 	// The next few lines (after the havem label)
    808 	// will save this SP onto the stack and then write
    809 	// the same SP back to m->sched.sp. That seems redundant,
    810 	// but if an unrecovered panic happens, unwindm will
    811 	// restore the g->sched.sp from the stack location
    812 	// and then systemstack will try to use it. If we don't set it here,
    813 	// that restored SP will be uninitialized (typically 0) and
    814 	// will not be usable.
    815 	MOVL	m_g0(BP), SI
    816 	MOVL	SP, (g_sched+gobuf_sp)(SI)
    817 
    818 havem:
    819 	// Now there's a valid m, and we're running on its m->g0.
    820 	// Save current m->g0->sched.sp on stack and then set it to SP.
    821 	// Save current sp in m->g0->sched.sp in preparation for
    822 	// switch back to m->curg stack.
    823 	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
    824 	MOVL	m_g0(BP), SI
    825 	MOVL	(g_sched+gobuf_sp)(SI), AX
    826 	MOVL	AX, 0(SP)
    827 	MOVL	SP, (g_sched+gobuf_sp)(SI)
    828 
    829 	// Switch to m->curg stack and call runtime.cgocallbackg.
    830 	// Because we are taking over the execution of m->curg
    831 	// but *not* resuming what had been running, we need to
    832 	// save that information (m->curg->sched) so we can restore it.
    833 	// We can restore m->curg->sched.sp easily, because calling
    834 	// runtime.cgocallbackg leaves SP unchanged upon return.
    835 	// To save m->curg->sched.pc, we push it onto the stack.
    836 	// This has the added benefit that it looks to the traceback
    837 	// routine like cgocallbackg is going to return to that
    838 	// PC (because the frame we allocate below has the same
    839 	// size as cgocallback_gofunc's frame declared above)
    840 	// so that the traceback will seamlessly trace back into
    841 	// the earlier calls.
    842 	//
    843 	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
    844 	// 8(SP) is unused.
    845 	MOVL	m_curg(BP), SI
    846 	MOVL	SI, g(CX)
    847 	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
    848 	MOVL	(g_sched+gobuf_pc)(SI), BP
    849 	MOVL	BP, -4(DI)
    850 	MOVL	ctxt+12(FP), CX
    851 	LEAL	-(4+12)(DI), SP
    852 	MOVL	DX, 4(SP)
    853 	MOVL	CX, 0(SP)
    854 	CALL	runtimecgocallbackg(SB)
    855 	MOVL	4(SP), DX
    856 
    857 	// Restore g->sched (== m->curg->sched) from saved values.
    858 	get_tls(CX)
    859 	MOVL	g(CX), SI
    860 	MOVL	12(SP), BP
    861 	MOVL	BP, (g_sched+gobuf_pc)(SI)
    862 	LEAL	(12+4)(SP), DI
    863 	MOVL	DI, (g_sched+gobuf_sp)(SI)
    864 
    865 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
    866 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
    867 	// so we do not have to restore it.)
    868 	MOVL	g(CX), BP
    869 	MOVL	g_m(BP), BP
    870 	MOVL	m_g0(BP), SI
    871 	MOVL	SI, g(CX)
    872 	MOVL	(g_sched+gobuf_sp)(SI), SP
    873 	MOVL	0(SP), AX
    874 	MOVL	AX, (g_sched+gobuf_sp)(SI)
    875 
    876 	// If the m on entry was nil, we called needm above to borrow an m
    877 	// for the duration of the call. Since the call is over, return it with dropm.
    878 	CMPL	DX, $0
    879 	JNE 3(PC)
    880 	MOVL	$runtimedropm(SB), AX
    881 	CALL	AX
    882 
    883 	// Done!
    884 	RET
    885 
    886 // void setg(G*); set g. for use by needm.
    887 TEXT runtimesetg(SB), NOSPLIT, $0-4
    888 	MOVL	gg+0(FP), BX
    889 #ifdef GOOS_windows
    890 	CMPL	BX, $0
    891 	JNE	settls
    892 	MOVL	$0, 0x14(FS)
    893 	RET
    894 settls:
    895 	MOVL	g_m(BX), AX
    896 	LEAL	m_tls(AX), AX
    897 	MOVL	AX, 0x14(FS)
    898 #endif
    899 	get_tls(CX)
    900 	MOVL	BX, g(CX)
    901 	RET
    902 
    903 // void setg_gcc(G*); set g. for use by gcc
    904 TEXT setg_gcc<>(SB), NOSPLIT, $0
    905 	get_tls(AX)
    906 	MOVL	gg+0(FP), DX
    907 	MOVL	DX, g(AX)
    908 	RET
    909 
    910 // check that SP is in range [g->stack.lo, g->stack.hi)
    911 TEXT runtimestackcheck(SB), NOSPLIT, $0-0
    912 	get_tls(CX)
    913 	MOVL	g(CX), AX
    914 	CMPL	(g_stack+stack_hi)(AX), SP
    915 	JHI	2(PC)
    916 	INT	$3
    917 	CMPL	SP, (g_stack+stack_lo)(AX)
    918 	JHI	2(PC)
    919 	INT	$3
    920 	RET
    921 
    922 // func cputicks() int64
    923 TEXT runtimecputicks(SB),NOSPLIT,$0-8
    924 	CMPB	runtimesupport_sse2(SB), $1
    925 	JNE	done
    926 	CMPB	runtimelfenceBeforeRdtsc(SB), $1
    927 	JNE	mfence
    928 	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
    929 	JMP	done
    930 mfence:
    931 	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
    932 done:
    933 	RDTSC
    934 	MOVL	AX, ret_lo+0(FP)
    935 	MOVL	DX, ret_hi+4(FP)
    936 	RET
    937 
    938 TEXT runtimeldt0setup(SB),NOSPLIT,$16-0
    939 	// set up ldt 7 to point at m0.tls
    940 	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
    941 	// the entry number is just a hint.  setldt will set up GS with what it used.
    942 	MOVL	$7, 0(SP)
    943 	LEAL	runtimem0+m_tls(SB), AX
    944 	MOVL	AX, 4(SP)
    945 	MOVL	$32, 8(SP)	// sizeof(tls array)
    946 	CALL	runtimesetldt(SB)
    947 	RET
    948 
    949 TEXT runtimeemptyfunc(SB),0,$0-0
    950 	RET
    951 
    952 // hash function using AES hardware instructions
    953 TEXT runtimeaeshash(SB),NOSPLIT,$0-16
    954 	MOVL	p+0(FP), AX	// ptr to data
    955 	MOVL	s+8(FP), BX	// size
    956 	LEAL	ret+12(FP), DX
    957 	JMP	runtimeaeshashbody(SB)
    958 
    959 TEXT runtimeaeshashstr(SB),NOSPLIT,$0-12
    960 	MOVL	p+0(FP), AX	// ptr to string object
    961 	MOVL	4(AX), BX	// length of string
    962 	MOVL	(AX), AX	// string data
    963 	LEAL	ret+8(FP), DX
    964 	JMP	runtimeaeshashbody(SB)
    965 
    966 // AX: data
    967 // BX: length
    968 // DX: address to put return value
    969 TEXT runtimeaeshashbody(SB),NOSPLIT,$0-0
    970 	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
    971 	PINSRW	$4, BX, X0	            // 16 bits of length
    972 	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
    973 	MOVO	X0, X1                      // save unscrambled seed
    974 	PXOR	runtimeaeskeysched(SB), X0 // xor in per-process seed
    975 	AESENC	X0, X0                      // scramble seed
    976 
    977 	CMPL	BX, $16
    978 	JB	aes0to15
    979 	JE	aes16
    980 	CMPL	BX, $32
    981 	JBE	aes17to32
    982 	CMPL	BX, $64
    983 	JBE	aes33to64
    984 	JMP	aes65plus
    985 
    986 aes0to15:
    987 	TESTL	BX, BX
    988 	JE	aes0
    989 
    990 	ADDL	$16, AX
    991 	TESTW	$0xff0, AX
    992 	JE	endofpage
    993 
    994 	// 16 bytes loaded at this address won't cross
    995 	// a page boundary, so we can load it directly.
    996 	MOVOU	-16(AX), X1
    997 	ADDL	BX, BX
    998 	PAND	masks<>(SB)(BX*8), X1
    999 
   1000 final1:
   1001 	AESENC	X0, X1  // scramble input, xor in seed
   1002 	AESENC	X1, X1  // scramble combo 2 times
   1003 	AESENC	X1, X1
   1004 	MOVL	X1, (DX)
   1005 	RET
   1006 
   1007 endofpage:
   1008 	// address ends in 1111xxxx. Might be up against
   1009 	// a page boundary, so load ending at last byte.
   1010 	// Then shift bytes down using pshufb.
   1011 	MOVOU	-32(AX)(BX*1), X1
   1012 	ADDL	BX, BX
   1013 	PSHUFB	shifts<>(SB)(BX*8), X1
   1014 	JMP	final1
   1015 
   1016 aes0:
   1017 	// Return scrambled input seed
   1018 	AESENC	X0, X0
   1019 	MOVL	X0, (DX)
   1020 	RET
   1021 
   1022 aes16:
   1023 	MOVOU	(AX), X1
   1024 	JMP	final1
   1025 
   1026 aes17to32:
   1027 	// make second starting seed
   1028 	PXOR	runtimeaeskeysched+16(SB), X1
   1029 	AESENC	X1, X1
   1030 
   1031 	// load data to be hashed
   1032 	MOVOU	(AX), X2
   1033 	MOVOU	-16(AX)(BX*1), X3
   1034 
   1035 	// scramble 3 times
   1036 	AESENC	X0, X2
   1037 	AESENC	X1, X3
   1038 	AESENC	X2, X2
   1039 	AESENC	X3, X3
   1040 	AESENC	X2, X2
   1041 	AESENC	X3, X3
   1042 
   1043 	// combine results
   1044 	PXOR	X3, X2
   1045 	MOVL	X2, (DX)
   1046 	RET
   1047 
   1048 aes33to64:
   1049 	// make 3 more starting seeds
   1050 	MOVO	X1, X2
   1051 	MOVO	X1, X3
   1052 	PXOR	runtimeaeskeysched+16(SB), X1
   1053 	PXOR	runtimeaeskeysched+32(SB), X2
   1054 	PXOR	runtimeaeskeysched+48(SB), X3
   1055 	AESENC	X1, X1
   1056 	AESENC	X2, X2
   1057 	AESENC	X3, X3
   1058 
   1059 	MOVOU	(AX), X4
   1060 	MOVOU	16(AX), X5
   1061 	MOVOU	-32(AX)(BX*1), X6
   1062 	MOVOU	-16(AX)(BX*1), X7
   1063 
   1064 	AESENC	X0, X4
   1065 	AESENC	X1, X5
   1066 	AESENC	X2, X6
   1067 	AESENC	X3, X7
   1068 
   1069 	AESENC	X4, X4
   1070 	AESENC	X5, X5
   1071 	AESENC	X6, X6
   1072 	AESENC	X7, X7
   1073 
   1074 	AESENC	X4, X4
   1075 	AESENC	X5, X5
   1076 	AESENC	X6, X6
   1077 	AESENC	X7, X7
   1078 
   1079 	PXOR	X6, X4
   1080 	PXOR	X7, X5
   1081 	PXOR	X5, X4
   1082 	MOVL	X4, (DX)
   1083 	RET
   1084 
   1085 aes65plus:
   1086 	// make 3 more starting seeds
   1087 	MOVO	X1, X2
   1088 	MOVO	X1, X3
   1089 	PXOR	runtimeaeskeysched+16(SB), X1
   1090 	PXOR	runtimeaeskeysched+32(SB), X2
   1091 	PXOR	runtimeaeskeysched+48(SB), X3
   1092 	AESENC	X1, X1
   1093 	AESENC	X2, X2
   1094 	AESENC	X3, X3
   1095 
   1096 	// start with last (possibly overlapping) block
   1097 	MOVOU	-64(AX)(BX*1), X4
   1098 	MOVOU	-48(AX)(BX*1), X5
   1099 	MOVOU	-32(AX)(BX*1), X6
   1100 	MOVOU	-16(AX)(BX*1), X7
   1101 
   1102 	// scramble state once
   1103 	AESENC	X0, X4
   1104 	AESENC	X1, X5
   1105 	AESENC	X2, X6
   1106 	AESENC	X3, X7
   1107 
   1108 	// compute number of remaining 64-byte blocks
   1109 	DECL	BX
   1110 	SHRL	$6, BX
   1111 
   1112 aesloop:
   1113 	// scramble state, xor in a block
   1114 	MOVOU	(AX), X0
   1115 	MOVOU	16(AX), X1
   1116 	MOVOU	32(AX), X2
   1117 	MOVOU	48(AX), X3
   1118 	AESENC	X0, X4
   1119 	AESENC	X1, X5
   1120 	AESENC	X2, X6
   1121 	AESENC	X3, X7
   1122 
   1123 	// scramble state
   1124 	AESENC	X4, X4
   1125 	AESENC	X5, X5
   1126 	AESENC	X6, X6
   1127 	AESENC	X7, X7
   1128 
   1129 	ADDL	$64, AX
   1130 	DECL	BX
   1131 	JNE	aesloop
   1132 
   1133 	// 2 more scrambles to finish
   1134 	AESENC	X4, X4
   1135 	AESENC	X5, X5
   1136 	AESENC	X6, X6
   1137 	AESENC	X7, X7
   1138 
   1139 	AESENC	X4, X4
   1140 	AESENC	X5, X5
   1141 	AESENC	X6, X6
   1142 	AESENC	X7, X7
   1143 
   1144 	PXOR	X6, X4
   1145 	PXOR	X7, X5
   1146 	PXOR	X5, X4
   1147 	MOVL	X4, (DX)
   1148 	RET
   1149 
   1150 TEXT runtimeaeshash32(SB),NOSPLIT,$0-12
   1151 	MOVL	p+0(FP), AX	// ptr to data
   1152 	MOVL	h+4(FP), X0	// seed
   1153 	PINSRD	$1, (AX), X0	// data
   1154 	AESENC	runtimeaeskeysched+0(SB), X0
   1155 	AESENC	runtimeaeskeysched+16(SB), X0
   1156 	AESENC	runtimeaeskeysched+32(SB), X0
   1157 	MOVL	X0, ret+8(FP)
   1158 	RET
   1159 
   1160 TEXT runtimeaeshash64(SB),NOSPLIT,$0-12
   1161 	MOVL	p+0(FP), AX	// ptr to data
   1162 	MOVQ	(AX), X0	// data
   1163 	PINSRD	$2, h+4(FP), X0	// seed
   1164 	AESENC	runtimeaeskeysched+0(SB), X0
   1165 	AESENC	runtimeaeskeysched+16(SB), X0
   1166 	AESENC	runtimeaeskeysched+32(SB), X0
   1167 	MOVL	X0, ret+8(FP)
   1168 	RET
   1169 
   1170 // simple mask to get rid of data in the high part of the register.
   1171 DATA masks<>+0x00(SB)/4, $0x00000000
   1172 DATA masks<>+0x04(SB)/4, $0x00000000
   1173 DATA masks<>+0x08(SB)/4, $0x00000000
   1174 DATA masks<>+0x0c(SB)/4, $0x00000000
   1175 
   1176 DATA masks<>+0x10(SB)/4, $0x000000ff
   1177 DATA masks<>+0x14(SB)/4, $0x00000000
   1178 DATA masks<>+0x18(SB)/4, $0x00000000
   1179 DATA masks<>+0x1c(SB)/4, $0x00000000
   1180 
   1181 DATA masks<>+0x20(SB)/4, $0x0000ffff
   1182 DATA masks<>+0x24(SB)/4, $0x00000000
   1183 DATA masks<>+0x28(SB)/4, $0x00000000
   1184 DATA masks<>+0x2c(SB)/4, $0x00000000
   1185 
   1186 DATA masks<>+0x30(SB)/4, $0x00ffffff
   1187 DATA masks<>+0x34(SB)/4, $0x00000000
   1188 DATA masks<>+0x38(SB)/4, $0x00000000
   1189 DATA masks<>+0x3c(SB)/4, $0x00000000
   1190 
   1191 DATA masks<>+0x40(SB)/4, $0xffffffff
   1192 DATA masks<>+0x44(SB)/4, $0x00000000
   1193 DATA masks<>+0x48(SB)/4, $0x00000000
   1194 DATA masks<>+0x4c(SB)/4, $0x00000000
   1195 
   1196 DATA masks<>+0x50(SB)/4, $0xffffffff
   1197 DATA masks<>+0x54(SB)/4, $0x000000ff
   1198 DATA masks<>+0x58(SB)/4, $0x00000000
   1199 DATA masks<>+0x5c(SB)/4, $0x00000000
   1200 
   1201 DATA masks<>+0x60(SB)/4, $0xffffffff
   1202 DATA masks<>+0x64(SB)/4, $0x0000ffff
   1203 DATA masks<>+0x68(SB)/4, $0x00000000
   1204 DATA masks<>+0x6c(SB)/4, $0x00000000
   1205 
   1206 DATA masks<>+0x70(SB)/4, $0xffffffff
   1207 DATA masks<>+0x74(SB)/4, $0x00ffffff
   1208 DATA masks<>+0x78(SB)/4, $0x00000000
   1209 DATA masks<>+0x7c(SB)/4, $0x00000000
   1210 
   1211 DATA masks<>+0x80(SB)/4, $0xffffffff
   1212 DATA masks<>+0x84(SB)/4, $0xffffffff
   1213 DATA masks<>+0x88(SB)/4, $0x00000000
   1214 DATA masks<>+0x8c(SB)/4, $0x00000000
   1215 
   1216 DATA masks<>+0x90(SB)/4, $0xffffffff
   1217 DATA masks<>+0x94(SB)/4, $0xffffffff
   1218 DATA masks<>+0x98(SB)/4, $0x000000ff
   1219 DATA masks<>+0x9c(SB)/4, $0x00000000
   1220 
   1221 DATA masks<>+0xa0(SB)/4, $0xffffffff
   1222 DATA masks<>+0xa4(SB)/4, $0xffffffff
   1223 DATA masks<>+0xa8(SB)/4, $0x0000ffff
   1224 DATA masks<>+0xac(SB)/4, $0x00000000
   1225 
   1226 DATA masks<>+0xb0(SB)/4, $0xffffffff
   1227 DATA masks<>+0xb4(SB)/4, $0xffffffff
   1228 DATA masks<>+0xb8(SB)/4, $0x00ffffff
   1229 DATA masks<>+0xbc(SB)/4, $0x00000000
   1230 
   1231 DATA masks<>+0xc0(SB)/4, $0xffffffff
   1232 DATA masks<>+0xc4(SB)/4, $0xffffffff
   1233 DATA masks<>+0xc8(SB)/4, $0xffffffff
   1234 DATA masks<>+0xcc(SB)/4, $0x00000000
   1235 
   1236 DATA masks<>+0xd0(SB)/4, $0xffffffff
   1237 DATA masks<>+0xd4(SB)/4, $0xffffffff
   1238 DATA masks<>+0xd8(SB)/4, $0xffffffff
   1239 DATA masks<>+0xdc(SB)/4, $0x000000ff
   1240 
   1241 DATA masks<>+0xe0(SB)/4, $0xffffffff
   1242 DATA masks<>+0xe4(SB)/4, $0xffffffff
   1243 DATA masks<>+0xe8(SB)/4, $0xffffffff
   1244 DATA masks<>+0xec(SB)/4, $0x0000ffff
   1245 
   1246 DATA masks<>+0xf0(SB)/4, $0xffffffff
   1247 DATA masks<>+0xf4(SB)/4, $0xffffffff
   1248 DATA masks<>+0xf8(SB)/4, $0xffffffff
   1249 DATA masks<>+0xfc(SB)/4, $0x00ffffff
   1250 
   1251 GLOBL masks<>(SB),RODATA,$256
   1252 
   1253 // these are arguments to pshufb. They move data down from
   1254 // the high bytes of the register to the low bytes of the register.
   1255 // index is how many bytes to move.
   1256 DATA shifts<>+0x00(SB)/4, $0x00000000
   1257 DATA shifts<>+0x04(SB)/4, $0x00000000
   1258 DATA shifts<>+0x08(SB)/4, $0x00000000
   1259 DATA shifts<>+0x0c(SB)/4, $0x00000000
   1260 
   1261 DATA shifts<>+0x10(SB)/4, $0xffffff0f
   1262 DATA shifts<>+0x14(SB)/4, $0xffffffff
   1263 DATA shifts<>+0x18(SB)/4, $0xffffffff
   1264 DATA shifts<>+0x1c(SB)/4, $0xffffffff
   1265 
   1266 DATA shifts<>+0x20(SB)/4, $0xffff0f0e
   1267 DATA shifts<>+0x24(SB)/4, $0xffffffff
   1268 DATA shifts<>+0x28(SB)/4, $0xffffffff
   1269 DATA shifts<>+0x2c(SB)/4, $0xffffffff
   1270 
   1271 DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
   1272 DATA shifts<>+0x34(SB)/4, $0xffffffff
   1273 DATA shifts<>+0x38(SB)/4, $0xffffffff
   1274 DATA shifts<>+0x3c(SB)/4, $0xffffffff
   1275 
   1276 DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
   1277 DATA shifts<>+0x44(SB)/4, $0xffffffff
   1278 DATA shifts<>+0x48(SB)/4, $0xffffffff
   1279 DATA shifts<>+0x4c(SB)/4, $0xffffffff
   1280 
   1281 DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
   1282 DATA shifts<>+0x54(SB)/4, $0xffffff0f
   1283 DATA shifts<>+0x58(SB)/4, $0xffffffff
   1284 DATA shifts<>+0x5c(SB)/4, $0xffffffff
   1285 
   1286 DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
   1287 DATA shifts<>+0x64(SB)/4, $0xffff0f0e
   1288 DATA shifts<>+0x68(SB)/4, $0xffffffff
   1289 DATA shifts<>+0x6c(SB)/4, $0xffffffff
   1290 
   1291 DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
   1292 DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
   1293 DATA shifts<>+0x78(SB)/4, $0xffffffff
   1294 DATA shifts<>+0x7c(SB)/4, $0xffffffff
   1295 
   1296 DATA shifts<>+0x80(SB)/4, $0x0b0a0908
   1297 DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
   1298 DATA shifts<>+0x88(SB)/4, $0xffffffff
   1299 DATA shifts<>+0x8c(SB)/4, $0xffffffff
   1300 
   1301 DATA shifts<>+0x90(SB)/4, $0x0a090807
   1302 DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
   1303 DATA shifts<>+0x98(SB)/4, $0xffffff0f
   1304 DATA shifts<>+0x9c(SB)/4, $0xffffffff
   1305 
   1306 DATA shifts<>+0xa0(SB)/4, $0x09080706
   1307 DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
   1308 DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
   1309 DATA shifts<>+0xac(SB)/4, $0xffffffff
   1310 
   1311 DATA shifts<>+0xb0(SB)/4, $0x08070605
   1312 DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
   1313 DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
   1314 DATA shifts<>+0xbc(SB)/4, $0xffffffff
   1315 
   1316 DATA shifts<>+0xc0(SB)/4, $0x07060504
   1317 DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
   1318 DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
   1319 DATA shifts<>+0xcc(SB)/4, $0xffffffff
   1320 
   1321 DATA shifts<>+0xd0(SB)/4, $0x06050403
   1322 DATA shifts<>+0xd4(SB)/4, $0x0a090807
   1323 DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
   1324 DATA shifts<>+0xdc(SB)/4, $0xffffff0f
   1325 
   1326 DATA shifts<>+0xe0(SB)/4, $0x05040302
   1327 DATA shifts<>+0xe4(SB)/4, $0x09080706
   1328 DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
   1329 DATA shifts<>+0xec(SB)/4, $0xffff0f0e
   1330 
   1331 DATA shifts<>+0xf0(SB)/4, $0x04030201
   1332 DATA shifts<>+0xf4(SB)/4, $0x08070605
   1333 DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
   1334 DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
   1335 
   1336 GLOBL shifts<>(SB),RODATA,$256
   1337 
   1338 TEXT checkASM(SB),NOSPLIT,$0-1
   1339 	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
   1340 	MOVL	$masks<>(SB), AX
   1341 	MOVL	$shifts<>(SB), BX
   1342 	ORL	BX, AX
   1343 	TESTL	$15, AX
   1344 	SETEQ	ret+0(FP)
   1345 	RET
   1346 
   1347 // memequal(p, q unsafe.Pointer, size uintptr) bool
   1348 TEXT runtimememequal(SB),NOSPLIT,$0-13
   1349 	MOVL	a+0(FP), SI
   1350 	MOVL	b+4(FP), DI
   1351 	CMPL	SI, DI
   1352 	JEQ	eq
   1353 	MOVL	size+8(FP), BX
   1354 	LEAL	ret+12(FP), AX
   1355 	JMP	runtimememeqbody(SB)
   1356 eq:
   1357 	MOVB    $1, ret+12(FP)
   1358 	RET
   1359 
   1360 // memequal_varlen(a, b unsafe.Pointer) bool
   1361 TEXT runtimememequal_varlen(SB),NOSPLIT,$0-9
   1362 	MOVL    a+0(FP), SI
   1363 	MOVL    b+4(FP), DI
   1364 	CMPL    SI, DI
   1365 	JEQ     eq
   1366 	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
   1367 	LEAL	ret+8(FP), AX
   1368 	JMP	runtimememeqbody(SB)
   1369 eq:
   1370 	MOVB    $1, ret+8(FP)
   1371 	RET
   1372 
   1373 TEXT bytesEqual(SB),NOSPLIT,$0-25
   1374 	MOVL	a_len+4(FP), BX
   1375 	MOVL	b_len+16(FP), CX
   1376 	CMPL	BX, CX
   1377 	JNE	eqret
   1378 	MOVL	a+0(FP), SI
   1379 	MOVL	b+12(FP), DI
   1380 	LEAL	ret+24(FP), AX
   1381 	JMP	runtimememeqbody(SB)
   1382 eqret:
   1383 	MOVB	$0, ret+24(FP)
   1384 	RET
   1385 
   1386 // a in SI
   1387 // b in DI
   1388 // count in BX
   1389 // address of result byte in AX
   1390 TEXT runtimememeqbody(SB),NOSPLIT,$0-0
   1391 	CMPL	BX, $4
   1392 	JB	small
   1393 
   1394 	// 64 bytes at a time using xmm registers
   1395 hugeloop:
   1396 	CMPL	BX, $64
   1397 	JB	bigloop
   1398 	CMPB	runtimesupport_sse2(SB), $1
   1399 	JNE	bigloop
   1400 	MOVOU	(SI), X0
   1401 	MOVOU	(DI), X1
   1402 	MOVOU	16(SI), X2
   1403 	MOVOU	16(DI), X3
   1404 	MOVOU	32(SI), X4
   1405 	MOVOU	32(DI), X5
   1406 	MOVOU	48(SI), X6
   1407 	MOVOU	48(DI), X7
   1408 	PCMPEQB	X1, X0
   1409 	PCMPEQB	X3, X2
   1410 	PCMPEQB	X5, X4
   1411 	PCMPEQB	X7, X6
   1412 	PAND	X2, X0
   1413 	PAND	X6, X4
   1414 	PAND	X4, X0
   1415 	PMOVMSKB X0, DX
   1416 	ADDL	$64, SI
   1417 	ADDL	$64, DI
   1418 	SUBL	$64, BX
   1419 	CMPL	DX, $0xffff
   1420 	JEQ	hugeloop
   1421 	MOVB	$0, (AX)
   1422 	RET
   1423 
   1424 	// 4 bytes at a time using 32-bit register
   1425 bigloop:
   1426 	CMPL	BX, $4
   1427 	JBE	leftover
   1428 	MOVL	(SI), CX
   1429 	MOVL	(DI), DX
   1430 	ADDL	$4, SI
   1431 	ADDL	$4, DI
   1432 	SUBL	$4, BX
   1433 	CMPL	CX, DX
   1434 	JEQ	bigloop
   1435 	MOVB	$0, (AX)
   1436 	RET
   1437 
   1438 	// remaining 0-4 bytes
   1439 leftover:
   1440 	MOVL	-4(SI)(BX*1), CX
   1441 	MOVL	-4(DI)(BX*1), DX
   1442 	CMPL	CX, DX
   1443 	SETEQ	(AX)
   1444 	RET
   1445 
   1446 small:
   1447 	CMPL	BX, $0
   1448 	JEQ	equal
   1449 
   1450 	LEAL	0(BX*8), CX
   1451 	NEGL	CX
   1452 
   1453 	MOVL	SI, DX
   1454 	CMPB	DX, $0xfc
   1455 	JA	si_high
   1456 
   1457 	// load at SI won't cross a page boundary.
   1458 	MOVL	(SI), SI
   1459 	JMP	si_finish
   1460 si_high:
   1461 	// address ends in 111111xx. Load up to bytes we want, move to correct position.
   1462 	MOVL	-4(SI)(BX*1), SI
   1463 	SHRL	CX, SI
   1464 si_finish:
   1465 
   1466 	// same for DI.
   1467 	MOVL	DI, DX
   1468 	CMPB	DX, $0xfc
   1469 	JA	di_high
   1470 	MOVL	(DI), DI
   1471 	JMP	di_finish
   1472 di_high:
   1473 	MOVL	-4(DI)(BX*1), DI
   1474 	SHRL	CX, DI
   1475 di_finish:
   1476 
   1477 	SUBL	SI, DI
   1478 	SHLL	CX, DI
   1479 equal:
   1480 	SETEQ	(AX)
   1481 	RET
   1482 
   1483 TEXT runtimecmpstring(SB),NOSPLIT,$0-20
   1484 	MOVL	s1_base+0(FP), SI
   1485 	MOVL	s1_len+4(FP), BX
   1486 	MOVL	s2_base+8(FP), DI
   1487 	MOVL	s2_len+12(FP), DX
   1488 	LEAL	ret+16(FP), AX
   1489 	JMP	runtimecmpbody(SB)
   1490 
   1491 TEXT bytesCompare(SB),NOSPLIT,$0-28
   1492 	MOVL	s1+0(FP), SI
   1493 	MOVL	s1+4(FP), BX
   1494 	MOVL	s2+12(FP), DI
   1495 	MOVL	s2+16(FP), DX
   1496 	LEAL	ret+24(FP), AX
   1497 	JMP	runtimecmpbody(SB)
   1498 
   1499 TEXT bytesIndexByte(SB),NOSPLIT,$0-20
   1500 	MOVL	s+0(FP), SI
   1501 	MOVL	s_len+4(FP), CX
   1502 	MOVB	c+12(FP), AL
   1503 	MOVL	SI, DI
   1504 	CLD; REPN; SCASB
   1505 	JZ 3(PC)
   1506 	MOVL	$-1, ret+16(FP)
   1507 	RET
   1508 	SUBL	SI, DI
   1509 	SUBL	$1, DI
   1510 	MOVL	DI, ret+16(FP)
   1511 	RET
   1512 
   1513 TEXT stringsIndexByte(SB),NOSPLIT,$0-16
   1514 	MOVL	s+0(FP), SI
   1515 	MOVL	s_len+4(FP), CX
   1516 	MOVB	c+8(FP), AL
   1517 	MOVL	SI, DI
   1518 	CLD; REPN; SCASB
   1519 	JZ 3(PC)
   1520 	MOVL	$-1, ret+12(FP)
   1521 	RET
   1522 	SUBL	SI, DI
   1523 	SUBL	$1, DI
   1524 	MOVL	DI, ret+12(FP)
   1525 	RET
   1526 
   1527 // input:
   1528 //   SI = a
   1529 //   DI = b
   1530 //   BX = alen
   1531 //   DX = blen
   1532 //   AX = address of return word (set to 1/0/-1)
   1533 TEXT runtimecmpbody(SB),NOSPLIT,$0-0
   1534 	MOVL	DX, BP
   1535 	SUBL	BX, DX // DX = blen-alen
   1536 	JLE	2(PC)
   1537 	MOVL	BX, BP // BP = min(alen, blen)
   1538 	CMPL	SI, DI
   1539 	JEQ	allsame
   1540 	CMPL	BP, $4
   1541 	JB	small
   1542 	CMPB	runtimesupport_sse2(SB), $1
   1543 	JNE	mediumloop
   1544 largeloop:
   1545 	CMPL	BP, $16
   1546 	JB	mediumloop
   1547 	MOVOU	(SI), X0
   1548 	MOVOU	(DI), X1
   1549 	PCMPEQB X0, X1
   1550 	PMOVMSKB X1, BX
   1551 	XORL	$0xffff, BX	// convert EQ to NE
   1552 	JNE	diff16	// branch if at least one byte is not equal
   1553 	ADDL	$16, SI
   1554 	ADDL	$16, DI
   1555 	SUBL	$16, BP
   1556 	JMP	largeloop
   1557 
   1558 diff16:
   1559 	BSFL	BX, BX	// index of first byte that differs
   1560 	XORL	DX, DX
   1561 	MOVB	(SI)(BX*1), CX
   1562 	CMPB	CX, (DI)(BX*1)
   1563 	SETHI	DX
   1564 	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
   1565 	MOVL	DX, (AX)
   1566 	RET
   1567 
   1568 mediumloop:
   1569 	CMPL	BP, $4
   1570 	JBE	_0through4
   1571 	MOVL	(SI), BX
   1572 	MOVL	(DI), CX
   1573 	CMPL	BX, CX
   1574 	JNE	diff4
   1575 	ADDL	$4, SI
   1576 	ADDL	$4, DI
   1577 	SUBL	$4, BP
   1578 	JMP	mediumloop
   1579 
   1580 _0through4:
   1581 	MOVL	-4(SI)(BP*1), BX
   1582 	MOVL	-4(DI)(BP*1), CX
   1583 	CMPL	BX, CX
   1584 	JEQ	allsame
   1585 
   1586 diff4:
   1587 	BSWAPL	BX	// reverse order of bytes
   1588 	BSWAPL	CX
   1589 	XORL	BX, CX	// find bit differences
   1590 	BSRL	CX, CX	// index of highest bit difference
   1591 	SHRL	CX, BX	// move a's bit to bottom
   1592 	ANDL	$1, BX	// mask bit
   1593 	LEAL	-1(BX*2), BX // 1/0 => +1/-1
   1594 	MOVL	BX, (AX)
   1595 	RET
   1596 
   1597 	// 0-3 bytes in common
   1598 small:
   1599 	LEAL	(BP*8), CX
   1600 	NEGL	CX
   1601 	JEQ	allsame
   1602 
   1603 	// load si
   1604 	CMPB	SI, $0xfc
   1605 	JA	si_high
   1606 	MOVL	(SI), SI
   1607 	JMP	si_finish
   1608 si_high:
   1609 	MOVL	-4(SI)(BP*1), SI
   1610 	SHRL	CX, SI
   1611 si_finish:
   1612 	SHLL	CX, SI
   1613 
   1614 	// same for di
   1615 	CMPB	DI, $0xfc
   1616 	JA	di_high
   1617 	MOVL	(DI), DI
   1618 	JMP	di_finish
   1619 di_high:
   1620 	MOVL	-4(DI)(BP*1), DI
   1621 	SHRL	CX, DI
   1622 di_finish:
   1623 	SHLL	CX, DI
   1624 
   1625 	BSWAPL	SI	// reverse order of bytes
   1626 	BSWAPL	DI
   1627 	XORL	SI, DI	// find bit differences
   1628 	JEQ	allsame
   1629 	BSRL	DI, CX	// index of highest bit difference
   1630 	SHRL	CX, SI	// move a's bit to bottom
   1631 	ANDL	$1, SI	// mask bit
   1632 	LEAL	-1(SI*2), BX // 1/0 => +1/-1
   1633 	MOVL	BX, (AX)
   1634 	RET
   1635 
   1636 	// all the bytes in common are the same, so we just need
   1637 	// to compare the lengths.
   1638 allsame:
   1639 	XORL	BX, BX
   1640 	XORL	CX, CX
   1641 	TESTL	DX, DX
   1642 	SETLT	BX	// 1 if alen > blen
   1643 	SETEQ	CX	// 1 if alen == blen
   1644 	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
   1645 	MOVL	BX, (AX)
   1646 	RET
   1647 
   1648 TEXT runtimereturn0(SB), NOSPLIT, $0
   1649 	MOVL	$0, AX
   1650 	RET
   1651 
   1652 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
   1653 // Must obey the gcc calling convention.
   1654 TEXT _cgo_topofstack(SB),NOSPLIT,$0
   1655 	get_tls(CX)
   1656 	MOVL	g(CX), AX
   1657 	MOVL	g_m(AX), AX
   1658 	MOVL	m_curg(AX), AX
   1659 	MOVL	(g_stack+stack_hi)(AX), AX
   1660 	RET
   1661 
   1662 // The top-most function running on a goroutine
   1663 // returns to goexit+PCQuantum.
   1664 TEXT runtimegoexit(SB),NOSPLIT,$0-0
   1665 	BYTE	$0x90	// NOP
   1666 	CALL	runtimegoexit1(SB)	// does not return
   1667 	// traceback from goexit1 must hit code range of goexit
   1668 	BYTE	$0x90	// NOP
   1669 
   1670 // Add a module's moduledata to the linked list of moduledata objects. This
   1671 // is called from .init_array by a function generated in the linker and so
   1672 // follows the platform ABI wrt register preservation -- it only touches AX,
   1673 // CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
   1674 // instead the pointer to the moduledata is passed in AX.
   1675 TEXT runtimeaddmoduledata(SB),NOSPLIT,$0-0
   1676        MOVL    runtimelastmoduledatap(SB), DX
   1677        MOVL    AX, moduledata_next(DX)
   1678        MOVL    AX, runtimelastmoduledatap(SB)
   1679        RET
   1680 
   1681 TEXT runtimeuint32tofloat64(SB),NOSPLIT,$8-12
   1682 	MOVL	a+0(FP), AX
   1683 	MOVL	AX, 0(SP)
   1684 	MOVL	$0, 4(SP)
   1685 	FMOVV	0(SP), F0
   1686 	FMOVDP	F0, ret+4(FP)
   1687 	RET
   1688 
   1689 TEXT runtimefloat64touint32(SB),NOSPLIT,$12-12
   1690 	FMOVD	a+0(FP), F0
   1691 	FSTCW	0(SP)
   1692 	FLDCW	runtimecontrolWord64trunc(SB)
   1693 	FMOVVP	F0, 4(SP)
   1694 	FLDCW	0(SP)
   1695 	MOVL	4(SP), AX
   1696 	MOVL	AX, ret+8(FP)
   1697 	RET
   1698