Home | History | Annotate | Download | only in runtime
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "go_asm.h"
      6 #include "go_tls.h"
      7 #include "funcdata.h"
      8 #include "textflag.h"
      9 
     10 TEXT runtimert0_go(SB),NOSPLIT,$0
     11 	// copy arguments forward on an even stack
     12 	MOVQ	DI, AX		// argc
     13 	MOVQ	SI, BX		// argv
     14 	SUBQ	$(4*8+7), SP		// 2args 2auto
     15 	ANDQ	$~15, SP
     16 	MOVQ	AX, 16(SP)
     17 	MOVQ	BX, 24(SP)
     18 
     19 	// create istack out of the given (operating system) stack.
     20 	// _cgo_init may update stackguard.
     21 	MOVQ	$runtimeg0(SB), DI
     22 	LEAQ	(-64*1024+104)(SP), BX
     23 	MOVQ	BX, g_stackguard0(DI)
     24 	MOVQ	BX, g_stackguard1(DI)
     25 	MOVQ	BX, (g_stack+stack_lo)(DI)
     26 	MOVQ	SP, (g_stack+stack_hi)(DI)
     27 
     28 	// find out information about the processor we're on
     29 	MOVQ	$0, AX
     30 	CPUID
     31 	CMPQ	AX, $0
     32 	JE	nocpuinfo
     33 
     34 	// Figure out how to serialize RDTSC.
     35 	// On Intel processors LFENCE is enough. AMD requires MFENCE.
     36 	// Don't know about the rest, so let's do MFENCE.
     37 	CMPL	BX, $0x756E6547  // "Genu"
     38 	JNE	notintel
     39 	CMPL	DX, $0x49656E69  // "ineI"
     40 	JNE	notintel
     41 	CMPL	CX, $0x6C65746E  // "ntel"
     42 	JNE	notintel
     43 	MOVB	$1, runtimelfenceBeforeRdtsc(SB)
     44 notintel:
     45 
     46 	MOVQ	$1, AX
     47 	CPUID
     48 	MOVL	CX, runtimecpuid_ecx(SB)
     49 	MOVL	DX, runtimecpuid_edx(SB)
     50 nocpuinfo:
     51 
     52 	// if there is an _cgo_init, call it.
     53 	MOVQ	_cgo_init(SB), AX
     54 	TESTQ	AX, AX
     55 	JZ	needtls
     56 	// g0 already in DI
     57 	MOVQ	DI, CX	// Win64 uses CX for first parameter
     58 	MOVQ	$setg_gcc<>(SB), SI
     59 	CALL	AX
     60 
     61 	// update stackguard after _cgo_init
     62 	MOVQ	$runtimeg0(SB), CX
     63 	MOVQ	(g_stack+stack_lo)(CX), AX
     64 	ADDQ	$const__StackGuard, AX
     65 	MOVQ	AX, g_stackguard0(CX)
     66 	MOVQ	AX, g_stackguard1(CX)
     67 
     68 	CMPL	runtimeiswindows(SB), $0
     69 	JEQ ok
     70 needtls:
     71 	// skip TLS setup on Plan 9
     72 	CMPL	runtimeisplan9(SB), $1
     73 	JEQ ok
     74 	// skip TLS setup on Solaris
     75 	CMPL	runtimeissolaris(SB), $1
     76 	JEQ ok
     77 
     78 	LEAQ	runtimetls0(SB), DI
     79 	CALL	runtimesettls(SB)
     80 
     81 	// store through it, to make sure it works
     82 	get_tls(BX)
     83 	MOVQ	$0x123, g(BX)
     84 	MOVQ	runtimetls0(SB), AX
     85 	CMPQ	AX, $0x123
     86 	JEQ 2(PC)
     87 	MOVL	AX, 0	// abort
     88 ok:
     89 	// set the per-goroutine and per-mach "registers"
     90 	get_tls(BX)
     91 	LEAQ	runtimeg0(SB), CX
     92 	MOVQ	CX, g(BX)
     93 	LEAQ	runtimem0(SB), AX
     94 
     95 	// save m->g0 = g0
     96 	MOVQ	CX, m_g0(AX)
     97 	// save m0 to g0->m
     98 	MOVQ	AX, g_m(CX)
     99 
    100 	CLD				// convention is D is always left cleared
    101 	CALL	runtimecheck(SB)
    102 
    103 	MOVL	16(SP), AX		// copy argc
    104 	MOVL	AX, 0(SP)
    105 	MOVQ	24(SP), AX		// copy argv
    106 	MOVQ	AX, 8(SP)
    107 	CALL	runtimeargs(SB)
    108 	CALL	runtimeosinit(SB)
    109 	CALL	runtimeschedinit(SB)
    110 
    111 	// create a new goroutine to start program
    112 	MOVQ	$runtimemainPC(SB), AX		// entry
    113 	PUSHQ	AX
    114 	PUSHQ	$0			// arg size
    115 	CALL	runtimenewproc(SB)
    116 	POPQ	AX
    117 	POPQ	AX
    118 
    119 	// start this M
    120 	CALL	runtimemstart(SB)
    121 
    122 	MOVL	$0xf1, 0xf1  // crash
    123 	RET
    124 
    125 DATA	runtimemainPC+0(SB)/8,$runtimemain(SB)
    126 GLOBL	runtimemainPC(SB),RODATA,$8
    127 
    128 TEXT runtimebreakpoint(SB),NOSPLIT,$0-0
    129 	BYTE	$0xcc
    130 	RET
    131 
    132 TEXT runtimeasminit(SB),NOSPLIT,$0-0
    133 	// No per-thread init.
    134 	RET
    135 
    136 /*
    137  *  go-routine
    138  */
    139 
    140 // void gosave(Gobuf*)
    141 // save state in Gobuf; setjmp
    142 TEXT runtimegosave(SB), NOSPLIT, $0-8
    143 	MOVQ	buf+0(FP), AX		// gobuf
    144 	LEAQ	buf+0(FP), BX		// caller's SP
    145 	MOVQ	BX, gobuf_sp(AX)
    146 	MOVQ	0(SP), BX		// caller's PC
    147 	MOVQ	BX, gobuf_pc(AX)
    148 	MOVQ	$0, gobuf_ret(AX)
    149 	MOVQ	$0, gobuf_ctxt(AX)
    150 	MOVQ	BP, gobuf_bp(AX)
    151 	get_tls(CX)
    152 	MOVQ	g(CX), BX
    153 	MOVQ	BX, gobuf_g(AX)
    154 	RET
    155 
    156 // void gogo(Gobuf*)
    157 // restore state from Gobuf; longjmp
    158 TEXT runtimegogo(SB), NOSPLIT, $0-8
    159 	MOVQ	buf+0(FP), BX		// gobuf
    160 	MOVQ	gobuf_g(BX), DX
    161 	MOVQ	0(DX), CX		// make sure g != nil
    162 	get_tls(CX)
    163 	MOVQ	DX, g(CX)
    164 	MOVQ	gobuf_sp(BX), SP	// restore SP
    165 	MOVQ	gobuf_ret(BX), AX
    166 	MOVQ	gobuf_ctxt(BX), DX
    167 	MOVQ	gobuf_bp(BX), BP
    168 	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
    169 	MOVQ	$0, gobuf_ret(BX)
    170 	MOVQ	$0, gobuf_ctxt(BX)
    171 	MOVQ	$0, gobuf_bp(BX)
    172 	MOVQ	gobuf_pc(BX), BX
    173 	JMP	BX
    174 
    175 // func mcall(fn func(*g))
    176 // Switch to m->g0's stack, call fn(g).
    177 // Fn must never return.  It should gogo(&g->sched)
    178 // to keep running g.
    179 TEXT runtimemcall(SB), NOSPLIT, $0-8
    180 	MOVQ	fn+0(FP), DI
    181 
    182 	get_tls(CX)
    183 	MOVQ	g(CX), AX	// save state in g->sched
    184 	MOVQ	0(SP), BX	// caller's PC
    185 	MOVQ	BX, (g_sched+gobuf_pc)(AX)
    186 	LEAQ	fn+0(FP), BX	// caller's SP
    187 	MOVQ	BX, (g_sched+gobuf_sp)(AX)
    188 	MOVQ	AX, (g_sched+gobuf_g)(AX)
    189 	MOVQ	BP, (g_sched+gobuf_bp)(AX)
    190 
    191 	// switch to m->g0 & its stack, call fn
    192 	MOVQ	g(CX), BX
    193 	MOVQ	g_m(BX), BX
    194 	MOVQ	m_g0(BX), SI
    195 	CMPQ	SI, AX	// if g == m->g0 call badmcall
    196 	JNE	3(PC)
    197 	MOVQ	$runtimebadmcall(SB), AX
    198 	JMP	AX
    199 	MOVQ	SI, g(CX)	// g = m->g0
    200 	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
    201 	PUSHQ	AX
    202 	MOVQ	DI, DX
    203 	MOVQ	0(DI), DI
    204 	CALL	DI
    205 	POPQ	AX
    206 	MOVQ	$runtimebadmcall2(SB), AX
    207 	JMP	AX
    208 	RET
    209 
    210 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
    211 // of the G stack.  We need to distinguish the routine that
    212 // lives at the bottom of the G stack from the one that lives
    213 // at the top of the system stack because the one at the top of
    214 // the system stack terminates the stack walk (see topofstack()).
    215 TEXT runtimesystemstack_switch(SB), NOSPLIT, $0-0
    216 	RET
    217 
    218 // func systemstack(fn func())
    219 TEXT runtimesystemstack(SB), NOSPLIT, $0-8
    220 	MOVQ	fn+0(FP), DI	// DI = fn
    221 	get_tls(CX)
    222 	MOVQ	g(CX), AX	// AX = g
    223 	MOVQ	g_m(AX), BX	// BX = m
    224 
    225 	MOVQ	m_gsignal(BX), DX	// DX = gsignal
    226 	CMPQ	AX, DX
    227 	JEQ	noswitch
    228 
    229 	MOVQ	m_g0(BX), DX	// DX = g0
    230 	CMPQ	AX, DX
    231 	JEQ	noswitch
    232 
    233 	MOVQ	m_curg(BX), R8
    234 	CMPQ	AX, R8
    235 	JEQ	switch
    236 
    237 	// Bad: g is not gsignal, not g0, not curg. What is it?
    238 	MOVQ	$runtimebadsystemstack(SB), AX
    239 	CALL	AX
    240 
    241 switch:
    242 	// save our state in g->sched.  Pretend to
    243 	// be systemstack_switch if the G stack is scanned.
    244 	MOVQ	$runtimesystemstack_switch(SB), SI
    245 	MOVQ	SI, (g_sched+gobuf_pc)(AX)
    246 	MOVQ	SP, (g_sched+gobuf_sp)(AX)
    247 	MOVQ	AX, (g_sched+gobuf_g)(AX)
    248 	MOVQ	BP, (g_sched+gobuf_bp)(AX)
    249 
    250 	// switch to g0
    251 	MOVQ	DX, g(CX)
    252 	MOVQ	(g_sched+gobuf_sp)(DX), BX
    253 	// make it look like mstart called systemstack on g0, to stop traceback
    254 	SUBQ	$8, BX
    255 	MOVQ	$runtimemstart(SB), DX
    256 	MOVQ	DX, 0(BX)
    257 	MOVQ	BX, SP
    258 
    259 	// call target function
    260 	MOVQ	DI, DX
    261 	MOVQ	0(DI), DI
    262 	CALL	DI
    263 
    264 	// switch back to g
    265 	get_tls(CX)
    266 	MOVQ	g(CX), AX
    267 	MOVQ	g_m(AX), BX
    268 	MOVQ	m_curg(BX), AX
    269 	MOVQ	AX, g(CX)
    270 	MOVQ	(g_sched+gobuf_sp)(AX), SP
    271 	MOVQ	$0, (g_sched+gobuf_sp)(AX)
    272 	RET
    273 
    274 noswitch:
    275 	// already on m stack, just call directly
    276 	MOVQ	DI, DX
    277 	MOVQ	0(DI), DI
    278 	CALL	DI
    279 	RET
    280 
    281 /*
    282  * support for morestack
    283  */
    284 
    285 // Called during function prolog when more stack is needed.
    286 //
    287 // The traceback routines see morestack on a g0 as being
    288 // the top of a stack (for example, morestack calling newstack
    289 // calling the scheduler calling newm calling gc), so we must
    290 // record an argument size. For that purpose, it has no arguments.
    291 TEXT runtimemorestack(SB),NOSPLIT,$0-0
    292 	// Cannot grow scheduler stack (m->g0).
    293 	get_tls(CX)
    294 	MOVQ	g(CX), BX
    295 	MOVQ	g_m(BX), BX
    296 	MOVQ	m_g0(BX), SI
    297 	CMPQ	g(CX), SI
    298 	JNE	2(PC)
    299 	INT	$3
    300 
    301 	// Cannot grow signal stack (m->gsignal).
    302 	MOVQ	m_gsignal(BX), SI
    303 	CMPQ	g(CX), SI
    304 	JNE	2(PC)
    305 	INT	$3
    306 
    307 	// Called from f.
    308 	// Set m->morebuf to f's caller.
    309 	MOVQ	8(SP), AX	// f's caller's PC
    310 	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
    311 	LEAQ	16(SP), AX	// f's caller's SP
    312 	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
    313 	get_tls(CX)
    314 	MOVQ	g(CX), SI
    315 	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
    316 
    317 	// Set g->sched to context in f.
    318 	MOVQ	0(SP), AX // f's PC
    319 	MOVQ	AX, (g_sched+gobuf_pc)(SI)
    320 	MOVQ	SI, (g_sched+gobuf_g)(SI)
    321 	LEAQ	8(SP), AX // f's SP
    322 	MOVQ	AX, (g_sched+gobuf_sp)(SI)
    323 	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
    324 	MOVQ	BP, (g_sched+gobuf_bp)(SI)
    325 
    326 	// Call newstack on m->g0's stack.
    327 	MOVQ	m_g0(BX), BX
    328 	MOVQ	BX, g(CX)
    329 	MOVQ	(g_sched+gobuf_sp)(BX), SP
    330 	CALL	runtimenewstack(SB)
    331 	MOVQ	$0, 0x1003	// crash if newstack returns
    332 	RET
    333 
    334 // morestack but not preserving ctxt.
    335 TEXT runtimemorestack_noctxt(SB),NOSPLIT,$0
    336 	MOVL	$0, DX
    337 	JMP	runtimemorestack(SB)
    338 
    339 TEXT runtimestackBarrier(SB),NOSPLIT,$0
    340 	// We came here via a RET to an overwritten return PC.
    341 	// AX may be live. Other registers are available.
    342 
    343 	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
    344 	get_tls(CX)
    345 	MOVQ	g(CX), CX
    346 	MOVQ	(g_stkbar+slice_array)(CX), DX
    347 	MOVQ	g_stkbarPos(CX), BX
    348 	IMULQ	$stkbar__size, BX	// Too big for SIB.
    349 	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
    350 	// Record that this stack barrier was hit.
    351 	ADDQ	$1, g_stkbarPos(CX)
    352 	// Jump to the original return PC.
    353 	JMP	BX
    354 
    355 // reflectcall: call a function with the given argument list
    356 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
    357 // we don't have variable-sized frames, so we use a small number
    358 // of constant-sized-frame functions to encode a few bits of size in the pc.
    359 // Caution: ugly multiline assembly macros in your future!
    360 
    361 #define DISPATCH(NAME,MAXSIZE)		\
    362 	CMPQ	CX, $MAXSIZE;		\
    363 	JA	3(PC);			\
    364 	MOVQ	$NAME(SB), AX;		\
    365 	JMP	AX
    366 // Note: can't just "JMP NAME(SB)" - bad inlining results.
    367 
    368 TEXT reflectcall(SB), NOSPLIT, $0-0
    369 	JMP	reflectcall(SB)
    370 
    371 TEXT reflectcall(SB), NOSPLIT, $0-32
    372 	MOVLQZX argsize+24(FP), CX
    373 	// NOTE(rsc): No call16, because CALLFN needs four words
    374 	// of argument space to invoke callwritebarrier.
    375 	DISPATCH(runtimecall32, 32)
    376 	DISPATCH(runtimecall64, 64)
    377 	DISPATCH(runtimecall128, 128)
    378 	DISPATCH(runtimecall256, 256)
    379 	DISPATCH(runtimecall512, 512)
    380 	DISPATCH(runtimecall1024, 1024)
    381 	DISPATCH(runtimecall2048, 2048)
    382 	DISPATCH(runtimecall4096, 4096)
    383 	DISPATCH(runtimecall8192, 8192)
    384 	DISPATCH(runtimecall16384, 16384)
    385 	DISPATCH(runtimecall32768, 32768)
    386 	DISPATCH(runtimecall65536, 65536)
    387 	DISPATCH(runtimecall131072, 131072)
    388 	DISPATCH(runtimecall262144, 262144)
    389 	DISPATCH(runtimecall524288, 524288)
    390 	DISPATCH(runtimecall1048576, 1048576)
    391 	DISPATCH(runtimecall2097152, 2097152)
    392 	DISPATCH(runtimecall4194304, 4194304)
    393 	DISPATCH(runtimecall8388608, 8388608)
    394 	DISPATCH(runtimecall16777216, 16777216)
    395 	DISPATCH(runtimecall33554432, 33554432)
    396 	DISPATCH(runtimecall67108864, 67108864)
    397 	DISPATCH(runtimecall134217728, 134217728)
    398 	DISPATCH(runtimecall268435456, 268435456)
    399 	DISPATCH(runtimecall536870912, 536870912)
    400 	DISPATCH(runtimecall1073741824, 1073741824)
    401 	MOVQ	$runtimebadreflectcall(SB), AX
    402 	JMP	AX
    403 
    404 #define CALLFN(NAME,MAXSIZE)			\
    405 TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
    406 	NO_LOCAL_POINTERS;			\
    407 	/* copy arguments to stack */		\
    408 	MOVQ	argptr+16(FP), SI;		\
    409 	MOVLQZX argsize+24(FP), CX;		\
    410 	MOVQ	SP, DI;				\
    411 	REP;MOVSB;				\
    412 	/* call function */			\
    413 	MOVQ	f+8(FP), DX;			\
    414 	PCDATA  $PCDATA_StackMapIndex, $0;	\
    415 	CALL	(DX);				\
    416 	/* copy return values back */		\
    417 	MOVQ	argptr+16(FP), DI;		\
    418 	MOVLQZX	argsize+24(FP), CX;		\
    419 	MOVLQZX retoffset+28(FP), BX;		\
    420 	MOVQ	SP, SI;				\
    421 	ADDQ	BX, DI;				\
    422 	ADDQ	BX, SI;				\
    423 	SUBQ	BX, CX;				\
    424 	REP;MOVSB;				\
    425 	/* execute write barrier updates */	\
    426 	MOVQ	argtype+0(FP), DX;		\
    427 	MOVQ	argptr+16(FP), DI;		\
    428 	MOVLQZX	argsize+24(FP), CX;		\
    429 	MOVLQZX retoffset+28(FP), BX;		\
    430 	MOVQ	DX, 0(SP);			\
    431 	MOVQ	DI, 8(SP);			\
    432 	MOVQ	CX, 16(SP);			\
    433 	MOVQ	BX, 24(SP);			\
    434 	CALL	runtimecallwritebarrier(SB);	\
    435 	RET
    436 
    437 CALLFN(call32, 32)
    438 CALLFN(call64, 64)
    439 CALLFN(call128, 128)
    440 CALLFN(call256, 256)
    441 CALLFN(call512, 512)
    442 CALLFN(call1024, 1024)
    443 CALLFN(call2048, 2048)
    444 CALLFN(call4096, 4096)
    445 CALLFN(call8192, 8192)
    446 CALLFN(call16384, 16384)
    447 CALLFN(call32768, 32768)
    448 CALLFN(call65536, 65536)
    449 CALLFN(call131072, 131072)
    450 CALLFN(call262144, 262144)
    451 CALLFN(call524288, 524288)
    452 CALLFN(call1048576, 1048576)
    453 CALLFN(call2097152, 2097152)
    454 CALLFN(call4194304, 4194304)
    455 CALLFN(call8388608, 8388608)
    456 CALLFN(call16777216, 16777216)
    457 CALLFN(call33554432, 33554432)
    458 CALLFN(call67108864, 67108864)
    459 CALLFN(call134217728, 134217728)
    460 CALLFN(call268435456, 268435456)
    461 CALLFN(call536870912, 536870912)
    462 CALLFN(call1073741824, 1073741824)
    463 
    464 // bool cas(int32 *val, int32 old, int32 new)
    465 // Atomically:
    466 //	if(*val == old){
    467 //		*val = new;
    468 //		return 1;
    469 //	} else
    470 //		return 0;
    471 TEXT runtimecas(SB), NOSPLIT, $0-17
    472 	MOVQ	ptr+0(FP), BX
    473 	MOVL	old+8(FP), AX
    474 	MOVL	new+12(FP), CX
    475 	LOCK
    476 	CMPXCHGL	CX, 0(BX)
    477 	SETEQ	ret+16(FP)
    478 	RET
    479 
    480 // bool	runtimecas64(uint64 *val, uint64 old, uint64 new)
    481 // Atomically:
    482 //	if(*val == *old){
    483 //		*val = new;
    484 //		return 1;
    485 //	} else {
    486 //		return 0;
    487 //	}
    488 TEXT runtimecas64(SB), NOSPLIT, $0-25
    489 	MOVQ	ptr+0(FP), BX
    490 	MOVQ	old+8(FP), AX
    491 	MOVQ	new+16(FP), CX
    492 	LOCK
    493 	CMPXCHGQ	CX, 0(BX)
    494 	SETEQ	ret+24(FP)
    495 	RET
    496 
    497 TEXT runtimecasuintptr(SB), NOSPLIT, $0-25
    498 	JMP	runtimecas64(SB)
    499 
    500 TEXT runtimeatomicloaduintptr(SB), NOSPLIT, $0-16
    501 	JMP	runtimeatomicload64(SB)
    502 
    503 TEXT runtimeatomicloaduint(SB), NOSPLIT, $0-16
    504 	JMP	runtimeatomicload64(SB)
    505 
    506 TEXT runtimeatomicstoreuintptr(SB), NOSPLIT, $0-16
    507 	JMP	runtimeatomicstore64(SB)
    508 
    509 // bool casp(void **val, void *old, void *new)
    510 // Atomically:
    511 //	if(*val == old){
    512 //		*val = new;
    513 //		return 1;
    514 //	} else
    515 //		return 0;
    516 TEXT runtimecasp1(SB), NOSPLIT, $0-25
    517 	MOVQ	ptr+0(FP), BX
    518 	MOVQ	old+8(FP), AX
    519 	MOVQ	new+16(FP), CX
    520 	LOCK
    521 	CMPXCHGQ	CX, 0(BX)
    522 	SETEQ	ret+24(FP)
    523 	RET
    524 
    525 // uint32 xadd(uint32 volatile *val, int32 delta)
    526 // Atomically:
    527 //	*val += delta;
    528 //	return *val;
    529 TEXT runtimexadd(SB), NOSPLIT, $0-20
    530 	MOVQ	ptr+0(FP), BX
    531 	MOVL	delta+8(FP), AX
    532 	MOVL	AX, CX
    533 	LOCK
    534 	XADDL	AX, 0(BX)
    535 	ADDL	CX, AX
    536 	MOVL	AX, ret+16(FP)
    537 	RET
    538 
    539 TEXT runtimexadd64(SB), NOSPLIT, $0-24
    540 	MOVQ	ptr+0(FP), BX
    541 	MOVQ	delta+8(FP), AX
    542 	MOVQ	AX, CX
    543 	LOCK
    544 	XADDQ	AX, 0(BX)
    545 	ADDQ	CX, AX
    546 	MOVQ	AX, ret+16(FP)
    547 	RET
    548 
    549 TEXT runtimexadduintptr(SB), NOSPLIT, $0-24
    550 	JMP	runtimexadd64(SB)
    551 
    552 TEXT runtimexchg(SB), NOSPLIT, $0-20
    553 	MOVQ	ptr+0(FP), BX
    554 	MOVL	new+8(FP), AX
    555 	XCHGL	AX, 0(BX)
    556 	MOVL	AX, ret+16(FP)
    557 	RET
    558 
    559 TEXT runtimexchg64(SB), NOSPLIT, $0-24
    560 	MOVQ	ptr+0(FP), BX
    561 	MOVQ	new+8(FP), AX
    562 	XCHGQ	AX, 0(BX)
    563 	MOVQ	AX, ret+16(FP)
    564 	RET
    565 
    566 TEXT runtimexchgp1(SB), NOSPLIT, $0-24
    567 	MOVQ	ptr+0(FP), BX
    568 	MOVQ	new+8(FP), AX
    569 	XCHGQ	AX, 0(BX)
    570 	MOVQ	AX, ret+16(FP)
    571 	RET
    572 
    573 TEXT runtimexchguintptr(SB), NOSPLIT, $0-24
    574 	JMP	runtimexchg64(SB)
    575 
    576 TEXT runtimeprocyield(SB),NOSPLIT,$0-0
    577 	MOVL	cycles+0(FP), AX
    578 again:
    579 	PAUSE
    580 	SUBL	$1, AX
    581 	JNZ	again
    582 	RET
    583 
    584 TEXT runtimeatomicstorep1(SB), NOSPLIT, $0-16
    585 	MOVQ	ptr+0(FP), BX
    586 	MOVQ	val+8(FP), AX
    587 	XCHGQ	AX, 0(BX)
    588 	RET
    589 
    590 TEXT runtimeatomicstore(SB), NOSPLIT, $0-12
    591 	MOVQ	ptr+0(FP), BX
    592 	MOVL	val+8(FP), AX
    593 	XCHGL	AX, 0(BX)
    594 	RET
    595 
    596 TEXT runtimeatomicstore64(SB), NOSPLIT, $0-16
    597 	MOVQ	ptr+0(FP), BX
    598 	MOVQ	val+8(FP), AX
    599 	XCHGQ	AX, 0(BX)
    600 	RET
    601 
    602 // void	runtimeatomicor8(byte volatile*, byte);
    603 TEXT runtimeatomicor8(SB), NOSPLIT, $0-9
    604 	MOVQ	ptr+0(FP), AX
    605 	MOVB	val+8(FP), BX
    606 	LOCK
    607 	ORB	BX, (AX)
    608 	RET
    609 
    610 // void	runtimeatomicand8(byte volatile*, byte);
    611 TEXT runtimeatomicand8(SB), NOSPLIT, $0-9
    612 	MOVQ	ptr+0(FP), AX
    613 	MOVB	val+8(FP), BX
    614 	LOCK
    615 	ANDB	BX, (AX)
    616 	RET
    617 
    618 TEXT publicationBarrier(SB),NOSPLIT,$0-0
    619 	// Stores are already ordered on x86, so this is just a
    620 	// compile barrier.
    621 	RET
    622 
    623 // void jmpdefer(fn, sp);
    624 // called from deferreturn.
    625 // 1. pop the caller
    626 // 2. sub 5 bytes from the callers return
    627 // 3. jmp to the argument
    628 TEXT runtimejmpdefer(SB), NOSPLIT, $0-16
    629 	MOVQ	fv+0(FP), DX	// fn
    630 	MOVQ	argp+8(FP), BX	// caller sp
    631 	LEAQ	-8(BX), SP	// caller sp after CALL
    632 	SUBQ	$5, (SP)	// return to CALL again
    633 	MOVQ	0(DX), BX
    634 	JMP	BX	// but first run the deferred function
    635 
    636 // Save state of caller into g->sched. Smashes R8, R9.
    637 TEXT gosave<>(SB),NOSPLIT,$0
    638 	get_tls(R8)
    639 	MOVQ	g(R8), R8
    640 	MOVQ	0(SP), R9
    641 	MOVQ	R9, (g_sched+gobuf_pc)(R8)
    642 	LEAQ	8(SP), R9
    643 	MOVQ	R9, (g_sched+gobuf_sp)(R8)
    644 	MOVQ	$0, (g_sched+gobuf_ret)(R8)
    645 	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
    646 	MOVQ	BP, (g_sched+gobuf_bp)(R8)
    647 	RET
    648 
    649 // func asmcgocall(fn, arg unsafe.Pointer) int32
    650 // Call fn(arg) on the scheduler stack,
    651 // aligned appropriately for the gcc ABI.
    652 // See cgocall.go for more details.
    653 TEXT asmcgocall(SB),NOSPLIT,$0-20
    654 	MOVQ	fn+0(FP), AX
    655 	MOVQ	arg+8(FP), BX
    656 
    657 	MOVQ	SP, DX
    658 
    659 	// Figure out if we need to switch to m->g0 stack.
    660 	// We get called to create new OS threads too, and those
    661 	// come in on the m->g0 stack already.
    662 	get_tls(CX)
    663 	MOVQ	g(CX), R8
    664 	MOVQ	g_m(R8), R8
    665 	MOVQ	m_g0(R8), SI
    666 	MOVQ	g(CX), DI
    667 	CMPQ	SI, DI
    668 	JEQ	nosave
    669 	MOVQ	m_gsignal(R8), SI
    670 	CMPQ	SI, DI
    671 	JEQ	nosave
    672 
    673 	MOVQ	m_g0(R8), SI
    674 	CALL	gosave<>(SB)
    675 	MOVQ	SI, g(CX)
    676 	MOVQ	(g_sched+gobuf_sp)(SI), SP
    677 nosave:
    678 
    679 	// Now on a scheduling stack (a pthread-created stack).
    680 	// Make sure we have enough room for 4 stack-backed fast-call
    681 	// registers as per windows amd64 calling convention.
    682 	SUBQ	$64, SP
    683 	ANDQ	$~15, SP	// alignment for gcc ABI
    684 	MOVQ	DI, 48(SP)	// save g
    685 	MOVQ	(g_stack+stack_hi)(DI), DI
    686 	SUBQ	DX, DI
    687 	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
    688 	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
    689 	MOVQ	BX, CX		// CX = first argument in Win64
    690 	CALL	AX
    691 
    692 	// Restore registers, g, stack pointer.
    693 	get_tls(CX)
    694 	MOVQ	48(SP), DI
    695 	MOVQ	(g_stack+stack_hi)(DI), SI
    696 	SUBQ	40(SP), SI
    697 	MOVQ	DI, g(CX)
    698 	MOVQ	SI, SP
    699 
    700 	MOVL	AX, ret+16(FP)
    701 	RET
    702 
    703 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
    704 // Turn the fn into a Go func (by taking its address) and call
    705 // cgocallback_gofunc.
    706 TEXT runtimecgocallback(SB),NOSPLIT,$24-24
    707 	LEAQ	fn+0(FP), AX
    708 	MOVQ	AX, 0(SP)
    709 	MOVQ	frame+8(FP), AX
    710 	MOVQ	AX, 8(SP)
    711 	MOVQ	framesize+16(FP), AX
    712 	MOVQ	AX, 16(SP)
    713 	MOVQ	$runtimecgocallback_gofunc(SB), AX
    714 	CALL	AX
    715 	RET
    716 
    717 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
    718 // See cgocall.go for more details.
    719 TEXT cgocallback_gofunc(SB),NOSPLIT,$8-24
    720 	NO_LOCAL_POINTERS
    721 
    722 	// If g is nil, Go did not create the current thread.
    723 	// Call needm to obtain one m for temporary use.
    724 	// In this case, we're running on the thread stack, so there's
    725 	// lots of space, but the linker doesn't know. Hide the call from
    726 	// the linker analysis by using an indirect call through AX.
    727 	get_tls(CX)
    728 #ifdef GOOS_windows
    729 	MOVL	$0, BX
    730 	CMPQ	CX, $0
    731 	JEQ	2(PC)
    732 #endif
    733 	MOVQ	g(CX), BX
    734 	CMPQ	BX, $0
    735 	JEQ	needm
    736 	MOVQ	g_m(BX), BX
    737 	MOVQ	BX, R8 // holds oldm until end of function
    738 	JMP	havem
    739 needm:
    740 	MOVQ	$0, 0(SP)
    741 	MOVQ	$runtimeneedm(SB), AX
    742 	CALL	AX
    743 	MOVQ	0(SP), R8
    744 	get_tls(CX)
    745 	MOVQ	g(CX), BX
    746 	MOVQ	g_m(BX), BX
    747 
    748 	// Set m->sched.sp = SP, so that if a panic happens
    749 	// during the function we are about to execute, it will
    750 	// have a valid SP to run on the g0 stack.
    751 	// The next few lines (after the havem label)
    752 	// will save this SP onto the stack and then write
    753 	// the same SP back to m->sched.sp. That seems redundant,
    754 	// but if an unrecovered panic happens, unwindm will
    755 	// restore the g->sched.sp from the stack location
    756 	// and then systemstack will try to use it. If we don't set it here,
    757 	// that restored SP will be uninitialized (typically 0) and
    758 	// will not be usable.
    759 	MOVQ	m_g0(BX), SI
    760 	MOVQ	SP, (g_sched+gobuf_sp)(SI)
    761 
    762 havem:
    763 	// Now there's a valid m, and we're running on its m->g0.
    764 	// Save current m->g0->sched.sp on stack and then set it to SP.
    765 	// Save current sp in m->g0->sched.sp in preparation for
    766 	// switch back to m->curg stack.
    767 	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
    768 	MOVQ	m_g0(BX), SI
    769 	MOVQ	(g_sched+gobuf_sp)(SI), AX
    770 	MOVQ	AX, 0(SP)
    771 	MOVQ	SP, (g_sched+gobuf_sp)(SI)
    772 
    773 	// Switch to m->curg stack and call runtime.cgocallbackg.
    774 	// Because we are taking over the execution of m->curg
    775 	// but *not* resuming what had been running, we need to
    776 	// save that information (m->curg->sched) so we can restore it.
    777 	// We can restore m->curg->sched.sp easily, because calling
    778 	// runtime.cgocallbackg leaves SP unchanged upon return.
    779 	// To save m->curg->sched.pc, we push it onto the stack.
    780 	// This has the added benefit that it looks to the traceback
    781 	// routine like cgocallbackg is going to return to that
    782 	// PC (because the frame we allocate below has the same
    783 	// size as cgocallback_gofunc's frame declared above)
    784 	// so that the traceback will seamlessly trace back into
    785 	// the earlier calls.
    786 	//
    787 	// In the new goroutine, 0(SP) holds the saved R8.
    788 	MOVQ	m_curg(BX), SI
    789 	MOVQ	SI, g(CX)
    790 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
    791 	MOVQ	(g_sched+gobuf_pc)(SI), BX
    792 	MOVQ	BX, -8(DI)
    793 	// Compute the size of the frame, including return PC and, if
    794 	// GOEXPERIMENT=framepointer, the saved based pointer
    795 	LEAQ	fv+0(FP), AX
    796 	SUBQ	SP, AX
    797 	SUBQ	AX, DI
    798 	MOVQ	DI, SP
    799 
    800 	MOVQ	R8, 0(SP)
    801 	CALL	runtimecgocallbackg(SB)
    802 	MOVQ	0(SP), R8
    803 
    804 	// Compute the size of the frame again.  FP and SP have
    805 	// completely different values here than they did above,
    806 	// but only their difference matters.
    807 	LEAQ	fv+0(FP), AX
    808 	SUBQ	SP, AX
    809 
    810 	// Restore g->sched (== m->curg->sched) from saved values.
    811 	get_tls(CX)
    812 	MOVQ	g(CX), SI
    813 	MOVQ	SP, DI
    814 	ADDQ	AX, DI
    815 	MOVQ	-8(DI), BX
    816 	MOVQ	BX, (g_sched+gobuf_pc)(SI)
    817 	MOVQ	DI, (g_sched+gobuf_sp)(SI)
    818 
    819 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
    820 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
    821 	// so we do not have to restore it.)
    822 	MOVQ	g(CX), BX
    823 	MOVQ	g_m(BX), BX
    824 	MOVQ	m_g0(BX), SI
    825 	MOVQ	SI, g(CX)
    826 	MOVQ	(g_sched+gobuf_sp)(SI), SP
    827 	MOVQ	0(SP), AX
    828 	MOVQ	AX, (g_sched+gobuf_sp)(SI)
    829 
    830 	// If the m on entry was nil, we called needm above to borrow an m
    831 	// for the duration of the call. Since the call is over, return it with dropm.
    832 	CMPQ	R8, $0
    833 	JNE 3(PC)
    834 	MOVQ	$runtimedropm(SB), AX
    835 	CALL	AX
    836 
    837 	// Done!
    838 	RET
    839 
    840 // void setg(G*); set g. for use by needm.
    841 TEXT runtimesetg(SB), NOSPLIT, $0-8
    842 	MOVQ	gg+0(FP), BX
    843 #ifdef GOOS_windows
    844 	CMPQ	BX, $0
    845 	JNE	settls
    846 	MOVQ	$0, 0x28(GS)
    847 	RET
    848 settls:
    849 	MOVQ	g_m(BX), AX
    850 	LEAQ	m_tls(AX), AX
    851 	MOVQ	AX, 0x28(GS)
    852 #endif
    853 	get_tls(CX)
    854 	MOVQ	BX, g(CX)
    855 	RET
    856 
    857 // void setg_gcc(G*); set g called from gcc.
    858 TEXT setg_gcc<>(SB),NOSPLIT,$0
    859 	get_tls(AX)
    860 	MOVQ	DI, g(AX)
    861 	RET
    862 
    863 // check that SP is in range [g->stack.lo, g->stack.hi)
    864 TEXT runtimestackcheck(SB), NOSPLIT, $0-0
    865 	get_tls(CX)
    866 	MOVQ	g(CX), AX
    867 	CMPQ	(g_stack+stack_hi)(AX), SP
    868 	JHI	2(PC)
    869 	INT	$3
    870 	CMPQ	SP, (g_stack+stack_lo)(AX)
    871 	JHI	2(PC)
    872 	INT	$3
    873 	RET
    874 
    875 TEXT runtimegetcallerpc(SB),NOSPLIT,$8-16
    876 	MOVQ	argp+0(FP),AX		// addr of first arg
    877 	MOVQ	-8(AX),AX		// get calling pc
    878 	CMPQ	AX, runtimestackBarrierPC(SB)
    879 	JNE	nobar
    880 	// Get original return PC.
    881 	CALL	runtimenextBarrierPC(SB)
    882 	MOVQ	0(SP), AX
    883 nobar:
    884 	MOVQ	AX, ret+8(FP)
    885 	RET
    886 
    887 TEXT runtimesetcallerpc(SB),NOSPLIT,$8-16
    888 	MOVQ	argp+0(FP),AX		// addr of first arg
    889 	MOVQ	pc+8(FP), BX
    890 	MOVQ	-8(AX), CX
    891 	CMPQ	CX, runtimestackBarrierPC(SB)
    892 	JEQ	setbar
    893 	MOVQ	BX, -8(AX)		// set calling pc
    894 	RET
    895 setbar:
    896 	// Set the stack barrier return PC.
    897 	MOVQ	BX, 0(SP)
    898 	CALL	runtimesetNextBarrierPC(SB)
    899 	RET
    900 
    901 TEXT runtimegetcallersp(SB),NOSPLIT,$0-16
    902 	MOVQ	argp+0(FP), AX
    903 	MOVQ	AX, ret+8(FP)
    904 	RET
    905 
    906 // func cputicks() int64
    907 TEXT runtimecputicks(SB),NOSPLIT,$0-0
    908 	CMPB	runtimelfenceBeforeRdtsc(SB), $1
    909 	JNE	mfence
    910 	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
    911 	JMP	done
    912 mfence:
    913 	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
    914 done:
    915 	RDTSC
    916 	SHLQ	$32, DX
    917 	ADDQ	DX, AX
    918 	MOVQ	AX, ret+0(FP)
    919 	RET
    920 
    921 // memhash_varlen(p unsafe.Pointer, h seed) uintptr
    922 // redirects to memhash(p, h, size) using the size
    923 // stored in the closure.
    924 TEXT runtimememhash_varlen(SB),NOSPLIT,$32-24
    925 	GO_ARGS
    926 	NO_LOCAL_POINTERS
    927 	MOVQ	p+0(FP), AX
    928 	MOVQ	h+8(FP), BX
    929 	MOVQ	8(DX), CX
    930 	MOVQ	AX, 0(SP)
    931 	MOVQ	BX, 8(SP)
    932 	MOVQ	CX, 16(SP)
    933 	CALL	runtimememhash(SB)
    934 	MOVQ	24(SP), AX
    935 	MOVQ	AX, ret+16(FP)
    936 	RET
    937 
    938 // hash function using AES hardware instructions
    939 TEXT runtimeaeshash(SB),NOSPLIT,$0-32
    940 	MOVQ	p+0(FP), AX	// ptr to data
    941 	MOVQ	s+16(FP), CX	// size
    942 	LEAQ	ret+24(FP), DX
    943 	JMP	runtimeaeshashbody(SB)
    944 
    945 TEXT runtimeaeshashstr(SB),NOSPLIT,$0-24
    946 	MOVQ	p+0(FP), AX	// ptr to string struct
    947 	MOVQ	8(AX), CX	// length of string
    948 	MOVQ	(AX), AX	// string data
    949 	LEAQ	ret+16(FP), DX
    950 	JMP	runtimeaeshashbody(SB)
    951 
    952 // AX: data
    953 // CX: length
    954 // DX: address to put return value
    955 TEXT runtimeaeshashbody(SB),NOSPLIT,$0-0
    956 	MOVQ	h+8(FP), X6	// seed to low 64 bits of xmm6
    957 	PINSRQ	$1, CX, X6	// size to high 64 bits of xmm6
    958 	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
    959 	MOVO	runtimeaeskeysched(SB), X7
    960 	CMPQ	CX, $16
    961 	JB	aes0to15
    962 	JE	aes16
    963 	CMPQ	CX, $32
    964 	JBE	aes17to32
    965 	CMPQ	CX, $64
    966 	JBE	aes33to64
    967 	CMPQ	CX, $128
    968 	JBE	aes65to128
    969 	JMP	aes129plus
    970 
    971 aes0to15:
    972 	TESTQ	CX, CX
    973 	JE	aes0
    974 
    975 	ADDQ	$16, AX
    976 	TESTW	$0xff0, AX
    977 	JE	endofpage
    978 
    979 	// 16 bytes loaded at this address won't cross
    980 	// a page boundary, so we can load it directly.
    981 	MOVOU	-16(AX), X0
    982 	ADDQ	CX, CX
    983 	MOVQ	$masks<>(SB), AX
    984 	PAND	(AX)(CX*8), X0
    985 
    986 	// scramble 3 times
    987 	AESENC	X6, X0
    988 	AESENC	X7, X0
    989 	AESENC	X7, X0
    990 	MOVQ	X0, (DX)
    991 	RET
    992 
    993 endofpage:
    994 	// address ends in 1111xxxx.  Might be up against
    995 	// a page boundary, so load ending at last byte.
    996 	// Then shift bytes down using pshufb.
    997 	MOVOU	-32(AX)(CX*1), X0
    998 	ADDQ	CX, CX
    999 	MOVQ	$shifts<>(SB), AX
   1000 	PSHUFB	(AX)(CX*8), X0
   1001 	AESENC	X6, X0
   1002 	AESENC	X7, X0
   1003 	AESENC	X7, X0
   1004 	MOVQ	X0, (DX)
   1005 	RET
   1006 
   1007 aes0:
   1008 	// return input seed
   1009 	MOVQ	h+8(FP), AX
   1010 	MOVQ	AX, (DX)
   1011 	RET
   1012 
   1013 aes16:
   1014 	MOVOU	(AX), X0
   1015 	AESENC	X6, X0
   1016 	AESENC	X7, X0
   1017 	AESENC	X7, X0
   1018 	MOVQ	X0, (DX)
   1019 	RET
   1020 
   1021 aes17to32:
   1022 	// load data to be hashed
   1023 	MOVOU	(AX), X0
   1024 	MOVOU	-16(AX)(CX*1), X1
   1025 
   1026 	// scramble 3 times
   1027 	AESENC	X6, X0
   1028 	AESENC	runtimeaeskeysched+16(SB), X1
   1029 	AESENC	X7, X0
   1030 	AESENC	X7, X1
   1031 	AESENC	X7, X0
   1032 	AESENC	X7, X1
   1033 
   1034 	// combine results
   1035 	PXOR	X1, X0
   1036 	MOVQ	X0, (DX)
   1037 	RET
   1038 
   1039 aes33to64:
   1040 	MOVOU	(AX), X0
   1041 	MOVOU	16(AX), X1
   1042 	MOVOU	-32(AX)(CX*1), X2
   1043 	MOVOU	-16(AX)(CX*1), X3
   1044 
   1045 	AESENC	X6, X0
   1046 	AESENC	runtimeaeskeysched+16(SB), X1
   1047 	AESENC	runtimeaeskeysched+32(SB), X2
   1048 	AESENC	runtimeaeskeysched+48(SB), X3
   1049 	AESENC	X7, X0
   1050 	AESENC	X7, X1
   1051 	AESENC	X7, X2
   1052 	AESENC	X7, X3
   1053 	AESENC	X7, X0
   1054 	AESENC	X7, X1
   1055 	AESENC	X7, X2
   1056 	AESENC	X7, X3
   1057 
   1058 	PXOR	X2, X0
   1059 	PXOR	X3, X1
   1060 	PXOR	X1, X0
   1061 	MOVQ	X0, (DX)
   1062 	RET
   1063 
   1064 aes65to128:
   1065 	MOVOU	(AX), X0
   1066 	MOVOU	16(AX), X1
   1067 	MOVOU	32(AX), X2
   1068 	MOVOU	48(AX), X3
   1069 	MOVOU	-64(AX)(CX*1), X4
   1070 	MOVOU	-48(AX)(CX*1), X5
   1071 	MOVOU	-32(AX)(CX*1), X8
   1072 	MOVOU	-16(AX)(CX*1), X9
   1073 
   1074 	AESENC	X6, X0
   1075 	AESENC	runtimeaeskeysched+16(SB), X1
   1076 	AESENC	runtimeaeskeysched+32(SB), X2
   1077 	AESENC	runtimeaeskeysched+48(SB), X3
   1078 	AESENC	runtimeaeskeysched+64(SB), X4
   1079 	AESENC	runtimeaeskeysched+80(SB), X5
   1080 	AESENC	runtimeaeskeysched+96(SB), X8
   1081 	AESENC	runtimeaeskeysched+112(SB), X9
   1082 	AESENC	X7, X0
   1083 	AESENC	X7, X1
   1084 	AESENC	X7, X2
   1085 	AESENC	X7, X3
   1086 	AESENC	X7, X4
   1087 	AESENC	X7, X5
   1088 	AESENC	X7, X8
   1089 	AESENC	X7, X9
   1090 	AESENC	X7, X0
   1091 	AESENC	X7, X1
   1092 	AESENC	X7, X2
   1093 	AESENC	X7, X3
   1094 	AESENC	X7, X4
   1095 	AESENC	X7, X5
   1096 	AESENC	X7, X8
   1097 	AESENC	X7, X9
   1098 
   1099 	PXOR	X4, X0
   1100 	PXOR	X5, X1
   1101 	PXOR	X8, X2
   1102 	PXOR	X9, X3
   1103 	PXOR	X2, X0
   1104 	PXOR	X3, X1
   1105 	PXOR	X1, X0
   1106 	MOVQ	X0, (DX)
   1107 	RET
   1108 
   1109 aes129plus:
   1110 	// start with last (possibly overlapping) block
   1111 	MOVOU	-128(AX)(CX*1), X0
   1112 	MOVOU	-112(AX)(CX*1), X1
   1113 	MOVOU	-96(AX)(CX*1), X2
   1114 	MOVOU	-80(AX)(CX*1), X3
   1115 	MOVOU	-64(AX)(CX*1), X4
   1116 	MOVOU	-48(AX)(CX*1), X5
   1117 	MOVOU	-32(AX)(CX*1), X8
   1118 	MOVOU	-16(AX)(CX*1), X9
   1119 
   1120 	// scramble state once
   1121 	AESENC	X6, X0
   1122 	AESENC	runtimeaeskeysched+16(SB), X1
   1123 	AESENC	runtimeaeskeysched+32(SB), X2
   1124 	AESENC	runtimeaeskeysched+48(SB), X3
   1125 	AESENC	runtimeaeskeysched+64(SB), X4
   1126 	AESENC	runtimeaeskeysched+80(SB), X5
   1127 	AESENC	runtimeaeskeysched+96(SB), X8
   1128 	AESENC	runtimeaeskeysched+112(SB), X9
   1129 
   1130 	// compute number of remaining 128-byte blocks
   1131 	DECQ	CX
   1132 	SHRQ	$7, CX
   1133 
   1134 aesloop:
   1135 	// scramble state, xor in a block
   1136 	MOVOU	(AX), X10
   1137 	MOVOU	16(AX), X11
   1138 	MOVOU	32(AX), X12
   1139 	MOVOU	48(AX), X13
   1140 	AESENC	X10, X0
   1141 	AESENC	X11, X1
   1142 	AESENC	X12, X2
   1143 	AESENC	X13, X3
   1144 	MOVOU	64(AX), X10
   1145 	MOVOU	80(AX), X11
   1146 	MOVOU	96(AX), X12
   1147 	MOVOU	112(AX), X13
   1148 	AESENC	X10, X4
   1149 	AESENC	X11, X5
   1150 	AESENC	X12, X8
   1151 	AESENC	X13, X9
   1152 
   1153 	// scramble state
   1154 	AESENC	X7, X0
   1155 	AESENC	X7, X1
   1156 	AESENC	X7, X2
   1157 	AESENC	X7, X3
   1158 	AESENC	X7, X4
   1159 	AESENC	X7, X5
   1160 	AESENC	X7, X8
   1161 	AESENC	X7, X9
   1162 
   1163 	ADDQ	$128, AX
   1164 	DECQ	CX
   1165 	JNE	aesloop
   1166 
   1167 	// 2 more scrambles to finish
   1168 	AESENC	X7, X0
   1169 	AESENC	X7, X1
   1170 	AESENC	X7, X2
   1171 	AESENC	X7, X3
   1172 	AESENC	X7, X4
   1173 	AESENC	X7, X5
   1174 	AESENC	X7, X8
   1175 	AESENC	X7, X9
   1176 	AESENC	X7, X0
   1177 	AESENC	X7, X1
   1178 	AESENC	X7, X2
   1179 	AESENC	X7, X3
   1180 	AESENC	X7, X4
   1181 	AESENC	X7, X5
   1182 	AESENC	X7, X8
   1183 	AESENC	X7, X9
   1184 
   1185 	PXOR	X4, X0
   1186 	PXOR	X5, X1
   1187 	PXOR	X8, X2
   1188 	PXOR	X9, X3
   1189 	PXOR	X2, X0
   1190 	PXOR	X3, X1
   1191 	PXOR	X1, X0
   1192 	MOVQ	X0, (DX)
   1193 	RET
   1194 
   1195 TEXT runtimeaeshash32(SB),NOSPLIT,$0-24
   1196 	MOVQ	p+0(FP), AX	// ptr to data
   1197 	MOVQ	h+8(FP), X0	// seed
   1198 	PINSRD	$2, (AX), X0	// data
   1199 	AESENC	runtimeaeskeysched+0(SB), X0
   1200 	AESENC	runtimeaeskeysched+16(SB), X0
   1201 	AESENC	runtimeaeskeysched+32(SB), X0
   1202 	MOVQ	X0, ret+16(FP)
   1203 	RET
   1204 
   1205 TEXT runtimeaeshash64(SB),NOSPLIT,$0-24
   1206 	MOVQ	p+0(FP), AX	// ptr to data
   1207 	MOVQ	h+8(FP), X0	// seed
   1208 	PINSRQ	$1, (AX), X0	// data
   1209 	AESENC	runtimeaeskeysched+0(SB), X0
   1210 	AESENC	runtimeaeskeysched+16(SB), X0
   1211 	AESENC	runtimeaeskeysched+32(SB), X0
   1212 	MOVQ	X0, ret+16(FP)
   1213 	RET
   1214 
   1215 // simple mask to get rid of data in the high part of the register.
   1216 DATA masks<>+0x00(SB)/8, $0x0000000000000000
   1217 DATA masks<>+0x08(SB)/8, $0x0000000000000000
   1218 DATA masks<>+0x10(SB)/8, $0x00000000000000ff
   1219 DATA masks<>+0x18(SB)/8, $0x0000000000000000
   1220 DATA masks<>+0x20(SB)/8, $0x000000000000ffff
   1221 DATA masks<>+0x28(SB)/8, $0x0000000000000000
   1222 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
   1223 DATA masks<>+0x38(SB)/8, $0x0000000000000000
   1224 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
   1225 DATA masks<>+0x48(SB)/8, $0x0000000000000000
   1226 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
   1227 DATA masks<>+0x58(SB)/8, $0x0000000000000000
   1228 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
   1229 DATA masks<>+0x68(SB)/8, $0x0000000000000000
   1230 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
   1231 DATA masks<>+0x78(SB)/8, $0x0000000000000000
   1232 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
   1233 DATA masks<>+0x88(SB)/8, $0x0000000000000000
   1234 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
   1235 DATA masks<>+0x98(SB)/8, $0x00000000000000ff
   1236 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
   1237 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
   1238 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
   1239 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
   1240 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
   1241 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
   1242 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
   1243 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
   1244 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
   1245 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
   1246 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
   1247 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
   1248 GLOBL masks<>(SB),RODATA,$256
   1249 
   1250 // these are arguments to pshufb.  They move data down from
   1251 // the high bytes of the register to the low bytes of the register.
   1252 // index is how many bytes to move.
   1253 DATA shifts<>+0x00(SB)/8, $0x0000000000000000
   1254 DATA shifts<>+0x08(SB)/8, $0x0000000000000000
   1255 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
   1256 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
   1257 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
   1258 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
   1259 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
   1260 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
   1261 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
   1262 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
   1263 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
   1264 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
   1265 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
   1266 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
   1267 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
   1268 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
   1269 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
   1270 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
   1271 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
   1272 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
   1273 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
   1274 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
   1275 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
   1276 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
   1277 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
   1278 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
   1279 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
   1280 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
   1281 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
   1282 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
   1283 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
   1284 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
   1285 GLOBL shifts<>(SB),RODATA,$256
   1286 
   1287 TEXT runtimememeq(SB),NOSPLIT,$0-25
   1288 	MOVQ	a+0(FP), SI
   1289 	MOVQ	b+8(FP), DI
   1290 	MOVQ	size+16(FP), BX
   1291 	LEAQ	ret+24(FP), AX
   1292 	JMP	runtimememeqbody(SB)
   1293 
   1294 // memequal_varlen(a, b unsafe.Pointer) bool
   1295 TEXT runtimememequal_varlen(SB),NOSPLIT,$0-17
   1296 	MOVQ	a+0(FP), SI
   1297 	MOVQ	b+8(FP), DI
   1298 	CMPQ	SI, DI
   1299 	JEQ	eq
   1300 	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
   1301 	LEAQ	ret+16(FP), AX
   1302 	JMP	runtimememeqbody(SB)
   1303 eq:
   1304 	MOVB	$1, ret+16(FP)
   1305 	RET
   1306 
   1307 // eqstring tests whether two strings are equal.
   1308 // The compiler guarantees that strings passed
   1309 // to eqstring have equal length.
   1310 // See runtime_test.go:eqstring_generic for
   1311 // equivalent Go code.
   1312 TEXT runtimeeqstring(SB),NOSPLIT,$0-33
   1313 	MOVQ	s1str+0(FP), SI
   1314 	MOVQ	s2str+16(FP), DI
   1315 	CMPQ	SI, DI
   1316 	JEQ	eq
   1317 	MOVQ	s1len+8(FP), BX
   1318 	LEAQ	v+32(FP), AX
   1319 	JMP	runtimememeqbody(SB)
   1320 eq:
   1321 	MOVB	$1, v+32(FP)
   1322 	RET
   1323 
   1324 // a in SI
   1325 // b in DI
   1326 // count in BX
   1327 // address of result byte in AX
   1328 TEXT runtimememeqbody(SB),NOSPLIT,$0-0
   1329 	CMPQ	BX, $8
   1330 	JB	small
   1331 
   1332 	// 64 bytes at a time using xmm registers
   1333 hugeloop:
   1334 	CMPQ	BX, $64
   1335 	JB	bigloop
   1336 	MOVOU	(SI), X0
   1337 	MOVOU	(DI), X1
   1338 	MOVOU	16(SI), X2
   1339 	MOVOU	16(DI), X3
   1340 	MOVOU	32(SI), X4
   1341 	MOVOU	32(DI), X5
   1342 	MOVOU	48(SI), X6
   1343 	MOVOU	48(DI), X7
   1344 	PCMPEQB	X1, X0
   1345 	PCMPEQB	X3, X2
   1346 	PCMPEQB	X5, X4
   1347 	PCMPEQB	X7, X6
   1348 	PAND	X2, X0
   1349 	PAND	X6, X4
   1350 	PAND	X4, X0
   1351 	PMOVMSKB X0, DX
   1352 	ADDQ	$64, SI
   1353 	ADDQ	$64, DI
   1354 	SUBQ	$64, BX
   1355 	CMPL	DX, $0xffff
   1356 	JEQ	hugeloop
   1357 	MOVB	$0, (AX)
   1358 	RET
   1359 
   1360 	// 8 bytes at a time using 64-bit register
   1361 bigloop:
   1362 	CMPQ	BX, $8
   1363 	JBE	leftover
   1364 	MOVQ	(SI), CX
   1365 	MOVQ	(DI), DX
   1366 	ADDQ	$8, SI
   1367 	ADDQ	$8, DI
   1368 	SUBQ	$8, BX
   1369 	CMPQ	CX, DX
   1370 	JEQ	bigloop
   1371 	MOVB	$0, (AX)
   1372 	RET
   1373 
   1374 	// remaining 0-8 bytes
   1375 leftover:
   1376 	MOVQ	-8(SI)(BX*1), CX
   1377 	MOVQ	-8(DI)(BX*1), DX
   1378 	CMPQ	CX, DX
   1379 	SETEQ	(AX)
   1380 	RET
   1381 
   1382 small:
   1383 	CMPQ	BX, $0
   1384 	JEQ	equal
   1385 
   1386 	LEAQ	0(BX*8), CX
   1387 	NEGQ	CX
   1388 
   1389 	CMPB	SI, $0xf8
   1390 	JA	si_high
   1391 
   1392 	// load at SI won't cross a page boundary.
   1393 	MOVQ	(SI), SI
   1394 	JMP	si_finish
   1395 si_high:
   1396 	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
   1397 	MOVQ	-8(SI)(BX*1), SI
   1398 	SHRQ	CX, SI
   1399 si_finish:
   1400 
   1401 	// same for DI.
   1402 	CMPB	DI, $0xf8
   1403 	JA	di_high
   1404 	MOVQ	(DI), DI
   1405 	JMP	di_finish
   1406 di_high:
   1407 	MOVQ	-8(DI)(BX*1), DI
   1408 	SHRQ	CX, DI
   1409 di_finish:
   1410 
   1411 	SUBQ	SI, DI
   1412 	SHLQ	CX, DI
   1413 equal:
   1414 	SETEQ	(AX)
   1415 	RET
   1416 
   1417 TEXT runtimecmpstring(SB),NOSPLIT,$0-40
   1418 	MOVQ	s1_base+0(FP), SI
   1419 	MOVQ	s1_len+8(FP), BX
   1420 	MOVQ	s2_base+16(FP), DI
   1421 	MOVQ	s2_len+24(FP), DX
   1422 	LEAQ	ret+32(FP), R9
   1423 	JMP	runtimecmpbody(SB)
   1424 
   1425 TEXT bytesCompare(SB),NOSPLIT,$0-56
   1426 	MOVQ	s1+0(FP), SI
   1427 	MOVQ	s1+8(FP), BX
   1428 	MOVQ	s2+24(FP), DI
   1429 	MOVQ	s2+32(FP), DX
   1430 	LEAQ	res+48(FP), R9
   1431 	JMP	runtimecmpbody(SB)
   1432 
   1433 // input:
   1434 //   SI = a
   1435 //   DI = b
   1436 //   BX = alen
   1437 //   DX = blen
   1438 //   R9 = address of output word (stores -1/0/1 here)
   1439 TEXT runtimecmpbody(SB),NOSPLIT,$0-0
   1440 	CMPQ	SI, DI
   1441 	JEQ	allsame
   1442 	CMPQ	BX, DX
   1443 	MOVQ	DX, R8
   1444 	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
   1445 	CMPQ	R8, $8
   1446 	JB	small
   1447 
   1448 loop:
   1449 	CMPQ	R8, $16
   1450 	JBE	_0through16
   1451 	MOVOU	(SI), X0
   1452 	MOVOU	(DI), X1
   1453 	PCMPEQB X0, X1
   1454 	PMOVMSKB X1, AX
   1455 	XORQ	$0xffff, AX	// convert EQ to NE
   1456 	JNE	diff16	// branch if at least one byte is not equal
   1457 	ADDQ	$16, SI
   1458 	ADDQ	$16, DI
   1459 	SUBQ	$16, R8
   1460 	JMP	loop
   1461 
   1462 	// AX = bit mask of differences
   1463 diff16:
   1464 	BSFQ	AX, BX	// index of first byte that differs
   1465 	XORQ	AX, AX
   1466 	MOVB	(SI)(BX*1), CX
   1467 	CMPB	CX, (DI)(BX*1)
   1468 	SETHI	AX
   1469 	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
   1470 	MOVQ	AX, (R9)
   1471 	RET
   1472 
   1473 	// 0 through 16 bytes left, alen>=8, blen>=8
   1474 _0through16:
   1475 	CMPQ	R8, $8
   1476 	JBE	_0through8
   1477 	MOVQ	(SI), AX
   1478 	MOVQ	(DI), CX
   1479 	CMPQ	AX, CX
   1480 	JNE	diff8
   1481 _0through8:
   1482 	MOVQ	-8(SI)(R8*1), AX
   1483 	MOVQ	-8(DI)(R8*1), CX
   1484 	CMPQ	AX, CX
   1485 	JEQ	allsame
   1486 
   1487 	// AX and CX contain parts of a and b that differ.
   1488 diff8:
   1489 	BSWAPQ	AX	// reverse order of bytes
   1490 	BSWAPQ	CX
   1491 	XORQ	AX, CX
   1492 	BSRQ	CX, CX	// index of highest bit difference
   1493 	SHRQ	CX, AX	// move a's bit to bottom
   1494 	ANDQ	$1, AX	// mask bit
   1495 	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
   1496 	MOVQ	AX, (R9)
   1497 	RET
   1498 
   1499 	// 0-7 bytes in common
   1500 small:
   1501 	LEAQ	(R8*8), CX	// bytes left -> bits left
   1502 	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
   1503 	JEQ	allsame
   1504 
   1505 	// load bytes of a into high bytes of AX
   1506 	CMPB	SI, $0xf8
   1507 	JA	si_high
   1508 	MOVQ	(SI), SI
   1509 	JMP	si_finish
   1510 si_high:
   1511 	MOVQ	-8(SI)(R8*1), SI
   1512 	SHRQ	CX, SI
   1513 si_finish:
   1514 	SHLQ	CX, SI
   1515 
   1516 	// load bytes of b in to high bytes of BX
   1517 	CMPB	DI, $0xf8
   1518 	JA	di_high
   1519 	MOVQ	(DI), DI
   1520 	JMP	di_finish
   1521 di_high:
   1522 	MOVQ	-8(DI)(R8*1), DI
   1523 	SHRQ	CX, DI
   1524 di_finish:
   1525 	SHLQ	CX, DI
   1526 
   1527 	BSWAPQ	SI	// reverse order of bytes
   1528 	BSWAPQ	DI
   1529 	XORQ	SI, DI	// find bit differences
   1530 	JEQ	allsame
   1531 	BSRQ	DI, CX	// index of highest bit difference
   1532 	SHRQ	CX, SI	// move a's bit to bottom
   1533 	ANDQ	$1, SI	// mask bit
   1534 	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
   1535 	MOVQ	AX, (R9)
   1536 	RET
   1537 
   1538 allsame:
   1539 	XORQ	AX, AX
   1540 	XORQ	CX, CX
   1541 	CMPQ	BX, DX
   1542 	SETGT	AX	// 1 if alen > blen
   1543 	SETEQ	CX	// 1 if alen == blen
   1544 	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
   1545 	MOVQ	AX, (R9)
   1546 	RET
   1547 
   1548 TEXT bytesIndexByte(SB),NOSPLIT,$0-40
   1549 	MOVQ s+0(FP), SI
   1550 	MOVQ s_len+8(FP), BX
   1551 	MOVB c+24(FP), AL
   1552 	LEAQ ret+32(FP), R8
   1553 	JMP  runtimeindexbytebody(SB)
   1554 
   1555 TEXT stringsIndexByte(SB),NOSPLIT,$0-32
   1556 	MOVQ s+0(FP), SI
   1557 	MOVQ s_len+8(FP), BX
   1558 	MOVB c+16(FP), AL
   1559 	LEAQ ret+24(FP), R8
   1560 	JMP  runtimeindexbytebody(SB)
   1561 
   1562 // input:
   1563 //   SI: data
   1564 //   BX: data len
   1565 //   AL: byte sought
   1566 //   R8: address to put result
   1567 TEXT runtimeindexbytebody(SB),NOSPLIT,$0
   1568 	MOVQ SI, DI
   1569 
   1570 	CMPQ BX, $16
   1571 	JLT small
   1572 
   1573 	// round up to first 16-byte boundary
   1574 	TESTQ $15, SI
   1575 	JZ aligned
   1576 	MOVQ SI, CX
   1577 	ANDQ $~15, CX
   1578 	ADDQ $16, CX
   1579 
   1580 	// search the beginning
   1581 	SUBQ SI, CX
   1582 	REPN; SCASB
   1583 	JZ success
   1584 
   1585 // DI is 16-byte aligned; get ready to search using SSE instructions
   1586 aligned:
   1587 	// round down to last 16-byte boundary
   1588 	MOVQ BX, R11
   1589 	ADDQ SI, R11
   1590 	ANDQ $~15, R11
   1591 
   1592 	// shuffle X0 around so that each byte contains c
   1593 	MOVD AX, X0
   1594 	PUNPCKLBW X0, X0
   1595 	PUNPCKLBW X0, X0
   1596 	PSHUFL $0, X0, X0
   1597 	JMP condition
   1598 
   1599 sse:
   1600 	// move the next 16-byte chunk of the buffer into X1
   1601 	MOVO (DI), X1
   1602 	// compare bytes in X0 to X1
   1603 	PCMPEQB X0, X1
   1604 	// take the top bit of each byte in X1 and put the result in DX
   1605 	PMOVMSKB X1, DX
   1606 	TESTL DX, DX
   1607 	JNZ ssesuccess
   1608 	ADDQ $16, DI
   1609 
   1610 condition:
   1611 	CMPQ DI, R11
   1612 	JLT sse
   1613 
   1614 	// search the end
   1615 	MOVQ SI, CX
   1616 	ADDQ BX, CX
   1617 	SUBQ R11, CX
   1618 	// if CX == 0, the zero flag will be set and we'll end up
   1619 	// returning a false success
   1620 	JZ failure
   1621 	REPN; SCASB
   1622 	JZ success
   1623 
   1624 failure:
   1625 	MOVQ $-1, (R8)
   1626 	RET
   1627 
   1628 // handle for lengths < 16
   1629 small:
   1630 	MOVQ BX, CX
   1631 	REPN; SCASB
   1632 	JZ success
   1633 	MOVQ $-1, (R8)
   1634 	RET
   1635 
   1636 // we've found the chunk containing the byte
   1637 // now just figure out which specific byte it is
   1638 ssesuccess:
   1639 	// get the index of the least significant set bit
   1640 	BSFW DX, DX
   1641 	SUBQ SI, DI
   1642 	ADDQ DI, DX
   1643 	MOVQ DX, (R8)
   1644 	RET
   1645 
   1646 success:
   1647 	SUBQ SI, DI
   1648 	SUBL $1, DI
   1649 	MOVQ DI, (R8)
   1650 	RET
   1651 
   1652 TEXT bytesEqual(SB),NOSPLIT,$0-49
   1653 	MOVQ	a_len+8(FP), BX
   1654 	MOVQ	b_len+32(FP), CX
   1655 	CMPQ	BX, CX
   1656 	JNE	eqret
   1657 	MOVQ	a+0(FP), SI
   1658 	MOVQ	b+24(FP), DI
   1659 	LEAQ	ret+48(FP), AX
   1660 	JMP	runtimememeqbody(SB)
   1661 eqret:
   1662 	MOVB	$0, ret+48(FP)
   1663 	RET
   1664 
   1665 TEXT runtimefastrand1(SB), NOSPLIT, $0-4
   1666 	get_tls(CX)
   1667 	MOVQ	g(CX), AX
   1668 	MOVQ	g_m(AX), AX
   1669 	MOVL	m_fastrand(AX), DX
   1670 	ADDL	DX, DX
   1671 	MOVL	DX, BX
   1672 	XORL	$0x88888eef, DX
   1673 	CMOVLMI	BX, DX
   1674 	MOVL	DX, m_fastrand(AX)
   1675 	MOVL	DX, ret+0(FP)
   1676 	RET
   1677 
   1678 TEXT runtimereturn0(SB), NOSPLIT, $0
   1679 	MOVL	$0, AX
   1680 	RET
   1681 
   1682 
   1683 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
   1684 // Must obey the gcc calling convention.
   1685 TEXT _cgo_topofstack(SB),NOSPLIT,$0
   1686 	get_tls(CX)
   1687 	MOVQ	g(CX), AX
   1688 	MOVQ	g_m(AX), AX
   1689 	MOVQ	m_curg(AX), AX
   1690 	MOVQ	(g_stack+stack_hi)(AX), AX
   1691 	RET
   1692 
   1693 // The top-most function running on a goroutine
   1694 // returns to goexit+PCQuantum.
   1695 TEXT runtimegoexit(SB),NOSPLIT,$0-0
   1696 	BYTE	$0x90	// NOP
   1697 	CALL	runtimegoexit1(SB)	// does not return
   1698 	// traceback from goexit1 must hit code range of goexit
   1699 	BYTE	$0x90	// NOP
   1700 
   1701 TEXT runtimeprefetcht0(SB),NOSPLIT,$0-8
   1702 	MOVQ	addr+0(FP), AX
   1703 	PREFETCHT0	(AX)
   1704 	RET
   1705 
   1706 TEXT runtimeprefetcht1(SB),NOSPLIT,$0-8
   1707 	MOVQ	addr+0(FP), AX
   1708 	PREFETCHT1	(AX)
   1709 	RET
   1710 
   1711 TEXT runtimeprefetcht2(SB),NOSPLIT,$0-8
   1712 	MOVQ	addr+0(FP), AX
   1713 	PREFETCHT2	(AX)
   1714 	RET
   1715 
   1716 TEXT runtimeprefetchnta(SB),NOSPLIT,$0-8
   1717 	MOVQ	addr+0(FP), AX
   1718 	PREFETCHNTA	(AX)
   1719 	RET
   1720 
   1721 // This is called from .init_array and follows the platform, not Go, ABI.
   1722 TEXT runtimeaddmoduledata(SB),NOSPLIT,$0-0
   1723 	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
   1724 	MOVQ	runtimelastmoduledatap(SB), AX
   1725 	MOVQ	DI, moduledata_next(AX)
   1726 	MOVQ	DI, runtimelastmoduledatap(SB)
   1727 	POPQ	R15
   1728 	RET
   1729