Home | History | Annotate | Download | only in runtime
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "go_asm.h"
      6 #include "go_tls.h"
      7 #include "funcdata.h"
      8 #include "textflag.h"
      9 
     10 TEXT runtimert0_go(SB),NOSPLIT,$0
     11 	// copy arguments forward on an even stack
     12 	MOVL	argc+0(FP), AX
     13 	MOVL	argv+4(FP), BX
     14 	MOVL	SP, CX
     15 	SUBL	$128, CX		// plenty of scratch
     16 	ANDL	$~15, CX
     17 	MOVL	CX, SP
     18 
     19 	MOVL	AX, 16(SP)
     20 	MOVL	BX, 24(SP)
     21 
     22 	// create istack out of the given (operating system) stack.
     23 	MOVL	$runtimeg0(SB), DI
     24 	LEAL	(-64*1024+104)(SP), BX
     25 	MOVL	BX, g_stackguard0(DI)
     26 	MOVL	BX, g_stackguard1(DI)
     27 	MOVL	BX, (g_stack+stack_lo)(DI)
     28 	MOVL	SP, (g_stack+stack_hi)(DI)
     29 
     30 	// find out information about the processor we're on
     31 	MOVL	$0, AX
     32 	CPUID
     33 	CMPL	AX, $0
     34 	JE	nocpuinfo
     35 
     36 	CMPL	BX, $0x756E6547  // "Genu"
     37 	JNE	notintel
     38 	CMPL	DX, $0x49656E69  // "ineI"
     39 	JNE	notintel
     40 	CMPL	CX, $0x6C65746E  // "ntel"
     41 	JNE	notintel
     42 	MOVB	$1, runtimeisIntel(SB)
     43 notintel:
     44 
     45 	// Load EAX=1 cpuid flags
     46 	MOVL	$1, AX
     47 	CPUID
     48 	MOVL	AX, runtimeprocessorVersionInfo(SB)
     49 
     50 	TESTL	$(1<<26), DX // SSE2
     51 	SETNE	runtimesupport_sse2(SB)
     52 
     53 	TESTL	$(1<<9), CX // SSSE3
     54 	SETNE	runtimesupport_ssse3(SB)
     55 
     56 	TESTL	$(1<<19), CX // SSE4.1
     57 	SETNE	runtimesupport_sse41(SB)
     58 
     59 	TESTL	$(1<<20), CX // SSE4.2
     60 	SETNE	runtimesupport_sse42(SB)
     61 
     62 	TESTL	$(1<<23), CX // POPCNT
     63 	SETNE	runtimesupport_popcnt(SB)
     64 
     65 	TESTL	$(1<<25), CX // AES
     66 	SETNE	runtimesupport_aes(SB)
     67 
     68 	TESTL	$(1<<27), CX // OSXSAVE
     69 	SETNE	runtimesupport_osxsave(SB)
     70 
     71 	// If OS support for XMM and YMM is not present
     72 	// support_avx will be set back to false later.
     73 	TESTL	$(1<<28), CX // AVX
     74 	SETNE	runtimesupport_avx(SB)
     75 
     76 eax7:
     77 	// Load EAX=7/ECX=0 cpuid flags
     78 	CMPL	SI, $7
     79 	JLT	osavx
     80 	MOVL	$7, AX
     81 	MOVL	$0, CX
     82 	CPUID
     83 
     84 	TESTL	$(1<<3), BX // BMI1
     85 	SETNE	runtimesupport_bmi1(SB)
     86 
     87 	// If OS support for XMM and YMM is not present
     88 	// support_avx2 will be set back to false later.
     89 	TESTL	$(1<<5), BX
     90 	SETNE	runtimesupport_avx2(SB)
     91 
     92 	TESTL	$(1<<8), BX // BMI2
     93 	SETNE	runtimesupport_bmi2(SB)
     94 
     95 	TESTL	$(1<<9), BX // ERMS
     96 	SETNE	runtimesupport_erms(SB)
     97 
     98 osavx:
     99 	// nacl does not support XGETBV to test
    100 	// for XMM and YMM OS support.
    101 #ifndef GOOS_nacl
    102 	CMPB	runtimesupport_osxsave(SB), $1
    103 	JNE	noavx
    104 	MOVL	$0, CX
    105 	// For XGETBV, OSXSAVE bit is required and sufficient
    106 	XGETBV
    107 	ANDL	$6, AX
    108 	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
    109 	JE nocpuinfo
    110 #endif
    111 noavx:
    112 	MOVB $0, runtimesupport_avx(SB)
    113 	MOVB $0, runtimesupport_avx2(SB)
    114 
    115 nocpuinfo:
    116 
    117 needtls:
    118 	LEAL	runtimem0+m_tls(SB), DI
    119 	CALL	runtimesettls(SB)
    120 
    121 	// store through it, to make sure it works
    122 	get_tls(BX)
    123 	MOVQ	$0x123, g(BX)
    124 	MOVQ	runtimem0+m_tls(SB), AX
    125 	CMPQ	AX, $0x123
    126 	JEQ 2(PC)
    127 	MOVL	AX, 0	// abort
    128 ok:
    129 	// set the per-goroutine and per-mach "registers"
    130 	get_tls(BX)
    131 	LEAL	runtimeg0(SB), CX
    132 	MOVL	CX, g(BX)
    133 	LEAL	runtimem0(SB), AX
    134 
    135 	// save m->g0 = g0
    136 	MOVL	CX, m_g0(AX)
    137 	// save m0 to g0->m
    138 	MOVL	AX, g_m(CX)
    139 
    140 	CLD				// convention is D is always left cleared
    141 	CALL	runtimecheck(SB)
    142 
    143 	MOVL	16(SP), AX		// copy argc
    144 	MOVL	AX, 0(SP)
    145 	MOVL	24(SP), AX		// copy argv
    146 	MOVL	AX, 4(SP)
    147 	CALL	runtimeargs(SB)
    148 	CALL	runtimeosinit(SB)
    149 	CALL	runtimeschedinit(SB)
    150 
    151 	// create a new goroutine to start program
    152 	MOVL	$runtimemainPC(SB), AX	// entry
    153 	MOVL	$0, 0(SP)
    154 	MOVL	AX, 4(SP)
    155 	CALL	runtimenewproc(SB)
    156 
    157 	// start this M
    158 	CALL	runtimemstart(SB)
    159 
    160 	MOVL	$0xf1, 0xf1  // crash
    161 	RET
    162 
    163 DATA	runtimemainPC+0(SB)/4,$runtimemain(SB)
    164 GLOBL	runtimemainPC(SB),RODATA,$4
    165 
    166 TEXT runtimebreakpoint(SB),NOSPLIT,$0-0
    167 	INT $3
    168 	RET
    169 
    170 TEXT runtimeasminit(SB),NOSPLIT,$0-0
    171 	// No per-thread init.
    172 	RET
    173 
    174 /*
    175  *  go-routine
    176  */
    177 
    178 // void gosave(Gobuf*)
    179 // save state in Gobuf; setjmp
    180 TEXT runtimegosave(SB), NOSPLIT, $0-4
    181 	MOVL	buf+0(FP), AX	// gobuf
    182 	LEAL	buf+0(FP), BX	// caller's SP
    183 	MOVL	BX, gobuf_sp(AX)
    184 	MOVL	0(SP), BX		// caller's PC
    185 	MOVL	BX, gobuf_pc(AX)
    186 	MOVQ	$0, gobuf_ret(AX)
    187 	// Assert ctxt is zero. See func save.
    188 	MOVL	gobuf_ctxt(AX), BX
    189 	TESTL	BX, BX
    190 	JZ	2(PC)
    191 	CALL	runtimebadctxt(SB)
    192 	get_tls(CX)
    193 	MOVL	g(CX), BX
    194 	MOVL	BX, gobuf_g(AX)
    195 	RET
    196 
    197 // void gogo(Gobuf*)
    198 // restore state from Gobuf; longjmp
    199 TEXT runtimegogo(SB), NOSPLIT, $8-4
    200 	MOVL	buf+0(FP), BX		// gobuf
    201 	MOVL	gobuf_g(BX), DX
    202 	MOVL	0(DX), CX		// make sure g != nil
    203 	get_tls(CX)
    204 	MOVL	DX, g(CX)
    205 	MOVL	gobuf_sp(BX), SP	// restore SP
    206 	MOVL	gobuf_ctxt(BX), DX
    207 	MOVQ	gobuf_ret(BX), AX
    208 	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
    209 	MOVQ	$0, gobuf_ret(BX)
    210 	MOVL	$0, gobuf_ctxt(BX)
    211 	MOVL	gobuf_pc(BX), BX
    212 	JMP	BX
    213 
    214 // func mcall(fn func(*g))
    215 // Switch to m->g0's stack, call fn(g).
    216 // Fn must never return. It should gogo(&g->sched)
    217 // to keep running g.
    218 TEXT runtimemcall(SB), NOSPLIT, $0-4
    219 	MOVL	fn+0(FP), DI
    220 
    221 	get_tls(CX)
    222 	MOVL	g(CX), AX	// save state in g->sched
    223 	MOVL	0(SP), BX	// caller's PC
    224 	MOVL	BX, (g_sched+gobuf_pc)(AX)
    225 	LEAL	fn+0(FP), BX	// caller's SP
    226 	MOVL	BX, (g_sched+gobuf_sp)(AX)
    227 	MOVL	AX, (g_sched+gobuf_g)(AX)
    228 
    229 	// switch to m->g0 & its stack, call fn
    230 	MOVL	g(CX), BX
    231 	MOVL	g_m(BX), BX
    232 	MOVL	m_g0(BX), SI
    233 	CMPL	SI, AX	// if g == m->g0 call badmcall
    234 	JNE	3(PC)
    235 	MOVL	$runtimebadmcall(SB), AX
    236 	JMP	AX
    237 	MOVL	SI, g(CX)	// g = m->g0
    238 	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
    239 	PUSHQ	AX
    240 	MOVL	DI, DX
    241 	MOVL	0(DI), DI
    242 	CALL	DI
    243 	POPQ	AX
    244 	MOVL	$runtimebadmcall2(SB), AX
    245 	JMP	AX
    246 	RET
    247 
    248 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
    249 // of the G stack. We need to distinguish the routine that
    250 // lives at the bottom of the G stack from the one that lives
    251 // at the top of the system stack because the one at the top of
    252 // the system stack terminates the stack walk (see topofstack()).
    253 TEXT runtimesystemstack_switch(SB), NOSPLIT, $0-0
    254 	RET
    255 
    256 // func systemstack(fn func())
    257 TEXT runtimesystemstack(SB), NOSPLIT, $0-4
    258 	MOVL	fn+0(FP), DI	// DI = fn
    259 	get_tls(CX)
    260 	MOVL	g(CX), AX	// AX = g
    261 	MOVL	g_m(AX), BX	// BX = m
    262 
    263 	MOVL	m_gsignal(BX), DX	// DX = gsignal
    264 	CMPL	AX, DX
    265 	JEQ	noswitch
    266 
    267 	MOVL	m_g0(BX), DX	// DX = g0
    268 	CMPL	AX, DX
    269 	JEQ	noswitch
    270 
    271 	MOVL	m_curg(BX), R8
    272 	CMPL	AX, R8
    273 	JEQ	switch
    274 
    275 	// Not g0, not curg. Must be gsignal, but that's not allowed.
    276 	// Hide call from linker nosplit analysis.
    277 	MOVL	$runtimebadsystemstack(SB), AX
    278 	CALL	AX
    279 
    280 switch:
    281 	// save our state in g->sched. Pretend to
    282 	// be systemstack_switch if the G stack is scanned.
    283 	MOVL	$runtimesystemstack_switch(SB), SI
    284 	MOVL	SI, (g_sched+gobuf_pc)(AX)
    285 	MOVL	SP, (g_sched+gobuf_sp)(AX)
    286 	MOVL	AX, (g_sched+gobuf_g)(AX)
    287 
    288 	// switch to g0
    289 	MOVL	DX, g(CX)
    290 	MOVL	(g_sched+gobuf_sp)(DX), SP
    291 
    292 	// call target function
    293 	MOVL	DI, DX
    294 	MOVL	0(DI), DI
    295 	CALL	DI
    296 
    297 	// switch back to g
    298 	get_tls(CX)
    299 	MOVL	g(CX), AX
    300 	MOVL	g_m(AX), BX
    301 	MOVL	m_curg(BX), AX
    302 	MOVL	AX, g(CX)
    303 	MOVL	(g_sched+gobuf_sp)(AX), SP
    304 	MOVL	$0, (g_sched+gobuf_sp)(AX)
    305 	RET
    306 
    307 noswitch:
    308 	// already on m stack, just call directly
    309 	// Using a tail call here cleans up tracebacks since we won't stop
    310 	// at an intermediate systemstack.
    311 	MOVL	DI, DX
    312 	MOVL	0(DI), DI
    313 	JMP	DI
    314 
    315 /*
    316  * support for morestack
    317  */
    318 
    319 // Called during function prolog when more stack is needed.
    320 //
    321 // The traceback routines see morestack on a g0 as being
    322 // the top of a stack (for example, morestack calling newstack
    323 // calling the scheduler calling newm calling gc), so we must
    324 // record an argument size. For that purpose, it has no arguments.
    325 TEXT runtimemorestack(SB),NOSPLIT,$0-0
    326 	get_tls(CX)
    327 	MOVL	g(CX), BX
    328 	MOVL	g_m(BX), BX
    329 
    330 	// Cannot grow scheduler stack (m->g0).
    331 	MOVL	m_g0(BX), SI
    332 	CMPL	g(CX), SI
    333 	JNE	3(PC)
    334 	CALL	runtimebadmorestackg0(SB)
    335 	MOVL	0, AX
    336 
    337 	// Cannot grow signal stack (m->gsignal).
    338 	MOVL	m_gsignal(BX), SI
    339 	CMPL	g(CX), SI
    340 	JNE	3(PC)
    341 	CALL	runtimebadmorestackgsignal(SB)
    342 	MOVL	0, AX
    343 
    344 	// Called from f.
    345 	// Set m->morebuf to f's caller.
    346 	MOVL	8(SP), AX	// f's caller's PC
    347 	MOVL	AX, (m_morebuf+gobuf_pc)(BX)
    348 	LEAL	16(SP), AX	// f's caller's SP
    349 	MOVL	AX, (m_morebuf+gobuf_sp)(BX)
    350 	get_tls(CX)
    351 	MOVL	g(CX), SI
    352 	MOVL	SI, (m_morebuf+gobuf_g)(BX)
    353 
    354 	// Set g->sched to context in f.
    355 	MOVL	0(SP), AX // f's PC
    356 	MOVL	AX, (g_sched+gobuf_pc)(SI)
    357 	MOVL	SI, (g_sched+gobuf_g)(SI)
    358 	LEAL	8(SP), AX // f's SP
    359 	MOVL	AX, (g_sched+gobuf_sp)(SI)
    360 	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
    361 
    362 	// Call newstack on m->g0's stack.
    363 	MOVL	m_g0(BX), BX
    364 	MOVL	BX, g(CX)
    365 	MOVL	(g_sched+gobuf_sp)(BX), SP
    366 	CALL	runtimenewstack(SB)
    367 	MOVL	$0, 0x1003	// crash if newstack returns
    368 	RET
    369 
    370 // morestack trampolines
    371 TEXT runtimemorestack_noctxt(SB),NOSPLIT,$0
    372 	MOVL	$0, DX
    373 	JMP	runtimemorestack(SB)
    374 
    375 // reflectcall: call a function with the given argument list
    376 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
    377 // we don't have variable-sized frames, so we use a small number
    378 // of constant-sized-frame functions to encode a few bits of size in the pc.
    379 // Caution: ugly multiline assembly macros in your future!
    380 
    381 #define DISPATCH(NAME,MAXSIZE)		\
    382 	CMPL	CX, $MAXSIZE;		\
    383 	JA	3(PC);			\
    384 	MOVL	$NAME(SB), AX;		\
    385 	JMP	AX
    386 // Note: can't just "JMP NAME(SB)" - bad inlining results.
    387 
    388 TEXT reflectcall(SB), NOSPLIT, $0-0
    389 	JMP	reflectcall(SB)
    390 
    391 TEXT reflectcall(SB), NOSPLIT, $0-20
    392 	MOVLQZX argsize+12(FP), CX
    393 	DISPATCH(runtimecall16, 16)
    394 	DISPATCH(runtimecall32, 32)
    395 	DISPATCH(runtimecall64, 64)
    396 	DISPATCH(runtimecall128, 128)
    397 	DISPATCH(runtimecall256, 256)
    398 	DISPATCH(runtimecall512, 512)
    399 	DISPATCH(runtimecall1024, 1024)
    400 	DISPATCH(runtimecall2048, 2048)
    401 	DISPATCH(runtimecall4096, 4096)
    402 	DISPATCH(runtimecall8192, 8192)
    403 	DISPATCH(runtimecall16384, 16384)
    404 	DISPATCH(runtimecall32768, 32768)
    405 	DISPATCH(runtimecall65536, 65536)
    406 	DISPATCH(runtimecall131072, 131072)
    407 	DISPATCH(runtimecall262144, 262144)
    408 	DISPATCH(runtimecall524288, 524288)
    409 	DISPATCH(runtimecall1048576, 1048576)
    410 	DISPATCH(runtimecall2097152, 2097152)
    411 	DISPATCH(runtimecall4194304, 4194304)
    412 	DISPATCH(runtimecall8388608, 8388608)
    413 	DISPATCH(runtimecall16777216, 16777216)
    414 	DISPATCH(runtimecall33554432, 33554432)
    415 	DISPATCH(runtimecall67108864, 67108864)
    416 	DISPATCH(runtimecall134217728, 134217728)
    417 	DISPATCH(runtimecall268435456, 268435456)
    418 	DISPATCH(runtimecall536870912, 536870912)
    419 	DISPATCH(runtimecall1073741824, 1073741824)
    420 	MOVL	$runtimebadreflectcall(SB), AX
    421 	JMP	AX
    422 
    423 #define CALLFN(NAME,MAXSIZE)			\
    424 TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
    425 	NO_LOCAL_POINTERS;			\
    426 	/* copy arguments to stack */		\
    427 	MOVL	argptr+8(FP), SI;		\
    428 	MOVL	argsize+12(FP), CX;		\
    429 	MOVL	SP, DI;				\
    430 	REP;MOVSB;				\
    431 	/* call function */			\
    432 	MOVL	f+4(FP), DX;			\
    433 	MOVL	(DX), AX;			\
    434 	CALL	AX;				\
    435 	/* copy return values back */		\
    436 	MOVL	argtype+0(FP), DX;		\
    437 	MOVL	argptr+8(FP), DI;		\
    438 	MOVL	argsize+12(FP), CX;		\
    439 	MOVL	retoffset+16(FP), BX;		\
    440 	MOVL	SP, SI;				\
    441 	ADDL	BX, DI;				\
    442 	ADDL	BX, SI;				\
    443 	SUBL	BX, CX;				\
    444 	CALL	callRet<>(SB);			\
    445 	RET
    446 
    447 // callRet copies return values back at the end of call*. This is a
    448 // separate function so it can allocate stack space for the arguments
    449 // to reflectcallmove. It does not follow the Go ABI; it expects its
    450 // arguments in registers.
    451 TEXT callRet<>(SB), NOSPLIT, $16-0
    452 	MOVL	DX, 0(SP)
    453 	MOVL	DI, 4(SP)
    454 	MOVL	SI, 8(SP)
    455 	MOVL	CX, 12(SP)
    456 	CALL	runtimereflectcallmove(SB)
    457 	RET
    458 
    459 CALLFN(call16, 16)
    460 CALLFN(call32, 32)
    461 CALLFN(call64, 64)
    462 CALLFN(call128, 128)
    463 CALLFN(call256, 256)
    464 CALLFN(call512, 512)
    465 CALLFN(call1024, 1024)
    466 CALLFN(call2048, 2048)
    467 CALLFN(call4096, 4096)
    468 CALLFN(call8192, 8192)
    469 CALLFN(call16384, 16384)
    470 CALLFN(call32768, 32768)
    471 CALLFN(call65536, 65536)
    472 CALLFN(call131072, 131072)
    473 CALLFN(call262144, 262144)
    474 CALLFN(call524288, 524288)
    475 CALLFN(call1048576, 1048576)
    476 CALLFN(call2097152, 2097152)
    477 CALLFN(call4194304, 4194304)
    478 CALLFN(call8388608, 8388608)
    479 CALLFN(call16777216, 16777216)
    480 CALLFN(call33554432, 33554432)
    481 CALLFN(call67108864, 67108864)
    482 CALLFN(call134217728, 134217728)
    483 CALLFN(call268435456, 268435456)
    484 CALLFN(call536870912, 536870912)
    485 CALLFN(call1073741824, 1073741824)
    486 
    487 TEXT runtimeprocyield(SB),NOSPLIT,$0-0
    488 	MOVL	cycles+0(FP), AX
    489 again:
    490 	PAUSE
    491 	SUBL	$1, AX
    492 	JNZ	again
    493 	RET
    494 
    495 TEXT publicationBarrier(SB),NOSPLIT,$0-0
    496 	// Stores are already ordered on x86, so this is just a
    497 	// compile barrier.
    498 	RET
    499 
    500 // void jmpdefer(fn, sp);
    501 // called from deferreturn.
    502 // 1. pop the caller
    503 // 2. sub 5 bytes from the callers return
    504 // 3. jmp to the argument
    505 TEXT runtimejmpdefer(SB), NOSPLIT, $0-8
    506 	MOVL	fv+0(FP), DX
    507 	MOVL	argp+4(FP), BX
    508 	LEAL	-8(BX), SP	// caller sp after CALL
    509 	SUBL	$5, (SP)	// return to CALL again
    510 	MOVL	0(DX), BX
    511 	JMP	BX	// but first run the deferred function
    512 
    513 // func asmcgocall(fn, arg unsafe.Pointer) int32
    514 // Not implemented.
    515 TEXT runtimeasmcgocall(SB),NOSPLIT,$0-12
    516 	MOVL	0, AX
    517 	RET
    518 
    519 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
    520 // Not implemented.
    521 TEXT runtimecgocallback(SB),NOSPLIT,$0-16
    522 	MOVL	0, AX
    523 	RET
    524 
    525 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
    526 // Not implemented.
    527 TEXT cgocallback_gofunc(SB),NOSPLIT,$0-16
    528 	MOVL	0, AX
    529 	RET
    530 
    531 // void setg(G*); set g. for use by needm.
    532 // Not implemented.
    533 TEXT runtimesetg(SB), NOSPLIT, $0-4
    534 	MOVL	0, AX
    535 	RET
    536 
    537 // check that SP is in range [g->stack.lo, g->stack.hi)
    538 TEXT runtimestackcheck(SB), NOSPLIT, $0-0
    539 	get_tls(CX)
    540 	MOVL	g(CX), AX
    541 	CMPL	(g_stack+stack_hi)(AX), SP
    542 	JHI	2(PC)
    543 	MOVL	0, AX
    544 	CMPL	SP, (g_stack+stack_lo)(AX)
    545 	JHI	2(PC)
    546 	MOVL	0, AX
    547 	RET
    548 
    549 // int64 runtimecputicks(void)
    550 TEXT runtimecputicks(SB),NOSPLIT,$0-0
    551 	RDTSC
    552 	SHLQ	$32, DX
    553 	ADDQ	DX, AX
    554 	MOVQ	AX, ret+0(FP)
    555 	RET
    556 
    557 // hash function using AES hardware instructions
    558 // For now, our one amd64p32 system (NaCl) does not
    559 // support using AES instructions, so have not bothered to
    560 // write the implementations. Can copy and adjust the ones
    561 // in asm_amd64.s when the time comes.
    562 
    563 TEXT runtimeaeshash(SB),NOSPLIT,$0-20
    564 	MOVL	AX, ret+16(FP)
    565 	RET
    566 
    567 TEXT runtimeaeshashstr(SB),NOSPLIT,$0-12
    568 	MOVL	AX, ret+8(FP)
    569 	RET
    570 
    571 TEXT runtimeaeshash32(SB),NOSPLIT,$0-12
    572 	MOVL	AX, ret+8(FP)
    573 	RET
    574 
    575 TEXT runtimeaeshash64(SB),NOSPLIT,$0-12
    576 	MOVL	AX, ret+8(FP)
    577 	RET
    578 
    579 // memequal(p, q unsafe.Pointer, size uintptr) bool
    580 TEXT runtimememequal(SB),NOSPLIT,$0-17
    581 	MOVL	a+0(FP), SI
    582 	MOVL	b+4(FP), DI
    583 	CMPL	SI, DI
    584 	JEQ	eq
    585 	MOVL	size+8(FP), BX
    586 	CALL	runtimememeqbody(SB)
    587 	MOVB	AX, ret+16(FP)
    588 	RET
    589 eq:
    590 	MOVB    $1, ret+16(FP)
    591 	RET
    592 
    593 // memequal_varlen(a, b unsafe.Pointer) bool
    594 TEXT runtimememequal_varlen(SB),NOSPLIT,$0-9
    595 	MOVL    a+0(FP), SI
    596 	MOVL    b+4(FP), DI
    597 	CMPL    SI, DI
    598 	JEQ     eq
    599 	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
    600 	CALL    runtimememeqbody(SB)
    601 	MOVB    AX, ret+8(FP)
    602 	RET
    603 eq:
    604 	MOVB    $1, ret+8(FP)
    605 	RET
    606 
    607 // a in SI
    608 // b in DI
    609 // count in BX
    610 TEXT runtimememeqbody(SB),NOSPLIT,$0-0
    611 	XORQ	AX, AX
    612 
    613 	CMPQ	BX, $8
    614 	JB	small
    615 
    616 	// 64 bytes at a time using xmm registers
    617 hugeloop:
    618 	CMPQ	BX, $64
    619 	JB	bigloop
    620 	MOVOU	(SI), X0
    621 	MOVOU	(DI), X1
    622 	MOVOU	16(SI), X2
    623 	MOVOU	16(DI), X3
    624 	MOVOU	32(SI), X4
    625 	MOVOU	32(DI), X5
    626 	MOVOU	48(SI), X6
    627 	MOVOU	48(DI), X7
    628 	PCMPEQB	X1, X0
    629 	PCMPEQB	X3, X2
    630 	PCMPEQB	X5, X4
    631 	PCMPEQB	X7, X6
    632 	PAND	X2, X0
    633 	PAND	X6, X4
    634 	PAND	X4, X0
    635 	PMOVMSKB X0, DX
    636 	ADDQ	$64, SI
    637 	ADDQ	$64, DI
    638 	SUBQ	$64, BX
    639 	CMPL	DX, $0xffff
    640 	JEQ	hugeloop
    641 	RET
    642 
    643 	// 8 bytes at a time using 64-bit register
    644 bigloop:
    645 	CMPQ	BX, $8
    646 	JBE	leftover
    647 	MOVQ	(SI), CX
    648 	MOVQ	(DI), DX
    649 	ADDQ	$8, SI
    650 	ADDQ	$8, DI
    651 	SUBQ	$8, BX
    652 	CMPQ	CX, DX
    653 	JEQ	bigloop
    654 	RET
    655 
    656 	// remaining 0-8 bytes
    657 leftover:
    658 	ADDQ	BX, SI
    659 	ADDQ	BX, DI
    660 	MOVQ	-8(SI), CX
    661 	MOVQ	-8(DI), DX
    662 	CMPQ	CX, DX
    663 	SETEQ	AX
    664 	RET
    665 
    666 small:
    667 	CMPQ	BX, $0
    668 	JEQ	equal
    669 
    670 	LEAQ	0(BX*8), CX
    671 	NEGQ	CX
    672 
    673 	CMPB	SI, $0xf8
    674 	JA	si_high
    675 
    676 	// load at SI won't cross a page boundary.
    677 	MOVQ	(SI), SI
    678 	JMP	si_finish
    679 si_high:
    680 	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
    681 	MOVQ	BX, DX
    682 	ADDQ	SI, DX
    683 	MOVQ	-8(DX), SI
    684 	SHRQ	CX, SI
    685 si_finish:
    686 
    687 	// same for DI.
    688 	CMPB	DI, $0xf8
    689 	JA	di_high
    690 	MOVQ	(DI), DI
    691 	JMP	di_finish
    692 di_high:
    693 	MOVQ	BX, DX
    694 	ADDQ	DI, DX
    695 	MOVQ	-8(DX), DI
    696 	SHRQ	CX, DI
    697 di_finish:
    698 
    699 	SUBQ	SI, DI
    700 	SHLQ	CX, DI
    701 equal:
    702 	SETEQ	AX
    703 	RET
    704 
    705 TEXT runtimecmpstring(SB),NOSPLIT,$0-20
    706 	MOVL	s1_base+0(FP), SI
    707 	MOVL	s1_len+4(FP), BX
    708 	MOVL	s2_base+8(FP), DI
    709 	MOVL	s2_len+12(FP), DX
    710 	CALL	runtimecmpbody(SB)
    711 	MOVL	AX, ret+16(FP)
    712 	RET
    713 
    714 TEXT bytesCompare(SB),NOSPLIT,$0-28
    715 	MOVL	s1+0(FP), SI
    716 	MOVL	s1+4(FP), BX
    717 	MOVL	s2+12(FP), DI
    718 	MOVL	s2+16(FP), DX
    719 	CALL	runtimecmpbody(SB)
    720 	MOVL	AX, res+24(FP)
    721 	RET
    722 
    723 // input:
    724 //   SI = a
    725 //   DI = b
    726 //   BX = alen
    727 //   DX = blen
    728 // output:
    729 //   AX = 1/0/-1
    730 TEXT runtimecmpbody(SB),NOSPLIT,$0-0
    731 	CMPQ	SI, DI
    732 	JEQ	allsame
    733 	CMPQ	BX, DX
    734 	MOVQ	DX, R8
    735 	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
    736 	CMPQ	R8, $8
    737 	JB	small
    738 
    739 loop:
    740 	CMPQ	R8, $16
    741 	JBE	_0through16
    742 	MOVOU	(SI), X0
    743 	MOVOU	(DI), X1
    744 	PCMPEQB X0, X1
    745 	PMOVMSKB X1, AX
    746 	XORQ	$0xffff, AX	// convert EQ to NE
    747 	JNE	diff16	// branch if at least one byte is not equal
    748 	ADDQ	$16, SI
    749 	ADDQ	$16, DI
    750 	SUBQ	$16, R8
    751 	JMP	loop
    752 
    753 	// AX = bit mask of differences
    754 diff16:
    755 	BSFQ	AX, BX	// index of first byte that differs
    756 	XORQ	AX, AX
    757 	ADDQ	BX, SI
    758 	MOVB	(SI), CX
    759 	ADDQ	BX, DI
    760 	CMPB	CX, (DI)
    761 	SETHI	AX
    762 	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
    763 	RET
    764 
    765 	// 0 through 16 bytes left, alen>=8, blen>=8
    766 _0through16:
    767 	CMPQ	R8, $8
    768 	JBE	_0through8
    769 	MOVQ	(SI), AX
    770 	MOVQ	(DI), CX
    771 	CMPQ	AX, CX
    772 	JNE	diff8
    773 _0through8:
    774 	ADDQ	R8, SI
    775 	ADDQ	R8, DI
    776 	MOVQ	-8(SI), AX
    777 	MOVQ	-8(DI), CX
    778 	CMPQ	AX, CX
    779 	JEQ	allsame
    780 
    781 	// AX and CX contain parts of a and b that differ.
    782 diff8:
    783 	BSWAPQ	AX	// reverse order of bytes
    784 	BSWAPQ	CX
    785 	XORQ	AX, CX
    786 	BSRQ	CX, CX	// index of highest bit difference
    787 	SHRQ	CX, AX	// move a's bit to bottom
    788 	ANDQ	$1, AX	// mask bit
    789 	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
    790 	RET
    791 
    792 	// 0-7 bytes in common
    793 small:
    794 	LEAQ	(R8*8), CX	// bytes left -> bits left
    795 	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
    796 	JEQ	allsame
    797 
    798 	// load bytes of a into high bytes of AX
    799 	CMPB	SI, $0xf8
    800 	JA	si_high
    801 	MOVQ	(SI), SI
    802 	JMP	si_finish
    803 si_high:
    804 	ADDQ	R8, SI
    805 	MOVQ	-8(SI), SI
    806 	SHRQ	CX, SI
    807 si_finish:
    808 	SHLQ	CX, SI
    809 
    810 	// load bytes of b in to high bytes of BX
    811 	CMPB	DI, $0xf8
    812 	JA	di_high
    813 	MOVQ	(DI), DI
    814 	JMP	di_finish
    815 di_high:
    816 	ADDQ	R8, DI
    817 	MOVQ	-8(DI), DI
    818 	SHRQ	CX, DI
    819 di_finish:
    820 	SHLQ	CX, DI
    821 
    822 	BSWAPQ	SI	// reverse order of bytes
    823 	BSWAPQ	DI
    824 	XORQ	SI, DI	// find bit differences
    825 	JEQ	allsame
    826 	BSRQ	DI, CX	// index of highest bit difference
    827 	SHRQ	CX, SI	// move a's bit to bottom
    828 	ANDQ	$1, SI	// mask bit
    829 	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
    830 	RET
    831 
    832 allsame:
    833 	XORQ	AX, AX
    834 	XORQ	CX, CX
    835 	CMPQ	BX, DX
    836 	SETGT	AX	// 1 if alen > blen
    837 	SETEQ	CX	// 1 if alen == blen
    838 	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
    839 	RET
    840 
    841 TEXT bytesIndexByte(SB),NOSPLIT,$0-20
    842 	MOVL s+0(FP), SI
    843 	MOVL s_len+4(FP), BX
    844 	MOVB c+12(FP), AL
    845 	CALL runtimeindexbytebody(SB)
    846 	MOVL AX, ret+16(FP)
    847 	RET
    848 
    849 TEXT stringsIndexByte(SB),NOSPLIT,$0-20
    850 	MOVL s+0(FP), SI
    851 	MOVL s_len+4(FP), BX
    852 	MOVB c+8(FP), AL
    853 	CALL runtimeindexbytebody(SB)
    854 	MOVL AX, ret+16(FP)
    855 	RET
    856 
    857 // input:
    858 //   SI: data
    859 //   BX: data len
    860 //   AL: byte sought
    861 // output:
    862 //   AX
    863 TEXT runtimeindexbytebody(SB),NOSPLIT,$0
    864 	MOVL SI, DI
    865 
    866 	CMPL BX, $16
    867 	JLT small
    868 
    869 	// round up to first 16-byte boundary
    870 	TESTL $15, SI
    871 	JZ aligned
    872 	MOVL SI, CX
    873 	ANDL $~15, CX
    874 	ADDL $16, CX
    875 
    876 	// search the beginning
    877 	SUBL SI, CX
    878 	REPN; SCASB
    879 	JZ success
    880 
    881 // DI is 16-byte aligned; get ready to search using SSE instructions
    882 aligned:
    883 	// round down to last 16-byte boundary
    884 	MOVL BX, R11
    885 	ADDL SI, R11
    886 	ANDL $~15, R11
    887 
    888 	// shuffle X0 around so that each byte contains c
    889 	MOVD AX, X0
    890 	PUNPCKLBW X0, X0
    891 	PUNPCKLBW X0, X0
    892 	PSHUFL $0, X0, X0
    893 	JMP condition
    894 
    895 sse:
    896 	// move the next 16-byte chunk of the buffer into X1
    897 	MOVO (DI), X1
    898 	// compare bytes in X0 to X1
    899 	PCMPEQB X0, X1
    900 	// take the top bit of each byte in X1 and put the result in DX
    901 	PMOVMSKB X1, DX
    902 	TESTL DX, DX
    903 	JNZ ssesuccess
    904 	ADDL $16, DI
    905 
    906 condition:
    907 	CMPL DI, R11
    908 	JLT sse
    909 
    910 	// search the end
    911 	MOVL SI, CX
    912 	ADDL BX, CX
    913 	SUBL R11, CX
    914 	// if CX == 0, the zero flag will be set and we'll end up
    915 	// returning a false success
    916 	JZ failure
    917 	REPN; SCASB
    918 	JZ success
    919 
    920 failure:
    921 	MOVL $-1, AX
    922 	RET
    923 
    924 // handle for lengths < 16
    925 small:
    926 	MOVL BX, CX
    927 	REPN; SCASB
    928 	JZ success
    929 	MOVL $-1, AX
    930 	RET
    931 
    932 // we've found the chunk containing the byte
    933 // now just figure out which specific byte it is
    934 ssesuccess:
    935 	// get the index of the least significant set bit
    936 	BSFW DX, DX
    937 	SUBL SI, DI
    938 	ADDL DI, DX
    939 	MOVL DX, AX
    940 	RET
    941 
    942 success:
    943 	SUBL SI, DI
    944 	SUBL $1, DI
    945 	MOVL DI, AX
    946 	RET
    947 
    948 TEXT bytesEqual(SB),NOSPLIT,$0-25
    949 	MOVL	a_len+4(FP), BX
    950 	MOVL	b_len+16(FP), CX
    951 	XORL	AX, AX
    952 	CMPL	BX, CX
    953 	JNE	eqret
    954 	MOVL	a+0(FP), SI
    955 	MOVL	b+12(FP), DI
    956 	CALL	runtimememeqbody(SB)
    957 eqret:
    958 	MOVB	AX, ret+24(FP)
    959 	RET
    960 
    961 TEXT runtimereturn0(SB), NOSPLIT, $0
    962 	MOVL	$0, AX
    963 	RET
    964 
    965 // The top-most function running on a goroutine
    966 // returns to goexit+PCQuantum.
    967 TEXT runtimegoexit(SB),NOSPLIT,$0-0
    968 	BYTE	$0x90	// NOP
    969 	CALL	runtimegoexit1(SB)	// does not return
    970 	// traceback from goexit1 must hit code range of goexit
    971 	BYTE	$0x90	// NOP
    972 
    973 TEXT checkASM(SB),NOSPLIT,$0-1
    974 	MOVB	$1, ret+0(FP)
    975 	RET
    976