Home | History | Annotate | Download | only in runtime
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "go_asm.h"
      6 #include "go_tls.h"
      7 #include "funcdata.h"
      8 #include "textflag.h"
      9 
     10 // _rt0_amd64 is common startup code for most amd64 systems when using
     11 // internal linking. This is the entry point for the program from the
     12 // kernel for an ordinary -buildmode=exe program. The stack holds the
     13 // number of arguments and the C-style argv.
     14 TEXT _rt0_amd64(SB),NOSPLIT,$-8
     15 	MOVQ	0(SP), DI	// argc
     16 	LEAQ	8(SP), SI	// argv
     17 	JMP	runtimert0_go(SB)
     18 
     19 // main is common startup code for most amd64 systems when using
     20 // external linking. The C startup code will call the symbol "main"
     21 // passing argc and argv in the usual C ABI registers DI and SI.
     22 TEXT main(SB),NOSPLIT,$-8
     23 	JMP	runtimert0_go(SB)
     24 
     25 // _rt0_amd64_lib is common startup code for most amd64 systems when
     26 // using -buildmode=c-archive or -buildmode=c-shared. The linker will
     27 // arrange to invoke this function as a global constructor (for
     28 // c-archive) or when the shared library is loaded (for c-shared).
     29 // We expect argc and argv to be passed in the usual C ABI registers
     30 // DI and SI.
     31 TEXT _rt0_amd64_lib(SB),NOSPLIT,$0x50
     32 	// Align stack per ELF ABI requirements.
     33 	MOVQ	SP, AX
     34 	ANDQ	$~15, SP
     35 	// Save C ABI callee-saved registers, as caller may need them.
     36 	MOVQ	BX, 0x10(SP)
     37 	MOVQ	BP, 0x18(SP)
     38 	MOVQ	R12, 0x20(SP)
     39 	MOVQ	R13, 0x28(SP)
     40 	MOVQ	R14, 0x30(SP)
     41 	MOVQ	R15, 0x38(SP)
     42 	MOVQ	AX, 0x40(SP)
     43 
     44 	MOVQ	DI, _rt0_amd64_lib_argc<>(SB)
     45 	MOVQ	SI, _rt0_amd64_lib_argv<>(SB)
     46 
     47 	// Synchronous initialization.
     48 	CALL	runtimelibpreinit(SB)
     49 
     50 	// Create a new thread to finish Go runtime initialization.
     51 	MOVQ	_cgo_sys_thread_create(SB), AX
     52 	TESTQ	AX, AX
     53 	JZ	nocgo
     54 	MOVQ	$_rt0_amd64_lib_go(SB), DI
     55 	MOVQ	$0, SI
     56 	CALL	AX
     57 	JMP	restore
     58 
     59 nocgo:
     60 	MOVQ	$0x800000, 0(SP)		// stacksize
     61 	MOVQ	$_rt0_amd64_lib_go(SB), AX
     62 	MOVQ	AX, 8(SP)			// fn
     63 	CALL	runtimenewosproc0(SB)
     64 
     65 restore:
     66 	MOVQ	0x10(SP), BX
     67 	MOVQ	0x18(SP), BP
     68 	MOVQ	0x20(SP), R12
     69 	MOVQ	0x28(SP), R13
     70 	MOVQ	0x30(SP), R14
     71 	MOVQ	0x38(SP), R15
     72 	MOVQ	0x40(SP), SP
     73 	RET
     74 
     75 // _rt0_amd64_lib_go initializes the Go runtime.
     76 // This is started in a separate thread by _rt0_amd64_lib.
     77 TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0
     78 	MOVQ	_rt0_amd64_lib_argc<>(SB), DI
     79 	MOVQ	_rt0_amd64_lib_argv<>(SB), SI
     80 	JMP	runtimert0_go(SB)
     81 
     82 DATA _rt0_amd64_lib_argc<>(SB)/8, $0
     83 GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
     84 DATA _rt0_amd64_lib_argv<>(SB)/8, $0
     85 GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
     86 
     87 TEXT runtimert0_go(SB),NOSPLIT,$0
     88 	// copy arguments forward on an even stack
     89 	MOVQ	DI, AX		// argc
     90 	MOVQ	SI, BX		// argv
     91 	SUBQ	$(4*8+7), SP		// 2args 2auto
     92 	ANDQ	$~15, SP
     93 	MOVQ	AX, 16(SP)
     94 	MOVQ	BX, 24(SP)
     95 
     96 	// create istack out of the given (operating system) stack.
     97 	// _cgo_init may update stackguard.
     98 	MOVQ	$runtimeg0(SB), DI
     99 	LEAQ	(-64*1024+104)(SP), BX
    100 	MOVQ	BX, g_stackguard0(DI)
    101 	MOVQ	BX, g_stackguard1(DI)
    102 	MOVQ	BX, (g_stack+stack_lo)(DI)
    103 	MOVQ	SP, (g_stack+stack_hi)(DI)
    104 
    105 	// find out information about the processor we're on
    106 	MOVL	$0, AX
    107 	CPUID
    108 	MOVL	AX, SI
    109 	CMPL	AX, $0
    110 	JE	nocpuinfo
    111 
    112 	// Figure out how to serialize RDTSC.
    113 	// On Intel processors LFENCE is enough. AMD requires MFENCE.
    114 	// Don't know about the rest, so let's do MFENCE.
    115 	CMPL	BX, $0x756E6547  // "Genu"
    116 	JNE	notintel
    117 	CMPL	DX, $0x49656E69  // "ineI"
    118 	JNE	notintel
    119 	CMPL	CX, $0x6C65746E  // "ntel"
    120 	JNE	notintel
    121 	MOVB	$1, runtimeisIntel(SB)
    122 	MOVB	$1, runtimelfenceBeforeRdtsc(SB)
    123 notintel:
    124 
    125 	// Load EAX=1 cpuid flags
    126 	MOVL	$1, AX
    127 	CPUID
    128 	MOVL	AX, runtimeprocessorVersionInfo(SB)
    129 
    130 	TESTL	$(1<<26), DX // SSE2
    131 	SETNE	runtimesupport_sse2(SB)
    132 
    133 	TESTL	$(1<<9), CX // SSSE3
    134 	SETNE	runtimesupport_ssse3(SB)
    135 
    136 	TESTL	$(1<<19), CX // SSE4.1
    137 	SETNE	runtimesupport_sse41(SB)
    138 
    139 	TESTL	$(1<<20), CX // SSE4.2
    140 	SETNE	runtimesupport_sse42(SB)
    141 
    142 	TESTL	$(1<<23), CX // POPCNT
    143 	SETNE	runtimesupport_popcnt(SB)
    144 
    145 	TESTL	$(1<<25), CX // AES
    146 	SETNE	runtimesupport_aes(SB)
    147 
    148 	TESTL	$(1<<27), CX // OSXSAVE
    149 	SETNE	runtimesupport_osxsave(SB)
    150 
    151 	// If OS support for XMM and YMM is not present
    152 	// support_avx will be set back to false later.
    153 	TESTL	$(1<<28), CX // AVX
    154 	SETNE	runtimesupport_avx(SB)
    155 
    156 eax7:
    157 	// Load EAX=7/ECX=0 cpuid flags
    158 	CMPL	SI, $7
    159 	JLT	osavx
    160 	MOVL	$7, AX
    161 	MOVL	$0, CX
    162 	CPUID
    163 
    164 	TESTL	$(1<<3), BX // BMI1
    165 	SETNE	runtimesupport_bmi1(SB)
    166 
    167 	// If OS support for XMM and YMM is not present
    168 	// support_avx2 will be set back to false later.
    169 	TESTL	$(1<<5), BX
    170 	SETNE	runtimesupport_avx2(SB)
    171 
    172 	TESTL	$(1<<8), BX // BMI2
    173 	SETNE	runtimesupport_bmi2(SB)
    174 
    175 	TESTL	$(1<<9), BX // ERMS
    176 	SETNE	runtimesupport_erms(SB)
    177 
    178 osavx:
    179 	CMPB	runtimesupport_osxsave(SB), $1
    180 	JNE	noavx
    181 	MOVL	$0, CX
    182 	// For XGETBV, OSXSAVE bit is required and sufficient
    183 	XGETBV
    184 	ANDL	$6, AX
    185 	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
    186 	JE nocpuinfo
    187 noavx:
    188 	MOVB $0, runtimesupport_avx(SB)
    189 	MOVB $0, runtimesupport_avx2(SB)
    190 
    191 nocpuinfo:
    192 	// if there is an _cgo_init, call it.
    193 	MOVQ	_cgo_init(SB), AX
    194 	TESTQ	AX, AX
    195 	JZ	needtls
    196 	// g0 already in DI
    197 	MOVQ	DI, CX	// Win64 uses CX for first parameter
    198 	MOVQ	$setg_gcc<>(SB), SI
    199 	CALL	AX
    200 
    201 	// update stackguard after _cgo_init
    202 	MOVQ	$runtimeg0(SB), CX
    203 	MOVQ	(g_stack+stack_lo)(CX), AX
    204 	ADDQ	$const__StackGuard, AX
    205 	MOVQ	AX, g_stackguard0(CX)
    206 	MOVQ	AX, g_stackguard1(CX)
    207 
    208 #ifndef GOOS_windows
    209 	JMP ok
    210 #endif
    211 needtls:
    212 #ifdef GOOS_plan9
    213 	// skip TLS setup on Plan 9
    214 	JMP ok
    215 #endif
    216 #ifdef GOOS_solaris
    217 	// skip TLS setup on Solaris
    218 	JMP ok
    219 #endif
    220 
    221 	LEAQ	runtimem0+m_tls(SB), DI
    222 	CALL	runtimesettls(SB)
    223 
    224 	// store through it, to make sure it works
    225 	get_tls(BX)
    226 	MOVQ	$0x123, g(BX)
    227 	MOVQ	runtimem0+m_tls(SB), AX
    228 	CMPQ	AX, $0x123
    229 	JEQ 2(PC)
    230 	MOVL	AX, 0	// abort
    231 ok:
    232 	// set the per-goroutine and per-mach "registers"
    233 	get_tls(BX)
    234 	LEAQ	runtimeg0(SB), CX
    235 	MOVQ	CX, g(BX)
    236 	LEAQ	runtimem0(SB), AX
    237 
    238 	// save m->g0 = g0
    239 	MOVQ	CX, m_g0(AX)
    240 	// save m0 to g0->m
    241 	MOVQ	AX, g_m(CX)
    242 
    243 	CLD				// convention is D is always left cleared
    244 	CALL	runtimecheck(SB)
    245 
    246 	MOVL	16(SP), AX		// copy argc
    247 	MOVL	AX, 0(SP)
    248 	MOVQ	24(SP), AX		// copy argv
    249 	MOVQ	AX, 8(SP)
    250 	CALL	runtimeargs(SB)
    251 	CALL	runtimeosinit(SB)
    252 	CALL	runtimeschedinit(SB)
    253 
    254 	// create a new goroutine to start program
    255 	MOVQ	$runtimemainPC(SB), AX		// entry
    256 	PUSHQ	AX
    257 	PUSHQ	$0			// arg size
    258 	CALL	runtimenewproc(SB)
    259 	POPQ	AX
    260 	POPQ	AX
    261 
    262 	// start this M
    263 	CALL	runtimemstart(SB)
    264 
    265 	MOVL	$0xf1, 0xf1  // crash
    266 	RET
    267 
    268 DATA	runtimemainPC+0(SB)/8,$runtimemain(SB)
    269 GLOBL	runtimemainPC(SB),RODATA,$8
    270 
    271 TEXT runtimebreakpoint(SB),NOSPLIT,$0-0
    272 	BYTE	$0xcc
    273 	RET
    274 
    275 TEXT runtimeasminit(SB),NOSPLIT,$0-0
    276 	// No per-thread init.
    277 	RET
    278 
    279 /*
    280  *  go-routine
    281  */
    282 
    283 // void gosave(Gobuf*)
    284 // save state in Gobuf; setjmp
    285 TEXT runtimegosave(SB), NOSPLIT, $0-8
    286 	MOVQ	buf+0(FP), AX		// gobuf
    287 	LEAQ	buf+0(FP), BX		// caller's SP
    288 	MOVQ	BX, gobuf_sp(AX)
    289 	MOVQ	0(SP), BX		// caller's PC
    290 	MOVQ	BX, gobuf_pc(AX)
    291 	MOVQ	$0, gobuf_ret(AX)
    292 	MOVQ	BP, gobuf_bp(AX)
    293 	// Assert ctxt is zero. See func save.
    294 	MOVQ	gobuf_ctxt(AX), BX
    295 	TESTQ	BX, BX
    296 	JZ	2(PC)
    297 	CALL	runtimebadctxt(SB)
    298 	get_tls(CX)
    299 	MOVQ	g(CX), BX
    300 	MOVQ	BX, gobuf_g(AX)
    301 	RET
    302 
    303 // void gogo(Gobuf*)
    304 // restore state from Gobuf; longjmp
    305 TEXT runtimegogo(SB), NOSPLIT, $16-8
    306 	MOVQ	buf+0(FP), BX		// gobuf
    307 	MOVQ	gobuf_g(BX), DX
    308 	MOVQ	0(DX), CX		// make sure g != nil
    309 	get_tls(CX)
    310 	MOVQ	DX, g(CX)
    311 	MOVQ	gobuf_sp(BX), SP	// restore SP
    312 	MOVQ	gobuf_ret(BX), AX
    313 	MOVQ	gobuf_ctxt(BX), DX
    314 	MOVQ	gobuf_bp(BX), BP
    315 	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
    316 	MOVQ	$0, gobuf_ret(BX)
    317 	MOVQ	$0, gobuf_ctxt(BX)
    318 	MOVQ	$0, gobuf_bp(BX)
    319 	MOVQ	gobuf_pc(BX), BX
    320 	JMP	BX
    321 
    322 // func mcall(fn func(*g))
    323 // Switch to m->g0's stack, call fn(g).
    324 // Fn must never return. It should gogo(&g->sched)
    325 // to keep running g.
    326 TEXT runtimemcall(SB), NOSPLIT, $0-8
    327 	MOVQ	fn+0(FP), DI
    328 
    329 	get_tls(CX)
    330 	MOVQ	g(CX), AX	// save state in g->sched
    331 	MOVQ	0(SP), BX	// caller's PC
    332 	MOVQ	BX, (g_sched+gobuf_pc)(AX)
    333 	LEAQ	fn+0(FP), BX	// caller's SP
    334 	MOVQ	BX, (g_sched+gobuf_sp)(AX)
    335 	MOVQ	AX, (g_sched+gobuf_g)(AX)
    336 	MOVQ	BP, (g_sched+gobuf_bp)(AX)
    337 
    338 	// switch to m->g0 & its stack, call fn
    339 	MOVQ	g(CX), BX
    340 	MOVQ	g_m(BX), BX
    341 	MOVQ	m_g0(BX), SI
    342 	CMPQ	SI, AX	// if g == m->g0 call badmcall
    343 	JNE	3(PC)
    344 	MOVQ	$runtimebadmcall(SB), AX
    345 	JMP	AX
    346 	MOVQ	SI, g(CX)	// g = m->g0
    347 	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
    348 	PUSHQ	AX
    349 	MOVQ	DI, DX
    350 	MOVQ	0(DI), DI
    351 	CALL	DI
    352 	POPQ	AX
    353 	MOVQ	$runtimebadmcall2(SB), AX
    354 	JMP	AX
    355 	RET
    356 
    357 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
    358 // of the G stack. We need to distinguish the routine that
    359 // lives at the bottom of the G stack from the one that lives
    360 // at the top of the system stack because the one at the top of
    361 // the system stack terminates the stack walk (see topofstack()).
    362 TEXT runtimesystemstack_switch(SB), NOSPLIT, $0-0
    363 	RET
    364 
    365 // func systemstack(fn func())
    366 TEXT runtimesystemstack(SB), NOSPLIT, $0-8
    367 	MOVQ	fn+0(FP), DI	// DI = fn
    368 	get_tls(CX)
    369 	MOVQ	g(CX), AX	// AX = g
    370 	MOVQ	g_m(AX), BX	// BX = m
    371 
    372 	MOVQ	m_gsignal(BX), DX	// DX = gsignal
    373 	CMPQ	AX, DX
    374 	JEQ	noswitch
    375 
    376 	MOVQ	m_g0(BX), DX	// DX = g0
    377 	CMPQ	AX, DX
    378 	JEQ	noswitch
    379 
    380 	MOVQ	m_curg(BX), R8
    381 	CMPQ	AX, R8
    382 	JEQ	switch
    383 
    384 	// Bad: g is not gsignal, not g0, not curg. What is it?
    385 	MOVQ	$runtimebadsystemstack(SB), AX
    386 	CALL	AX
    387 
    388 switch:
    389 	// save our state in g->sched. Pretend to
    390 	// be systemstack_switch if the G stack is scanned.
    391 	MOVQ	$runtimesystemstack_switch(SB), SI
    392 	MOVQ	SI, (g_sched+gobuf_pc)(AX)
    393 	MOVQ	SP, (g_sched+gobuf_sp)(AX)
    394 	MOVQ	AX, (g_sched+gobuf_g)(AX)
    395 	MOVQ	BP, (g_sched+gobuf_bp)(AX)
    396 
    397 	// switch to g0
    398 	MOVQ	DX, g(CX)
    399 	MOVQ	(g_sched+gobuf_sp)(DX), BX
    400 	// make it look like mstart called systemstack on g0, to stop traceback
    401 	SUBQ	$8, BX
    402 	MOVQ	$runtimemstart(SB), DX
    403 	MOVQ	DX, 0(BX)
    404 	MOVQ	BX, SP
    405 
    406 	// call target function
    407 	MOVQ	DI, DX
    408 	MOVQ	0(DI), DI
    409 	CALL	DI
    410 
    411 	// switch back to g
    412 	get_tls(CX)
    413 	MOVQ	g(CX), AX
    414 	MOVQ	g_m(AX), BX
    415 	MOVQ	m_curg(BX), AX
    416 	MOVQ	AX, g(CX)
    417 	MOVQ	(g_sched+gobuf_sp)(AX), SP
    418 	MOVQ	$0, (g_sched+gobuf_sp)(AX)
    419 	RET
    420 
    421 noswitch:
    422 	// already on m stack; tail call the function
    423 	// Using a tail call here cleans up tracebacks since we won't stop
    424 	// at an intermediate systemstack.
    425 	MOVQ	DI, DX
    426 	MOVQ	0(DI), DI
    427 	JMP	DI
    428 
    429 /*
    430  * support for morestack
    431  */
    432 
    433 // Called during function prolog when more stack is needed.
    434 //
    435 // The traceback routines see morestack on a g0 as being
    436 // the top of a stack (for example, morestack calling newstack
    437 // calling the scheduler calling newm calling gc), so we must
    438 // record an argument size. For that purpose, it has no arguments.
    439 TEXT runtimemorestack(SB),NOSPLIT,$0-0
    440 	// Cannot grow scheduler stack (m->g0).
    441 	get_tls(CX)
    442 	MOVQ	g(CX), BX
    443 	MOVQ	g_m(BX), BX
    444 	MOVQ	m_g0(BX), SI
    445 	CMPQ	g(CX), SI
    446 	JNE	3(PC)
    447 	CALL	runtimebadmorestackg0(SB)
    448 	INT	$3
    449 
    450 	// Cannot grow signal stack (m->gsignal).
    451 	MOVQ	m_gsignal(BX), SI
    452 	CMPQ	g(CX), SI
    453 	JNE	3(PC)
    454 	CALL	runtimebadmorestackgsignal(SB)
    455 	INT	$3
    456 
    457 	// Called from f.
    458 	// Set m->morebuf to f's caller.
    459 	MOVQ	8(SP), AX	// f's caller's PC
    460 	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
    461 	LEAQ	16(SP), AX	// f's caller's SP
    462 	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
    463 	get_tls(CX)
    464 	MOVQ	g(CX), SI
    465 	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
    466 
    467 	// Set g->sched to context in f.
    468 	MOVQ	0(SP), AX // f's PC
    469 	MOVQ	AX, (g_sched+gobuf_pc)(SI)
    470 	MOVQ	SI, (g_sched+gobuf_g)(SI)
    471 	LEAQ	8(SP), AX // f's SP
    472 	MOVQ	AX, (g_sched+gobuf_sp)(SI)
    473 	MOVQ	BP, (g_sched+gobuf_bp)(SI)
    474 	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
    475 
    476 	// Call newstack on m->g0's stack.
    477 	MOVQ	m_g0(BX), BX
    478 	MOVQ	BX, g(CX)
    479 	MOVQ	(g_sched+gobuf_sp)(BX), SP
    480 	CALL	runtimenewstack(SB)
    481 	MOVQ	$0, 0x1003	// crash if newstack returns
    482 	RET
    483 
    484 // morestack but not preserving ctxt.
    485 TEXT runtimemorestack_noctxt(SB),NOSPLIT,$0
    486 	MOVL	$0, DX
    487 	JMP	runtimemorestack(SB)
    488 
    489 // reflectcall: call a function with the given argument list
    490 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
    491 // we don't have variable-sized frames, so we use a small number
    492 // of constant-sized-frame functions to encode a few bits of size in the pc.
    493 // Caution: ugly multiline assembly macros in your future!
    494 
    495 #define DISPATCH(NAME,MAXSIZE)		\
    496 	CMPQ	CX, $MAXSIZE;		\
    497 	JA	3(PC);			\
    498 	MOVQ	$NAME(SB), AX;		\
    499 	JMP	AX
    500 // Note: can't just "JMP NAME(SB)" - bad inlining results.
    501 
    502 TEXT reflectcall(SB), NOSPLIT, $0-0
    503 	JMP	reflectcall(SB)
    504 
    505 TEXT reflectcall(SB), NOSPLIT, $0-32
    506 	MOVLQZX argsize+24(FP), CX
    507 	DISPATCH(runtimecall32, 32)
    508 	DISPATCH(runtimecall64, 64)
    509 	DISPATCH(runtimecall128, 128)
    510 	DISPATCH(runtimecall256, 256)
    511 	DISPATCH(runtimecall512, 512)
    512 	DISPATCH(runtimecall1024, 1024)
    513 	DISPATCH(runtimecall2048, 2048)
    514 	DISPATCH(runtimecall4096, 4096)
    515 	DISPATCH(runtimecall8192, 8192)
    516 	DISPATCH(runtimecall16384, 16384)
    517 	DISPATCH(runtimecall32768, 32768)
    518 	DISPATCH(runtimecall65536, 65536)
    519 	DISPATCH(runtimecall131072, 131072)
    520 	DISPATCH(runtimecall262144, 262144)
    521 	DISPATCH(runtimecall524288, 524288)
    522 	DISPATCH(runtimecall1048576, 1048576)
    523 	DISPATCH(runtimecall2097152, 2097152)
    524 	DISPATCH(runtimecall4194304, 4194304)
    525 	DISPATCH(runtimecall8388608, 8388608)
    526 	DISPATCH(runtimecall16777216, 16777216)
    527 	DISPATCH(runtimecall33554432, 33554432)
    528 	DISPATCH(runtimecall67108864, 67108864)
    529 	DISPATCH(runtimecall134217728, 134217728)
    530 	DISPATCH(runtimecall268435456, 268435456)
    531 	DISPATCH(runtimecall536870912, 536870912)
    532 	DISPATCH(runtimecall1073741824, 1073741824)
    533 	MOVQ	$runtimebadreflectcall(SB), AX
    534 	JMP	AX
    535 
    536 #define CALLFN(NAME,MAXSIZE)			\
    537 TEXT NAME(SB), WRAPPER, $MAXSIZE-32;		\
    538 	NO_LOCAL_POINTERS;			\
    539 	/* copy arguments to stack */		\
    540 	MOVQ	argptr+16(FP), SI;		\
    541 	MOVLQZX argsize+24(FP), CX;		\
    542 	MOVQ	SP, DI;				\
    543 	REP;MOVSB;				\
    544 	/* call function */			\
    545 	MOVQ	f+8(FP), DX;			\
    546 	PCDATA  $PCDATA_StackMapIndex, $0;	\
    547 	CALL	(DX);				\
    548 	/* copy return values back */		\
    549 	MOVQ	argtype+0(FP), DX;		\
    550 	MOVQ	argptr+16(FP), DI;		\
    551 	MOVLQZX	argsize+24(FP), CX;		\
    552 	MOVLQZX	retoffset+28(FP), BX;		\
    553 	MOVQ	SP, SI;				\
    554 	ADDQ	BX, DI;				\
    555 	ADDQ	BX, SI;				\
    556 	SUBQ	BX, CX;				\
    557 	CALL	callRet<>(SB);			\
    558 	RET
    559 
    560 // callRet copies return values back at the end of call*. This is a
    561 // separate function so it can allocate stack space for the arguments
    562 // to reflectcallmove. It does not follow the Go ABI; it expects its
    563 // arguments in registers.
    564 TEXT callRet<>(SB), NOSPLIT, $32-0
    565 	NO_LOCAL_POINTERS
    566 	MOVQ	DX, 0(SP)
    567 	MOVQ	DI, 8(SP)
    568 	MOVQ	SI, 16(SP)
    569 	MOVQ	CX, 24(SP)
    570 	CALL	runtimereflectcallmove(SB)
    571 	RET
    572 
    573 CALLFN(call32, 32)
    574 CALLFN(call64, 64)
    575 CALLFN(call128, 128)
    576 CALLFN(call256, 256)
    577 CALLFN(call512, 512)
    578 CALLFN(call1024, 1024)
    579 CALLFN(call2048, 2048)
    580 CALLFN(call4096, 4096)
    581 CALLFN(call8192, 8192)
    582 CALLFN(call16384, 16384)
    583 CALLFN(call32768, 32768)
    584 CALLFN(call65536, 65536)
    585 CALLFN(call131072, 131072)
    586 CALLFN(call262144, 262144)
    587 CALLFN(call524288, 524288)
    588 CALLFN(call1048576, 1048576)
    589 CALLFN(call2097152, 2097152)
    590 CALLFN(call4194304, 4194304)
    591 CALLFN(call8388608, 8388608)
    592 CALLFN(call16777216, 16777216)
    593 CALLFN(call33554432, 33554432)
    594 CALLFN(call67108864, 67108864)
    595 CALLFN(call134217728, 134217728)
    596 CALLFN(call268435456, 268435456)
    597 CALLFN(call536870912, 536870912)
    598 CALLFN(call1073741824, 1073741824)
    599 
    600 TEXT runtimeprocyield(SB),NOSPLIT,$0-0
    601 	MOVL	cycles+0(FP), AX
    602 again:
    603 	PAUSE
    604 	SUBL	$1, AX
    605 	JNZ	again
    606 	RET
    607 
    608 
    609 TEXT publicationBarrier(SB),NOSPLIT,$0-0
    610 	// Stores are already ordered on x86, so this is just a
    611 	// compile barrier.
    612 	RET
    613 
    614 // void jmpdefer(fn, sp);
    615 // called from deferreturn.
    616 // 1. pop the caller
    617 // 2. sub 5 bytes from the callers return
    618 // 3. jmp to the argument
    619 TEXT runtimejmpdefer(SB), NOSPLIT, $0-16
    620 	MOVQ	fv+0(FP), DX	// fn
    621 	MOVQ	argp+8(FP), BX	// caller sp
    622 	LEAQ	-8(BX), SP	// caller sp after CALL
    623 	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
    624 	SUBQ	$5, (SP)	// return to CALL again
    625 	MOVQ	0(DX), BX
    626 	JMP	BX	// but first run the deferred function
    627 
    628 // Save state of caller into g->sched. Smashes R8, R9.
    629 TEXT gosave<>(SB),NOSPLIT,$0
    630 	get_tls(R8)
    631 	MOVQ	g(R8), R8
    632 	MOVQ	0(SP), R9
    633 	MOVQ	R9, (g_sched+gobuf_pc)(R8)
    634 	LEAQ	8(SP), R9
    635 	MOVQ	R9, (g_sched+gobuf_sp)(R8)
    636 	MOVQ	$0, (g_sched+gobuf_ret)(R8)
    637 	MOVQ	BP, (g_sched+gobuf_bp)(R8)
    638 	// Assert ctxt is zero. See func save.
    639 	MOVQ	(g_sched+gobuf_ctxt)(R8), R9
    640 	TESTQ	R9, R9
    641 	JZ	2(PC)
    642 	CALL	runtimebadctxt(SB)
    643 	RET
    644 
    645 // func asmcgocall(fn, arg unsafe.Pointer) int32
    646 // Call fn(arg) on the scheduler stack,
    647 // aligned appropriately for the gcc ABI.
    648 // See cgocall.go for more details.
    649 TEXT asmcgocall(SB),NOSPLIT,$0-20
    650 	MOVQ	fn+0(FP), AX
    651 	MOVQ	arg+8(FP), BX
    652 
    653 	MOVQ	SP, DX
    654 
    655 	// Figure out if we need to switch to m->g0 stack.
    656 	// We get called to create new OS threads too, and those
    657 	// come in on the m->g0 stack already.
    658 	get_tls(CX)
    659 	MOVQ	g(CX), R8
    660 	CMPQ	R8, $0
    661 	JEQ	nosave
    662 	MOVQ	g_m(R8), R8
    663 	MOVQ	m_g0(R8), SI
    664 	MOVQ	g(CX), DI
    665 	CMPQ	SI, DI
    666 	JEQ	nosave
    667 	MOVQ	m_gsignal(R8), SI
    668 	CMPQ	SI, DI
    669 	JEQ	nosave
    670 
    671 	// Switch to system stack.
    672 	MOVQ	m_g0(R8), SI
    673 	CALL	gosave<>(SB)
    674 	MOVQ	SI, g(CX)
    675 	MOVQ	(g_sched+gobuf_sp)(SI), SP
    676 
    677 	// Now on a scheduling stack (a pthread-created stack).
    678 	// Make sure we have enough room for 4 stack-backed fast-call
    679 	// registers as per windows amd64 calling convention.
    680 	SUBQ	$64, SP
    681 	ANDQ	$~15, SP	// alignment for gcc ABI
    682 	MOVQ	DI, 48(SP)	// save g
    683 	MOVQ	(g_stack+stack_hi)(DI), DI
    684 	SUBQ	DX, DI
    685 	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
    686 	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
    687 	MOVQ	BX, CX		// CX = first argument in Win64
    688 	CALL	AX
    689 
    690 	// Restore registers, g, stack pointer.
    691 	get_tls(CX)
    692 	MOVQ	48(SP), DI
    693 	MOVQ	(g_stack+stack_hi)(DI), SI
    694 	SUBQ	40(SP), SI
    695 	MOVQ	DI, g(CX)
    696 	MOVQ	SI, SP
    697 
    698 	MOVL	AX, ret+16(FP)
    699 	RET
    700 
    701 nosave:
    702 	// Running on a system stack, perhaps even without a g.
    703 	// Having no g can happen during thread creation or thread teardown
    704 	// (see needm/dropm on Solaris, for example).
    705 	// This code is like the above sequence but without saving/restoring g
    706 	// and without worrying about the stack moving out from under us
    707 	// (because we're on a system stack, not a goroutine stack).
    708 	// The above code could be used directly if already on a system stack,
    709 	// but then the only path through this code would be a rare case on Solaris.
    710 	// Using this code for all "already on system stack" calls exercises it more,
    711 	// which should help keep it correct.
    712 	SUBQ	$64, SP
    713 	ANDQ	$~15, SP
    714 	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
    715 	MOVQ	DX, 40(SP)	// save original stack pointer
    716 	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
    717 	MOVQ	BX, CX		// CX = first argument in Win64
    718 	CALL	AX
    719 	MOVQ	40(SP), SI	// restore original stack pointer
    720 	MOVQ	SI, SP
    721 	MOVL	AX, ret+16(FP)
    722 	RET
    723 
    724 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
    725 // Turn the fn into a Go func (by taking its address) and call
    726 // cgocallback_gofunc.
    727 TEXT runtimecgocallback(SB),NOSPLIT,$32-32
    728 	LEAQ	fn+0(FP), AX
    729 	MOVQ	AX, 0(SP)
    730 	MOVQ	frame+8(FP), AX
    731 	MOVQ	AX, 8(SP)
    732 	MOVQ	framesize+16(FP), AX
    733 	MOVQ	AX, 16(SP)
    734 	MOVQ	ctxt+24(FP), AX
    735 	MOVQ	AX, 24(SP)
    736 	MOVQ	$runtimecgocallback_gofunc(SB), AX
    737 	CALL	AX
    738 	RET
    739 
    740 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
    741 // See cgocall.go for more details.
    742 TEXT cgocallback_gofunc(SB),NOSPLIT,$16-32
    743 	NO_LOCAL_POINTERS
    744 
    745 	// If g is nil, Go did not create the current thread.
    746 	// Call needm to obtain one m for temporary use.
    747 	// In this case, we're running on the thread stack, so there's
    748 	// lots of space, but the linker doesn't know. Hide the call from
    749 	// the linker analysis by using an indirect call through AX.
    750 	get_tls(CX)
    751 #ifdef GOOS_windows
    752 	MOVL	$0, BX
    753 	CMPQ	CX, $0
    754 	JEQ	2(PC)
    755 #endif
    756 	MOVQ	g(CX), BX
    757 	CMPQ	BX, $0
    758 	JEQ	needm
    759 	MOVQ	g_m(BX), BX
    760 	MOVQ	BX, R8 // holds oldm until end of function
    761 	JMP	havem
    762 needm:
    763 	MOVQ	$0, 0(SP)
    764 	MOVQ	$runtimeneedm(SB), AX
    765 	CALL	AX
    766 	MOVQ	0(SP), R8
    767 	get_tls(CX)
    768 	MOVQ	g(CX), BX
    769 	MOVQ	g_m(BX), BX
    770 
    771 	// Set m->sched.sp = SP, so that if a panic happens
    772 	// during the function we are about to execute, it will
    773 	// have a valid SP to run on the g0 stack.
    774 	// The next few lines (after the havem label)
    775 	// will save this SP onto the stack and then write
    776 	// the same SP back to m->sched.sp. That seems redundant,
    777 	// but if an unrecovered panic happens, unwindm will
    778 	// restore the g->sched.sp from the stack location
    779 	// and then systemstack will try to use it. If we don't set it here,
    780 	// that restored SP will be uninitialized (typically 0) and
    781 	// will not be usable.
    782 	MOVQ	m_g0(BX), SI
    783 	MOVQ	SP, (g_sched+gobuf_sp)(SI)
    784 
    785 havem:
    786 	// Now there's a valid m, and we're running on its m->g0.
    787 	// Save current m->g0->sched.sp on stack and then set it to SP.
    788 	// Save current sp in m->g0->sched.sp in preparation for
    789 	// switch back to m->curg stack.
    790 	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
    791 	MOVQ	m_g0(BX), SI
    792 	MOVQ	(g_sched+gobuf_sp)(SI), AX
    793 	MOVQ	AX, 0(SP)
    794 	MOVQ	SP, (g_sched+gobuf_sp)(SI)
    795 
    796 	// Switch to m->curg stack and call runtime.cgocallbackg.
    797 	// Because we are taking over the execution of m->curg
    798 	// but *not* resuming what had been running, we need to
    799 	// save that information (m->curg->sched) so we can restore it.
    800 	// We can restore m->curg->sched.sp easily, because calling
    801 	// runtime.cgocallbackg leaves SP unchanged upon return.
    802 	// To save m->curg->sched.pc, we push it onto the stack.
    803 	// This has the added benefit that it looks to the traceback
    804 	// routine like cgocallbackg is going to return to that
    805 	// PC (because the frame we allocate below has the same
    806 	// size as cgocallback_gofunc's frame declared above)
    807 	// so that the traceback will seamlessly trace back into
    808 	// the earlier calls.
    809 	//
    810 	// In the new goroutine, 8(SP) holds the saved R8.
    811 	MOVQ	m_curg(BX), SI
    812 	MOVQ	SI, g(CX)
    813 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
    814 	MOVQ	(g_sched+gobuf_pc)(SI), BX
    815 	MOVQ	BX, -8(DI)
    816 	// Compute the size of the frame, including return PC and, if
    817 	// GOEXPERIMENT=framepointer, the saved base pointer
    818 	MOVQ	ctxt+24(FP), BX
    819 	LEAQ	fv+0(FP), AX
    820 	SUBQ	SP, AX
    821 	SUBQ	AX, DI
    822 	MOVQ	DI, SP
    823 
    824 	MOVQ	R8, 8(SP)
    825 	MOVQ	BX, 0(SP)
    826 	CALL	runtimecgocallbackg(SB)
    827 	MOVQ	8(SP), R8
    828 
    829 	// Compute the size of the frame again. FP and SP have
    830 	// completely different values here than they did above,
    831 	// but only their difference matters.
    832 	LEAQ	fv+0(FP), AX
    833 	SUBQ	SP, AX
    834 
    835 	// Restore g->sched (== m->curg->sched) from saved values.
    836 	get_tls(CX)
    837 	MOVQ	g(CX), SI
    838 	MOVQ	SP, DI
    839 	ADDQ	AX, DI
    840 	MOVQ	-8(DI), BX
    841 	MOVQ	BX, (g_sched+gobuf_pc)(SI)
    842 	MOVQ	DI, (g_sched+gobuf_sp)(SI)
    843 
    844 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
    845 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
    846 	// so we do not have to restore it.)
    847 	MOVQ	g(CX), BX
    848 	MOVQ	g_m(BX), BX
    849 	MOVQ	m_g0(BX), SI
    850 	MOVQ	SI, g(CX)
    851 	MOVQ	(g_sched+gobuf_sp)(SI), SP
    852 	MOVQ	0(SP), AX
    853 	MOVQ	AX, (g_sched+gobuf_sp)(SI)
    854 
    855 	// If the m on entry was nil, we called needm above to borrow an m
    856 	// for the duration of the call. Since the call is over, return it with dropm.
    857 	CMPQ	R8, $0
    858 	JNE 3(PC)
    859 	MOVQ	$runtimedropm(SB), AX
    860 	CALL	AX
    861 
    862 	// Done!
    863 	RET
    864 
    865 // void setg(G*); set g. for use by needm.
    866 TEXT runtimesetg(SB), NOSPLIT, $0-8
    867 	MOVQ	gg+0(FP), BX
    868 #ifdef GOOS_windows
    869 	CMPQ	BX, $0
    870 	JNE	settls
    871 	MOVQ	$0, 0x28(GS)
    872 	RET
    873 settls:
    874 	MOVQ	g_m(BX), AX
    875 	LEAQ	m_tls(AX), AX
    876 	MOVQ	AX, 0x28(GS)
    877 #endif
    878 	get_tls(CX)
    879 	MOVQ	BX, g(CX)
    880 	RET
    881 
    882 // void setg_gcc(G*); set g called from gcc.
    883 TEXT setg_gcc<>(SB),NOSPLIT,$0
    884 	get_tls(AX)
    885 	MOVQ	DI, g(AX)
    886 	RET
    887 
    888 // check that SP is in range [g->stack.lo, g->stack.hi)
    889 TEXT runtimestackcheck(SB), NOSPLIT, $0-0
    890 	get_tls(CX)
    891 	MOVQ	g(CX), AX
    892 	CMPQ	(g_stack+stack_hi)(AX), SP
    893 	JHI	2(PC)
    894 	INT	$3
    895 	CMPQ	SP, (g_stack+stack_lo)(AX)
    896 	JHI	2(PC)
    897 	INT	$3
    898 	RET
    899 
    900 // func cputicks() int64
    901 TEXT runtimecputicks(SB),NOSPLIT,$0-0
    902 	CMPB	runtimelfenceBeforeRdtsc(SB), $1
    903 	JNE	mfence
    904 	LFENCE
    905 	JMP	done
    906 mfence:
    907 	MFENCE
    908 done:
    909 	RDTSC
    910 	SHLQ	$32, DX
    911 	ADDQ	DX, AX
    912 	MOVQ	AX, ret+0(FP)
    913 	RET
    914 
    915 // hash function using AES hardware instructions
    916 TEXT runtimeaeshash(SB),NOSPLIT,$0-32
    917 	MOVQ	p+0(FP), AX	// ptr to data
    918 	MOVQ	s+16(FP), CX	// size
    919 	LEAQ	ret+24(FP), DX
    920 	JMP	runtimeaeshashbody(SB)
    921 
    922 TEXT runtimeaeshashstr(SB),NOSPLIT,$0-24
    923 	MOVQ	p+0(FP), AX	// ptr to string struct
    924 	MOVQ	8(AX), CX	// length of string
    925 	MOVQ	(AX), AX	// string data
    926 	LEAQ	ret+16(FP), DX
    927 	JMP	runtimeaeshashbody(SB)
    928 
    929 // AX: data
    930 // CX: length
    931 // DX: address to put return value
    932 TEXT runtimeaeshashbody(SB),NOSPLIT,$0-0
    933 	// Fill an SSE register with our seeds.
    934 	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
    935 	PINSRW	$4, CX, X0			// 16 bits of length
    936 	PSHUFHW $0, X0, X0			// repeat length 4 times total
    937 	MOVO	X0, X1				// save unscrambled seed
    938 	PXOR	runtimeaeskeysched(SB), X0	// xor in per-process seed
    939 	AESENC	X0, X0				// scramble seed
    940 
    941 	CMPQ	CX, $16
    942 	JB	aes0to15
    943 	JE	aes16
    944 	CMPQ	CX, $32
    945 	JBE	aes17to32
    946 	CMPQ	CX, $64
    947 	JBE	aes33to64
    948 	CMPQ	CX, $128
    949 	JBE	aes65to128
    950 	JMP	aes129plus
    951 
    952 aes0to15:
    953 	TESTQ	CX, CX
    954 	JE	aes0
    955 
    956 	ADDQ	$16, AX
    957 	TESTW	$0xff0, AX
    958 	JE	endofpage
    959 
    960 	// 16 bytes loaded at this address won't cross
    961 	// a page boundary, so we can load it directly.
    962 	MOVOU	-16(AX), X1
    963 	ADDQ	CX, CX
    964 	MOVQ	$masks<>(SB), AX
    965 	PAND	(AX)(CX*8), X1
    966 final1:
    967 	PXOR	X0, X1	// xor data with seed
    968 	AESENC	X1, X1	// scramble combo 3 times
    969 	AESENC	X1, X1
    970 	AESENC	X1, X1
    971 	MOVQ	X1, (DX)
    972 	RET
    973 
    974 endofpage:
    975 	// address ends in 1111xxxx. Might be up against
    976 	// a page boundary, so load ending at last byte.
    977 	// Then shift bytes down using pshufb.
    978 	MOVOU	-32(AX)(CX*1), X1
    979 	ADDQ	CX, CX
    980 	MOVQ	$shifts<>(SB), AX
    981 	PSHUFB	(AX)(CX*8), X1
    982 	JMP	final1
    983 
    984 aes0:
    985 	// Return scrambled input seed
    986 	AESENC	X0, X0
    987 	MOVQ	X0, (DX)
    988 	RET
    989 
    990 aes16:
    991 	MOVOU	(AX), X1
    992 	JMP	final1
    993 
    994 aes17to32:
    995 	// make second starting seed
    996 	PXOR	runtimeaeskeysched+16(SB), X1
    997 	AESENC	X1, X1
    998 
    999 	// load data to be hashed
   1000 	MOVOU	(AX), X2
   1001 	MOVOU	-16(AX)(CX*1), X3
   1002 
   1003 	// xor with seed
   1004 	PXOR	X0, X2
   1005 	PXOR	X1, X3
   1006 
   1007 	// scramble 3 times
   1008 	AESENC	X2, X2
   1009 	AESENC	X3, X3
   1010 	AESENC	X2, X2
   1011 	AESENC	X3, X3
   1012 	AESENC	X2, X2
   1013 	AESENC	X3, X3
   1014 
   1015 	// combine results
   1016 	PXOR	X3, X2
   1017 	MOVQ	X2, (DX)
   1018 	RET
   1019 
   1020 aes33to64:
   1021 	// make 3 more starting seeds
   1022 	MOVO	X1, X2
   1023 	MOVO	X1, X3
   1024 	PXOR	runtimeaeskeysched+16(SB), X1
   1025 	PXOR	runtimeaeskeysched+32(SB), X2
   1026 	PXOR	runtimeaeskeysched+48(SB), X3
   1027 	AESENC	X1, X1
   1028 	AESENC	X2, X2
   1029 	AESENC	X3, X3
   1030 
   1031 	MOVOU	(AX), X4
   1032 	MOVOU	16(AX), X5
   1033 	MOVOU	-32(AX)(CX*1), X6
   1034 	MOVOU	-16(AX)(CX*1), X7
   1035 
   1036 	PXOR	X0, X4
   1037 	PXOR	X1, X5
   1038 	PXOR	X2, X6
   1039 	PXOR	X3, X7
   1040 
   1041 	AESENC	X4, X4
   1042 	AESENC	X5, X5
   1043 	AESENC	X6, X6
   1044 	AESENC	X7, X7
   1045 
   1046 	AESENC	X4, X4
   1047 	AESENC	X5, X5
   1048 	AESENC	X6, X6
   1049 	AESENC	X7, X7
   1050 
   1051 	AESENC	X4, X4
   1052 	AESENC	X5, X5
   1053 	AESENC	X6, X6
   1054 	AESENC	X7, X7
   1055 
   1056 	PXOR	X6, X4
   1057 	PXOR	X7, X5
   1058 	PXOR	X5, X4
   1059 	MOVQ	X4, (DX)
   1060 	RET
   1061 
   1062 aes65to128:
   1063 	// make 7 more starting seeds
   1064 	MOVO	X1, X2
   1065 	MOVO	X1, X3
   1066 	MOVO	X1, X4
   1067 	MOVO	X1, X5
   1068 	MOVO	X1, X6
   1069 	MOVO	X1, X7
   1070 	PXOR	runtimeaeskeysched+16(SB), X1
   1071 	PXOR	runtimeaeskeysched+32(SB), X2
   1072 	PXOR	runtimeaeskeysched+48(SB), X3
   1073 	PXOR	runtimeaeskeysched+64(SB), X4
   1074 	PXOR	runtimeaeskeysched+80(SB), X5
   1075 	PXOR	runtimeaeskeysched+96(SB), X6
   1076 	PXOR	runtimeaeskeysched+112(SB), X7
   1077 	AESENC	X1, X1
   1078 	AESENC	X2, X2
   1079 	AESENC	X3, X3
   1080 	AESENC	X4, X4
   1081 	AESENC	X5, X5
   1082 	AESENC	X6, X6
   1083 	AESENC	X7, X7
   1084 
   1085 	// load data
   1086 	MOVOU	(AX), X8
   1087 	MOVOU	16(AX), X9
   1088 	MOVOU	32(AX), X10
   1089 	MOVOU	48(AX), X11
   1090 	MOVOU	-64(AX)(CX*1), X12
   1091 	MOVOU	-48(AX)(CX*1), X13
   1092 	MOVOU	-32(AX)(CX*1), X14
   1093 	MOVOU	-16(AX)(CX*1), X15
   1094 
   1095 	// xor with seed
   1096 	PXOR	X0, X8
   1097 	PXOR	X1, X9
   1098 	PXOR	X2, X10
   1099 	PXOR	X3, X11
   1100 	PXOR	X4, X12
   1101 	PXOR	X5, X13
   1102 	PXOR	X6, X14
   1103 	PXOR	X7, X15
   1104 
   1105 	// scramble 3 times
   1106 	AESENC	X8, X8
   1107 	AESENC	X9, X9
   1108 	AESENC	X10, X10
   1109 	AESENC	X11, X11
   1110 	AESENC	X12, X12
   1111 	AESENC	X13, X13
   1112 	AESENC	X14, X14
   1113 	AESENC	X15, X15
   1114 
   1115 	AESENC	X8, X8
   1116 	AESENC	X9, X9
   1117 	AESENC	X10, X10
   1118 	AESENC	X11, X11
   1119 	AESENC	X12, X12
   1120 	AESENC	X13, X13
   1121 	AESENC	X14, X14
   1122 	AESENC	X15, X15
   1123 
   1124 	AESENC	X8, X8
   1125 	AESENC	X9, X9
   1126 	AESENC	X10, X10
   1127 	AESENC	X11, X11
   1128 	AESENC	X12, X12
   1129 	AESENC	X13, X13
   1130 	AESENC	X14, X14
   1131 	AESENC	X15, X15
   1132 
   1133 	// combine results
   1134 	PXOR	X12, X8
   1135 	PXOR	X13, X9
   1136 	PXOR	X14, X10
   1137 	PXOR	X15, X11
   1138 	PXOR	X10, X8
   1139 	PXOR	X11, X9
   1140 	PXOR	X9, X8
   1141 	MOVQ	X8, (DX)
   1142 	RET
   1143 
   1144 aes129plus:
   1145 	// make 7 more starting seeds
   1146 	MOVO	X1, X2
   1147 	MOVO	X1, X3
   1148 	MOVO	X1, X4
   1149 	MOVO	X1, X5
   1150 	MOVO	X1, X6
   1151 	MOVO	X1, X7
   1152 	PXOR	runtimeaeskeysched+16(SB), X1
   1153 	PXOR	runtimeaeskeysched+32(SB), X2
   1154 	PXOR	runtimeaeskeysched+48(SB), X3
   1155 	PXOR	runtimeaeskeysched+64(SB), X4
   1156 	PXOR	runtimeaeskeysched+80(SB), X5
   1157 	PXOR	runtimeaeskeysched+96(SB), X6
   1158 	PXOR	runtimeaeskeysched+112(SB), X7
   1159 	AESENC	X1, X1
   1160 	AESENC	X2, X2
   1161 	AESENC	X3, X3
   1162 	AESENC	X4, X4
   1163 	AESENC	X5, X5
   1164 	AESENC	X6, X6
   1165 	AESENC	X7, X7
   1166 
   1167 	// start with last (possibly overlapping) block
   1168 	MOVOU	-128(AX)(CX*1), X8
   1169 	MOVOU	-112(AX)(CX*1), X9
   1170 	MOVOU	-96(AX)(CX*1), X10
   1171 	MOVOU	-80(AX)(CX*1), X11
   1172 	MOVOU	-64(AX)(CX*1), X12
   1173 	MOVOU	-48(AX)(CX*1), X13
   1174 	MOVOU	-32(AX)(CX*1), X14
   1175 	MOVOU	-16(AX)(CX*1), X15
   1176 
   1177 	// xor in seed
   1178 	PXOR	X0, X8
   1179 	PXOR	X1, X9
   1180 	PXOR	X2, X10
   1181 	PXOR	X3, X11
   1182 	PXOR	X4, X12
   1183 	PXOR	X5, X13
   1184 	PXOR	X6, X14
   1185 	PXOR	X7, X15
   1186 
   1187 	// compute number of remaining 128-byte blocks
   1188 	DECQ	CX
   1189 	SHRQ	$7, CX
   1190 
   1191 aesloop:
   1192 	// scramble state
   1193 	AESENC	X8, X8
   1194 	AESENC	X9, X9
   1195 	AESENC	X10, X10
   1196 	AESENC	X11, X11
   1197 	AESENC	X12, X12
   1198 	AESENC	X13, X13
   1199 	AESENC	X14, X14
   1200 	AESENC	X15, X15
   1201 
   1202 	// scramble state, xor in a block
   1203 	MOVOU	(AX), X0
   1204 	MOVOU	16(AX), X1
   1205 	MOVOU	32(AX), X2
   1206 	MOVOU	48(AX), X3
   1207 	AESENC	X0, X8
   1208 	AESENC	X1, X9
   1209 	AESENC	X2, X10
   1210 	AESENC	X3, X11
   1211 	MOVOU	64(AX), X4
   1212 	MOVOU	80(AX), X5
   1213 	MOVOU	96(AX), X6
   1214 	MOVOU	112(AX), X7
   1215 	AESENC	X4, X12
   1216 	AESENC	X5, X13
   1217 	AESENC	X6, X14
   1218 	AESENC	X7, X15
   1219 
   1220 	ADDQ	$128, AX
   1221 	DECQ	CX
   1222 	JNE	aesloop
   1223 
   1224 	// 3 more scrambles to finish
   1225 	AESENC	X8, X8
   1226 	AESENC	X9, X9
   1227 	AESENC	X10, X10
   1228 	AESENC	X11, X11
   1229 	AESENC	X12, X12
   1230 	AESENC	X13, X13
   1231 	AESENC	X14, X14
   1232 	AESENC	X15, X15
   1233 	AESENC	X8, X8
   1234 	AESENC	X9, X9
   1235 	AESENC	X10, X10
   1236 	AESENC	X11, X11
   1237 	AESENC	X12, X12
   1238 	AESENC	X13, X13
   1239 	AESENC	X14, X14
   1240 	AESENC	X15, X15
   1241 	AESENC	X8, X8
   1242 	AESENC	X9, X9
   1243 	AESENC	X10, X10
   1244 	AESENC	X11, X11
   1245 	AESENC	X12, X12
   1246 	AESENC	X13, X13
   1247 	AESENC	X14, X14
   1248 	AESENC	X15, X15
   1249 
   1250 	PXOR	X12, X8
   1251 	PXOR	X13, X9
   1252 	PXOR	X14, X10
   1253 	PXOR	X15, X11
   1254 	PXOR	X10, X8
   1255 	PXOR	X11, X9
   1256 	PXOR	X9, X8
   1257 	MOVQ	X8, (DX)
   1258 	RET
   1259 
   1260 TEXT runtimeaeshash32(SB),NOSPLIT,$0-24
   1261 	MOVQ	p+0(FP), AX	// ptr to data
   1262 	MOVQ	h+8(FP), X0	// seed
   1263 	PINSRD	$2, (AX), X0	// data
   1264 	AESENC	runtimeaeskeysched+0(SB), X0
   1265 	AESENC	runtimeaeskeysched+16(SB), X0
   1266 	AESENC	runtimeaeskeysched+32(SB), X0
   1267 	MOVQ	X0, ret+16(FP)
   1268 	RET
   1269 
   1270 TEXT runtimeaeshash64(SB),NOSPLIT,$0-24
   1271 	MOVQ	p+0(FP), AX	// ptr to data
   1272 	MOVQ	h+8(FP), X0	// seed
   1273 	PINSRQ	$1, (AX), X0	// data
   1274 	AESENC	runtimeaeskeysched+0(SB), X0
   1275 	AESENC	runtimeaeskeysched+16(SB), X0
   1276 	AESENC	runtimeaeskeysched+32(SB), X0
   1277 	MOVQ	X0, ret+16(FP)
   1278 	RET
   1279 
   1280 // simple mask to get rid of data in the high part of the register.
   1281 DATA masks<>+0x00(SB)/8, $0x0000000000000000
   1282 DATA masks<>+0x08(SB)/8, $0x0000000000000000
   1283 DATA masks<>+0x10(SB)/8, $0x00000000000000ff
   1284 DATA masks<>+0x18(SB)/8, $0x0000000000000000
   1285 DATA masks<>+0x20(SB)/8, $0x000000000000ffff
   1286 DATA masks<>+0x28(SB)/8, $0x0000000000000000
   1287 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
   1288 DATA masks<>+0x38(SB)/8, $0x0000000000000000
   1289 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
   1290 DATA masks<>+0x48(SB)/8, $0x0000000000000000
   1291 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
   1292 DATA masks<>+0x58(SB)/8, $0x0000000000000000
   1293 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
   1294 DATA masks<>+0x68(SB)/8, $0x0000000000000000
   1295 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
   1296 DATA masks<>+0x78(SB)/8, $0x0000000000000000
   1297 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
   1298 DATA masks<>+0x88(SB)/8, $0x0000000000000000
   1299 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
   1300 DATA masks<>+0x98(SB)/8, $0x00000000000000ff
   1301 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
   1302 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
   1303 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
   1304 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
   1305 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
   1306 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
   1307 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
   1308 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
   1309 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
   1310 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
   1311 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
   1312 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
   1313 GLOBL masks<>(SB),RODATA,$256
   1314 
   1315 TEXT checkASM(SB),NOSPLIT,$0-1
   1316 	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
   1317 	MOVQ	$masks<>(SB), AX
   1318 	MOVQ	$shifts<>(SB), BX
   1319 	ORQ	BX, AX
   1320 	TESTQ	$15, AX
   1321 	SETEQ	ret+0(FP)
   1322 	RET
   1323 
   1324 // these are arguments to pshufb. They move data down from
   1325 // the high bytes of the register to the low bytes of the register.
   1326 // index is how many bytes to move.
   1327 DATA shifts<>+0x00(SB)/8, $0x0000000000000000
   1328 DATA shifts<>+0x08(SB)/8, $0x0000000000000000
   1329 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
   1330 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
   1331 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
   1332 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
   1333 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
   1334 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
   1335 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
   1336 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
   1337 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
   1338 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
   1339 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
   1340 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
   1341 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
   1342 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
   1343 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
   1344 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
   1345 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
   1346 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
   1347 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
   1348 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
   1349 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
   1350 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
   1351 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
   1352 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
   1353 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
   1354 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
   1355 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
   1356 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
   1357 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
   1358 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
   1359 GLOBL shifts<>(SB),RODATA,$256
   1360 
   1361 // memequal(p, q unsafe.Pointer, size uintptr) bool
   1362 TEXT runtimememequal(SB),NOSPLIT,$0-25
   1363 	MOVQ	a+0(FP), SI
   1364 	MOVQ	b+8(FP), DI
   1365 	CMPQ	SI, DI
   1366 	JEQ	eq
   1367 	MOVQ	size+16(FP), BX
   1368 	LEAQ	ret+24(FP), AX
   1369 	JMP	runtimememeqbody(SB)
   1370 eq:
   1371 	MOVB	$1, ret+24(FP)
   1372 	RET
   1373 
   1374 // memequal_varlen(a, b unsafe.Pointer) bool
   1375 TEXT runtimememequal_varlen(SB),NOSPLIT,$0-17
   1376 	MOVQ	a+0(FP), SI
   1377 	MOVQ	b+8(FP), DI
   1378 	CMPQ	SI, DI
   1379 	JEQ	eq
   1380 	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
   1381 	LEAQ	ret+16(FP), AX
   1382 	JMP	runtimememeqbody(SB)
   1383 eq:
   1384 	MOVB	$1, ret+16(FP)
   1385 	RET
   1386 
   1387 // a in SI
   1388 // b in DI
   1389 // count in BX
   1390 // address of result byte in AX
   1391 TEXT runtimememeqbody(SB),NOSPLIT,$0-0
   1392 	CMPQ	BX, $8
   1393 	JB	small
   1394 	CMPQ	BX, $64
   1395 	JB	bigloop
   1396 	CMPB    runtimesupport_avx2(SB), $1
   1397 	JE	hugeloop_avx2
   1398 
   1399 	// 64 bytes at a time using xmm registers
   1400 hugeloop:
   1401 	CMPQ	BX, $64
   1402 	JB	bigloop
   1403 	MOVOU	(SI), X0
   1404 	MOVOU	(DI), X1
   1405 	MOVOU	16(SI), X2
   1406 	MOVOU	16(DI), X3
   1407 	MOVOU	32(SI), X4
   1408 	MOVOU	32(DI), X5
   1409 	MOVOU	48(SI), X6
   1410 	MOVOU	48(DI), X7
   1411 	PCMPEQB	X1, X0
   1412 	PCMPEQB	X3, X2
   1413 	PCMPEQB	X5, X4
   1414 	PCMPEQB	X7, X6
   1415 	PAND	X2, X0
   1416 	PAND	X6, X4
   1417 	PAND	X4, X0
   1418 	PMOVMSKB X0, DX
   1419 	ADDQ	$64, SI
   1420 	ADDQ	$64, DI
   1421 	SUBQ	$64, BX
   1422 	CMPL	DX, $0xffff
   1423 	JEQ	hugeloop
   1424 	MOVB	$0, (AX)
   1425 	RET
   1426 
   1427 	// 64 bytes at a time using ymm registers
   1428 hugeloop_avx2:
   1429 	CMPQ	BX, $64
   1430 	JB	bigloop_avx2
   1431 	VMOVDQU	(SI), Y0
   1432 	VMOVDQU	(DI), Y1
   1433 	VMOVDQU	32(SI), Y2
   1434 	VMOVDQU	32(DI), Y3
   1435 	VPCMPEQB	Y1, Y0, Y4
   1436 	VPCMPEQB	Y2, Y3, Y5
   1437 	VPAND	Y4, Y5, Y6
   1438 	VPMOVMSKB Y6, DX
   1439 	ADDQ	$64, SI
   1440 	ADDQ	$64, DI
   1441 	SUBQ	$64, BX
   1442 	CMPL	DX, $0xffffffff
   1443 	JEQ	hugeloop_avx2
   1444 	VZEROUPPER
   1445 	MOVB	$0, (AX)
   1446 	RET
   1447 
   1448 bigloop_avx2:
   1449 	VZEROUPPER
   1450 
   1451 	// 8 bytes at a time using 64-bit register
   1452 bigloop:
   1453 	CMPQ	BX, $8
   1454 	JBE	leftover
   1455 	MOVQ	(SI), CX
   1456 	MOVQ	(DI), DX
   1457 	ADDQ	$8, SI
   1458 	ADDQ	$8, DI
   1459 	SUBQ	$8, BX
   1460 	CMPQ	CX, DX
   1461 	JEQ	bigloop
   1462 	MOVB	$0, (AX)
   1463 	RET
   1464 
   1465 	// remaining 0-8 bytes
   1466 leftover:
   1467 	MOVQ	-8(SI)(BX*1), CX
   1468 	MOVQ	-8(DI)(BX*1), DX
   1469 	CMPQ	CX, DX
   1470 	SETEQ	(AX)
   1471 	RET
   1472 
   1473 small:
   1474 	CMPQ	BX, $0
   1475 	JEQ	equal
   1476 
   1477 	LEAQ	0(BX*8), CX
   1478 	NEGQ	CX
   1479 
   1480 	CMPB	SI, $0xf8
   1481 	JA	si_high
   1482 
   1483 	// load at SI won't cross a page boundary.
   1484 	MOVQ	(SI), SI
   1485 	JMP	si_finish
   1486 si_high:
   1487 	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   1488 	MOVQ	-8(SI)(BX*1), SI
   1489 	SHRQ	CX, SI
   1490 si_finish:
   1491 
   1492 	// same for DI.
   1493 	CMPB	DI, $0xf8
   1494 	JA	di_high
   1495 	MOVQ	(DI), DI
   1496 	JMP	di_finish
   1497 di_high:
   1498 	MOVQ	-8(DI)(BX*1), DI
   1499 	SHRQ	CX, DI
   1500 di_finish:
   1501 
   1502 	SUBQ	SI, DI
   1503 	SHLQ	CX, DI
   1504 equal:
   1505 	SETEQ	(AX)
   1506 	RET
   1507 
   1508 TEXT runtimecmpstring(SB),NOSPLIT,$0-40
   1509 	MOVQ	s1_base+0(FP), SI
   1510 	MOVQ	s1_len+8(FP), BX
   1511 	MOVQ	s2_base+16(FP), DI
   1512 	MOVQ	s2_len+24(FP), DX
   1513 	LEAQ	ret+32(FP), R9
   1514 	JMP	runtimecmpbody(SB)
   1515 
   1516 TEXT bytesCompare(SB),NOSPLIT,$0-56
   1517 	MOVQ	s1+0(FP), SI
   1518 	MOVQ	s1+8(FP), BX
   1519 	MOVQ	s2+24(FP), DI
   1520 	MOVQ	s2+32(FP), DX
   1521 	LEAQ	res+48(FP), R9
   1522 	JMP	runtimecmpbody(SB)
   1523 
   1524 // input:
   1525 //   SI = a
   1526 //   DI = b
   1527 //   BX = alen
   1528 //   DX = blen
   1529 //   R9 = address of output word (stores -1/0/1 here)
   1530 TEXT runtimecmpbody(SB),NOSPLIT,$0-0
   1531 	CMPQ	SI, DI
   1532 	JEQ	allsame
   1533 	CMPQ	BX, DX
   1534 	MOVQ	DX, R8
   1535 	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
   1536 	CMPQ	R8, $8
   1537 	JB	small
   1538 
   1539 	CMPQ	R8, $63
   1540 	JBE	loop
   1541 	CMPB    runtimesupport_avx2(SB), $1
   1542 	JEQ     big_loop_avx2
   1543 	JMP	big_loop
   1544 loop:
   1545 	CMPQ	R8, $16
   1546 	JBE	_0through16
   1547 	MOVOU	(SI), X0
   1548 	MOVOU	(DI), X1
   1549 	PCMPEQB X0, X1
   1550 	PMOVMSKB X1, AX
   1551 	XORQ	$0xffff, AX	// convert EQ to NE
   1552 	JNE	diff16	// branch if at least one byte is not equal
   1553 	ADDQ	$16, SI
   1554 	ADDQ	$16, DI
   1555 	SUBQ	$16, R8
   1556 	JMP	loop
   1557 
   1558 diff64:
   1559 	ADDQ	$48, SI
   1560 	ADDQ	$48, DI
   1561 	JMP	diff16
   1562 diff48:
   1563 	ADDQ	$32, SI
   1564 	ADDQ	$32, DI
   1565 	JMP	diff16
   1566 diff32:
   1567 	ADDQ	$16, SI
   1568 	ADDQ	$16, DI
   1569 	// AX = bit mask of differences
   1570 diff16:
   1571 	BSFQ	AX, BX	// index of first byte that differs
   1572 	XORQ	AX, AX
   1573 	MOVB	(SI)(BX*1), CX
   1574 	CMPB	CX, (DI)(BX*1)
   1575 	SETHI	AX
   1576 	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
   1577 	MOVQ	AX, (R9)
   1578 	RET
   1579 
   1580 	// 0 through 16 bytes left, alen>=8, blen>=8
   1581 _0through16:
   1582 	CMPQ	R8, $8
   1583 	JBE	_0through8
   1584 	MOVQ	(SI), AX
   1585 	MOVQ	(DI), CX
   1586 	CMPQ	AX, CX
   1587 	JNE	diff8
   1588 _0through8:
   1589 	MOVQ	-8(SI)(R8*1), AX
   1590 	MOVQ	-8(DI)(R8*1), CX
   1591 	CMPQ	AX, CX
   1592 	JEQ	allsame
   1593 
   1594 	// AX and CX contain parts of a and b that differ.
   1595 diff8:
   1596 	BSWAPQ	AX	// reverse order of bytes
   1597 	BSWAPQ	CX
   1598 	XORQ	AX, CX
   1599 	BSRQ	CX, CX	// index of highest bit difference
   1600 	SHRQ	CX, AX	// move a's bit to bottom
   1601 	ANDQ	$1, AX	// mask bit
   1602 	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
   1603 	MOVQ	AX, (R9)
   1604 	RET
   1605 
   1606 	// 0-7 bytes in common
   1607 small:
   1608 	LEAQ	(R8*8), CX	// bytes left -> bits left
   1609 	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
   1610 	JEQ	allsame
   1611 
   1612 	// load bytes of a into high bytes of AX
   1613 	CMPB	SI, $0xf8
   1614 	JA	si_high
   1615 	MOVQ	(SI), SI
   1616 	JMP	si_finish
   1617 si_high:
   1618 	MOVQ	-8(SI)(R8*1), SI
   1619 	SHRQ	CX, SI
   1620 si_finish:
   1621 	SHLQ	CX, SI
   1622 
   1623 	// load bytes of b in to high bytes of BX
   1624 	CMPB	DI, $0xf8
   1625 	JA	di_high
   1626 	MOVQ	(DI), DI
   1627 	JMP	di_finish
   1628 di_high:
   1629 	MOVQ	-8(DI)(R8*1), DI
   1630 	SHRQ	CX, DI
   1631 di_finish:
   1632 	SHLQ	CX, DI
   1633 
   1634 	BSWAPQ	SI	// reverse order of bytes
   1635 	BSWAPQ	DI
   1636 	XORQ	SI, DI	// find bit differences
   1637 	JEQ	allsame
   1638 	BSRQ	DI, CX	// index of highest bit difference
   1639 	SHRQ	CX, SI	// move a's bit to bottom
   1640 	ANDQ	$1, SI	// mask bit
   1641 	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
   1642 	MOVQ	AX, (R9)
   1643 	RET
   1644 
   1645 allsame:
   1646 	XORQ	AX, AX
   1647 	XORQ	CX, CX
   1648 	CMPQ	BX, DX
   1649 	SETGT	AX	// 1 if alen > blen
   1650 	SETEQ	CX	// 1 if alen == blen
   1651 	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
   1652 	MOVQ	AX, (R9)
   1653 	RET
   1654 
   1655 	// this works for >= 64 bytes of data.
   1656 big_loop:
   1657 	MOVOU	(SI), X0
   1658 	MOVOU	(DI), X1
   1659 	PCMPEQB X0, X1
   1660 	PMOVMSKB X1, AX
   1661 	XORQ	$0xffff, AX
   1662 	JNE	diff16
   1663 
   1664 	MOVOU	16(SI), X0
   1665 	MOVOU	16(DI), X1
   1666 	PCMPEQB X0, X1
   1667 	PMOVMSKB X1, AX
   1668 	XORQ	$0xffff, AX
   1669 	JNE	diff32
   1670 
   1671 	MOVOU	32(SI), X0
   1672 	MOVOU	32(DI), X1
   1673 	PCMPEQB X0, X1
   1674 	PMOVMSKB X1, AX
   1675 	XORQ	$0xffff, AX
   1676 	JNE	diff48
   1677 
   1678 	MOVOU	48(SI), X0
   1679 	MOVOU	48(DI), X1
   1680 	PCMPEQB X0, X1
   1681 	PMOVMSKB X1, AX
   1682 	XORQ	$0xffff, AX
   1683 	JNE	diff64
   1684 
   1685 	ADDQ	$64, SI
   1686 	ADDQ	$64, DI
   1687 	SUBQ	$64, R8
   1688 	CMPQ	R8, $64
   1689 	JBE	loop
   1690 	JMP	big_loop
   1691 
   1692 	// Compare 64-bytes per loop iteration.
   1693 	// Loop is unrolled and uses AVX2.
   1694 big_loop_avx2:
   1695 	VMOVDQU	(SI), Y2
   1696 	VMOVDQU	(DI), Y3
   1697 	VMOVDQU	32(SI), Y4
   1698 	VMOVDQU	32(DI), Y5
   1699 	VPCMPEQB Y2, Y3, Y0
   1700 	VPMOVMSKB Y0, AX
   1701 	XORL	$0xffffffff, AX
   1702 	JNE	diff32_avx2
   1703 	VPCMPEQB Y4, Y5, Y6
   1704 	VPMOVMSKB Y6, AX
   1705 	XORL	$0xffffffff, AX
   1706 	JNE	diff64_avx2
   1707 
   1708 	ADDQ	$64, SI
   1709 	ADDQ	$64, DI
   1710 	SUBQ	$64, R8
   1711 	CMPQ	R8, $64
   1712 	JB	big_loop_avx2_exit
   1713 	JMP	big_loop_avx2
   1714 
   1715 	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
   1716 diff32_avx2:
   1717 	VZEROUPPER
   1718 	JMP diff16
   1719 
   1720 	// Same as diff32_avx2, but for last 32 bytes.
   1721 diff64_avx2:
   1722 	VZEROUPPER
   1723 	JMP diff48
   1724 
   1725 	// For <64 bytes remainder jump to normal loop.
   1726 big_loop_avx2_exit:
   1727 	VZEROUPPER
   1728 	JMP loop
   1729 
   1730 TEXT stringsindexShortStr(SB),NOSPLIT,$0-40
   1731 	MOVQ s+0(FP), DI
   1732 	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
   1733 	MOVQ s_len+8(FP), DX
   1734 	MOVQ c+16(FP), BP
   1735 	MOVQ c_len+24(FP), AX
   1736 	MOVQ DI, R10
   1737 	LEAQ ret+32(FP), R11
   1738 	JMP  runtimeindexShortStr(SB)
   1739 
   1740 TEXT bytesindexShortStr(SB),NOSPLIT,$0-56
   1741 	MOVQ s+0(FP), DI
   1742 	MOVQ s_len+8(FP), DX
   1743 	MOVQ c+24(FP), BP
   1744 	MOVQ c_len+32(FP), AX
   1745 	MOVQ DI, R10
   1746 	LEAQ ret+48(FP), R11
   1747 	JMP  runtimeindexShortStr(SB)
   1748 
   1749 // AX: length of string, that we are searching for
   1750 // DX: length of string, in which we are searching
   1751 // DI: pointer to string, in which we are searching
   1752 // BP: pointer to string, that we are searching for
   1753 // R11: address, where to put return value
   1754 TEXT runtimeindexShortStr(SB),NOSPLIT,$0
   1755 	CMPQ AX, DX
   1756 	JA fail
   1757 	CMPQ DX, $16
   1758 	JAE sse42
   1759 no_sse42:
   1760 	CMPQ AX, $2
   1761 	JA   _3_or_more
   1762 	MOVW (BP), BP
   1763 	LEAQ -1(DI)(DX*1), DX
   1764 loop2:
   1765 	MOVW (DI), SI
   1766 	CMPW SI,BP
   1767 	JZ success
   1768 	ADDQ $1,DI
   1769 	CMPQ DI,DX
   1770 	JB loop2
   1771 	JMP fail
   1772 _3_or_more:
   1773 	CMPQ AX, $3
   1774 	JA   _4_or_more
   1775 	MOVW 1(BP), BX
   1776 	MOVW (BP), BP
   1777 	LEAQ -2(DI)(DX*1), DX
   1778 loop3:
   1779 	MOVW (DI), SI
   1780 	CMPW SI,BP
   1781 	JZ   partial_success3
   1782 	ADDQ $1,DI
   1783 	CMPQ DI,DX
   1784 	JB loop3
   1785 	JMP fail
   1786 partial_success3:
   1787 	MOVW 1(DI), SI
   1788 	CMPW SI,BX
   1789 	JZ success
   1790 	ADDQ $1,DI
   1791 	CMPQ DI,DX
   1792 	JB loop3
   1793 	JMP fail
   1794 _4_or_more:
   1795 	CMPQ AX, $4
   1796 	JA   _5_or_more
   1797 	MOVL (BP), BP
   1798 	LEAQ -3(DI)(DX*1), DX
   1799 loop4:
   1800 	MOVL (DI), SI
   1801 	CMPL SI,BP
   1802 	JZ   success
   1803 	ADDQ $1,DI
   1804 	CMPQ DI,DX
   1805 	JB loop4
   1806 	JMP fail
   1807 _5_or_more:
   1808 	CMPQ AX, $7
   1809 	JA   _8_or_more
   1810 	LEAQ 1(DI)(DX*1), DX
   1811 	SUBQ AX, DX
   1812 	MOVL -4(BP)(AX*1), BX
   1813 	MOVL (BP), BP
   1814 loop5to7:
   1815 	MOVL (DI), SI
   1816 	CMPL SI,BP
   1817 	JZ   partial_success5to7
   1818 	ADDQ $1,DI
   1819 	CMPQ DI,DX
   1820 	JB loop5to7
   1821 	JMP fail
   1822 partial_success5to7:
   1823 	MOVL -4(AX)(DI*1), SI
   1824 	CMPL SI,BX
   1825 	JZ success
   1826 	ADDQ $1,DI
   1827 	CMPQ DI,DX
   1828 	JB loop5to7
   1829 	JMP fail
   1830 _8_or_more:
   1831 	CMPQ AX, $8
   1832 	JA   _9_or_more
   1833 	MOVQ (BP), BP
   1834 	LEAQ -7(DI)(DX*1), DX
   1835 loop8:
   1836 	MOVQ (DI), SI
   1837 	CMPQ SI,BP
   1838 	JZ   success
   1839 	ADDQ $1,DI
   1840 	CMPQ DI,DX
   1841 	JB loop8
   1842 	JMP fail
   1843 _9_or_more:
   1844 	CMPQ AX, $15
   1845 	JA   _16_or_more
   1846 	LEAQ 1(DI)(DX*1), DX
   1847 	SUBQ AX, DX
   1848 	MOVQ -8(BP)(AX*1), BX
   1849 	MOVQ (BP), BP
   1850 loop9to15:
   1851 	MOVQ (DI), SI
   1852 	CMPQ SI,BP
   1853 	JZ   partial_success9to15
   1854 	ADDQ $1,DI
   1855 	CMPQ DI,DX
   1856 	JB loop9to15
   1857 	JMP fail
   1858 partial_success9to15:
   1859 	MOVQ -8(AX)(DI*1), SI
   1860 	CMPQ SI,BX
   1861 	JZ success
   1862 	ADDQ $1,DI
   1863 	CMPQ DI,DX
   1864 	JB loop9to15
   1865 	JMP fail
   1866 _16_or_more:
   1867 	CMPQ AX, $16
   1868 	JA   _17_or_more
   1869 	MOVOU (BP), X1
   1870 	LEAQ -15(DI)(DX*1), DX
   1871 loop16:
   1872 	MOVOU (DI), X2
   1873 	PCMPEQB X1, X2
   1874 	PMOVMSKB X2, SI
   1875 	CMPQ  SI, $0xffff
   1876 	JE   success
   1877 	ADDQ $1,DI
   1878 	CMPQ DI,DX
   1879 	JB loop16
   1880 	JMP fail
   1881 _17_or_more:
   1882 	CMPQ AX, $31
   1883 	JA   _32_or_more
   1884 	LEAQ 1(DI)(DX*1), DX
   1885 	SUBQ AX, DX
   1886 	MOVOU -16(BP)(AX*1), X0
   1887 	MOVOU (BP), X1
   1888 loop17to31:
   1889 	MOVOU (DI), X2
   1890 	PCMPEQB X1,X2
   1891 	PMOVMSKB X2, SI
   1892 	CMPQ  SI, $0xffff
   1893 	JE   partial_success17to31
   1894 	ADDQ $1,DI
   1895 	CMPQ DI,DX
   1896 	JB loop17to31
   1897 	JMP fail
   1898 partial_success17to31:
   1899 	MOVOU -16(AX)(DI*1), X3
   1900 	PCMPEQB X0, X3
   1901 	PMOVMSKB X3, SI
   1902 	CMPQ  SI, $0xffff
   1903 	JE success
   1904 	ADDQ $1,DI
   1905 	CMPQ DI,DX
   1906 	JB loop17to31
   1907 	JMP fail
   1908 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
   1909 // So no need to check cpuid
   1910 _32_or_more:
   1911 	CMPQ AX, $32
   1912 	JA   _33_to_63
   1913 	VMOVDQU (BP), Y1
   1914 	LEAQ -31(DI)(DX*1), DX
   1915 loop32:
   1916 	VMOVDQU (DI), Y2
   1917 	VPCMPEQB Y1, Y2, Y3
   1918 	VPMOVMSKB Y3, SI
   1919 	CMPL  SI, $0xffffffff
   1920 	JE   success_avx2
   1921 	ADDQ $1,DI
   1922 	CMPQ DI,DX
   1923 	JB loop32
   1924 	JMP fail_avx2
   1925 _33_to_63:
   1926 	LEAQ 1(DI)(DX*1), DX
   1927 	SUBQ AX, DX
   1928 	VMOVDQU -32(BP)(AX*1), Y0
   1929 	VMOVDQU (BP), Y1
   1930 loop33to63:
   1931 	VMOVDQU (DI), Y2
   1932 	VPCMPEQB Y1, Y2, Y3
   1933 	VPMOVMSKB Y3, SI
   1934 	CMPL  SI, $0xffffffff
   1935 	JE   partial_success33to63
   1936 	ADDQ $1,DI
   1937 	CMPQ DI,DX
   1938 	JB loop33to63
   1939 	JMP fail_avx2
   1940 partial_success33to63:
   1941 	VMOVDQU -32(AX)(DI*1), Y3
   1942 	VPCMPEQB Y0, Y3, Y4
   1943 	VPMOVMSKB Y4, SI
   1944 	CMPL  SI, $0xffffffff
   1945 	JE success_avx2
   1946 	ADDQ $1,DI
   1947 	CMPQ DI,DX
   1948 	JB loop33to63
   1949 fail_avx2:
   1950 	VZEROUPPER
   1951 fail:
   1952 	MOVQ $-1, (R11)
   1953 	RET
   1954 success_avx2:
   1955 	VZEROUPPER
   1956 	JMP success
   1957 sse42:
   1958 	CMPB runtimesupport_sse42(SB), $1
   1959 	JNE no_sse42
   1960 	CMPQ AX, $12
   1961 	// PCMPESTRI is slower than normal compare,
   1962 	// so using it makes sense only if we advance 4+ bytes per compare
   1963 	// This value was determined experimentally and is the ~same
   1964 	// on Nehalem (first with SSE42) and Haswell.
   1965 	JAE _9_or_more
   1966 	LEAQ 16(BP), SI
   1967 	TESTW $0xff0, SI
   1968 	JEQ no_sse42
   1969 	MOVOU (BP), X1
   1970 	LEAQ -15(DI)(DX*1), SI
   1971 	MOVQ $16, R9
   1972 	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
   1973 loop_sse42:
   1974 	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
   1975 	// for equality (bits 2,3 are 11)
   1976 	// result is not masked or inverted (bits 4,5 are 00)
   1977 	// and corresponds to first matching byte (bit 6 is 0)
   1978 	PCMPESTRI $0x0c, (DI), X1
   1979 	// CX == 16 means no match,
   1980 	// CX > R9 means partial match at the end of the string,
   1981 	// otherwise sep is at offset CX from X1 start
   1982 	CMPQ CX, R9
   1983 	JBE sse42_success
   1984 	ADDQ R9, DI
   1985 	CMPQ DI, SI
   1986 	JB loop_sse42
   1987 	PCMPESTRI $0x0c, -1(SI), X1
   1988 	CMPQ CX, R9
   1989 	JA fail
   1990 	LEAQ -1(SI), DI
   1991 sse42_success:
   1992 	ADDQ CX, DI
   1993 success:
   1994 	SUBQ R10, DI
   1995 	MOVQ DI, (R11)
   1996 	RET
   1997 
   1998 
   1999 TEXT bytesIndexByte(SB),NOSPLIT,$0-40
   2000 	MOVQ s+0(FP), SI
   2001 	MOVQ s_len+8(FP), BX
   2002 	MOVB c+24(FP), AL
   2003 	LEAQ ret+32(FP), R8
   2004 	JMP  runtimeindexbytebody(SB)
   2005 
   2006 TEXT stringsIndexByte(SB),NOSPLIT,$0-32
   2007 	MOVQ s+0(FP), SI
   2008 	MOVQ s_len+8(FP), BX
   2009 	MOVB c+16(FP), AL
   2010 	LEAQ ret+24(FP), R8
   2011 	JMP  runtimeindexbytebody(SB)
   2012 
   2013 // input:
   2014 //   SI: data
   2015 //   BX: data len
   2016 //   AL: byte sought
   2017 //   R8: address to put result
   2018 TEXT runtimeindexbytebody(SB),NOSPLIT,$0
   2019 	// Shuffle X0 around so that each byte contains
   2020 	// the character we're looking for.
   2021 	MOVD AX, X0
   2022 	PUNPCKLBW X0, X0
   2023 	PUNPCKLBW X0, X0
   2024 	PSHUFL $0, X0, X0
   2025 
   2026 	CMPQ BX, $16
   2027 	JLT small
   2028 
   2029 	MOVQ SI, DI
   2030 
   2031 	CMPQ BX, $32
   2032 	JA avx2
   2033 sse:
   2034 	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
   2035 	JMP	sseloopentry
   2036 
   2037 sseloop:
   2038 	// Move the next 16-byte chunk of the data into X1.
   2039 	MOVOU	(DI), X1
   2040 	// Compare bytes in X0 to X1.
   2041 	PCMPEQB	X0, X1
   2042 	// Take the top bit of each byte in X1 and put the result in DX.
   2043 	PMOVMSKB X1, DX
   2044 	// Find first set bit, if any.
   2045 	BSFL	DX, DX
   2046 	JNZ	ssesuccess
   2047 	// Advance to next block.
   2048 	ADDQ	$16, DI
   2049 sseloopentry:
   2050 	CMPQ	DI, AX
   2051 	JB	sseloop
   2052 
   2053 	// Search the last 16-byte chunk. This chunk may overlap with the
   2054 	// chunks we've already searched, but that's ok.
   2055 	MOVQ	AX, DI
   2056 	MOVOU	(AX), X1
   2057 	PCMPEQB	X0, X1
   2058 	PMOVMSKB X1, DX
   2059 	BSFL	DX, DX
   2060 	JNZ	ssesuccess
   2061 
   2062 failure:
   2063 	MOVQ $-1, (R8)
   2064 	RET
   2065 
   2066 // We've found a chunk containing the byte.
   2067 // The chunk was loaded from DI.
   2068 // The index of the matching byte in the chunk is DX.
   2069 // The start of the data is SI.
   2070 ssesuccess:
   2071 	SUBQ SI, DI	// Compute offset of chunk within data.
   2072 	ADDQ DX, DI	// Add offset of byte within chunk.
   2073 	MOVQ DI, (R8)
   2074 	RET
   2075 
   2076 // handle for lengths < 16
   2077 small:
   2078 	TESTQ	BX, BX
   2079 	JEQ	failure
   2080 
   2081 	// Check if we'll load across a page boundary.
   2082 	LEAQ	16(SI), AX
   2083 	TESTW	$0xff0, AX
   2084 	JEQ	endofpage
   2085 
   2086 	MOVOU	(SI), X1 // Load data
   2087 	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
   2088 	PMOVMSKB X1, DX	// Move result bits to integer register.
   2089 	BSFL	DX, DX	// Find first set bit.
   2090 	JZ	failure	// No set bit, failure.
   2091 	CMPL	DX, BX
   2092 	JAE	failure	// Match is past end of data.
   2093 	MOVQ	DX, (R8)
   2094 	RET
   2095 
   2096 endofpage:
   2097 	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
   2098 	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
   2099 	PMOVMSKB X1, DX	// Move result bits to integer register.
   2100 	MOVL	BX, CX
   2101 	SHLL	CX, DX
   2102 	SHRL	$16, DX	// Shift desired bits down to bottom of register.
   2103 	BSFL	DX, DX	// Find first set bit.
   2104 	JZ	failure	// No set bit, failure.
   2105 	MOVQ	DX, (R8)
   2106 	RET
   2107 
   2108 avx2:
   2109 	CMPB   runtimesupport_avx2(SB), $1
   2110 	JNE sse
   2111 	MOVD AX, X0
   2112 	LEAQ -32(SI)(BX*1), R11
   2113 	VPBROADCASTB  X0, Y1
   2114 avx2_loop:
   2115 	VMOVDQU (DI), Y2
   2116 	VPCMPEQB Y1, Y2, Y3
   2117 	VPTEST Y3, Y3
   2118 	JNZ avx2success
   2119 	ADDQ $32, DI
   2120 	CMPQ DI, R11
   2121 	JLT avx2_loop
   2122 	MOVQ R11, DI
   2123 	VMOVDQU (DI), Y2
   2124 	VPCMPEQB Y1, Y2, Y3
   2125 	VPTEST Y3, Y3
   2126 	JNZ avx2success
   2127 	VZEROUPPER
   2128 	MOVQ $-1, (R8)
   2129 	RET
   2130 
   2131 avx2success:
   2132 	VPMOVMSKB Y3, DX
   2133 	BSFL DX, DX
   2134 	SUBQ SI, DI
   2135 	ADDQ DI, DX
   2136 	MOVQ DX, (R8)
   2137 	VZEROUPPER
   2138 	RET
   2139 
   2140 TEXT bytesEqual(SB),NOSPLIT,$0-49
   2141 	MOVQ	a_len+8(FP), BX
   2142 	MOVQ	b_len+32(FP), CX
   2143 	CMPQ	BX, CX
   2144 	JNE	eqret
   2145 	MOVQ	a+0(FP), SI
   2146 	MOVQ	b+24(FP), DI
   2147 	LEAQ	ret+48(FP), AX
   2148 	JMP	runtimememeqbody(SB)
   2149 eqret:
   2150 	MOVB	$0, ret+48(FP)
   2151 	RET
   2152 
   2153 
   2154 TEXT bytescountByte(SB),NOSPLIT,$0-40
   2155 	MOVQ s+0(FP), SI
   2156 	MOVQ s_len+8(FP), BX
   2157 	MOVB c+24(FP), AL
   2158 	LEAQ ret+32(FP), R8
   2159 	JMP  runtimecountByte(SB)
   2160 
   2161 TEXT stringscountByte(SB),NOSPLIT,$0-32
   2162 	MOVQ s+0(FP), SI
   2163 	MOVQ s_len+8(FP), BX
   2164 	MOVB c+16(FP), AL
   2165 	LEAQ ret+24(FP), R8
   2166 	JMP  runtimecountByte(SB)
   2167 
   2168 // input:
   2169 //   SI: data
   2170 //   BX: data len
   2171 //   AL: byte sought
   2172 //   R8: address to put result
   2173 // This requires the POPCNT instruction
   2174 TEXT runtimecountByte(SB),NOSPLIT,$0
   2175 	// Shuffle X0 around so that each byte contains
   2176 	// the character we're looking for.
   2177 	MOVD AX, X0
   2178 	PUNPCKLBW X0, X0
   2179 	PUNPCKLBW X0, X0
   2180 	PSHUFL $0, X0, X0
   2181 
   2182 	CMPQ BX, $16
   2183 	JLT small
   2184 
   2185 	MOVQ $0, R12 // Accumulator
   2186 
   2187 	MOVQ SI, DI
   2188 
   2189 	CMPQ BX, $32
   2190 	JA avx2
   2191 sse:
   2192 	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
   2193 	JMP	sseloopentry
   2194 
   2195 sseloop:
   2196 	// Move the next 16-byte chunk of the data into X1.
   2197 	MOVOU	(DI), X1
   2198 	// Compare bytes in X0 to X1.
   2199 	PCMPEQB	X0, X1
   2200 	// Take the top bit of each byte in X1 and put the result in DX.
   2201 	PMOVMSKB X1, DX
   2202 	// Count number of matching bytes
   2203 	POPCNTL DX, DX
   2204 	// Accumulate into R12
   2205 	ADDQ DX, R12
   2206 	// Advance to next block.
   2207 	ADDQ	$16, DI
   2208 sseloopentry:
   2209 	CMPQ	DI, AX
   2210 	JBE	sseloop
   2211 
   2212 	// Get the number of bytes to consider in the last 16 bytes
   2213 	ANDQ $15, BX
   2214 	JZ end
   2215 
   2216 	// Create mask to ignore overlap between previous 16 byte block
   2217 	// and the next.
   2218 	MOVQ $16,CX
   2219 	SUBQ BX, CX
   2220 	MOVQ $0xFFFF, R10
   2221 	SARQ CL, R10
   2222 	SALQ CL, R10
   2223 
   2224 	// Process the last 16-byte chunk. This chunk may overlap with the
   2225 	// chunks we've already searched so we need to mask part of it.
   2226 	MOVOU	(AX), X1
   2227 	PCMPEQB	X0, X1
   2228 	PMOVMSKB X1, DX
   2229 	// Apply mask
   2230 	ANDQ R10, DX
   2231 	POPCNTL DX, DX
   2232 	ADDQ DX, R12
   2233 end:
   2234 	MOVQ R12, (R8)
   2235 	RET
   2236 
   2237 // handle for lengths < 16
   2238 small:
   2239 	TESTQ	BX, BX
   2240 	JEQ	endzero
   2241 
   2242 	// Check if we'll load across a page boundary.
   2243 	LEAQ	16(SI), AX
   2244 	TESTW	$0xff0, AX
   2245 	JEQ	endofpage
   2246 
   2247 	// We must ignore high bytes as they aren't part of our slice.
   2248 	// Create mask.
   2249 	MOVB BX, CX
   2250 	MOVQ $1, R10
   2251 	SALQ CL, R10
   2252 	SUBQ $1, R10
   2253 
   2254 	// Load data
   2255 	MOVOU	(SI), X1
   2256 	// Compare target byte with each byte in data.
   2257 	PCMPEQB	X0, X1
   2258 	// Move result bits to integer register.
   2259 	PMOVMSKB X1, DX
   2260 	// Apply mask
   2261 	ANDQ R10, DX
   2262 	POPCNTL DX, DX
   2263 	// Directly return DX, we don't need to accumulate
   2264 	// since we have <16 bytes.
   2265 	MOVQ	DX, (R8)
   2266 	RET
   2267 endzero:
   2268 	MOVQ $0, (R8)
   2269 	RET
   2270 
   2271 endofpage:
   2272 	// We must ignore low bytes as they aren't part of our slice.
   2273 	MOVQ $16,CX
   2274 	SUBQ BX, CX
   2275 	MOVQ $0xFFFF, R10
   2276 	SARQ CL, R10
   2277 	SALQ CL, R10
   2278 
   2279 	// Load data into the high end of X1.
   2280 	MOVOU	-16(SI)(BX*1), X1
   2281 	// Compare target byte with each byte in data.
   2282 	PCMPEQB	X0, X1
   2283 	// Move result bits to integer register.
   2284 	PMOVMSKB X1, DX
   2285 	// Apply mask
   2286 	ANDQ R10, DX
   2287 	// Directly return DX, we don't need to accumulate
   2288 	// since we have <16 bytes.
   2289 	POPCNTL DX, DX
   2290 	MOVQ	DX, (R8)
   2291 	RET
   2292 
   2293 avx2:
   2294 	CMPB   runtimesupport_avx2(SB), $1
   2295 	JNE sse
   2296 	MOVD AX, X0
   2297 	LEAQ -32(SI)(BX*1), R11
   2298 	VPBROADCASTB  X0, Y1
   2299 avx2_loop:
   2300 	VMOVDQU (DI), Y2
   2301 	VPCMPEQB Y1, Y2, Y3
   2302 	VPMOVMSKB Y3, DX
   2303 	POPCNTL DX, DX
   2304 	ADDQ DX, R12
   2305 	ADDQ $32, DI
   2306 	CMPQ DI, R11
   2307 	JLE avx2_loop
   2308 
   2309 	// If last block is already processed,
   2310 	// skip to the end.
   2311 	CMPQ DI, R11
   2312 	JEQ endavx
   2313 
   2314 	// Load address of the last 32 bytes.
   2315 	// There is an overlap with the previous block.
   2316 	MOVQ R11, DI
   2317 	VMOVDQU (DI), Y2
   2318 	VPCMPEQB Y1, Y2, Y3
   2319 	VPMOVMSKB Y3, DX
   2320 	// Exit AVX mode.
   2321 	VZEROUPPER
   2322 
   2323 	// Create mask to ignore overlap between previous 32 byte block
   2324 	// and the next.
   2325 	ANDQ $31, BX
   2326 	MOVQ $32,CX
   2327 	SUBQ BX, CX
   2328 	MOVQ $0xFFFFFFFF, R10
   2329 	SARQ CL, R10
   2330 	SALQ CL, R10
   2331 	// Apply mask
   2332 	ANDQ R10, DX
   2333 	POPCNTL DX, DX
   2334 	ADDQ DX, R12
   2335 	MOVQ R12, (R8)
   2336 	RET
   2337 endavx:
   2338 	// Exit AVX mode.
   2339 	VZEROUPPER
   2340 	MOVQ R12, (R8)
   2341 	RET
   2342 
   2343 TEXT runtimereturn0(SB), NOSPLIT, $0
   2344 	MOVL	$0, AX
   2345 	RET
   2346 
   2347 
   2348 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
   2349 // Must obey the gcc calling convention.
   2350 TEXT _cgo_topofstack(SB),NOSPLIT,$0
   2351 	get_tls(CX)
   2352 	MOVQ	g(CX), AX
   2353 	MOVQ	g_m(AX), AX
   2354 	MOVQ	m_curg(AX), AX
   2355 	MOVQ	(g_stack+stack_hi)(AX), AX
   2356 	RET
   2357 
   2358 // The top-most function running on a goroutine
   2359 // returns to goexit+PCQuantum.
   2360 TEXT runtimegoexit(SB),NOSPLIT,$0-0
   2361 	BYTE	$0x90	// NOP
   2362 	CALL	runtimegoexit1(SB)	// does not return
   2363 	// traceback from goexit1 must hit code range of goexit
   2364 	BYTE	$0x90	// NOP
   2365 
   2366 // This is called from .init_array and follows the platform, not Go, ABI.
   2367 TEXT runtimeaddmoduledata(SB),NOSPLIT,$0-0
   2368 	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
   2369 	MOVQ	runtimelastmoduledatap(SB), AX
   2370 	MOVQ	DI, moduledata_next(AX)
   2371 	MOVQ	DI, runtimelastmoduledatap(SB)
   2372 	POPQ	R15
   2373 	RET
   2374 
   2375 // gcWriteBarrier performs a heap pointer write and informs the GC.
   2376 //
   2377 // gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
   2378 // - DI is the destination of the write
   2379 // - AX is the value being written at DI
   2380 // It clobbers FLAGS. It does not clobber any general-purpose registers,
   2381 // but may clobber others (e.g., SSE registers).
   2382 TEXT runtimegcWriteBarrier(SB),NOSPLIT,$120
   2383 	// Save the registers clobbered by the fast path. This is slightly
   2384 	// faster than having the caller spill these.
   2385 	MOVQ	R14, 104(SP)
   2386 	MOVQ	R13, 112(SP)
   2387 	// TODO: Consider passing g.m.p in as an argument so they can be shared
   2388 	// across a sequence of write barriers.
   2389 	get_tls(R13)
   2390 	MOVQ	g(R13), R13
   2391 	MOVQ	g_m(R13), R13
   2392 	MOVQ	m_p(R13), R13
   2393 	MOVQ	(p_wbBuf+wbBuf_next)(R13), R14
   2394 	// Increment wbBuf.next position.
   2395 	LEAQ	16(R14), R14
   2396 	MOVQ	R14, (p_wbBuf+wbBuf_next)(R13)
   2397 	CMPQ	R14, (p_wbBuf+wbBuf_end)(R13)
   2398 	// Record the write.
   2399 	MOVQ	AX, -16(R14)	// Record value
   2400 	MOVQ	(DI), R13	// TODO: This turns bad writes into bad reads.
   2401 	MOVQ	R13, -8(R14)	// Record *slot
   2402 	// Is the buffer full? (flags set in CMPQ above)
   2403 	JEQ	flush
   2404 ret:
   2405 	MOVQ	104(SP), R14
   2406 	MOVQ	112(SP), R13
   2407 	// Do the write.
   2408 	MOVQ	AX, (DI)
   2409 	RET
   2410 
   2411 flush:
   2412 	// Save all general purpose registers since these could be
   2413 	// clobbered by wbBufFlush and were not saved by the caller.
   2414 	// It is possible for wbBufFlush to clobber other registers
   2415 	// (e.g., SSE registers), but the compiler takes care of saving
   2416 	// those in the caller if necessary. This strikes a balance
   2417 	// with registers that are likely to be used.
   2418 	//
   2419 	// We don't have type information for these, but all code under
   2420 	// here is NOSPLIT, so nothing will observe these.
   2421 	//
   2422 	// TODO: We could strike a different balance; e.g., saving X0
   2423 	// and not saving GP registers that are less likely to be used.
   2424 	MOVQ	DI, 0(SP)	// Also first argument to wbBufFlush
   2425 	MOVQ	AX, 8(SP)	// Also second argument to wbBufFlush
   2426 	MOVQ	BX, 16(SP)
   2427 	MOVQ	CX, 24(SP)
   2428 	MOVQ	DX, 32(SP)
   2429 	// DI already saved
   2430 	MOVQ	SI, 40(SP)
   2431 	MOVQ	BP, 48(SP)
   2432 	MOVQ	R8, 56(SP)
   2433 	MOVQ	R9, 64(SP)
   2434 	MOVQ	R10, 72(SP)
   2435 	MOVQ	R11, 80(SP)
   2436 	MOVQ	R12, 88(SP)
   2437 	// R13 already saved
   2438 	// R14 already saved
   2439 	MOVQ	R15, 96(SP)
   2440 
   2441 	// This takes arguments DI and AX
   2442 	CALL	runtimewbBufFlush(SB)
   2443 
   2444 	MOVQ	0(SP), DI
   2445 	MOVQ	8(SP), AX
   2446 	MOVQ	16(SP), BX
   2447 	MOVQ	24(SP), CX
   2448 	MOVQ	32(SP), DX
   2449 	MOVQ	40(SP), SI
   2450 	MOVQ	48(SP), BP
   2451 	MOVQ	56(SP), R8
   2452 	MOVQ	64(SP), R9
   2453 	MOVQ	72(SP), R10
   2454 	MOVQ	80(SP), R11
   2455 	MOVQ	88(SP), R12
   2456 	MOVQ	96(SP), R15
   2457 	JMP	ret
   2458