Home | History | Annotate | Download | only in runtime
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 #include "go_asm.h"
      6 #include "go_tls.h"
      7 #include "tls_arm64.h"
      8 #include "funcdata.h"
      9 #include "textflag.h"
     10 
     11 TEXT runtimert0_go(SB),NOSPLIT,$0
     12 	// SP = stack; R0 = argc; R1 = argv
     13 
     14 	SUB	$32, RSP
     15 	MOVW	R0, 8(RSP) // argc
     16 	MOVD	R1, 16(RSP) // argv
     17 
     18 	// create istack out of the given (operating system) stack.
     19 	// _cgo_init may update stackguard.
     20 	MOVD	$runtimeg0(SB), g
     21 	MOVD RSP, R7
     22 	MOVD	$(-64*1024)(R7), R0
     23 	MOVD	R0, g_stackguard0(g)
     24 	MOVD	R0, g_stackguard1(g)
     25 	MOVD	R0, (g_stack+stack_lo)(g)
     26 	MOVD	R7, (g_stack+stack_hi)(g)
     27 
     28 	// if there is a _cgo_init, call it using the gcc ABI.
     29 	MOVD	_cgo_init(SB), R12
     30 	CMP	$0, R12
     31 	BEQ	nocgo
     32 
     33 	MRS_TPIDR_R0			// load TLS base pointer
     34 	MOVD	R0, R3			// arg 3: TLS base pointer
     35 #ifdef TLSG_IS_VARIABLE
     36 	MOVD	$runtimetls_g(SB), R2 	// arg 2: &tls_g
     37 #else
     38 	MOVD	$0, R2		        // arg 2: not used when using platform's TLS
     39 #endif
     40 	MOVD	$setg_gcc<>(SB), R1	// arg 1: setg
     41 	MOVD	g, R0			// arg 0: G
     42 	BL	(R12)
     43 	MOVD	_cgo_init(SB), R12
     44 	CMP	$0, R12
     45 	BEQ	nocgo
     46 
     47 nocgo:
     48 	// update stackguard after _cgo_init
     49 	MOVD	(g_stack+stack_lo)(g), R0
     50 	ADD	$const__StackGuard, R0
     51 	MOVD	R0, g_stackguard0(g)
     52 	MOVD	R0, g_stackguard1(g)
     53 
     54 	// set the per-goroutine and per-mach "registers"
     55 	MOVD	$runtimem0(SB), R0
     56 
     57 	// save m->g0 = g0
     58 	MOVD	g, m_g0(R0)
     59 	// save m0 to g0->m
     60 	MOVD	R0, g_m(g)
     61 
     62 	BL	runtimecheck(SB)
     63 
     64 	MOVW	8(RSP), R0	// copy argc
     65 	MOVW	R0, -8(RSP)
     66 	MOVD	16(RSP), R0		// copy argv
     67 	MOVD	R0, 0(RSP)
     68 	BL	runtimeargs(SB)
     69 	BL	runtimeosinit(SB)
     70 	BL	runtimeschedinit(SB)
     71 
     72 	// create a new goroutine to start program
     73 	MOVD	$runtimemainPC(SB), R0		// entry
     74 	MOVD	RSP, R7
     75 	MOVD.W	$0, -8(R7)
     76 	MOVD.W	R0, -8(R7)
     77 	MOVD.W	$0, -8(R7)
     78 	MOVD.W	$0, -8(R7)
     79 	MOVD	R7, RSP
     80 	BL	runtimenewproc(SB)
     81 	ADD	$32, RSP
     82 
     83 	// start this M
     84 	BL	runtimemstart(SB)
     85 
     86 	MOVD	$0, R0
     87 	MOVD	R0, (R0)	// boom
     88 	UNDEF
     89 
     90 DATA	runtimemainPC+0(SB)/8,$runtimemain(SB)
     91 GLOBL	runtimemainPC(SB),RODATA,$8
     92 
     93 TEXT runtimebreakpoint(SB),NOSPLIT,$-8-0
     94 	BRK
     95 	RET
     96 
     97 TEXT runtimeasminit(SB),NOSPLIT,$-8-0
     98 	RET
     99 
    100 /*
    101  *  go-routine
    102  */
    103 
    104 // void gosave(Gobuf*)
    105 // save state in Gobuf; setjmp
    106 TEXT runtimegosave(SB), NOSPLIT, $-8-8
    107 	MOVD	buf+0(FP), R3
    108 	MOVD	RSP, R0
    109 	MOVD	R0, gobuf_sp(R3)
    110 	MOVD	LR, gobuf_pc(R3)
    111 	MOVD	g, gobuf_g(R3)
    112 	MOVD	ZR, gobuf_lr(R3)
    113 	MOVD	ZR, gobuf_ret(R3)
    114 	// Assert ctxt is zero. See func save.
    115 	MOVD	gobuf_ctxt(R3), R0
    116 	CMP	$0, R0
    117 	BEQ	2(PC)
    118 	CALL	runtimebadctxt(SB)
    119 	RET
    120 
    121 // void gogo(Gobuf*)
    122 // restore state from Gobuf; longjmp
    123 TEXT runtimegogo(SB), NOSPLIT, $24-8
    124 	MOVD	buf+0(FP), R5
    125 	MOVD	gobuf_g(R5), g
    126 	BL	runtimesave_g(SB)
    127 
    128 	MOVD	0(g), R4	// make sure g is not nil
    129 	MOVD	gobuf_sp(R5), R0
    130 	MOVD	R0, RSP
    131 	MOVD	gobuf_lr(R5), LR
    132 	MOVD	gobuf_ret(R5), R0
    133 	MOVD	gobuf_ctxt(R5), R26
    134 	MOVD	$0, gobuf_sp(R5)
    135 	MOVD	$0, gobuf_ret(R5)
    136 	MOVD	$0, gobuf_lr(R5)
    137 	MOVD	$0, gobuf_ctxt(R5)
    138 	CMP	ZR, ZR // set condition codes for == test, needed by stack split
    139 	MOVD	gobuf_pc(R5), R6
    140 	B	(R6)
    141 
    142 // void mcall(fn func(*g))
    143 // Switch to m->g0's stack, call fn(g).
    144 // Fn must never return. It should gogo(&g->sched)
    145 // to keep running g.
    146 TEXT runtimemcall(SB), NOSPLIT, $-8-8
    147 	// Save caller state in g->sched
    148 	MOVD	RSP, R0
    149 	MOVD	R0, (g_sched+gobuf_sp)(g)
    150 	MOVD	LR, (g_sched+gobuf_pc)(g)
    151 	MOVD	$0, (g_sched+gobuf_lr)(g)
    152 	MOVD	g, (g_sched+gobuf_g)(g)
    153 
    154 	// Switch to m->g0 & its stack, call fn.
    155 	MOVD	g, R3
    156 	MOVD	g_m(g), R8
    157 	MOVD	m_g0(R8), g
    158 	BL	runtimesave_g(SB)
    159 	CMP	g, R3
    160 	BNE	2(PC)
    161 	B	runtimebadmcall(SB)
    162 	MOVD	fn+0(FP), R26			// context
    163 	MOVD	0(R26), R4			// code pointer
    164 	MOVD	(g_sched+gobuf_sp)(g), R0
    165 	MOVD	R0, RSP	// sp = m->g0->sched.sp
    166 	MOVD	R3, -8(RSP)
    167 	MOVD	$0, -16(RSP)
    168 	SUB	$16, RSP
    169 	BL	(R4)
    170 	B	runtimebadmcall2(SB)
    171 
    172 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
    173 // of the G stack. We need to distinguish the routine that
    174 // lives at the bottom of the G stack from the one that lives
    175 // at the top of the system stack because the one at the top of
    176 // the system stack terminates the stack walk (see topofstack()).
    177 TEXT runtimesystemstack_switch(SB), NOSPLIT, $0-0
    178 	UNDEF
    179 	BL	(LR)	// make sure this function is not leaf
    180 	RET
    181 
    182 // func systemstack(fn func())
    183 TEXT runtimesystemstack(SB), NOSPLIT, $0-8
    184 	MOVD	fn+0(FP), R3	// R3 = fn
    185 	MOVD	R3, R26		// context
    186 	MOVD	g_m(g), R4	// R4 = m
    187 
    188 	MOVD	m_gsignal(R4), R5	// R5 = gsignal
    189 	CMP	g, R5
    190 	BEQ	noswitch
    191 
    192 	MOVD	m_g0(R4), R5	// R5 = g0
    193 	CMP	g, R5
    194 	BEQ	noswitch
    195 
    196 	MOVD	m_curg(R4), R6
    197 	CMP	g, R6
    198 	BEQ	switch
    199 
    200 	// Bad: g is not gsignal, not g0, not curg. What is it?
    201 	// Hide call from linker nosplit analysis.
    202 	MOVD	$runtimebadsystemstack(SB), R3
    203 	BL	(R3)
    204 
    205 switch:
    206 	// save our state in g->sched. Pretend to
    207 	// be systemstack_switch if the G stack is scanned.
    208 	MOVD	$runtimesystemstack_switch(SB), R6
    209 	ADD	$8, R6	// get past prologue
    210 	MOVD	R6, (g_sched+gobuf_pc)(g)
    211 	MOVD	RSP, R0
    212 	MOVD	R0, (g_sched+gobuf_sp)(g)
    213 	MOVD	$0, (g_sched+gobuf_lr)(g)
    214 	MOVD	g, (g_sched+gobuf_g)(g)
    215 
    216 	// switch to g0
    217 	MOVD	R5, g
    218 	BL	runtimesave_g(SB)
    219 	MOVD	(g_sched+gobuf_sp)(g), R3
    220 	// make it look like mstart called systemstack on g0, to stop traceback
    221 	SUB	$16, R3
    222 	AND	$~15, R3
    223 	MOVD	$runtimemstart(SB), R4
    224 	MOVD	R4, 0(R3)
    225 	MOVD	R3, RSP
    226 
    227 	// call target function
    228 	MOVD	0(R26), R3	// code pointer
    229 	BL	(R3)
    230 
    231 	// switch back to g
    232 	MOVD	g_m(g), R3
    233 	MOVD	m_curg(R3), g
    234 	BL	runtimesave_g(SB)
    235 	MOVD	(g_sched+gobuf_sp)(g), R0
    236 	MOVD	R0, RSP
    237 	MOVD	$0, (g_sched+gobuf_sp)(g)
    238 	RET
    239 
    240 noswitch:
    241 	// already on m stack, just call directly
    242 	// Using a tail call here cleans up tracebacks since we won't stop
    243 	// at an intermediate systemstack.
    244 	MOVD	0(R26), R3	// code pointer
    245 	MOVD.P	16(RSP), R30	// restore LR
    246 	B	(R3)
    247 
    248 /*
    249  * support for morestack
    250  */
    251 
    252 // Called during function prolog when more stack is needed.
    253 // Caller has already loaded:
    254 // R3 prolog's LR (R30)
    255 //
    256 // The traceback routines see morestack on a g0 as being
    257 // the top of a stack (for example, morestack calling newstack
    258 // calling the scheduler calling newm calling gc), so we must
    259 // record an argument size. For that purpose, it has no arguments.
    260 TEXT runtimemorestack(SB),NOSPLIT,$-8-0
    261 	// Cannot grow scheduler stack (m->g0).
    262 	MOVD	g_m(g), R8
    263 	MOVD	m_g0(R8), R4
    264 	CMP	g, R4
    265 	BNE	3(PC)
    266 	BL	runtimebadmorestackg0(SB)
    267 	B	runtimeabort(SB)
    268 
    269 	// Cannot grow signal stack (m->gsignal).
    270 	MOVD	m_gsignal(R8), R4
    271 	CMP	g, R4
    272 	BNE	3(PC)
    273 	BL	runtimebadmorestackgsignal(SB)
    274 	B	runtimeabort(SB)
    275 
    276 	// Called from f.
    277 	// Set g->sched to context in f
    278 	MOVD	RSP, R0
    279 	MOVD	R0, (g_sched+gobuf_sp)(g)
    280 	MOVD	LR, (g_sched+gobuf_pc)(g)
    281 	MOVD	R3, (g_sched+gobuf_lr)(g)
    282 	MOVD	R26, (g_sched+gobuf_ctxt)(g)
    283 
    284 	// Called from f.
    285 	// Set m->morebuf to f's callers.
    286 	MOVD	R3, (m_morebuf+gobuf_pc)(R8)	// f's caller's PC
    287 	MOVD	RSP, R0
    288 	MOVD	R0, (m_morebuf+gobuf_sp)(R8)	// f's caller's RSP
    289 	MOVD	g, (m_morebuf+gobuf_g)(R8)
    290 
    291 	// Call newstack on m->g0's stack.
    292 	MOVD	m_g0(R8), g
    293 	BL	runtimesave_g(SB)
    294 	MOVD	(g_sched+gobuf_sp)(g), R0
    295 	MOVD	R0, RSP
    296 	MOVD.W	$0, -16(RSP)	// create a call frame on g0 (saved LR; keep 16-aligned)
    297 	BL	runtimenewstack(SB)
    298 
    299 	// Not reached, but make sure the return PC from the call to newstack
    300 	// is still in this function, and not the beginning of the next.
    301 	UNDEF
    302 
    303 TEXT runtimemorestack_noctxt(SB),NOSPLIT,$-4-0
    304 	MOVW	$0, R26
    305 	B runtimemorestack(SB)
    306 
    307 // reflectcall: call a function with the given argument list
    308 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
    309 // we don't have variable-sized frames, so we use a small number
    310 // of constant-sized-frame functions to encode a few bits of size in the pc.
    311 // Caution: ugly multiline assembly macros in your future!
    312 
    313 #define DISPATCH(NAME,MAXSIZE)		\
    314 	MOVD	$MAXSIZE, R27;		\
    315 	CMP	R27, R16;		\
    316 	BGT	3(PC);			\
    317 	MOVD	$NAME(SB), R27;	\
    318 	B	(R27)
    319 // Note: can't just "B NAME(SB)" - bad inlining results.
    320 
    321 TEXT reflectcall(SB), NOSPLIT, $0-0
    322 	B	reflectcall(SB)
    323 
    324 TEXT reflectcall(SB), NOSPLIT, $-8-32
    325 	MOVWU argsize+24(FP), R16
    326 	DISPATCH(runtimecall32, 32)
    327 	DISPATCH(runtimecall64, 64)
    328 	DISPATCH(runtimecall128, 128)
    329 	DISPATCH(runtimecall256, 256)
    330 	DISPATCH(runtimecall512, 512)
    331 	DISPATCH(runtimecall1024, 1024)
    332 	DISPATCH(runtimecall2048, 2048)
    333 	DISPATCH(runtimecall4096, 4096)
    334 	DISPATCH(runtimecall8192, 8192)
    335 	DISPATCH(runtimecall16384, 16384)
    336 	DISPATCH(runtimecall32768, 32768)
    337 	DISPATCH(runtimecall65536, 65536)
    338 	DISPATCH(runtimecall131072, 131072)
    339 	DISPATCH(runtimecall262144, 262144)
    340 	DISPATCH(runtimecall524288, 524288)
    341 	DISPATCH(runtimecall1048576, 1048576)
    342 	DISPATCH(runtimecall2097152, 2097152)
    343 	DISPATCH(runtimecall4194304, 4194304)
    344 	DISPATCH(runtimecall8388608, 8388608)
    345 	DISPATCH(runtimecall16777216, 16777216)
    346 	DISPATCH(runtimecall33554432, 33554432)
    347 	DISPATCH(runtimecall67108864, 67108864)
    348 	DISPATCH(runtimecall134217728, 134217728)
    349 	DISPATCH(runtimecall268435456, 268435456)
    350 	DISPATCH(runtimecall536870912, 536870912)
    351 	DISPATCH(runtimecall1073741824, 1073741824)
    352 	MOVD	$runtimebadreflectcall(SB), R0
    353 	B	(R0)
    354 
    355 #define CALLFN(NAME,MAXSIZE)			\
    356 TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
    357 	NO_LOCAL_POINTERS;			\
    358 	/* copy arguments to stack */		\
    359 	MOVD	arg+16(FP), R3;			\
    360 	MOVWU	argsize+24(FP), R4;		\
    361 	ADD	$8, RSP, R5;			\
    362 	BIC	$0xf, R4, R6;			\
    363 	CBZ	R6, 6(PC);			\
    364 	/* if R6=(argsize&~15) != 0 */		\
    365 	ADD	R6, R5, R6;			\
    366 	/* copy 16 bytes a time */		\
    367 	LDP.P	16(R3), (R7, R8);		\
    368 	STP.P	(R7, R8), 16(R5);		\
    369 	CMP	R5, R6;				\
    370 	BNE	-3(PC);				\
    371 	AND	$0xf, R4, R6;			\
    372 	CBZ	R6, 6(PC);			\
    373 	/* if R6=(argsize&15) != 0 */		\
    374 	ADD	R6, R5, R6;			\
    375 	/* copy 1 byte a time for the rest */	\
    376 	MOVBU.P	1(R3), R7;			\
    377 	MOVBU.P	R7, 1(R5);			\
    378 	CMP	R5, R6;				\
    379 	BNE	-3(PC);				\
    380 	/* call function */			\
    381 	MOVD	f+8(FP), R26;			\
    382 	MOVD	(R26), R0;			\
    383 	PCDATA  $PCDATA_StackMapIndex, $0;	\
    384 	BL	(R0);				\
    385 	/* copy return values back */		\
    386 	MOVD	argtype+0(FP), R7;		\
    387 	MOVD	arg+16(FP), R3;			\
    388 	MOVWU	n+24(FP), R4;			\
    389 	MOVWU	retoffset+28(FP), R6;		\
    390 	ADD	$8, RSP, R5;			\
    391 	ADD	R6, R5; 			\
    392 	ADD	R6, R3;				\
    393 	SUB	R6, R4;				\
    394 	BL	callRet<>(SB);			\
    395 	RET
    396 
    397 // callRet copies return values back at the end of call*. This is a
    398 // separate function so it can allocate stack space for the arguments
    399 // to reflectcallmove. It does not follow the Go ABI; it expects its
    400 // arguments in registers.
    401 TEXT callRet<>(SB), NOSPLIT, $40-0
    402 	MOVD	R7, 8(RSP)
    403 	MOVD	R3, 16(RSP)
    404 	MOVD	R5, 24(RSP)
    405 	MOVD	R4, 32(RSP)
    406 	BL	runtimereflectcallmove(SB)
    407 	RET
    408 
    409 // These have 8 added to make the overall frame size a multiple of 16,
    410 // as required by the ABI. (There is another +8 for the saved LR.)
    411 CALLFN(call32, 40 )
    412 CALLFN(call64, 72 )
    413 CALLFN(call128, 136 )
    414 CALLFN(call256, 264 )
    415 CALLFN(call512, 520 )
    416 CALLFN(call1024, 1032 )
    417 CALLFN(call2048, 2056 )
    418 CALLFN(call4096, 4104 )
    419 CALLFN(call8192, 8200 )
    420 CALLFN(call16384, 16392 )
    421 CALLFN(call32768, 32776 )
    422 CALLFN(call65536, 65544 )
    423 CALLFN(call131072, 131080 )
    424 CALLFN(call262144, 262152 )
    425 CALLFN(call524288, 524296 )
    426 CALLFN(call1048576, 1048584 )
    427 CALLFN(call2097152, 2097160 )
    428 CALLFN(call4194304, 4194312 )
    429 CALLFN(call8388608, 8388616 )
    430 CALLFN(call16777216, 16777224 )
    431 CALLFN(call33554432, 33554440 )
    432 CALLFN(call67108864, 67108872 )
    433 CALLFN(call134217728, 134217736 )
    434 CALLFN(call268435456, 268435464 )
    435 CALLFN(call536870912, 536870920 )
    436 CALLFN(call1073741824, 1073741832 )
    437 
    438 // AES hashing not implemented for ARM64, issue #10109.
    439 TEXT runtimeaeshash(SB),NOSPLIT,$-8-0
    440 	MOVW	$0, R0
    441 	MOVW	(R0), R1
    442 TEXT runtimeaeshash32(SB),NOSPLIT,$-8-0
    443 	MOVW	$0, R0
    444 	MOVW	(R0), R1
    445 TEXT runtimeaeshash64(SB),NOSPLIT,$-8-0
    446 	MOVW	$0, R0
    447 	MOVW	(R0), R1
    448 TEXT runtimeaeshashstr(SB),NOSPLIT,$-8-0
    449 	MOVW	$0, R0
    450 	MOVW	(R0), R1
    451 
    452 TEXT runtimeprocyield(SB),NOSPLIT,$0-0
    453 	MOVWU	cycles+0(FP), R0
    454 again:
    455 	YIELD
    456 	SUBW	$1, R0
    457 	CBNZ	R0, again
    458 	RET
    459 
    460 // void jmpdefer(fv, sp);
    461 // called from deferreturn.
    462 // 1. grab stored LR for caller
    463 // 2. sub 4 bytes to get back to BL deferreturn
    464 // 3. BR to fn
    465 TEXT runtimejmpdefer(SB), NOSPLIT, $-8-16
    466 	MOVD	0(RSP), R0
    467 	SUB	$4, R0
    468 	MOVD	R0, LR
    469 
    470 	MOVD	fv+0(FP), R26
    471 	MOVD	argp+8(FP), R0
    472 	MOVD	R0, RSP
    473 	SUB	$8, RSP
    474 	MOVD	0(R26), R3
    475 	B	(R3)
    476 
    477 // Save state of caller into g->sched. Smashes R0.
    478 TEXT gosave<>(SB),NOSPLIT,$-8
    479 	MOVD	LR, (g_sched+gobuf_pc)(g)
    480 	MOVD RSP, R0
    481 	MOVD	R0, (g_sched+gobuf_sp)(g)
    482 	MOVD	$0, (g_sched+gobuf_lr)(g)
    483 	MOVD	$0, (g_sched+gobuf_ret)(g)
    484 	// Assert ctxt is zero. See func save.
    485 	MOVD	(g_sched+gobuf_ctxt)(g), R0
    486 	CMP	$0, R0
    487 	BEQ	2(PC)
    488 	CALL	runtimebadctxt(SB)
    489 	RET
    490 
    491 // func asmcgocall(fn, arg unsafe.Pointer) int32
    492 // Call fn(arg) on the scheduler stack,
    493 // aligned appropriately for the gcc ABI.
    494 // See cgocall.go for more details.
    495 TEXT asmcgocall(SB),NOSPLIT,$0-20
    496 	MOVD	fn+0(FP), R1
    497 	MOVD	arg+8(FP), R0
    498 
    499 	MOVD	RSP, R2		// save original stack pointer
    500 	MOVD	g, R4
    501 
    502 	// Figure out if we need to switch to m->g0 stack.
    503 	// We get called to create new OS threads too, and those
    504 	// come in on the m->g0 stack already.
    505 	MOVD	g_m(g), R8
    506 	MOVD	m_g0(R8), R3
    507 	CMP	R3, g
    508 	BEQ	g0
    509 	MOVD	R0, R9	// gosave<> and save_g might clobber R0
    510 	BL	gosave<>(SB)
    511 	MOVD	R3, g
    512 	BL	runtimesave_g(SB)
    513 	MOVD	(g_sched+gobuf_sp)(g), R0
    514 	MOVD	R0, RSP
    515 	MOVD	R9, R0
    516 
    517 	// Now on a scheduling stack (a pthread-created stack).
    518 g0:
    519 	// Save room for two of our pointers /*, plus 32 bytes of callee
    520 	// save area that lives on the caller stack. */
    521 	MOVD	RSP, R13
    522 	SUB	$16, R13
    523 	MOVD	R13, RSP
    524 	MOVD	R4, 0(RSP)	// save old g on stack
    525 	MOVD	(g_stack+stack_hi)(R4), R4
    526 	SUB	R2, R4
    527 	MOVD	R4, 8(RSP)	// save depth in old g stack (can't just save SP, as stack might be copied during a callback)
    528 	BL	(R1)
    529 	MOVD	R0, R9
    530 
    531 	// Restore g, stack pointer. R0 is errno, so don't touch it
    532 	MOVD	0(RSP), g
    533 	BL	runtimesave_g(SB)
    534 	MOVD	(g_stack+stack_hi)(g), R5
    535 	MOVD	8(RSP), R6
    536 	SUB	R6, R5
    537 	MOVD	R9, R0
    538 	MOVD	R5, RSP
    539 
    540 	MOVW	R0, ret+16(FP)
    541 	RET
    542 
    543 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
    544 // Turn the fn into a Go func (by taking its address) and call
    545 // cgocallback_gofunc.
    546 TEXT runtimecgocallback(SB),NOSPLIT,$40-32
    547 	MOVD	$fn+0(FP), R0
    548 	MOVD	R0, 8(RSP)
    549 	MOVD	frame+8(FP), R0
    550 	MOVD	R0, 16(RSP)
    551 	MOVD	framesize+16(FP), R0
    552 	MOVD	R0, 24(RSP)
    553 	MOVD	ctxt+24(FP), R0
    554 	MOVD	R0, 32(RSP)
    555 	MOVD	$runtimecgocallback_gofunc(SB), R0
    556 	BL	(R0)
    557 	RET
    558 
    559 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
    560 // See cgocall.go for more details.
    561 TEXT cgocallback_gofunc(SB),NOSPLIT,$24-32
    562 	NO_LOCAL_POINTERS
    563 
    564 	// Load g from thread-local storage.
    565 	MOVB	runtimeiscgo(SB), R3
    566 	CMP	$0, R3
    567 	BEQ	nocgo
    568 	BL	runtimeload_g(SB)
    569 nocgo:
    570 
    571 	// If g is nil, Go did not create the current thread.
    572 	// Call needm to obtain one for temporary use.
    573 	// In this case, we're running on the thread stack, so there's
    574 	// lots of space, but the linker doesn't know. Hide the call from
    575 	// the linker analysis by using an indirect call.
    576 	CMP	$0, g
    577 	BEQ	needm
    578 
    579 	MOVD	g_m(g), R8
    580 	MOVD	R8, savedm-8(SP)
    581 	B	havem
    582 
    583 needm:
    584 	MOVD	g, savedm-8(SP) // g is zero, so is m.
    585 	MOVD	$runtimeneedm(SB), R0
    586 	BL	(R0)
    587 
    588 	// Set m->sched.sp = SP, so that if a panic happens
    589 	// during the function we are about to execute, it will
    590 	// have a valid SP to run on the g0 stack.
    591 	// The next few lines (after the havem label)
    592 	// will save this SP onto the stack and then write
    593 	// the same SP back to m->sched.sp. That seems redundant,
    594 	// but if an unrecovered panic happens, unwindm will
    595 	// restore the g->sched.sp from the stack location
    596 	// and then systemstack will try to use it. If we don't set it here,
    597 	// that restored SP will be uninitialized (typically 0) and
    598 	// will not be usable.
    599 	MOVD	g_m(g), R8
    600 	MOVD	m_g0(R8), R3
    601 	MOVD	RSP, R0
    602 	MOVD	R0, (g_sched+gobuf_sp)(R3)
    603 
    604 havem:
    605 	// Now there's a valid m, and we're running on its m->g0.
    606 	// Save current m->g0->sched.sp on stack and then set it to SP.
    607 	// Save current sp in m->g0->sched.sp in preparation for
    608 	// switch back to m->curg stack.
    609 	// NOTE: unwindm knows that the saved g->sched.sp is at 16(RSP) aka savedsp-16(SP).
    610 	// Beware that the frame size is actually 32.
    611 	MOVD	m_g0(R8), R3
    612 	MOVD	(g_sched+gobuf_sp)(R3), R4
    613 	MOVD	R4, savedsp-16(SP)
    614 	MOVD	RSP, R0
    615 	MOVD	R0, (g_sched+gobuf_sp)(R3)
    616 
    617 	// Switch to m->curg stack and call runtime.cgocallbackg.
    618 	// Because we are taking over the execution of m->curg
    619 	// but *not* resuming what had been running, we need to
    620 	// save that information (m->curg->sched) so we can restore it.
    621 	// We can restore m->curg->sched.sp easily, because calling
    622 	// runtime.cgocallbackg leaves SP unchanged upon return.
    623 	// To save m->curg->sched.pc, we push it onto the stack.
    624 	// This has the added benefit that it looks to the traceback
    625 	// routine like cgocallbackg is going to return to that
    626 	// PC (because the frame we allocate below has the same
    627 	// size as cgocallback_gofunc's frame declared above)
    628 	// so that the traceback will seamlessly trace back into
    629 	// the earlier calls.
    630 	//
    631 	// In the new goroutine, -8(SP) is unused (where SP refers to
    632 	// m->curg's SP while we're setting it up, before we've adjusted it).
    633 	MOVD	m_curg(R8), g
    634 	BL	runtimesave_g(SB)
    635 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
    636 	MOVD	(g_sched+gobuf_pc)(g), R5
    637 	MOVD	R5, -(24+8)(R4)
    638 	MOVD	ctxt+24(FP), R0
    639 	MOVD	R0, -(16+8)(R4)
    640 	MOVD	$-(24+8)(R4), R0 // maintain 16-byte SP alignment
    641 	MOVD	R0, RSP
    642 	BL	runtimecgocallbackg(SB)
    643 
    644 	// Restore g->sched (== m->curg->sched) from saved values.
    645 	MOVD	0(RSP), R5
    646 	MOVD	R5, (g_sched+gobuf_pc)(g)
    647 	MOVD	RSP, R4
    648 	ADD	$(24+8), R4, R4
    649 	MOVD	R4, (g_sched+gobuf_sp)(g)
    650 
    651 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
    652 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
    653 	// so we do not have to restore it.)
    654 	MOVD	g_m(g), R8
    655 	MOVD	m_g0(R8), g
    656 	BL	runtimesave_g(SB)
    657 	MOVD	(g_sched+gobuf_sp)(g), R0
    658 	MOVD	R0, RSP
    659 	MOVD	savedsp-16(SP), R4
    660 	MOVD	R4, (g_sched+gobuf_sp)(g)
    661 
    662 	// If the m on entry was nil, we called needm above to borrow an m
    663 	// for the duration of the call. Since the call is over, return it with dropm.
    664 	MOVD	savedm-8(SP), R6
    665 	CMP	$0, R6
    666 	BNE	droppedm
    667 	MOVD	$runtimedropm(SB), R0
    668 	BL	(R0)
    669 droppedm:
    670 
    671 	// Done!
    672 	RET
    673 
    674 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
    675 // Must obey the gcc calling convention.
    676 TEXT _cgo_topofstack(SB),NOSPLIT,$24
    677 	// g (R28) and REGTMP (R27)  might be clobbered by load_g. They
    678 	// are callee-save in the gcc calling convention, so save them.
    679 	MOVD	R27, savedR27-8(SP)
    680 	MOVD	g, saveG-16(SP)
    681 
    682 	BL	runtimeload_g(SB)
    683 	MOVD	g_m(g), R0
    684 	MOVD	m_curg(R0), R0
    685 	MOVD	(g_stack+stack_hi)(R0), R0
    686 
    687 	MOVD	saveG-16(SP), g
    688 	MOVD	savedR28-8(SP), R27
    689 	RET
    690 
    691 // void setg(G*); set g. for use by needm.
    692 TEXT runtimesetg(SB), NOSPLIT, $0-8
    693 	MOVD	gg+0(FP), g
    694 	// This only happens if iscgo, so jump straight to save_g
    695 	BL	runtimesave_g(SB)
    696 	RET
    697 
    698 // void setg_gcc(G*); set g called from gcc
    699 TEXT setg_gcc<>(SB),NOSPLIT,$8
    700 	MOVD	R0, g
    701 	MOVD	R27, savedR27-8(SP)
    702 	BL	runtimesave_g(SB)
    703 	MOVD	savedR27-8(SP), R27
    704 	RET
    705 
    706 TEXT runtimegetcallerpc(SB),NOSPLIT,$-8-8
    707 	MOVD	0(RSP), R0		// LR saved by caller
    708 	MOVD	R0, ret+0(FP)
    709 	RET
    710 
    711 TEXT runtimeabort(SB),NOSPLIT,$-8-0
    712 	B	(ZR)
    713 	UNDEF
    714 
    715 // memequal(a, b unsafe.Pointer, size uintptr) bool
    716 TEXT runtimememequal(SB),NOSPLIT,$-8-25
    717 	MOVD	size+16(FP), R1
    718 	// short path to handle 0-byte case
    719 	CBZ	R1, equal
    720 	MOVD	a+0(FP), R0
    721 	MOVD	b+8(FP), R2
    722 	MOVD	$ret+24(FP), R8
    723 	B	runtimememeqbody<>(SB)
    724 equal:
    725 	MOVD	$1, R0
    726 	MOVB	R0, ret+24(FP)
    727 	RET
    728 
    729 // memequal_varlen(a, b unsafe.Pointer) bool
    730 TEXT runtimememequal_varlen(SB),NOSPLIT,$40-17
    731 	MOVD	a+0(FP), R3
    732 	MOVD	b+8(FP), R4
    733 	CMP	R3, R4
    734 	BEQ	eq
    735 	MOVD	8(R26), R5    // compiler stores size at offset 8 in the closure
    736 	MOVD	R3, 8(RSP)
    737 	MOVD	R4, 16(RSP)
    738 	MOVD	R5, 24(RSP)
    739 	BL	runtimememequal(SB)
    740 	MOVBU	32(RSP), R3
    741 	MOVB	R3, ret+16(FP)
    742 	RET
    743 eq:
    744 	MOVD	$1, R3
    745 	MOVB	R3, ret+16(FP)
    746 	RET
    747 
    748 TEXT runtimecmpstring(SB),NOSPLIT,$-4-40
    749 	MOVD	s1_base+0(FP), R2
    750 	MOVD	s1_len+8(FP), R0
    751 	MOVD	s2_base+16(FP), R3
    752 	MOVD	s2_len+24(FP), R1
    753 	ADD	$40, RSP, R7
    754 	B	runtimecmpbody<>(SB)
    755 
    756 TEXT bytesCompare(SB),NOSPLIT,$-4-56
    757 	MOVD	s1+0(FP), R2
    758 	MOVD	s1+8(FP), R0
    759 	MOVD	s2+24(FP), R3
    760 	MOVD	s2+32(FP), R1
    761 	ADD	$56, RSP, R7
    762 	B	runtimecmpbody<>(SB)
    763 
    764 // On entry:
    765 // R0 is the length of s1
    766 // R1 is the length of s2
    767 // R2 points to the start of s1
    768 // R3 points to the start of s2
    769 // R7 points to return value (-1/0/1 will be written here)
    770 //
    771 // On exit:
    772 // R4, R5, and R6 are clobbered
    773 TEXT runtimecmpbody<>(SB),NOSPLIT,$-4-0
    774 	CMP	R2, R3
    775 	BEQ	samebytes // same starting pointers; compare lengths
    776 	CMP	R0, R1
    777 	CSEL    LT, R1, R0, R6 // R6 is min(R0, R1)
    778 
    779 	ADD	R2, R6	// R2 is current byte in s1, R6 is last byte in s1 to compare
    780 loop:
    781 	CMP	R2, R6
    782 	BEQ	samebytes // all compared bytes were the same; compare lengths
    783 	MOVBU.P	1(R2), R4
    784 	MOVBU.P	1(R3), R5
    785 	CMP	R4, R5
    786 	BEQ	loop
    787 	// bytes differed
    788 	MOVD	$1, R4
    789 	CSNEG	LT, R4, R4, R4
    790 	MOVD	R4, (R7)
    791 	RET
    792 samebytes:
    793 	MOVD	$1, R4
    794 	CMP	R0, R1
    795 	CSNEG	LT, R4, R4, R4
    796 	CSEL	EQ, ZR, R4, R4
    797 	MOVD	R4, (R7)
    798 	RET
    799 
    800 //
    801 // functions for other packages
    802 //
    803 TEXT bytesIndexByte(SB),NOSPLIT,$0-40
    804 	MOVD	b+0(FP), R0
    805 	MOVD	b_len+8(FP), R2
    806 	MOVBU	c+24(FP), R1
    807 	MOVD	$ret+32(FP), R8
    808 	B	runtimeindexbytebody<>(SB)
    809 
    810 TEXT stringsIndexByte(SB),NOSPLIT,$0-32
    811 	MOVD	s+0(FP), R0
    812 	MOVD	s_len+8(FP), R2
    813 	MOVBU	c+16(FP), R1
    814 	MOVD	$ret+24(FP), R8
    815 	B	runtimeindexbytebody<>(SB)
    816 
    817 // input:
    818 //   R0: data
    819 //   R1: byte to search
    820 //   R2: data len
    821 //   R8: address to put result
    822 TEXT runtimeindexbytebody<>(SB),NOSPLIT,$0
    823 	// Core algorithm:
    824 	// For each 32-byte chunk we calculate a 64-bit syndrome value,
    825 	// with two bits per byte. For each tuple, bit 0 is set if the
    826 	// relevant byte matched the requested character and bit 1 is
    827 	// not used (faster than using a 32bit syndrome). Since the bits
    828 	// in the syndrome reflect exactly the order in which things occur
    829 	// in the original string, counting trailing zeros allows to
    830 	// identify exactly which byte has matched.
    831 
    832 	CBZ	R2, fail
    833 	MOVD	R0, R11
    834 	// Magic constant 0x40100401 allows us to identify
    835 	// which lane matches the requested byte.
    836 	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
    837 	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
    838 	MOVD	$0x40100401, R5
    839 	VMOV	R1, V0.B16
    840 	// Work with aligned 32-byte chunks
    841 	BIC	$0x1f, R0, R3
    842 	VMOV	R5, V5.S4
    843 	ANDS	$0x1f, R0, R9
    844 	AND	$0x1f, R2, R10
    845 	BEQ	loop
    846 
    847 	// Input string is not 32-byte aligned. We calculate the
    848 	// syndrome value for the aligned 32 bytes block containing
    849 	// the first bytes and mask off the irrelevant part.
    850 	VLD1.P	(R3), [V1.B16, V2.B16]
    851 	SUB	$0x20, R9, R4
    852 	ADDS	R4, R2, R2
    853 	VCMEQ	V0.B16, V1.B16, V3.B16
    854 	VCMEQ	V0.B16, V2.B16, V4.B16
    855 	VAND	V5.B16, V3.B16, V3.B16
    856 	VAND	V5.B16, V4.B16, V4.B16
    857 	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
    858 	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
    859 	VMOV	V6.D[0], R6
    860 	// Clear the irrelevant lower bits
    861 	LSL	$1, R9, R4
    862 	LSR	R4, R6, R6
    863 	LSL	R4, R6, R6
    864 	// The first block can also be the last
    865 	BLS	masklast
    866 	// Have we found something already?
    867 	CBNZ	R6, tail
    868 
    869 loop:
    870 	VLD1.P	(R3), [V1.B16, V2.B16]
    871 	SUBS	$0x20, R2, R2
    872 	VCMEQ	V0.B16, V1.B16, V3.B16
    873 	VCMEQ	V0.B16, V2.B16, V4.B16
    874 	// If we're out of data we finish regardless of the result
    875 	BLS	end
    876 	// Use a fast check for the termination condition
    877 	VORR	V4.B16, V3.B16, V6.B16
    878 	VADDP	V6.D2, V6.D2, V6.D2
    879 	VMOV	V6.D[0], R6
    880 	// We're not out of data, loop if we haven't found the character
    881 	CBZ	R6, loop
    882 
    883 end:
    884 	// Termination condition found, let's calculate the syndrome value
    885 	VAND	V5.B16, V3.B16, V3.B16
    886 	VAND	V5.B16, V4.B16, V4.B16
    887 	VADDP	V4.B16, V3.B16, V6.B16
    888 	VADDP	V6.B16, V6.B16, V6.B16
    889 	VMOV	V6.D[0], R6
    890 	// Only do the clear for the last possible block with less than 32 bytes
    891 	// Condition flags come from SUBS in the loop
    892 	BHS	tail
    893 
    894 masklast:
    895 	// Clear the irrelevant upper bits
    896 	ADD	R9, R10, R4
    897 	AND	$0x1f, R4, R4
    898 	SUB	$0x20, R4, R4
    899 	NEG	R4<<1, R4
    900 	LSL	R4, R6, R6
    901 	LSR	R4, R6, R6
    902 
    903 tail:
    904 	// Check that we have found a character
    905 	CBZ	R6, fail
    906 	// Count the trailing zeros using bit reversing
    907 	RBIT	R6, R6
    908 	// Compensate the last post-increment
    909 	SUB	$0x20, R3, R3
    910 	// And count the leading zeros
    911 	CLZ	R6, R6
    912 	// R6 is twice the offset into the fragment
    913 	ADD	R6>>1, R3, R0
    914 	// Compute the offset result
    915 	SUB	R11, R0, R0
    916 	MOVD	R0, (R8)
    917 	RET
    918 
    919 fail:
    920 	MOVD	$-1, R0
    921 	MOVD	R0, (R8)
    922 	RET
    923 
    924 // Equal(a, b []byte) bool
    925 TEXT bytesEqual(SB),NOSPLIT,$0-49
    926 	MOVD	a_len+8(FP), R1
    927 	MOVD	b_len+32(FP), R3
    928 	CMP	R1, R3
    929 	// unequal lengths are not equal
    930 	BNE	not_equal
    931 	// short path to handle 0-byte case
    932 	CBZ	R1, equal
    933 	MOVD	a+0(FP), R0
    934 	MOVD	b+24(FP), R2
    935 	MOVD	$ret+48(FP), R8
    936 	B	runtimememeqbody<>(SB)
    937 equal:
    938 	MOVD	$1, R0
    939 	MOVB	R0, ret+48(FP)
    940 	RET
    941 not_equal:
    942 	MOVB	ZR, ret+48(FP)
    943 	RET
    944 
    945 // input:
    946 // R0: pointer a
    947 // R1: data len
    948 // R2: pointer b
    949 // R8: address to put result
    950 TEXT runtimememeqbody<>(SB),NOSPLIT,$0
    951 	CMP	$1, R1
    952 	// handle 1-byte special case for better performance
    953 	BEQ	one
    954 	CMP	$16, R1
    955 	// handle specially if length < 16
    956 	BLO	tail
    957 	BIC	$0x3f, R1, R3
    958 	CBZ	R3, chunk16
    959 	// work with 64-byte chunks
    960 	ADD	R3, R0, R6	// end of chunks
    961 chunk64_loop:
    962 	VLD1.P	(R0), [V0.D2, V1.D2, V2.D2, V3.D2]
    963 	VLD1.P	(R2), [V4.D2, V5.D2, V6.D2, V7.D2]
    964 	VCMEQ	V0.D2, V4.D2, V8.D2
    965 	VCMEQ	V1.D2, V5.D2, V9.D2
    966 	VCMEQ	V2.D2, V6.D2, V10.D2
    967 	VCMEQ	V3.D2, V7.D2, V11.D2
    968 	VAND	V8.B16, V9.B16, V8.B16
    969 	VAND	V8.B16, V10.B16, V8.B16
    970 	VAND	V8.B16, V11.B16, V8.B16
    971 	CMP	R0, R6
    972 	VMOV	V8.D[0], R4
    973 	VMOV	V8.D[1], R5
    974 	CBZ	R4, not_equal
    975 	CBZ	R5, not_equal
    976 	BNE	chunk64_loop
    977 	AND	$0x3f, R1, R1
    978 	CBZ	R1, equal
    979 chunk16:
    980 	// work with 16-byte chunks
    981 	BIC	$0xf, R1, R3
    982 	CBZ	R3, tail
    983 	ADD	R3, R0, R6	// end of chunks
    984 chunk16_loop:
    985 	VLD1.P	(R0), [V0.D2]
    986 	VLD1.P	(R2), [V1.D2]
    987 	VCMEQ	V0.D2, V1.D2, V2.D2
    988 	CMP	R0, R6
    989 	VMOV	V2.D[0], R4
    990 	VMOV	V2.D[1], R5
    991 	CBZ	R4, not_equal
    992 	CBZ	R5, not_equal
    993 	BNE	chunk16_loop
    994 	AND	$0xf, R1, R1
    995 	CBZ	R1, equal
    996 tail:
    997 	// special compare of tail with length < 16
    998 	TBZ	$3, R1, lt_8
    999 	MOVD.P	8(R0), R4
   1000 	MOVD.P	8(R2), R5
   1001 	CMP	R4, R5
   1002 	BNE	not_equal
   1003 lt_8:
   1004 	TBZ	$2, R1, lt_4
   1005 	MOVWU.P	4(R0), R4
   1006 	MOVWU.P	4(R2), R5
   1007 	CMP	R4, R5
   1008 	BNE	not_equal
   1009 lt_4:
   1010 	TBZ	$1, R1, lt_2
   1011 	MOVHU.P	2(R0), R4
   1012 	MOVHU.P	2(R2), R5
   1013 	CMP	R4, R5
   1014 	BNE	not_equal
   1015 lt_2:
   1016 	TBZ     $0, R1, equal
   1017 one:
   1018 	MOVBU	(R0), R4
   1019 	MOVBU	(R2), R5
   1020 	CMP	R4, R5
   1021 	BNE	not_equal
   1022 equal:
   1023 	MOVD	$1, R0
   1024 	MOVB	R0, (R8)
   1025 	RET
   1026 not_equal:
   1027 	MOVB	ZR, (R8)
   1028 	RET
   1029 
   1030 TEXT runtimereturn0(SB), NOSPLIT, $0
   1031 	MOVW	$0, R0
   1032 	RET
   1033 
   1034 // The top-most function running on a goroutine
   1035 // returns to goexit+PCQuantum.
   1036 TEXT runtimegoexit(SB),NOSPLIT,$-8-0
   1037 	MOVD	R0, R0	// NOP
   1038 	BL	runtimegoexit1(SB)	// does not return
   1039 
   1040 TEXT runtimesigreturn(SB),NOSPLIT,$0-0
   1041 	RET
   1042 
   1043 // This is called from .init_array and follows the platform, not Go, ABI.
   1044 TEXT runtimeaddmoduledata(SB),NOSPLIT,$0-0
   1045 	SUB	$0x10, RSP
   1046 	MOVD	R27, 8(RSP) // The access to global variables below implicitly uses R27, which is callee-save
   1047 	MOVD	runtimelastmoduledatap(SB), R1
   1048 	MOVD	R0, moduledata_next(R1)
   1049 	MOVD	R0, runtimelastmoduledatap(SB)
   1050 	MOVD	8(RSP), R27
   1051 	ADD	$0x10, RSP
   1052 	RET
   1053 
   1054 TEXT checkASM(SB),NOSPLIT,$0-1
   1055 	MOVW	$1, R3
   1056 	MOVB	R3, ret+0(FP)
   1057 	RET
   1058