Home | History | Annotate | Download | only in runtime
      1 // Copyright 2014 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // +build ppc64 ppc64le
      6 
      7 #include "go_asm.h"
      8 #include "go_tls.h"
      9 #include "funcdata.h"
     10 #include "textflag.h"
     11 #include "asm_ppc64x.h"
     12 
     13 TEXT runtimert0_go(SB),NOSPLIT,$0
     14 	// R1 = stack; R3 = argc; R4 = argv; R13 = C TLS base pointer
     15 
     16 	// initialize essential registers
     17 	BL	runtimereginit(SB)
     18 
     19 	SUB	$(FIXED_FRAME+16), R1
     20 	MOVD	R2, 24(R1)		// stash the TOC pointer away again now we've created a new frame
     21 	MOVW	R3, FIXED_FRAME+0(R1)	// argc
     22 	MOVD	R4, FIXED_FRAME+8(R1)	// argv
     23 
     24 	// create istack out of the given (operating system) stack.
     25 	// _cgo_init may update stackguard.
     26 	MOVD	$runtimeg0(SB), g
     27 	MOVD	$(-64*1024), R31
     28 	ADD	R31, R1, R3
     29 	MOVD	R3, g_stackguard0(g)
     30 	MOVD	R3, g_stackguard1(g)
     31 	MOVD	R3, (g_stack+stack_lo)(g)
     32 	MOVD	R1, (g_stack+stack_hi)(g)
     33 
     34 	// if there is a _cgo_init, call it using the gcc ABI.
     35 	MOVD	_cgo_init(SB), R12
     36 	CMP	R0, R12
     37 	BEQ	nocgo
     38 	MOVD	R12, CTR		// r12 = "global function entry point"
     39 	MOVD	R13, R5			// arg 2: TLS base pointer
     40 	MOVD	$setg_gcc<>(SB), R4 	// arg 1: setg
     41 	MOVD	g, R3			// arg 0: G
     42 	// C functions expect 32 bytes of space on caller stack frame
     43 	// and a 16-byte aligned R1
     44 	MOVD	R1, R14			// save current stack
     45 	SUB	$32, R1			// reserve 32 bytes
     46 	RLDCR	$0, R1, $~15, R1	// 16-byte align
     47 	BL	(CTR)			// may clobber R0, R3-R12
     48 	MOVD	R14, R1			// restore stack
     49 	MOVD	24(R1), R2
     50 	XOR	R0, R0			// fix R0
     51 
     52 nocgo:
     53 	// update stackguard after _cgo_init
     54 	MOVD	(g_stack+stack_lo)(g), R3
     55 	ADD	$const__StackGuard, R3
     56 	MOVD	R3, g_stackguard0(g)
     57 	MOVD	R3, g_stackguard1(g)
     58 
     59 	// set the per-goroutine and per-mach "registers"
     60 	MOVD	$runtimem0(SB), R3
     61 
     62 	// save m->g0 = g0
     63 	MOVD	g, m_g0(R3)
     64 	// save m0 to g0->m
     65 	MOVD	R3, g_m(g)
     66 
     67 	BL	runtimecheck(SB)
     68 
     69 	// args are already prepared
     70 	BL	runtimeargs(SB)
     71 	BL	runtimeosinit(SB)
     72 	BL	runtimeschedinit(SB)
     73 
     74 	// create a new goroutine to start program
     75 	MOVD	$runtimemainPC(SB), R3		// entry
     76 	MOVDU	R3, -8(R1)
     77 	MOVDU	R0, -8(R1)
     78 	MOVDU	R0, -8(R1)
     79 	MOVDU	R0, -8(R1)
     80 	MOVDU	R0, -8(R1)
     81 	MOVDU	R0, -8(R1)
     82 	BL	runtimenewproc(SB)
     83 	ADD	$(16+FIXED_FRAME), R1
     84 
     85 	// start this M
     86 	BL	runtimemstart(SB)
     87 
     88 	MOVD	R0, 0(R0)
     89 	RET
     90 
     91 DATA	runtimemainPC+0(SB)/8,$runtimemain(SB)
     92 GLOBL	runtimemainPC(SB),RODATA,$8
     93 
     94 TEXT runtimebreakpoint(SB),NOSPLIT|NOFRAME,$0-0
     95 	MOVD	R0, 0(R0) // TODO: TD
     96 	RET
     97 
     98 TEXT runtimeasminit(SB),NOSPLIT|NOFRAME,$0-0
     99 	RET
    100 
    101 TEXT _cgo_reginit(SB),NOSPLIT|NOFRAME,$0-0
    102 	// crosscall_ppc64 and crosscall2 need to reginit, but can't
    103 	// get at the 'runtime.reginit' symbol.
    104 	BR	runtimereginit(SB)
    105 
    106 TEXT runtimereginit(SB),NOSPLIT|NOFRAME,$0-0
    107 	// set R0 to zero, it's expected by the toolchain
    108 	XOR R0, R0
    109 	RET
    110 
    111 /*
    112  *  go-routine
    113  */
    114 
    115 // void gosave(Gobuf*)
    116 // save state in Gobuf; setjmp
    117 TEXT runtimegosave(SB), NOSPLIT|NOFRAME, $0-8
    118 	MOVD	buf+0(FP), R3
    119 	MOVD	R1, gobuf_sp(R3)
    120 	MOVD	LR, R31
    121 	MOVD	R31, gobuf_pc(R3)
    122 	MOVD	g, gobuf_g(R3)
    123 	MOVD	R0, gobuf_lr(R3)
    124 	MOVD	R0, gobuf_ret(R3)
    125 	// Assert ctxt is zero. See func save.
    126 	MOVD	gobuf_ctxt(R3), R3
    127 	CMP	R0, R3
    128 	BEQ	2(PC)
    129 	BL	runtimebadctxt(SB)
    130 	RET
    131 
    132 // void gogo(Gobuf*)
    133 // restore state from Gobuf; longjmp
    134 TEXT runtimegogo(SB), NOSPLIT, $16-8
    135 	MOVD	buf+0(FP), R5
    136 	MOVD	gobuf_g(R5), g	// make sure g is not nil
    137 	BL	runtimesave_g(SB)
    138 
    139 	MOVD	0(g), R4
    140 	MOVD	gobuf_sp(R5), R1
    141 	MOVD	gobuf_lr(R5), R31
    142 	MOVD	R31, LR
    143 	MOVD	gobuf_ret(R5), R3
    144 	MOVD	gobuf_ctxt(R5), R11
    145 	MOVD	R0, gobuf_sp(R5)
    146 	MOVD	R0, gobuf_ret(R5)
    147 	MOVD	R0, gobuf_lr(R5)
    148 	MOVD	R0, gobuf_ctxt(R5)
    149 	CMP	R0, R0 // set condition codes for == test, needed by stack split
    150 	MOVD	gobuf_pc(R5), R12
    151 	MOVD	R12, CTR
    152 	BR	(CTR)
    153 
    154 // void mcall(fn func(*g))
    155 // Switch to m->g0's stack, call fn(g).
    156 // Fn must never return. It should gogo(&g->sched)
    157 // to keep running g.
    158 TEXT runtimemcall(SB), NOSPLIT|NOFRAME, $0-8
    159 	// Save caller state in g->sched
    160 	MOVD	R1, (g_sched+gobuf_sp)(g)
    161 	MOVD	LR, R31
    162 	MOVD	R31, (g_sched+gobuf_pc)(g)
    163 	MOVD	R0, (g_sched+gobuf_lr)(g)
    164 	MOVD	g, (g_sched+gobuf_g)(g)
    165 
    166 	// Switch to m->g0 & its stack, call fn.
    167 	MOVD	g, R3
    168 	MOVD	g_m(g), R8
    169 	MOVD	m_g0(R8), g
    170 	BL	runtimesave_g(SB)
    171 	CMP	g, R3
    172 	BNE	2(PC)
    173 	BR	runtimebadmcall(SB)
    174 	MOVD	fn+0(FP), R11			// context
    175 	MOVD	0(R11), R12			// code pointer
    176 	MOVD	R12, CTR
    177 	MOVD	(g_sched+gobuf_sp)(g), R1	// sp = m->g0->sched.sp
    178 	MOVDU	R3, -8(R1)
    179 	MOVDU	R0, -8(R1)
    180 	MOVDU	R0, -8(R1)
    181 	MOVDU	R0, -8(R1)
    182 	MOVDU	R0, -8(R1)
    183 	BL	(CTR)
    184 	MOVD	24(R1), R2
    185 	BR	runtimebadmcall2(SB)
    186 
    187 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
    188 // of the G stack. We need to distinguish the routine that
    189 // lives at the bottom of the G stack from the one that lives
    190 // at the top of the system stack because the one at the top of
    191 // the system stack terminates the stack walk (see topofstack()).
    192 TEXT runtimesystemstack_switch(SB), NOSPLIT, $0-0
    193 	// We have several undefs here so that 16 bytes past
    194 	// $runtimesystemstack_switch lies within them whether or not the
    195         // instructions that derive r2 from r12 are there.
    196 	UNDEF
    197 	UNDEF
    198 	UNDEF
    199 	BL	(LR)	// make sure this function is not leaf
    200 	RET
    201 
    202 // func systemstack(fn func())
    203 TEXT runtimesystemstack(SB), NOSPLIT, $0-8
    204 	MOVD	fn+0(FP), R3	// R3 = fn
    205 	MOVD	R3, R11		// context
    206 	MOVD	g_m(g), R4	// R4 = m
    207 
    208 	MOVD	m_gsignal(R4), R5	// R5 = gsignal
    209 	CMP	g, R5
    210 	BEQ	noswitch
    211 
    212 	MOVD	m_g0(R4), R5	// R5 = g0
    213 	CMP	g, R5
    214 	BEQ	noswitch
    215 
    216 	MOVD	m_curg(R4), R6
    217 	CMP	g, R6
    218 	BEQ	switch
    219 
    220 	// Bad: g is not gsignal, not g0, not curg. What is it?
    221 	// Hide call from linker nosplit analysis.
    222 	MOVD	$runtimebadsystemstack(SB), R12
    223 	MOVD	R12, CTR
    224 	BL	(CTR)
    225 
    226 switch:
    227 	// save our state in g->sched. Pretend to
    228 	// be systemstack_switch if the G stack is scanned.
    229 	MOVD	$runtimesystemstack_switch(SB), R6
    230 	ADD     $16, R6 // get past prologue (including r2-setting instructions when they're there)
    231 	MOVD	R6, (g_sched+gobuf_pc)(g)
    232 	MOVD	R1, (g_sched+gobuf_sp)(g)
    233 	MOVD	R0, (g_sched+gobuf_lr)(g)
    234 	MOVD	g, (g_sched+gobuf_g)(g)
    235 
    236 	// switch to g0
    237 	MOVD	R5, g
    238 	BL	runtimesave_g(SB)
    239 	MOVD	(g_sched+gobuf_sp)(g), R3
    240 	// make it look like mstart called systemstack on g0, to stop traceback
    241 	SUB	$FIXED_FRAME, R3
    242 	MOVD	$runtimemstart(SB), R4
    243 	MOVD	R4, 0(R3)
    244 	MOVD	R3, R1
    245 
    246 	// call target function
    247 	MOVD	0(R11), R12	// code pointer
    248 	MOVD	R12, CTR
    249 	BL	(CTR)
    250 
    251 	// restore TOC pointer. It seems unlikely that we will use systemstack
    252 	// to call a function defined in another module, but the results of
    253 	// doing so would be so confusing that it's worth doing this.
    254 	MOVD	g_m(g), R3
    255 	MOVD	m_curg(R3), g
    256 	MOVD	(g_sched+gobuf_sp)(g), R3
    257 	MOVD	24(R3), R2
    258 	// switch back to g
    259 	MOVD	g_m(g), R3
    260 	MOVD	m_curg(R3), g
    261 	BL	runtimesave_g(SB)
    262 	MOVD	(g_sched+gobuf_sp)(g), R1
    263 	MOVD	R0, (g_sched+gobuf_sp)(g)
    264 	RET
    265 
    266 noswitch:
    267 	// already on m stack, just call directly
    268 	// On other arches we do a tail call here, but it appears to be
    269 	// impossible to tail call a function pointer in shared mode on
    270 	// ppc64 because the caller is responsible for restoring the TOC.
    271 	MOVD	0(R11), R12	// code pointer
    272 	MOVD	R12, CTR
    273 	BL	(CTR)
    274 	MOVD	24(R1), R2
    275 	RET
    276 
    277 /*
    278  * support for morestack
    279  */
    280 
    281 // Called during function prolog when more stack is needed.
    282 // Caller has already loaded:
    283 // R3: framesize, R4: argsize, R5: LR
    284 //
    285 // The traceback routines see morestack on a g0 as being
    286 // the top of a stack (for example, morestack calling newstack
    287 // calling the scheduler calling newm calling gc), so we must
    288 // record an argument size. For that purpose, it has no arguments.
    289 TEXT runtimemorestack(SB),NOSPLIT|NOFRAME,$0-0
    290 	// Cannot grow scheduler stack (m->g0).
    291 	MOVD	g_m(g), R7
    292 	MOVD	m_g0(R7), R8
    293 	CMP	g, R8
    294 	BNE	3(PC)
    295 	BL	runtimebadmorestackg0(SB)
    296 	BL	runtimeabort(SB)
    297 
    298 	// Cannot grow signal stack (m->gsignal).
    299 	MOVD	m_gsignal(R7), R8
    300 	CMP	g, R8
    301 	BNE	3(PC)
    302 	BL	runtimebadmorestackgsignal(SB)
    303 	BL	runtimeabort(SB)
    304 
    305 	// Called from f.
    306 	// Set g->sched to context in f.
    307 	MOVD	R1, (g_sched+gobuf_sp)(g)
    308 	MOVD	LR, R8
    309 	MOVD	R8, (g_sched+gobuf_pc)(g)
    310 	MOVD	R5, (g_sched+gobuf_lr)(g)
    311 	MOVD	R11, (g_sched+gobuf_ctxt)(g)
    312 
    313 	// Called from f.
    314 	// Set m->morebuf to f's caller.
    315 	MOVD	R5, (m_morebuf+gobuf_pc)(R7)	// f's caller's PC
    316 	MOVD	R1, (m_morebuf+gobuf_sp)(R7)	// f's caller's SP
    317 	MOVD	g, (m_morebuf+gobuf_g)(R7)
    318 
    319 	// Call newstack on m->g0's stack.
    320 	MOVD	m_g0(R7), g
    321 	BL	runtimesave_g(SB)
    322 	MOVD	(g_sched+gobuf_sp)(g), R1
    323 	MOVDU   R0, -(FIXED_FRAME+0)(R1)	// create a call frame on g0
    324 	BL	runtimenewstack(SB)
    325 
    326 	// Not reached, but make sure the return PC from the call to newstack
    327 	// is still in this function, and not the beginning of the next.
    328 	UNDEF
    329 
    330 TEXT runtimemorestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
    331 	MOVD	R0, R11
    332 	BR	runtimemorestack(SB)
    333 
    334 // reflectcall: call a function with the given argument list
    335 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
    336 // we don't have variable-sized frames, so we use a small number
    337 // of constant-sized-frame functions to encode a few bits of size in the pc.
    338 // Caution: ugly multiline assembly macros in your future!
    339 
    340 #define DISPATCH(NAME,MAXSIZE)		\
    341 	MOVD	$MAXSIZE, R31;		\
    342 	CMP	R3, R31;		\
    343 	BGT	4(PC);			\
    344 	MOVD	$NAME(SB), R12;		\
    345 	MOVD	R12, CTR;		\
    346 	BR	(CTR)
    347 // Note: can't just "BR NAME(SB)" - bad inlining results.
    348 
    349 TEXT reflectcall(SB), NOSPLIT, $0-0
    350 	BR	reflectcall(SB)
    351 
    352 TEXT reflectcall(SB), NOSPLIT|NOFRAME, $0-32
    353 	MOVWZ argsize+24(FP), R3
    354 	DISPATCH(runtimecall32, 32)
    355 	DISPATCH(runtimecall64, 64)
    356 	DISPATCH(runtimecall128, 128)
    357 	DISPATCH(runtimecall256, 256)
    358 	DISPATCH(runtimecall512, 512)
    359 	DISPATCH(runtimecall1024, 1024)
    360 	DISPATCH(runtimecall2048, 2048)
    361 	DISPATCH(runtimecall4096, 4096)
    362 	DISPATCH(runtimecall8192, 8192)
    363 	DISPATCH(runtimecall16384, 16384)
    364 	DISPATCH(runtimecall32768, 32768)
    365 	DISPATCH(runtimecall65536, 65536)
    366 	DISPATCH(runtimecall131072, 131072)
    367 	DISPATCH(runtimecall262144, 262144)
    368 	DISPATCH(runtimecall524288, 524288)
    369 	DISPATCH(runtimecall1048576, 1048576)
    370 	DISPATCH(runtimecall2097152, 2097152)
    371 	DISPATCH(runtimecall4194304, 4194304)
    372 	DISPATCH(runtimecall8388608, 8388608)
    373 	DISPATCH(runtimecall16777216, 16777216)
    374 	DISPATCH(runtimecall33554432, 33554432)
    375 	DISPATCH(runtimecall67108864, 67108864)
    376 	DISPATCH(runtimecall134217728, 134217728)
    377 	DISPATCH(runtimecall268435456, 268435456)
    378 	DISPATCH(runtimecall536870912, 536870912)
    379 	DISPATCH(runtimecall1073741824, 1073741824)
    380 	MOVD	$runtimebadreflectcall(SB), R12
    381 	MOVD	R12, CTR
    382 	BR	(CTR)
    383 
    384 #define CALLFN(NAME,MAXSIZE)			\
    385 TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
    386 	NO_LOCAL_POINTERS;			\
    387 	/* copy arguments to stack */		\
    388 	MOVD	arg+16(FP), R3;			\
    389 	MOVWZ	argsize+24(FP), R4;			\
    390 	MOVD	R1, R5;				\
    391 	ADD	$(FIXED_FRAME-1), R5;			\
    392 	SUB	$1, R3;				\
    393 	ADD	R5, R4;				\
    394 	CMP	R5, R4;				\
    395 	BEQ	4(PC);				\
    396 	MOVBZU	1(R3), R6;			\
    397 	MOVBZU	R6, 1(R5);			\
    398 	BR	-4(PC);				\
    399 	/* call function */			\
    400 	MOVD	f+8(FP), R11;			\
    401 	MOVD	(R11), R12;			\
    402 	MOVD	R12, CTR;			\
    403 	PCDATA  $PCDATA_StackMapIndex, $0;	\
    404 	BL	(CTR);				\
    405 	MOVD	24(R1), R2;			\
    406 	/* copy return values back */		\
    407 	MOVD	argtype+0(FP), R7;		\
    408 	MOVD	arg+16(FP), R3;			\
    409 	MOVWZ	n+24(FP), R4;			\
    410 	MOVWZ	retoffset+28(FP), R6;		\
    411 	ADD	$FIXED_FRAME, R1, R5;		\
    412 	ADD	R6, R5; 			\
    413 	ADD	R6, R3;				\
    414 	SUB	R6, R4;				\
    415 	BL	callRet<>(SB);			\
    416 	RET
    417 
    418 // callRet copies return values back at the end of call*. This is a
    419 // separate function so it can allocate stack space for the arguments
    420 // to reflectcallmove. It does not follow the Go ABI; it expects its
    421 // arguments in registers.
    422 TEXT callRet<>(SB), NOSPLIT, $32-0
    423 	MOVD	R7, FIXED_FRAME+0(R1)
    424 	MOVD	R3, FIXED_FRAME+8(R1)
    425 	MOVD	R5, FIXED_FRAME+16(R1)
    426 	MOVD	R4, FIXED_FRAME+24(R1)
    427 	BL	runtimereflectcallmove(SB)
    428 	RET
    429 
    430 CALLFN(call32, 32)
    431 CALLFN(call64, 64)
    432 CALLFN(call128, 128)
    433 CALLFN(call256, 256)
    434 CALLFN(call512, 512)
    435 CALLFN(call1024, 1024)
    436 CALLFN(call2048, 2048)
    437 CALLFN(call4096, 4096)
    438 CALLFN(call8192, 8192)
    439 CALLFN(call16384, 16384)
    440 CALLFN(call32768, 32768)
    441 CALLFN(call65536, 65536)
    442 CALLFN(call131072, 131072)
    443 CALLFN(call262144, 262144)
    444 CALLFN(call524288, 524288)
    445 CALLFN(call1048576, 1048576)
    446 CALLFN(call2097152, 2097152)
    447 CALLFN(call4194304, 4194304)
    448 CALLFN(call8388608, 8388608)
    449 CALLFN(call16777216, 16777216)
    450 CALLFN(call33554432, 33554432)
    451 CALLFN(call67108864, 67108864)
    452 CALLFN(call134217728, 134217728)
    453 CALLFN(call268435456, 268435456)
    454 CALLFN(call536870912, 536870912)
    455 CALLFN(call1073741824, 1073741824)
    456 
    457 TEXT runtimeprocyield(SB),NOSPLIT,$0-0
    458 	RET
    459 
    460 // void jmpdefer(fv, sp);
    461 // called from deferreturn.
    462 // 1. grab stored LR for caller
    463 // 2. sub 8 bytes to get back to either nop or toc reload before deferreturn
    464 // 3. BR to fn
    465 // When dynamically linking Go, it is not sufficient to rewind to the BL
    466 // deferreturn -- we might be jumping between modules and so we need to reset
    467 // the TOC pointer in r2. To do this, codegen inserts MOVD 24(R1), R2 *before*
    468 // the BL deferreturn and jmpdefer rewinds to that.
    469 TEXT runtimejmpdefer(SB), NOSPLIT|NOFRAME, $0-16
    470 	MOVD	0(R1), R31
    471 	SUB     $8, R31
    472 	MOVD	R31, LR
    473 
    474 	MOVD	fv+0(FP), R11
    475 	MOVD	argp+8(FP), R1
    476 	SUB	$FIXED_FRAME, R1
    477 	MOVD	0(R11), R12
    478 	MOVD	R12, CTR
    479 	BR	(CTR)
    480 
    481 // Save state of caller into g->sched. Smashes R31.
    482 TEXT gosave<>(SB),NOSPLIT|NOFRAME,$0
    483 	MOVD	LR, R31
    484 	MOVD	R31, (g_sched+gobuf_pc)(g)
    485 	MOVD	R1, (g_sched+gobuf_sp)(g)
    486 	MOVD	R0, (g_sched+gobuf_lr)(g)
    487 	MOVD	R0, (g_sched+gobuf_ret)(g)
    488 	// Assert ctxt is zero. See func save.
    489 	MOVD	(g_sched+gobuf_ctxt)(g), R31
    490 	CMP	R0, R31
    491 	BEQ	2(PC)
    492 	BL	runtimebadctxt(SB)
    493 	RET
    494 
    495 // func asmcgocall(fn, arg unsafe.Pointer) int32
    496 // Call fn(arg) on the scheduler stack,
    497 // aligned appropriately for the gcc ABI.
    498 // See cgocall.go for more details.
    499 TEXT asmcgocall(SB),NOSPLIT,$0-20
    500 	MOVD	fn+0(FP), R3
    501 	MOVD	arg+8(FP), R4
    502 
    503 	MOVD	R1, R7		// save original stack pointer
    504 	MOVD	g, R5
    505 
    506 	// Figure out if we need to switch to m->g0 stack.
    507 	// We get called to create new OS threads too, and those
    508 	// come in on the m->g0 stack already.
    509 	MOVD	g_m(g), R6
    510 	MOVD	m_g0(R6), R6
    511 	CMP	R6, g
    512 	BEQ	g0
    513 	BL	gosave<>(SB)
    514 	MOVD	R6, g
    515 	BL	runtimesave_g(SB)
    516 	MOVD	(g_sched+gobuf_sp)(g), R1
    517 
    518 	// Now on a scheduling stack (a pthread-created stack).
    519 g0:
    520 	// Save room for two of our pointers, plus 32 bytes of callee
    521 	// save area that lives on the caller stack.
    522 	SUB	$48, R1
    523 	RLDCR	$0, R1, $~15, R1	// 16-byte alignment for gcc ABI
    524 	MOVD	R5, 40(R1)	// save old g on stack
    525 	MOVD	(g_stack+stack_hi)(R5), R5
    526 	SUB	R7, R5
    527 	MOVD	R5, 32(R1)	// save depth in old g stack (can't just save SP, as stack might be copied during a callback)
    528 	MOVD	R0, 0(R1)	// clear back chain pointer (TODO can we give it real back trace information?)
    529 	// This is a "global call", so put the global entry point in r12
    530 	MOVD	R3, R12
    531 	MOVD	R12, CTR
    532 	MOVD	R4, R3		// arg in r3
    533 	BL	(CTR)
    534 
    535 	// C code can clobber R0, so set it back to 0.  F27-F31 are
    536 	// callee save, so we don't need to recover those.
    537 	XOR	R0, R0
    538 	// Restore g, stack pointer, toc pointer.
    539 	// R3 is errno, so don't touch it
    540 	MOVD	40(R1), g
    541 	MOVD    (g_stack+stack_hi)(g), R5
    542 	MOVD    32(R1), R6
    543 	SUB     R6, R5
    544 	MOVD    24(R5), R2
    545 	BL	runtimesave_g(SB)
    546 	MOVD	(g_stack+stack_hi)(g), R5
    547 	MOVD	32(R1), R6
    548 	SUB	R6, R5
    549 	MOVD	R5, R1
    550 
    551 	MOVW	R3, ret+16(FP)
    552 	RET
    553 
    554 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
    555 // Turn the fn into a Go func (by taking its address) and call
    556 // cgocallback_gofunc.
    557 TEXT runtimecgocallback(SB),NOSPLIT,$32-32
    558 	MOVD	$fn+0(FP), R3
    559 	MOVD	R3, FIXED_FRAME+0(R1)
    560 	MOVD	frame+8(FP), R3
    561 	MOVD	R3, FIXED_FRAME+8(R1)
    562 	MOVD	framesize+16(FP), R3
    563 	MOVD	R3, FIXED_FRAME+16(R1)
    564 	MOVD	ctxt+24(FP), R3
    565 	MOVD	R3, FIXED_FRAME+24(R1)
    566 	MOVD	$runtimecgocallback_gofunc(SB), R12
    567 	MOVD	R12, CTR
    568 	BL	(CTR)
    569 	RET
    570 
    571 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
    572 // See cgocall.go for more details.
    573 TEXT cgocallback_gofunc(SB),NOSPLIT,$16-32
    574 	NO_LOCAL_POINTERS
    575 
    576 	// Load m and g from thread-local storage.
    577 	MOVB	runtimeiscgo(SB), R3
    578 	CMP	R3, $0
    579 	BEQ	nocgo
    580 	BL	runtimeload_g(SB)
    581 nocgo:
    582 
    583 	// If g is nil, Go did not create the current thread.
    584 	// Call needm to obtain one for temporary use.
    585 	// In this case, we're running on the thread stack, so there's
    586 	// lots of space, but the linker doesn't know. Hide the call from
    587 	// the linker analysis by using an indirect call.
    588 	CMP	g, $0
    589 	BEQ	needm
    590 
    591 	MOVD	g_m(g), R8
    592 	MOVD	R8, savedm-8(SP)
    593 	BR	havem
    594 
    595 needm:
    596 	MOVD	g, savedm-8(SP) // g is zero, so is m.
    597 	MOVD	$runtimeneedm(SB), R12
    598 	MOVD	R12, CTR
    599 	BL	(CTR)
    600 
    601 	// Set m->sched.sp = SP, so that if a panic happens
    602 	// during the function we are about to execute, it will
    603 	// have a valid SP to run on the g0 stack.
    604 	// The next few lines (after the havem label)
    605 	// will save this SP onto the stack and then write
    606 	// the same SP back to m->sched.sp. That seems redundant,
    607 	// but if an unrecovered panic happens, unwindm will
    608 	// restore the g->sched.sp from the stack location
    609 	// and then systemstack will try to use it. If we don't set it here,
    610 	// that restored SP will be uninitialized (typically 0) and
    611 	// will not be usable.
    612 	MOVD	g_m(g), R8
    613 	MOVD	m_g0(R8), R3
    614 	MOVD	R1, (g_sched+gobuf_sp)(R3)
    615 
    616 havem:
    617 	// Now there's a valid m, and we're running on its m->g0.
    618 	// Save current m->g0->sched.sp on stack and then set it to SP.
    619 	// Save current sp in m->g0->sched.sp in preparation for
    620 	// switch back to m->curg stack.
    621 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
    622 	MOVD	m_g0(R8), R3
    623 	MOVD	(g_sched+gobuf_sp)(R3), R4
    624 	MOVD	R4, savedsp-16(SP)
    625 	MOVD	R1, (g_sched+gobuf_sp)(R3)
    626 
    627 	// Switch to m->curg stack and call runtime.cgocallbackg.
    628 	// Because we are taking over the execution of m->curg
    629 	// but *not* resuming what had been running, we need to
    630 	// save that information (m->curg->sched) so we can restore it.
    631 	// We can restore m->curg->sched.sp easily, because calling
    632 	// runtime.cgocallbackg leaves SP unchanged upon return.
    633 	// To save m->curg->sched.pc, we push it onto the stack.
    634 	// This has the added benefit that it looks to the traceback
    635 	// routine like cgocallbackg is going to return to that
    636 	// PC (because the frame we allocate below has the same
    637 	// size as cgocallback_gofunc's frame declared above)
    638 	// so that the traceback will seamlessly trace back into
    639 	// the earlier calls.
    640 	//
    641 	// In the new goroutine, -8(SP) is unused (where SP refers to
    642 	// m->curg's SP while we're setting it up, before we've adjusted it).
    643 	MOVD	m_curg(R8), g
    644 	BL	runtimesave_g(SB)
    645 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
    646 	MOVD	(g_sched+gobuf_pc)(g), R5
    647 	MOVD	R5, -(FIXED_FRAME+16)(R4)
    648 	MOVD	ctxt+24(FP), R3
    649 	MOVD	R3, -16(R4)
    650 	MOVD	$-(FIXED_FRAME+16)(R4), R1
    651 	BL	runtimecgocallbackg(SB)
    652 
    653 	// Restore g->sched (== m->curg->sched) from saved values.
    654 	MOVD	0(R1), R5
    655 	MOVD	R5, (g_sched+gobuf_pc)(g)
    656 	MOVD	$(FIXED_FRAME+16)(R1), R4
    657 	MOVD	R4, (g_sched+gobuf_sp)(g)
    658 
    659 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
    660 	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
    661 	// so we do not have to restore it.)
    662 	MOVD	g_m(g), R8
    663 	MOVD	m_g0(R8), g
    664 	BL	runtimesave_g(SB)
    665 	MOVD	(g_sched+gobuf_sp)(g), R1
    666 	MOVD	savedsp-16(SP), R4
    667 	MOVD	R4, (g_sched+gobuf_sp)(g)
    668 
    669 	// If the m on entry was nil, we called needm above to borrow an m
    670 	// for the duration of the call. Since the call is over, return it with dropm.
    671 	MOVD	savedm-8(SP), R6
    672 	CMP	R6, $0
    673 	BNE	droppedm
    674 	MOVD	$runtimedropm(SB), R12
    675 	MOVD	R12, CTR
    676 	BL	(CTR)
    677 droppedm:
    678 
    679 	// Done!
    680 	RET
    681 
    682 // void setg(G*); set g. for use by needm.
    683 TEXT runtimesetg(SB), NOSPLIT, $0-8
    684 	MOVD	gg+0(FP), g
    685 	// This only happens if iscgo, so jump straight to save_g
    686 	BL	runtimesave_g(SB)
    687 	RET
    688 
    689 // void setg_gcc(G*); set g in C TLS.
    690 // Must obey the gcc calling convention.
    691 TEXT setg_gcc<>(SB),NOSPLIT|NOFRAME,$0-0
    692 	// The standard prologue clobbers R31, which is callee-save in
    693 	// the C ABI, so we have to use $-8-0 and save LR ourselves.
    694 	MOVD	LR, R4
    695 	// Also save g and R31, since they're callee-save in C ABI
    696 	MOVD	R31, R5
    697 	MOVD	g, R6
    698 
    699 	MOVD	R3, g
    700 	BL	runtimesave_g(SB)
    701 
    702 	MOVD	R6, g
    703 	MOVD	R5, R31
    704 	MOVD	R4, LR
    705 	RET
    706 
    707 TEXT runtimegetcallerpc(SB),NOSPLIT|NOFRAME,$0-8
    708 	MOVD	0(R1), R3		// LR saved by caller
    709 	MOVD	R3, ret+0(FP)
    710 	RET
    711 
    712 TEXT runtimeabort(SB),NOSPLIT|NOFRAME,$0-0
    713 	MOVW	(R0), R0
    714 	UNDEF
    715 
    716 #define	TBRL	268
    717 #define	TBRU	269		/* Time base Upper/Lower */
    718 
    719 // int64 runtimecputicks(void)
    720 TEXT runtimecputicks(SB),NOSPLIT,$0-8
    721 	MOVW	SPR(TBRU), R4
    722 	MOVW	SPR(TBRL), R3
    723 	MOVW	SPR(TBRU), R5
    724 	CMPW	R4, R5
    725 	BNE	-4(PC)
    726 	SLD	$32, R5
    727 	OR	R5, R3
    728 	MOVD	R3, ret+0(FP)
    729 	RET
    730 
    731 // AES hashing not implemented for ppc64
    732 TEXT runtimeaeshash(SB),NOSPLIT|NOFRAME,$0-0
    733 	MOVW	(R0), R1
    734 TEXT runtimeaeshash32(SB),NOSPLIT|NOFRAME,$0-0
    735 	MOVW	(R0), R1
    736 TEXT runtimeaeshash64(SB),NOSPLIT|NOFRAME,$0-0
    737 	MOVW	(R0), R1
    738 TEXT runtimeaeshashstr(SB),NOSPLIT|NOFRAME,$0-0
    739 	MOVW	(R0), R1
    740 
    741 TEXT runtimememequal(SB),NOSPLIT,$0-25
    742 	MOVD    a+0(FP), R3
    743 	MOVD    b+8(FP), R4
    744 	MOVD    size+16(FP), R5
    745 
    746 	BL	runtimememeqbody(SB)
    747 	MOVB    R9, ret+24(FP)
    748 	RET
    749 
    750 // memequal_varlen(a, b unsafe.Pointer) bool
    751 TEXT runtimememequal_varlen(SB),NOSPLIT,$40-17
    752 	MOVD	a+0(FP), R3
    753 	MOVD	b+8(FP), R4
    754 	CMP	R3, R4
    755 	BEQ	eq
    756 	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
    757 	BL	runtimememeqbody(SB)
    758 	MOVB	R9, ret+16(FP)
    759 	RET
    760 eq:
    761 	MOVD	$1, R3
    762 	MOVB	R3, ret+16(FP)
    763 	RET
    764 
    765 // Do an efficient memcmp for ppc64le
    766 // R3 = s1 len
    767 // R4 = s2 len
    768 // R5 = s1 addr
    769 // R6 = s2 addr
    770 // R7 = addr of return value
    771 TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
    772 	MOVD	R3,R8		// set up length
    773 	CMP	R3,R4,CR2	// unequal?
    774 	BC	12,8,setuplen	// BLT CR2
    775 	MOVD	R4,R8		// use R4 for comparison len
    776 setuplen:
    777 	MOVD	R8,CTR		// set up loop counter
    778 	CMP	R8,$8		// only optimize >=8
    779 	BLT	simplecheck
    780 	DCBT	(R5)		// cache hint
    781 	DCBT	(R6)
    782 	CMP	R8,$32		// optimize >= 32
    783 	MOVD	R8,R9
    784 	BLT	setup8a		// 8 byte moves only
    785 setup32a:
    786 	SRADCC	$5,R8,R9	// number of 32 byte chunks
    787 	MOVD	R9,CTR
    788 
    789         // Special processing for 32 bytes or longer.
    790         // Loading this way is faster and correct as long as the
    791 	// doublewords being compared are equal. Once they
    792 	// are found unequal, reload them in proper byte order
    793 	// to determine greater or less than.
    794 loop32a:
    795 	MOVD	0(R5),R9	// doublewords to compare
    796 	MOVD	0(R6),R10	// get 4 doublewords
    797 	MOVD	8(R5),R14
    798 	MOVD	8(R6),R15
    799 	CMPU	R9,R10		// bytes equal?
    800 	MOVD	$0,R16		// set up for cmpne
    801 	BNE	cmpne		// further compare for LT or GT
    802 	MOVD	16(R5),R9	// get next pair of doublewords
    803 	MOVD	16(R6),R10
    804 	CMPU	R14,R15		// bytes match?
    805 	MOVD	$8,R16		// set up for cmpne
    806 	BNE	cmpne		// further compare for LT or GT
    807 	MOVD	24(R5),R14	// get next pair of doublewords
    808 	MOVD    24(R6),R15
    809 	CMPU	R9,R10		// bytes match?
    810 	MOVD	$16,R16		// set up for cmpne
    811 	BNE	cmpne		// further compare for LT or GT
    812 	MOVD	$-8,R16		// for cmpne, R5,R6 already inc by 32
    813 	ADD	$32,R5		// bump up to next 32
    814 	ADD	$32,R6
    815 	CMPU    R14,R15		// bytes match?
    816 	BC	8,2,loop32a	// br ctr and cr
    817 	BNE	cmpne
    818 	ANDCC	$24,R8,R9	// Any 8 byte chunks?
    819 	BEQ	leftover	// and result is 0
    820 setup8a:
    821 	SRADCC	$3,R9,R9	// get the 8 byte count
    822 	BEQ	leftover	// shifted value is 0
    823 	MOVD	R9,CTR		// loop count for doublewords
    824 loop8:
    825 	MOVDBR	(R5+R0),R9	// doublewords to compare
    826 	MOVDBR	(R6+R0),R10	// LE compare order
    827 	ADD	$8,R5
    828 	ADD	$8,R6
    829 	CMPU	R9,R10		// match?
    830 	BC	8,2,loop8	// bt ctr <> 0 && cr
    831 	BGT	greater
    832 	BLT	less
    833 leftover:
    834 	ANDCC	$7,R8,R9	// check for leftover bytes
    835 	MOVD	R9,CTR		// save the ctr
    836 	BNE	simple		// leftover bytes
    837 	BC	12,10,equal	// test CR2 for length comparison
    838 	BC	12,8,less
    839 	BR	greater
    840 simplecheck:
    841 	CMP	R8,$0		// remaining compare length 0
    842 	BNE	simple		// do simple compare
    843 	BC	12,10,equal	// test CR2 for length comparison
    844 	BC	12,8,less	// 1st len < 2nd len, result less
    845 	BR	greater		// 1st len > 2nd len must be greater
    846 simple:
    847 	MOVBZ	0(R5), R9	// get byte from 1st operand
    848 	ADD	$1,R5
    849 	MOVBZ	0(R6), R10	// get byte from 2nd operand
    850 	ADD	$1,R6
    851 	CMPU	R9, R10
    852 	BC	8,2,simple	// bc ctr <> 0 && cr
    853 	BGT	greater		// 1st > 2nd
    854 	BLT	less		// 1st < 2nd
    855 	BC	12,10,equal	// test CR2 for length comparison
    856 	BC	12,9,greater	// 2nd len > 1st len
    857 	BR	less		// must be less
    858 cmpne:				// only here is not equal
    859 	MOVDBR	(R5+R16),R8	// reload in reverse order
    860 	MOVDBR	(R6+R16),R9
    861 	CMPU	R8,R9		// compare correct endianness
    862 	BGT	greater		// here only if NE
    863 less:
    864 	MOVD	$-1,R3
    865 	MOVD	R3,(R7)		// return value if A < B
    866 	RET
    867 equal:
    868 	MOVD	$0,(R7)		// return value if A == B
    869 	RET
    870 greater:
    871 	MOVD	$1,R3
    872 	MOVD	R3,(R7)		// return value if A > B
    873 	RET
    874 
    875 // Do an efficient memcmp for ppc64 (BE)
    876 // R3 = s1 len
    877 // R4 = s2 len
    878 // R5 = s1 addr
    879 // R6 = s2 addr
    880 // R7 = addr of return value
    881 TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
    882 	MOVD	R3,R8		// set up length
    883 	CMP	R3,R4,CR2	// unequal?
    884 	BC	12,8,setuplen	// BLT CR2
    885 	MOVD	R4,R8		// use R4 for comparison len
    886 setuplen:
    887 	MOVD	R8,CTR		// set up loop counter
    888 	CMP	R8,$8		// only optimize >=8
    889 	BLT	simplecheck
    890 	DCBT	(R5)		// cache hint
    891 	DCBT	(R6)
    892 	CMP	R8,$32		// optimize >= 32
    893 	MOVD	R8,R9
    894 	BLT	setup8a		// 8 byte moves only
    895 
    896 setup32a:
    897 	SRADCC	$5,R8,R9	// number of 32 byte chunks
    898 	MOVD	R9,CTR
    899 loop32a:
    900 	MOVD	0(R5),R9	// doublewords to compare
    901 	MOVD	0(R6),R10	// get 4 doublewords
    902 	MOVD	8(R5),R14
    903 	MOVD	8(R6),R15
    904 	CMPU	R9,R10		// bytes equal?
    905 	BLT	less		// found to be less
    906 	BGT	greater		// found to be greater
    907 	MOVD	16(R5),R9	// get next pair of doublewords
    908 	MOVD	16(R6),R10
    909 	CMPU	R14,R15		// bytes match?
    910 	BLT	less		// found less
    911 	BGT	greater		// found greater
    912 	MOVD	24(R5),R14	// get next pair of doublewords
    913 	MOVD	24(R6),R15
    914 	CMPU	R9,R10		// bytes match?
    915 	BLT	less		// found to be less
    916 	BGT	greater		// found to be greater
    917 	ADD	$32,R5		// bump up to next 32
    918 	ADD	$32,R6
    919 	CMPU	R14,R15		// bytes match?
    920 	BC	8,2,loop32a	// br ctr and cr
    921 	BLT	less		// with BE, byte ordering is
    922 	BGT	greater		// good for compare
    923 	ANDCC	$24,R8,R9	// Any 8 byte chunks?
    924 	BEQ	leftover	// and result is 0
    925 setup8a:
    926 	SRADCC	$3,R9,R9	// get the 8 byte count
    927 	BEQ	leftover	// shifted value is 0
    928 	MOVD	R9,CTR		// loop count for doublewords
    929 loop8:
    930 	MOVD	(R5),R9
    931 	MOVD	(R6),R10
    932 	ADD	$8,R5
    933 	ADD	$8,R6
    934 	CMPU	R9,R10		// match?
    935 	BC	8,2,loop8	// bt ctr <> 0 && cr
    936 	BGT	greater
    937 	BLT	less
    938 leftover:
    939 	ANDCC	$7,R8,R9	// check for leftover bytes
    940 	MOVD	R9,CTR		// save the ctr
    941 	BNE	simple		// leftover bytes
    942 	BC	12,10,equal	// test CR2 for length comparison
    943 	BC	12,8,less
    944 	BR	greater
    945 simplecheck:
    946 	CMP	R8,$0		// remaining compare length 0
    947 	BNE	simple		// do simple compare
    948 	BC	12,10,equal	// test CR2 for length comparison
    949 	BC 	12,8,less	// 1st len < 2nd len, result less
    950 	BR	greater		// same len, must be equal
    951 simple:
    952 	MOVBZ	0(R5),R9	// get byte from 1st operand
    953 	ADD	$1,R5
    954 	MOVBZ	0(R6),R10	// get byte from 2nd operand
    955 	ADD	$1,R6
    956 	CMPU	R9,R10
    957 	BC	8,2,simple	// bc ctr <> 0 && cr
    958 	BGT	greater		// 1st > 2nd
    959 	BLT	less		// 1st < 2nd
    960 	BC	12,10,equal	// test CR2 for length comparison
    961 	BC	12,9,greater	// 2nd len > 1st len
    962 less:
    963 	MOVD	$-1,R3
    964 	MOVD    R3,(R7)		// return value if A < B
    965 	RET
    966 equal:
    967 	MOVD    $0,(R7)		// return value if A == B
    968 	RET
    969 greater:
    970 	MOVD	$1,R3
    971 	MOVD	R3,(R7)		// return value if A > B
    972 	RET
    973 
    974 // Do an efficient memequal for ppc64
    975 // R3 = s1
    976 // R4 = s2
    977 // R5 = len
    978 // R9 = return value
    979 TEXT runtimememeqbody(SB),NOSPLIT|NOFRAME,$0-0
    980 	MOVD    R5,CTR
    981 	CMP     R5,$8		// only optimize >=8
    982 	BLT     simplecheck
    983 	DCBT	(R3)		// cache hint
    984 	DCBT	(R4)
    985 	CMP	R5,$32		// optimize >= 32
    986 	MOVD	R5,R6		// needed if setup8a branch
    987 	BLT	setup8a		// 8 byte moves only
    988 setup32a:                       // 8 byte aligned, >= 32 bytes
    989 	SRADCC  $5,R5,R6        // number of 32 byte chunks to compare
    990 	MOVD	R6,CTR
    991 loop32a:
    992 	MOVD    0(R3),R6        // doublewords to compare
    993 	MOVD    0(R4),R7
    994 	MOVD	8(R3),R8	//
    995 	MOVD	8(R4),R9
    996 	CMP     R6,R7           // bytes batch?
    997 	BNE     noteq
    998 	MOVD	16(R3),R6
    999 	MOVD	16(R4),R7
   1000 	CMP     R8,R9		// bytes match?
   1001 	MOVD	24(R3),R8
   1002 	MOVD	24(R4),R9
   1003 	BNE     noteq
   1004 	CMP     R6,R7           // bytes match?
   1005 	BNE	noteq
   1006 	ADD     $32,R3		// bump up to next 32
   1007 	ADD     $32,R4
   1008 	CMP     R8,R9           // bytes match?
   1009 	BC      8,2,loop32a	// br ctr and cr
   1010 	BNE	noteq
   1011 	ANDCC	$24,R5,R6       // Any 8 byte chunks?
   1012 	BEQ	leftover	// and result is 0
   1013 setup8a:
   1014 	SRADCC  $3,R6,R6        // get the 8 byte count
   1015 	BEQ	leftover	// shifted value is 0
   1016 	MOVD    R6,CTR
   1017 loop8:
   1018 	MOVD    0(R3),R6        // doublewords to compare
   1019 	ADD	$8,R3
   1020 	MOVD    0(R4),R7
   1021 	ADD     $8,R4
   1022 	CMP     R6,R7           // match?
   1023 	BC	8,2,loop8	// bt ctr <> 0 && cr
   1024 	BNE     noteq
   1025 leftover:
   1026 	ANDCC   $7,R5,R6        // check for leftover bytes
   1027 	BEQ     equal
   1028 	MOVD    R6,CTR
   1029 	BR	simple
   1030 simplecheck:
   1031 	CMP	R5,$0
   1032 	BEQ	equal
   1033 simple:
   1034 	MOVBZ   0(R3), R6
   1035 	ADD	$1,R3
   1036 	MOVBZ   0(R4), R7
   1037 	ADD     $1,R4
   1038 	CMP     R6, R7
   1039 	BNE     noteq
   1040 	BC      8,2,simple
   1041 	BNE	noteq
   1042 	BR	equal
   1043 noteq:
   1044 	MOVD    $0, R9
   1045 	RET
   1046 equal:
   1047 	MOVD    $1, R9
   1048 	RET
   1049 
   1050 TEXT bytesEqual(SB),NOSPLIT,$0-49
   1051 	MOVD	a_len+8(FP), R4
   1052 	MOVD	b_len+32(FP), R5
   1053 	CMP	R5, R4		// unequal lengths are not equal
   1054 	BNE	noteq
   1055 	MOVD	a+0(FP), R3
   1056 	MOVD	b+24(FP), R4
   1057 	BL	runtimememeqbody(SB)
   1058 
   1059 	MOVBZ	R9,ret+48(FP)
   1060 	RET
   1061 
   1062 noteq:
   1063 	MOVBZ	$0,ret+48(FP)
   1064 	RET
   1065 
   1066 equal:
   1067 	MOVD	$1,R3
   1068 	MOVBZ	R3,ret+48(FP)
   1069 	RET
   1070 
   1071 TEXT bytesIndexByte(SB),NOSPLIT|NOFRAME,$0-40
   1072 	MOVD	s+0(FP), R3		// R3 = byte array pointer
   1073 	MOVD	s_len+8(FP), R4		// R4 = length
   1074 	MOVBZ	c+24(FP), R5		// R5 = byte
   1075 	MOVD	$ret+32(FP), R14	// R14 = &ret
   1076 	BR	runtimeindexbytebody<>(SB)
   1077 
   1078 TEXT stringsIndexByte(SB),NOSPLIT|NOFRAME,$0-32
   1079 	MOVD	s+0(FP), R3	  // R3 = string
   1080 	MOVD	s_len+8(FP), R4	  // R4 = length
   1081 	MOVBZ	c+16(FP), R5	  // R5 = byte
   1082 	MOVD	$ret+24(FP), R14  // R14 = &ret
   1083 	BR	runtimeindexbytebody<>(SB)
   1084 
   1085 TEXT runtimeindexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
   1086 	DCBT	(R3)		// Prepare cache line.
   1087 	MOVD	R3,R17		// Save base address for calculating the index later.
   1088 	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
   1089 	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
   1090 	ADD	R4,R3,R7	// Last acceptable address in R7.
   1091 
   1092 	RLDIMI	$16,R5,$32,R5
   1093 	CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently.
   1094 	MOVD	$-1,R9
   1095 	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
   1096 	RLDIMI	$32,R5,$0,R5
   1097 	MOVD	R7,R10		// Save last acceptable address in R10 for later.
   1098 	ADD	$-1,R7,R7
   1099 #ifdef GOARCH_ppc64le
   1100 	SLD	R6,R9,R9	// Prepare mask for Little Endian
   1101 #else
   1102 	SRD	R6,R9,R9	// Same for Big Endian
   1103 #endif
   1104 	BLE	small_string	// Jump to the small string case if it's <32 bytes.
   1105 
   1106 	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
   1107 	// in V0, V1 and V10, then branch to the preloop.
   1108 	ANDCC	$63,R3,R11
   1109 	BEQ	CR0,qw_align
   1110 	RLDICL	$0,R3,$61,R11
   1111 
   1112 	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
   1113 	CMPB	R12,R5,R3	// Check for a match.
   1114 	AND	R9,R3,R3	// Mask bytes below s_base
   1115 	RLDICL	$0,R7,$61,R6	// length-1
   1116 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
   1117 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
   1118 	BNE	CR7,done
   1119 	ADD	$8,R8,R8
   1120 	ADD	$-8,R4,R4
   1121 	ADD	R4,R11,R4
   1122 
   1123 	// Check for quadword alignment
   1124 	ANDCC	$15,R8,R11
   1125 	BEQ	CR0,qw_align
   1126 
   1127 	// Not aligned, so handle the next doubleword
   1128 	MOVD	0(R8),R12
   1129 	CMPB	R12,R5,R3
   1130 	CMPU	R3,$0,CR7
   1131 	BNE	CR7,done
   1132 	ADD	$8,R8,R8
   1133 	ADD	$-8,R4,R4
   1134 
   1135 	// Either quadword aligned or 64-byte at this point. We can use LVX.
   1136 qw_align:
   1137 
   1138 	// Set up auxiliary data for the vectorized algorithm.
   1139 	VSPLTISB  $0,V0		// Replicate 0 across V0
   1140 	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
   1141 	MTVRD	  R5,V1
   1142 	LVSL	  (R0+R0),V11
   1143 	VSLB	  V11,V10,V10
   1144 	VSPLTB	  $7,V1,V1	// Replicate byte across V1
   1145 	CMPU	  R4, $64	// If len <= 64, don't use the vectorized loop
   1146 	BLE	  tail
   1147 
   1148 	// We will load 4 quardwords per iteration in the loop, so check for
   1149 	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
   1150 	ANDCC	  $63,R8,R11
   1151 	BEQ	  CR0,preloop
   1152 
   1153 	// Not 64-byte aligned. Load one quadword at a time until aligned.
   1154 	LVX	    (R8+R0),V4
   1155 	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   1156 	BNE	    CR6,found_qw_align
   1157 	ADD	    $16,R8,R8
   1158 	ADD	    $-16,R4,R4
   1159 
   1160 	ANDCC	    $63,R8,R11
   1161 	BEQ	    CR0,preloop
   1162 	LVX	    (R8+R0),V4
   1163 	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   1164 	BNE	    CR6,found_qw_align
   1165 	ADD	    $16,R8,R8
   1166 	ADD	    $-16,R4,R4
   1167 
   1168 	ANDCC	    $63,R8,R11
   1169 	BEQ	    CR0,preloop
   1170 	LVX	    (R8+R0),V4
   1171 	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   1172 	BNE	    CR6,found_qw_align
   1173 	ADD	    $-16,R4,R4
   1174 	ADD	    $16,R8,R8
   1175 
   1176 	// 64-byte aligned. Prepare for the main loop.
   1177 preloop:
   1178 	CMPU	R4,$64
   1179 	BLE	tail	      // If len <= 64, don't use the vectorized loop
   1180 
   1181 	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
   1182 	// per loop iteration. The last doubleword is in R10, so our loop counter
   1183 	// starts at (R10-R8)/64.
   1184 	SUB	R8,R10,R6
   1185 	SRD	$6,R6,R9      // Loop counter in R9
   1186 	MOVD	R9,CTR
   1187 
   1188 	MOVD	$16,R11      // Load offsets for the vector loads
   1189 	MOVD	$32,R9
   1190 	MOVD	$48,R7
   1191 
   1192 	// Main loop we will load 64 bytes per iteration
   1193 loop:
   1194 	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
   1195 	LVX	    (R11+R8),V3
   1196 	LVX	    (R9+R8),V4
   1197 	LVX	    (R7+R8),V5
   1198 	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
   1199 	VCMPEQUB    V1,V3,V7
   1200 	VCMPEQUB    V1,V4,V8
   1201 	VCMPEQUB    V1,V5,V9
   1202 	VOR	    V6,V7,V11	      // Compress the result in a single vector
   1203 	VOR	    V8,V9,V12
   1204 	VOR	    V11,V12,V11
   1205 	VCMPEQUBCC  V0,V11,V11	      // Check for byte
   1206 	BGE	    CR6,found
   1207 	ADD	    $64,R8,R8
   1208 	BC	    16,0,loop	      // bdnz loop
   1209 
   1210 	// Handle the tailing bytes or R4 <= 64
   1211 	RLDICL	$0,R6,$58,R4
   1212 tail:
   1213 	CMPU	    R4,$0
   1214 	BEQ	    notfound
   1215 	LVX	    (R8+R0),V4
   1216 	VCMPEQUBCC  V1,V4,V6
   1217 	BNE	    CR6,found_qw_align
   1218 	ADD	    $16,R8,R8
   1219 	CMPU	    R4,$16,CR6
   1220 	BLE	    CR6,notfound
   1221 	ADD	    $-16,R4,R4
   1222 
   1223 	LVX	    (R8+R0),V4
   1224 	VCMPEQUBCC  V1,V4,V6
   1225 	BNE	    CR6,found_qw_align
   1226 	ADD	    $16,R8,R8
   1227 	CMPU	    R4,$16,CR6
   1228 	BLE	    CR6,notfound
   1229 	ADD	    $-16,R4,R4
   1230 
   1231 	LVX	    (R8+R0),V4
   1232 	VCMPEQUBCC  V1,V4,V6
   1233 	BNE	    CR6,found_qw_align
   1234 	ADD	    $16,R8,R8
   1235 	CMPU	    R4,$16,CR6
   1236 	BLE	    CR6,notfound
   1237 	ADD	    $-16,R4,R4
   1238 
   1239 	LVX	    (R8+R0),V4
   1240 	VCMPEQUBCC  V1,V4,V6
   1241 	BNE	    CR6,found_qw_align
   1242 
   1243 notfound:
   1244 	MOVD	$-1,R3
   1245 	MOVD	R3,(R14)
   1246 	RET
   1247 
   1248 found:
   1249 	// We will now compress the results into a single doubleword,
   1250 	// so it can be moved to a GPR for the final index calculation.
   1251 
   1252 	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
   1253 	// first bit of each byte into bits 48-63.
   1254 	VBPERMQ	  V6,V10,V6
   1255 	VBPERMQ	  V7,V10,V7
   1256 	VBPERMQ	  V8,V10,V8
   1257 	VBPERMQ	  V9,V10,V9
   1258 
   1259 	// Shift each 16-bit component into its correct position for
   1260 	// merging into a single doubleword.
   1261 #ifdef GOARCH_ppc64le
   1262 	VSLDOI	  $2,V7,V7,V7
   1263 	VSLDOI	  $4,V8,V8,V8
   1264 	VSLDOI	  $6,V9,V9,V9
   1265 #else
   1266 	VSLDOI	  $6,V6,V6,V6
   1267 	VSLDOI	  $4,V7,V7,V7
   1268 	VSLDOI	  $2,V8,V8,V8
   1269 #endif
   1270 
   1271 	// Merge V6-V9 into a single doubleword and move to a GPR.
   1272 	VOR	V6,V7,V11
   1273 	VOR	V8,V9,V4
   1274 	VOR	V4,V11,V4
   1275 	MFVRD	V4,R3
   1276 
   1277 #ifdef GOARCH_ppc64le
   1278 	ADD	  $-1,R3,R11
   1279 	ANDN	  R3,R11,R11
   1280 	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
   1281 #else
   1282 	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
   1283 #endif
   1284 	ADD	R8,R11,R3	// Calculate byte address
   1285 
   1286 return:
   1287 	SUB	R17,R3
   1288 	MOVD	R3,(R14)
   1289 	RET
   1290 
   1291 found_qw_align:
   1292 	// Use the same algorithm as above. Compress the result into
   1293 	// a single doubleword and move it to a GPR for the final
   1294 	// calculation.
   1295 	VBPERMQ	  V6,V10,V6
   1296 
   1297 #ifdef GOARCH_ppc64le
   1298 	MFVRD	  V6,R3
   1299 	ADD	  $-1,R3,R11
   1300 	ANDN	  R3,R11,R11
   1301 	POPCNTD	  R11,R11
   1302 #else
   1303 	VSLDOI	  $6,V6,V6,V6
   1304 	MFVRD	  V6,R3
   1305 	CNTLZD	  R3,R11
   1306 #endif
   1307 	ADD	  R8,R11,R3
   1308 	CMPU	  R11,R4
   1309 	BLT	  return
   1310 	BR	  notfound
   1311 
   1312 done:
   1313 	// At this point, R3 has 0xFF in the same position as the byte we are
   1314 	// looking for in the doubleword. Use that to calculate the exact index
   1315 	// of the byte.
   1316 #ifdef GOARCH_ppc64le
   1317 	ADD	$-1,R3,R11
   1318 	ANDN	R3,R11,R11
   1319 	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
   1320 #else
   1321 	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
   1322 #endif
   1323 	CMPU	R8,R7		// Check if we are at the last doubleword.
   1324 	SRD	$3,R11		// Convert trailing zeros to bytes.
   1325 	ADD	R11,R8,R3
   1326 	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
   1327 	BNE	return
   1328 	BLE	CR7,return
   1329 	BR	notfound
   1330 
   1331 small_string:
   1332 	// We unroll this loop for better performance.
   1333 	CMPU	R4,$0		// Check for length=0
   1334 	BEQ	notfound
   1335 
   1336 	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
   1337 	CMPB	R12,R5,R3	// Check for a match.
   1338 	AND	R9,R3,R3	// Mask bytes below s_base.
   1339 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
   1340 	RLDICL	$0,R7,$61,R6	// length-1
   1341 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
   1342 	CMPU	R8,R7
   1343 	BNE	CR7,done
   1344 	BEQ	notfound	// Hit length.
   1345 
   1346 	MOVDU	8(R8),R12
   1347 	CMPB	R12,R5,R3
   1348 	CMPU	R3,$0,CR6
   1349 	CMPU	R8,R7
   1350 	BNE	CR6,done
   1351 	BEQ	notfound
   1352 
   1353 	MOVDU	8(R8),R12
   1354 	CMPB	R12,R5,R3
   1355 	CMPU	R3,$0,CR6
   1356 	CMPU	R8,R7
   1357 	BNE	CR6,done
   1358 	BEQ	notfound
   1359 
   1360 	MOVDU	8(R8),R12
   1361 	CMPB	R12,R5,R3
   1362 	CMPU	R3,$0,CR6
   1363 	CMPU	R8,R7
   1364 	BNE	CR6,done
   1365 	BEQ	notfound
   1366 
   1367 	MOVDU	8(R8),R12
   1368 	CMPB	R12,R5,R3
   1369 	CMPU	R3,$0,CR6
   1370 	BNE	CR6,done
   1371 	BR	notfound
   1372 
   1373 TEXT runtimecmpstring(SB),NOSPLIT|NOFRAME,$0-40
   1374 	MOVD	s1_base+0(FP), R5
   1375 	MOVD	s2_base+16(FP), R6
   1376 	MOVD	s1_len+8(FP), R3
   1377 	CMP	R5,R6,CR7
   1378 	MOVD	s2_len+24(FP), R4
   1379 	MOVD	$ret+32(FP), R7
   1380 	CMP	R3,R4,CR6
   1381 	BEQ	CR7,equal
   1382 
   1383 notequal:
   1384 #ifdef	GOARCH_ppc64le
   1385 	BR	cmpbodyLE<>(SB)
   1386 #else
   1387 	BR      cmpbodyBE<>(SB)
   1388 #endif
   1389 
   1390 equal:
   1391 	BEQ	CR6,done
   1392 	MOVD	$1, R8
   1393 	BGT	CR6,greater
   1394 	NEG	R8
   1395 
   1396 greater:
   1397 	MOVD	R8, (R7)
   1398 	RET
   1399 
   1400 done:
   1401 	MOVD	$0, (R7)
   1402 	RET
   1403 
   1404 TEXT bytesCompare(SB),NOSPLIT|NOFRAME,$0-56
   1405 	MOVD	s1+0(FP), R5
   1406 	MOVD	s2+24(FP), R6
   1407 	MOVD	s1+8(FP), R3
   1408 	CMP	R5,R6,CR7
   1409 	MOVD	s2+32(FP), R4
   1410 	MOVD	$ret+48(FP), R7
   1411 	CMP	R3,R4,CR6
   1412 	BEQ	CR7,equal
   1413 
   1414 #ifdef	GOARCH_ppc64le
   1415 	BR	cmpbodyLE<>(SB)
   1416 #else
   1417 	BR      cmpbodyBE<>(SB)
   1418 #endif
   1419 
   1420 equal:
   1421 	BEQ	CR6,done
   1422 	MOVD	$1, R8
   1423 	BGT	CR6,greater
   1424 	NEG	R8
   1425 
   1426 greater:
   1427 	MOVD	R8, (R7)
   1428 	RET
   1429 
   1430 done:
   1431 	MOVD	$0, (R7)
   1432 	RET
   1433 
   1434 TEXT runtimereturn0(SB), NOSPLIT, $0
   1435 	MOVW	$0, R3
   1436 	RET
   1437 
   1438 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
   1439 // Must obey the gcc calling convention.
   1440 TEXT _cgo_topofstack(SB),NOSPLIT|NOFRAME,$0
   1441 	// g (R30) and R31 are callee-save in the C ABI, so save them
   1442 	MOVD	g, R4
   1443 	MOVD	R31, R5
   1444 	MOVD	LR, R6
   1445 
   1446 	BL	runtimeload_g(SB)	// clobbers g (R30), R31
   1447 	MOVD	g_m(g), R3
   1448 	MOVD	m_curg(R3), R3
   1449 	MOVD	(g_stack+stack_hi)(R3), R3
   1450 
   1451 	MOVD	R4, g
   1452 	MOVD	R5, R31
   1453 	MOVD	R6, LR
   1454 	RET
   1455 
   1456 // The top-most function running on a goroutine
   1457 // returns to goexit+PCQuantum.
   1458 //
   1459 // When dynamically linking Go, it can be returned to from a function
   1460 // implemented in a different module and so needs to reload the TOC pointer
   1461 // from the stack (although this function declares that it does not set up x-a
   1462 // frame, newproc1 does in fact allocate one for goexit and saves the TOC
   1463 // pointer in the correct place).
   1464 // goexit+_PCQuantum is halfway through the usual global entry point prologue
   1465 // that derives r2 from r12 which is a bit silly, but not harmful.
   1466 TEXT runtimegoexit(SB),NOSPLIT|NOFRAME,$0-0
   1467 	MOVD	24(R1), R2
   1468 	BL	runtimegoexit1(SB)	// does not return
   1469 	// traceback from goexit1 must hit code range of goexit
   1470 	MOVD	R0, R0	// NOP
   1471 
   1472 TEXT runtimesigreturn(SB),NOSPLIT,$0-0
   1473 	RET
   1474 
   1475 // prepGoExitFrame saves the current TOC pointer (i.e. the TOC pointer for the
   1476 // module containing runtime) to the frame that goexit will execute in when
   1477 // the goroutine exits. It's implemented in assembly mainly because that's the
   1478 // easiest way to get access to R2.
   1479 TEXT runtimeprepGoExitFrame(SB),NOSPLIT,$0-8
   1480       MOVD    sp+0(FP), R3
   1481       MOVD    R2, 24(R3)
   1482       RET
   1483 
   1484 TEXT runtimeaddmoduledata(SB),NOSPLIT|NOFRAME,$0-0
   1485 	ADD	$-8, R1
   1486 	MOVD	R31, 0(R1)
   1487 	MOVD	runtimelastmoduledatap(SB), R4
   1488 	MOVD	R3, moduledata_next(R4)
   1489 	MOVD	R3, runtimelastmoduledatap(SB)
   1490 	MOVD	0(R1), R31
   1491 	ADD	$8, R1
   1492 	RET
   1493 
   1494 TEXT checkASM(SB),NOSPLIT,$0-1
   1495 	MOVW	$1, R3
   1496 	MOVB	R3, ret+0(FP)
   1497 	RET
   1498