1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "go_tls.h" 7 #include "funcdata.h" 8 #include "textflag.h" 9 10 TEXT runtimert0_go(SB),NOSPLIT,$0 11 // copy arguments forward on an even stack 12 MOVQ DI, AX // argc 13 MOVQ SI, BX // argv 14 SUBQ $(4*8+7), SP // 2args 2auto 15 ANDQ $~15, SP 16 MOVQ AX, 16(SP) 17 MOVQ BX, 24(SP) 18 19 // create istack out of the given (operating system) stack. 20 // _cgo_init may update stackguard. 21 MOVQ $runtimeg0(SB), DI 22 LEAQ (-64*1024+104)(SP), BX 23 MOVQ BX, g_stackguard0(DI) 24 MOVQ BX, g_stackguard1(DI) 25 MOVQ BX, (g_stack+stack_lo)(DI) 26 MOVQ SP, (g_stack+stack_hi)(DI) 27 28 // find out information about the processor we're on 29 MOVQ $0, AX 30 CPUID 31 CMPQ AX, $0 32 JE nocpuinfo 33 34 // Figure out how to serialize RDTSC. 35 // On Intel processors LFENCE is enough. AMD requires MFENCE. 36 // Don't know about the rest, so let's do MFENCE. 37 CMPL BX, $0x756E6547 // "Genu" 38 JNE notintel 39 CMPL DX, $0x49656E69 // "ineI" 40 JNE notintel 41 CMPL CX, $0x6C65746E // "ntel" 42 JNE notintel 43 MOVB $1, runtimelfenceBeforeRdtsc(SB) 44 notintel: 45 46 MOVQ $1, AX 47 CPUID 48 MOVL CX, runtimecpuid_ecx(SB) 49 MOVL DX, runtimecpuid_edx(SB) 50 nocpuinfo: 51 52 // if there is an _cgo_init, call it. 53 MOVQ _cgo_init(SB), AX 54 TESTQ AX, AX 55 JZ needtls 56 // g0 already in DI 57 MOVQ DI, CX // Win64 uses CX for first parameter 58 MOVQ $setg_gcc<>(SB), SI 59 CALL AX 60 61 // update stackguard after _cgo_init 62 MOVQ $runtimeg0(SB), CX 63 MOVQ (g_stack+stack_lo)(CX), AX 64 ADDQ $const__StackGuard, AX 65 MOVQ AX, g_stackguard0(CX) 66 MOVQ AX, g_stackguard1(CX) 67 68 CMPL runtimeiswindows(SB), $0 69 JEQ ok 70 needtls: 71 // skip TLS setup on Plan 9 72 CMPL runtimeisplan9(SB), $1 73 JEQ ok 74 // skip TLS setup on Solaris 75 CMPL runtimeissolaris(SB), $1 76 JEQ ok 77 78 LEAQ runtimetls0(SB), DI 79 CALL runtimesettls(SB) 80 81 // store through it, to make sure it works 82 get_tls(BX) 83 MOVQ $0x123, g(BX) 84 MOVQ runtimetls0(SB), AX 85 CMPQ AX, $0x123 86 JEQ 2(PC) 87 MOVL AX, 0 // abort 88 ok: 89 // set the per-goroutine and per-mach "registers" 90 get_tls(BX) 91 LEAQ runtimeg0(SB), CX 92 MOVQ CX, g(BX) 93 LEAQ runtimem0(SB), AX 94 95 // save m->g0 = g0 96 MOVQ CX, m_g0(AX) 97 // save m0 to g0->m 98 MOVQ AX, g_m(CX) 99 100 CLD // convention is D is always left cleared 101 CALL runtimecheck(SB) 102 103 MOVL 16(SP), AX // copy argc 104 MOVL AX, 0(SP) 105 MOVQ 24(SP), AX // copy argv 106 MOVQ AX, 8(SP) 107 CALL runtimeargs(SB) 108 CALL runtimeosinit(SB) 109 CALL runtimeschedinit(SB) 110 111 // create a new goroutine to start program 112 MOVQ $runtimemainPC(SB), AX // entry 113 PUSHQ AX 114 PUSHQ $0 // arg size 115 CALL runtimenewproc(SB) 116 POPQ AX 117 POPQ AX 118 119 // start this M 120 CALL runtimemstart(SB) 121 122 MOVL $0xf1, 0xf1 // crash 123 RET 124 125 DATA runtimemainPC+0(SB)/8,$runtimemain(SB) 126 GLOBL runtimemainPC(SB),RODATA,$8 127 128 TEXT runtimebreakpoint(SB),NOSPLIT,$0-0 129 BYTE $0xcc 130 RET 131 132 TEXT runtimeasminit(SB),NOSPLIT,$0-0 133 // No per-thread init. 134 RET 135 136 /* 137 * go-routine 138 */ 139 140 // void gosave(Gobuf*) 141 // save state in Gobuf; setjmp 142 TEXT runtimegosave(SB), NOSPLIT, $0-8 143 MOVQ buf+0(FP), AX // gobuf 144 LEAQ buf+0(FP), BX // caller's SP 145 MOVQ BX, gobuf_sp(AX) 146 MOVQ 0(SP), BX // caller's PC 147 MOVQ BX, gobuf_pc(AX) 148 MOVQ $0, gobuf_ret(AX) 149 MOVQ $0, gobuf_ctxt(AX) 150 MOVQ BP, gobuf_bp(AX) 151 get_tls(CX) 152 MOVQ g(CX), BX 153 MOVQ BX, gobuf_g(AX) 154 RET 155 156 // void gogo(Gobuf*) 157 // restore state from Gobuf; longjmp 158 TEXT runtimegogo(SB), NOSPLIT, $0-8 159 MOVQ buf+0(FP), BX // gobuf 160 MOVQ gobuf_g(BX), DX 161 MOVQ 0(DX), CX // make sure g != nil 162 get_tls(CX) 163 MOVQ DX, g(CX) 164 MOVQ gobuf_sp(BX), SP // restore SP 165 MOVQ gobuf_ret(BX), AX 166 MOVQ gobuf_ctxt(BX), DX 167 MOVQ gobuf_bp(BX), BP 168 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector 169 MOVQ $0, gobuf_ret(BX) 170 MOVQ $0, gobuf_ctxt(BX) 171 MOVQ $0, gobuf_bp(BX) 172 MOVQ gobuf_pc(BX), BX 173 JMP BX 174 175 // func mcall(fn func(*g)) 176 // Switch to m->g0's stack, call fn(g). 177 // Fn must never return. It should gogo(&g->sched) 178 // to keep running g. 179 TEXT runtimemcall(SB), NOSPLIT, $0-8 180 MOVQ fn+0(FP), DI 181 182 get_tls(CX) 183 MOVQ g(CX), AX // save state in g->sched 184 MOVQ 0(SP), BX // caller's PC 185 MOVQ BX, (g_sched+gobuf_pc)(AX) 186 LEAQ fn+0(FP), BX // caller's SP 187 MOVQ BX, (g_sched+gobuf_sp)(AX) 188 MOVQ AX, (g_sched+gobuf_g)(AX) 189 MOVQ BP, (g_sched+gobuf_bp)(AX) 190 191 // switch to m->g0 & its stack, call fn 192 MOVQ g(CX), BX 193 MOVQ g_m(BX), BX 194 MOVQ m_g0(BX), SI 195 CMPQ SI, AX // if g == m->g0 call badmcall 196 JNE 3(PC) 197 MOVQ $runtimebadmcall(SB), AX 198 JMP AX 199 MOVQ SI, g(CX) // g = m->g0 200 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp 201 PUSHQ AX 202 MOVQ DI, DX 203 MOVQ 0(DI), DI 204 CALL DI 205 POPQ AX 206 MOVQ $runtimebadmcall2(SB), AX 207 JMP AX 208 RET 209 210 // systemstack_switch is a dummy routine that systemstack leaves at the bottom 211 // of the G stack. We need to distinguish the routine that 212 // lives at the bottom of the G stack from the one that lives 213 // at the top of the system stack because the one at the top of 214 // the system stack terminates the stack walk (see topofstack()). 215 TEXT runtimesystemstack_switch(SB), NOSPLIT, $0-0 216 RET 217 218 // func systemstack(fn func()) 219 TEXT runtimesystemstack(SB), NOSPLIT, $0-8 220 MOVQ fn+0(FP), DI // DI = fn 221 get_tls(CX) 222 MOVQ g(CX), AX // AX = g 223 MOVQ g_m(AX), BX // BX = m 224 225 MOVQ m_gsignal(BX), DX // DX = gsignal 226 CMPQ AX, DX 227 JEQ noswitch 228 229 MOVQ m_g0(BX), DX // DX = g0 230 CMPQ AX, DX 231 JEQ noswitch 232 233 MOVQ m_curg(BX), R8 234 CMPQ AX, R8 235 JEQ switch 236 237 // Bad: g is not gsignal, not g0, not curg. What is it? 238 MOVQ $runtimebadsystemstack(SB), AX 239 CALL AX 240 241 switch: 242 // save our state in g->sched. Pretend to 243 // be systemstack_switch if the G stack is scanned. 244 MOVQ $runtimesystemstack_switch(SB), SI 245 MOVQ SI, (g_sched+gobuf_pc)(AX) 246 MOVQ SP, (g_sched+gobuf_sp)(AX) 247 MOVQ AX, (g_sched+gobuf_g)(AX) 248 MOVQ BP, (g_sched+gobuf_bp)(AX) 249 250 // switch to g0 251 MOVQ DX, g(CX) 252 MOVQ (g_sched+gobuf_sp)(DX), BX 253 // make it look like mstart called systemstack on g0, to stop traceback 254 SUBQ $8, BX 255 MOVQ $runtimemstart(SB), DX 256 MOVQ DX, 0(BX) 257 MOVQ BX, SP 258 259 // call target function 260 MOVQ DI, DX 261 MOVQ 0(DI), DI 262 CALL DI 263 264 // switch back to g 265 get_tls(CX) 266 MOVQ g(CX), AX 267 MOVQ g_m(AX), BX 268 MOVQ m_curg(BX), AX 269 MOVQ AX, g(CX) 270 MOVQ (g_sched+gobuf_sp)(AX), SP 271 MOVQ $0, (g_sched+gobuf_sp)(AX) 272 RET 273 274 noswitch: 275 // already on m stack, just call directly 276 MOVQ DI, DX 277 MOVQ 0(DI), DI 278 CALL DI 279 RET 280 281 /* 282 * support for morestack 283 */ 284 285 // Called during function prolog when more stack is needed. 286 // 287 // The traceback routines see morestack on a g0 as being 288 // the top of a stack (for example, morestack calling newstack 289 // calling the scheduler calling newm calling gc), so we must 290 // record an argument size. For that purpose, it has no arguments. 291 TEXT runtimemorestack(SB),NOSPLIT,$0-0 292 // Cannot grow scheduler stack (m->g0). 293 get_tls(CX) 294 MOVQ g(CX), BX 295 MOVQ g_m(BX), BX 296 MOVQ m_g0(BX), SI 297 CMPQ g(CX), SI 298 JNE 2(PC) 299 INT $3 300 301 // Cannot grow signal stack (m->gsignal). 302 MOVQ m_gsignal(BX), SI 303 CMPQ g(CX), SI 304 JNE 2(PC) 305 INT $3 306 307 // Called from f. 308 // Set m->morebuf to f's caller. 309 MOVQ 8(SP), AX // f's caller's PC 310 MOVQ AX, (m_morebuf+gobuf_pc)(BX) 311 LEAQ 16(SP), AX // f's caller's SP 312 MOVQ AX, (m_morebuf+gobuf_sp)(BX) 313 get_tls(CX) 314 MOVQ g(CX), SI 315 MOVQ SI, (m_morebuf+gobuf_g)(BX) 316 317 // Set g->sched to context in f. 318 MOVQ 0(SP), AX // f's PC 319 MOVQ AX, (g_sched+gobuf_pc)(SI) 320 MOVQ SI, (g_sched+gobuf_g)(SI) 321 LEAQ 8(SP), AX // f's SP 322 MOVQ AX, (g_sched+gobuf_sp)(SI) 323 MOVQ DX, (g_sched+gobuf_ctxt)(SI) 324 MOVQ BP, (g_sched+gobuf_bp)(SI) 325 326 // Call newstack on m->g0's stack. 327 MOVQ m_g0(BX), BX 328 MOVQ BX, g(CX) 329 MOVQ (g_sched+gobuf_sp)(BX), SP 330 CALL runtimenewstack(SB) 331 MOVQ $0, 0x1003 // crash if newstack returns 332 RET 333 334 // morestack but not preserving ctxt. 335 TEXT runtimemorestack_noctxt(SB),NOSPLIT,$0 336 MOVL $0, DX 337 JMP runtimemorestack(SB) 338 339 TEXT runtimestackBarrier(SB),NOSPLIT,$0 340 // We came here via a RET to an overwritten return PC. 341 // AX may be live. Other registers are available. 342 343 // Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal. 344 get_tls(CX) 345 MOVQ g(CX), CX 346 MOVQ (g_stkbar+slice_array)(CX), DX 347 MOVQ g_stkbarPos(CX), BX 348 IMULQ $stkbar__size, BX // Too big for SIB. 349 MOVQ stkbar_savedLRVal(DX)(BX*1), BX 350 // Record that this stack barrier was hit. 351 ADDQ $1, g_stkbarPos(CX) 352 // Jump to the original return PC. 353 JMP BX 354 355 // reflectcall: call a function with the given argument list 356 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32). 357 // we don't have variable-sized frames, so we use a small number 358 // of constant-sized-frame functions to encode a few bits of size in the pc. 359 // Caution: ugly multiline assembly macros in your future! 360 361 #define DISPATCH(NAME,MAXSIZE) \ 362 CMPQ CX, $MAXSIZE; \ 363 JA 3(PC); \ 364 MOVQ $NAME(SB), AX; \ 365 JMP AX 366 // Note: can't just "JMP NAME(SB)" - bad inlining results. 367 368 TEXT reflectcall(SB), NOSPLIT, $0-0 369 JMP reflectcall(SB) 370 371 TEXT reflectcall(SB), NOSPLIT, $0-32 372 MOVLQZX argsize+24(FP), CX 373 // NOTE(rsc): No call16, because CALLFN needs four words 374 // of argument space to invoke callwritebarrier. 375 DISPATCH(runtimecall32, 32) 376 DISPATCH(runtimecall64, 64) 377 DISPATCH(runtimecall128, 128) 378 DISPATCH(runtimecall256, 256) 379 DISPATCH(runtimecall512, 512) 380 DISPATCH(runtimecall1024, 1024) 381 DISPATCH(runtimecall2048, 2048) 382 DISPATCH(runtimecall4096, 4096) 383 DISPATCH(runtimecall8192, 8192) 384 DISPATCH(runtimecall16384, 16384) 385 DISPATCH(runtimecall32768, 32768) 386 DISPATCH(runtimecall65536, 65536) 387 DISPATCH(runtimecall131072, 131072) 388 DISPATCH(runtimecall262144, 262144) 389 DISPATCH(runtimecall524288, 524288) 390 DISPATCH(runtimecall1048576, 1048576) 391 DISPATCH(runtimecall2097152, 2097152) 392 DISPATCH(runtimecall4194304, 4194304) 393 DISPATCH(runtimecall8388608, 8388608) 394 DISPATCH(runtimecall16777216, 16777216) 395 DISPATCH(runtimecall33554432, 33554432) 396 DISPATCH(runtimecall67108864, 67108864) 397 DISPATCH(runtimecall134217728, 134217728) 398 DISPATCH(runtimecall268435456, 268435456) 399 DISPATCH(runtimecall536870912, 536870912) 400 DISPATCH(runtimecall1073741824, 1073741824) 401 MOVQ $runtimebadreflectcall(SB), AX 402 JMP AX 403 404 #define CALLFN(NAME,MAXSIZE) \ 405 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \ 406 NO_LOCAL_POINTERS; \ 407 /* copy arguments to stack */ \ 408 MOVQ argptr+16(FP), SI; \ 409 MOVLQZX argsize+24(FP), CX; \ 410 MOVQ SP, DI; \ 411 REP;MOVSB; \ 412 /* call function */ \ 413 MOVQ f+8(FP), DX; \ 414 PCDATA $PCDATA_StackMapIndex, $0; \ 415 CALL (DX); \ 416 /* copy return values back */ \ 417 MOVQ argptr+16(FP), DI; \ 418 MOVLQZX argsize+24(FP), CX; \ 419 MOVLQZX retoffset+28(FP), BX; \ 420 MOVQ SP, SI; \ 421 ADDQ BX, DI; \ 422 ADDQ BX, SI; \ 423 SUBQ BX, CX; \ 424 REP;MOVSB; \ 425 /* execute write barrier updates */ \ 426 MOVQ argtype+0(FP), DX; \ 427 MOVQ argptr+16(FP), DI; \ 428 MOVLQZX argsize+24(FP), CX; \ 429 MOVLQZX retoffset+28(FP), BX; \ 430 MOVQ DX, 0(SP); \ 431 MOVQ DI, 8(SP); \ 432 MOVQ CX, 16(SP); \ 433 MOVQ BX, 24(SP); \ 434 CALL runtimecallwritebarrier(SB); \ 435 RET 436 437 CALLFN(call32, 32) 438 CALLFN(call64, 64) 439 CALLFN(call128, 128) 440 CALLFN(call256, 256) 441 CALLFN(call512, 512) 442 CALLFN(call1024, 1024) 443 CALLFN(call2048, 2048) 444 CALLFN(call4096, 4096) 445 CALLFN(call8192, 8192) 446 CALLFN(call16384, 16384) 447 CALLFN(call32768, 32768) 448 CALLFN(call65536, 65536) 449 CALLFN(call131072, 131072) 450 CALLFN(call262144, 262144) 451 CALLFN(call524288, 524288) 452 CALLFN(call1048576, 1048576) 453 CALLFN(call2097152, 2097152) 454 CALLFN(call4194304, 4194304) 455 CALLFN(call8388608, 8388608) 456 CALLFN(call16777216, 16777216) 457 CALLFN(call33554432, 33554432) 458 CALLFN(call67108864, 67108864) 459 CALLFN(call134217728, 134217728) 460 CALLFN(call268435456, 268435456) 461 CALLFN(call536870912, 536870912) 462 CALLFN(call1073741824, 1073741824) 463 464 // bool cas(int32 *val, int32 old, int32 new) 465 // Atomically: 466 // if(*val == old){ 467 // *val = new; 468 // return 1; 469 // } else 470 // return 0; 471 TEXT runtimecas(SB), NOSPLIT, $0-17 472 MOVQ ptr+0(FP), BX 473 MOVL old+8(FP), AX 474 MOVL new+12(FP), CX 475 LOCK 476 CMPXCHGL CX, 0(BX) 477 SETEQ ret+16(FP) 478 RET 479 480 // bool runtimecas64(uint64 *val, uint64 old, uint64 new) 481 // Atomically: 482 // if(*val == *old){ 483 // *val = new; 484 // return 1; 485 // } else { 486 // return 0; 487 // } 488 TEXT runtimecas64(SB), NOSPLIT, $0-25 489 MOVQ ptr+0(FP), BX 490 MOVQ old+8(FP), AX 491 MOVQ new+16(FP), CX 492 LOCK 493 CMPXCHGQ CX, 0(BX) 494 SETEQ ret+24(FP) 495 RET 496 497 TEXT runtimecasuintptr(SB), NOSPLIT, $0-25 498 JMP runtimecas64(SB) 499 500 TEXT runtimeatomicloaduintptr(SB), NOSPLIT, $0-16 501 JMP runtimeatomicload64(SB) 502 503 TEXT runtimeatomicloaduint(SB), NOSPLIT, $0-16 504 JMP runtimeatomicload64(SB) 505 506 TEXT runtimeatomicstoreuintptr(SB), NOSPLIT, $0-16 507 JMP runtimeatomicstore64(SB) 508 509 // bool casp(void **val, void *old, void *new) 510 // Atomically: 511 // if(*val == old){ 512 // *val = new; 513 // return 1; 514 // } else 515 // return 0; 516 TEXT runtimecasp1(SB), NOSPLIT, $0-25 517 MOVQ ptr+0(FP), BX 518 MOVQ old+8(FP), AX 519 MOVQ new+16(FP), CX 520 LOCK 521 CMPXCHGQ CX, 0(BX) 522 SETEQ ret+24(FP) 523 RET 524 525 // uint32 xadd(uint32 volatile *val, int32 delta) 526 // Atomically: 527 // *val += delta; 528 // return *val; 529 TEXT runtimexadd(SB), NOSPLIT, $0-20 530 MOVQ ptr+0(FP), BX 531 MOVL delta+8(FP), AX 532 MOVL AX, CX 533 LOCK 534 XADDL AX, 0(BX) 535 ADDL CX, AX 536 MOVL AX, ret+16(FP) 537 RET 538 539 TEXT runtimexadd64(SB), NOSPLIT, $0-24 540 MOVQ ptr+0(FP), BX 541 MOVQ delta+8(FP), AX 542 MOVQ AX, CX 543 LOCK 544 XADDQ AX, 0(BX) 545 ADDQ CX, AX 546 MOVQ AX, ret+16(FP) 547 RET 548 549 TEXT runtimexadduintptr(SB), NOSPLIT, $0-24 550 JMP runtimexadd64(SB) 551 552 TEXT runtimexchg(SB), NOSPLIT, $0-20 553 MOVQ ptr+0(FP), BX 554 MOVL new+8(FP), AX 555 XCHGL AX, 0(BX) 556 MOVL AX, ret+16(FP) 557 RET 558 559 TEXT runtimexchg64(SB), NOSPLIT, $0-24 560 MOVQ ptr+0(FP), BX 561 MOVQ new+8(FP), AX 562 XCHGQ AX, 0(BX) 563 MOVQ AX, ret+16(FP) 564 RET 565 566 TEXT runtimexchgp1(SB), NOSPLIT, $0-24 567 MOVQ ptr+0(FP), BX 568 MOVQ new+8(FP), AX 569 XCHGQ AX, 0(BX) 570 MOVQ AX, ret+16(FP) 571 RET 572 573 TEXT runtimexchguintptr(SB), NOSPLIT, $0-24 574 JMP runtimexchg64(SB) 575 576 TEXT runtimeprocyield(SB),NOSPLIT,$0-0 577 MOVL cycles+0(FP), AX 578 again: 579 PAUSE 580 SUBL $1, AX 581 JNZ again 582 RET 583 584 TEXT runtimeatomicstorep1(SB), NOSPLIT, $0-16 585 MOVQ ptr+0(FP), BX 586 MOVQ val+8(FP), AX 587 XCHGQ AX, 0(BX) 588 RET 589 590 TEXT runtimeatomicstore(SB), NOSPLIT, $0-12 591 MOVQ ptr+0(FP), BX 592 MOVL val+8(FP), AX 593 XCHGL AX, 0(BX) 594 RET 595 596 TEXT runtimeatomicstore64(SB), NOSPLIT, $0-16 597 MOVQ ptr+0(FP), BX 598 MOVQ val+8(FP), AX 599 XCHGQ AX, 0(BX) 600 RET 601 602 // void runtimeatomicor8(byte volatile*, byte); 603 TEXT runtimeatomicor8(SB), NOSPLIT, $0-9 604 MOVQ ptr+0(FP), AX 605 MOVB val+8(FP), BX 606 LOCK 607 ORB BX, (AX) 608 RET 609 610 // void runtimeatomicand8(byte volatile*, byte); 611 TEXT runtimeatomicand8(SB), NOSPLIT, $0-9 612 MOVQ ptr+0(FP), AX 613 MOVB val+8(FP), BX 614 LOCK 615 ANDB BX, (AX) 616 RET 617 618 TEXT publicationBarrier(SB),NOSPLIT,$0-0 619 // Stores are already ordered on x86, so this is just a 620 // compile barrier. 621 RET 622 623 // void jmpdefer(fn, sp); 624 // called from deferreturn. 625 // 1. pop the caller 626 // 2. sub 5 bytes from the callers return 627 // 3. jmp to the argument 628 TEXT runtimejmpdefer(SB), NOSPLIT, $0-16 629 MOVQ fv+0(FP), DX // fn 630 MOVQ argp+8(FP), BX // caller sp 631 LEAQ -8(BX), SP // caller sp after CALL 632 SUBQ $5, (SP) // return to CALL again 633 MOVQ 0(DX), BX 634 JMP BX // but first run the deferred function 635 636 // Save state of caller into g->sched. Smashes R8, R9. 637 TEXT gosave<>(SB),NOSPLIT,$0 638 get_tls(R8) 639 MOVQ g(R8), R8 640 MOVQ 0(SP), R9 641 MOVQ R9, (g_sched+gobuf_pc)(R8) 642 LEAQ 8(SP), R9 643 MOVQ R9, (g_sched+gobuf_sp)(R8) 644 MOVQ $0, (g_sched+gobuf_ret)(R8) 645 MOVQ $0, (g_sched+gobuf_ctxt)(R8) 646 MOVQ BP, (g_sched+gobuf_bp)(R8) 647 RET 648 649 // func asmcgocall(fn, arg unsafe.Pointer) int32 650 // Call fn(arg) on the scheduler stack, 651 // aligned appropriately for the gcc ABI. 652 // See cgocall.go for more details. 653 TEXT asmcgocall(SB),NOSPLIT,$0-20 654 MOVQ fn+0(FP), AX 655 MOVQ arg+8(FP), BX 656 657 MOVQ SP, DX 658 659 // Figure out if we need to switch to m->g0 stack. 660 // We get called to create new OS threads too, and those 661 // come in on the m->g0 stack already. 662 get_tls(CX) 663 MOVQ g(CX), R8 664 MOVQ g_m(R8), R8 665 MOVQ m_g0(R8), SI 666 MOVQ g(CX), DI 667 CMPQ SI, DI 668 JEQ nosave 669 MOVQ m_gsignal(R8), SI 670 CMPQ SI, DI 671 JEQ nosave 672 673 MOVQ m_g0(R8), SI 674 CALL gosave<>(SB) 675 MOVQ SI, g(CX) 676 MOVQ (g_sched+gobuf_sp)(SI), SP 677 nosave: 678 679 // Now on a scheduling stack (a pthread-created stack). 680 // Make sure we have enough room for 4 stack-backed fast-call 681 // registers as per windows amd64 calling convention. 682 SUBQ $64, SP 683 ANDQ $~15, SP // alignment for gcc ABI 684 MOVQ DI, 48(SP) // save g 685 MOVQ (g_stack+stack_hi)(DI), DI 686 SUBQ DX, DI 687 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback) 688 MOVQ BX, DI // DI = first argument in AMD64 ABI 689 MOVQ BX, CX // CX = first argument in Win64 690 CALL AX 691 692 // Restore registers, g, stack pointer. 693 get_tls(CX) 694 MOVQ 48(SP), DI 695 MOVQ (g_stack+stack_hi)(DI), SI 696 SUBQ 40(SP), SI 697 MOVQ DI, g(CX) 698 MOVQ SI, SP 699 700 MOVL AX, ret+16(FP) 701 RET 702 703 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize) 704 // Turn the fn into a Go func (by taking its address) and call 705 // cgocallback_gofunc. 706 TEXT runtimecgocallback(SB),NOSPLIT,$24-24 707 LEAQ fn+0(FP), AX 708 MOVQ AX, 0(SP) 709 MOVQ frame+8(FP), AX 710 MOVQ AX, 8(SP) 711 MOVQ framesize+16(FP), AX 712 MOVQ AX, 16(SP) 713 MOVQ $runtimecgocallback_gofunc(SB), AX 714 CALL AX 715 RET 716 717 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize) 718 // See cgocall.go for more details. 719 TEXT cgocallback_gofunc(SB),NOSPLIT,$8-24 720 NO_LOCAL_POINTERS 721 722 // If g is nil, Go did not create the current thread. 723 // Call needm to obtain one m for temporary use. 724 // In this case, we're running on the thread stack, so there's 725 // lots of space, but the linker doesn't know. Hide the call from 726 // the linker analysis by using an indirect call through AX. 727 get_tls(CX) 728 #ifdef GOOS_windows 729 MOVL $0, BX 730 CMPQ CX, $0 731 JEQ 2(PC) 732 #endif 733 MOVQ g(CX), BX 734 CMPQ BX, $0 735 JEQ needm 736 MOVQ g_m(BX), BX 737 MOVQ BX, R8 // holds oldm until end of function 738 JMP havem 739 needm: 740 MOVQ $0, 0(SP) 741 MOVQ $runtimeneedm(SB), AX 742 CALL AX 743 MOVQ 0(SP), R8 744 get_tls(CX) 745 MOVQ g(CX), BX 746 MOVQ g_m(BX), BX 747 748 // Set m->sched.sp = SP, so that if a panic happens 749 // during the function we are about to execute, it will 750 // have a valid SP to run on the g0 stack. 751 // The next few lines (after the havem label) 752 // will save this SP onto the stack and then write 753 // the same SP back to m->sched.sp. That seems redundant, 754 // but if an unrecovered panic happens, unwindm will 755 // restore the g->sched.sp from the stack location 756 // and then systemstack will try to use it. If we don't set it here, 757 // that restored SP will be uninitialized (typically 0) and 758 // will not be usable. 759 MOVQ m_g0(BX), SI 760 MOVQ SP, (g_sched+gobuf_sp)(SI) 761 762 havem: 763 // Now there's a valid m, and we're running on its m->g0. 764 // Save current m->g0->sched.sp on stack and then set it to SP. 765 // Save current sp in m->g0->sched.sp in preparation for 766 // switch back to m->curg stack. 767 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP). 768 MOVQ m_g0(BX), SI 769 MOVQ (g_sched+gobuf_sp)(SI), AX 770 MOVQ AX, 0(SP) 771 MOVQ SP, (g_sched+gobuf_sp)(SI) 772 773 // Switch to m->curg stack and call runtime.cgocallbackg. 774 // Because we are taking over the execution of m->curg 775 // but *not* resuming what had been running, we need to 776 // save that information (m->curg->sched) so we can restore it. 777 // We can restore m->curg->sched.sp easily, because calling 778 // runtime.cgocallbackg leaves SP unchanged upon return. 779 // To save m->curg->sched.pc, we push it onto the stack. 780 // This has the added benefit that it looks to the traceback 781 // routine like cgocallbackg is going to return to that 782 // PC (because the frame we allocate below has the same 783 // size as cgocallback_gofunc's frame declared above) 784 // so that the traceback will seamlessly trace back into 785 // the earlier calls. 786 // 787 // In the new goroutine, 0(SP) holds the saved R8. 788 MOVQ m_curg(BX), SI 789 MOVQ SI, g(CX) 790 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI 791 MOVQ (g_sched+gobuf_pc)(SI), BX 792 MOVQ BX, -8(DI) 793 // Compute the size of the frame, including return PC and, if 794 // GOEXPERIMENT=framepointer, the saved based pointer 795 LEAQ fv+0(FP), AX 796 SUBQ SP, AX 797 SUBQ AX, DI 798 MOVQ DI, SP 799 800 MOVQ R8, 0(SP) 801 CALL runtimecgocallbackg(SB) 802 MOVQ 0(SP), R8 803 804 // Compute the size of the frame again. FP and SP have 805 // completely different values here than they did above, 806 // but only their difference matters. 807 LEAQ fv+0(FP), AX 808 SUBQ SP, AX 809 810 // Restore g->sched (== m->curg->sched) from saved values. 811 get_tls(CX) 812 MOVQ g(CX), SI 813 MOVQ SP, DI 814 ADDQ AX, DI 815 MOVQ -8(DI), BX 816 MOVQ BX, (g_sched+gobuf_pc)(SI) 817 MOVQ DI, (g_sched+gobuf_sp)(SI) 818 819 // Switch back to m->g0's stack and restore m->g0->sched.sp. 820 // (Unlike m->curg, the g0 goroutine never uses sched.pc, 821 // so we do not have to restore it.) 822 MOVQ g(CX), BX 823 MOVQ g_m(BX), BX 824 MOVQ m_g0(BX), SI 825 MOVQ SI, g(CX) 826 MOVQ (g_sched+gobuf_sp)(SI), SP 827 MOVQ 0(SP), AX 828 MOVQ AX, (g_sched+gobuf_sp)(SI) 829 830 // If the m on entry was nil, we called needm above to borrow an m 831 // for the duration of the call. Since the call is over, return it with dropm. 832 CMPQ R8, $0 833 JNE 3(PC) 834 MOVQ $runtimedropm(SB), AX 835 CALL AX 836 837 // Done! 838 RET 839 840 // void setg(G*); set g. for use by needm. 841 TEXT runtimesetg(SB), NOSPLIT, $0-8 842 MOVQ gg+0(FP), BX 843 #ifdef GOOS_windows 844 CMPQ BX, $0 845 JNE settls 846 MOVQ $0, 0x28(GS) 847 RET 848 settls: 849 MOVQ g_m(BX), AX 850 LEAQ m_tls(AX), AX 851 MOVQ AX, 0x28(GS) 852 #endif 853 get_tls(CX) 854 MOVQ BX, g(CX) 855 RET 856 857 // void setg_gcc(G*); set g called from gcc. 858 TEXT setg_gcc<>(SB),NOSPLIT,$0 859 get_tls(AX) 860 MOVQ DI, g(AX) 861 RET 862 863 // check that SP is in range [g->stack.lo, g->stack.hi) 864 TEXT runtimestackcheck(SB), NOSPLIT, $0-0 865 get_tls(CX) 866 MOVQ g(CX), AX 867 CMPQ (g_stack+stack_hi)(AX), SP 868 JHI 2(PC) 869 INT $3 870 CMPQ SP, (g_stack+stack_lo)(AX) 871 JHI 2(PC) 872 INT $3 873 RET 874 875 TEXT runtimegetcallerpc(SB),NOSPLIT,$8-16 876 MOVQ argp+0(FP),AX // addr of first arg 877 MOVQ -8(AX),AX // get calling pc 878 CMPQ AX, runtimestackBarrierPC(SB) 879 JNE nobar 880 // Get original return PC. 881 CALL runtimenextBarrierPC(SB) 882 MOVQ 0(SP), AX 883 nobar: 884 MOVQ AX, ret+8(FP) 885 RET 886 887 TEXT runtimesetcallerpc(SB),NOSPLIT,$8-16 888 MOVQ argp+0(FP),AX // addr of first arg 889 MOVQ pc+8(FP), BX 890 MOVQ -8(AX), CX 891 CMPQ CX, runtimestackBarrierPC(SB) 892 JEQ setbar 893 MOVQ BX, -8(AX) // set calling pc 894 RET 895 setbar: 896 // Set the stack barrier return PC. 897 MOVQ BX, 0(SP) 898 CALL runtimesetNextBarrierPC(SB) 899 RET 900 901 TEXT runtimegetcallersp(SB),NOSPLIT,$0-16 902 MOVQ argp+0(FP), AX 903 MOVQ AX, ret+8(FP) 904 RET 905 906 // func cputicks() int64 907 TEXT runtimecputicks(SB),NOSPLIT,$0-0 908 CMPB runtimelfenceBeforeRdtsc(SB), $1 909 JNE mfence 910 BYTE $0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE 911 JMP done 912 mfence: 913 BYTE $0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE 914 done: 915 RDTSC 916 SHLQ $32, DX 917 ADDQ DX, AX 918 MOVQ AX, ret+0(FP) 919 RET 920 921 // memhash_varlen(p unsafe.Pointer, h seed) uintptr 922 // redirects to memhash(p, h, size) using the size 923 // stored in the closure. 924 TEXT runtimememhash_varlen(SB),NOSPLIT,$32-24 925 GO_ARGS 926 NO_LOCAL_POINTERS 927 MOVQ p+0(FP), AX 928 MOVQ h+8(FP), BX 929 MOVQ 8(DX), CX 930 MOVQ AX, 0(SP) 931 MOVQ BX, 8(SP) 932 MOVQ CX, 16(SP) 933 CALL runtimememhash(SB) 934 MOVQ 24(SP), AX 935 MOVQ AX, ret+16(FP) 936 RET 937 938 // hash function using AES hardware instructions 939 TEXT runtimeaeshash(SB),NOSPLIT,$0-32 940 MOVQ p+0(FP), AX // ptr to data 941 MOVQ s+16(FP), CX // size 942 LEAQ ret+24(FP), DX 943 JMP runtimeaeshashbody(SB) 944 945 TEXT runtimeaeshashstr(SB),NOSPLIT,$0-24 946 MOVQ p+0(FP), AX // ptr to string struct 947 MOVQ 8(AX), CX // length of string 948 MOVQ (AX), AX // string data 949 LEAQ ret+16(FP), DX 950 JMP runtimeaeshashbody(SB) 951 952 // AX: data 953 // CX: length 954 // DX: address to put return value 955 TEXT runtimeaeshashbody(SB),NOSPLIT,$0-0 956 MOVQ h+8(FP), X6 // seed to low 64 bits of xmm6 957 PINSRQ $1, CX, X6 // size to high 64 bits of xmm6 958 PSHUFHW $0, X6, X6 // replace size with its low 2 bytes repeated 4 times 959 MOVO runtimeaeskeysched(SB), X7 960 CMPQ CX, $16 961 JB aes0to15 962 JE aes16 963 CMPQ CX, $32 964 JBE aes17to32 965 CMPQ CX, $64 966 JBE aes33to64 967 CMPQ CX, $128 968 JBE aes65to128 969 JMP aes129plus 970 971 aes0to15: 972 TESTQ CX, CX 973 JE aes0 974 975 ADDQ $16, AX 976 TESTW $0xff0, AX 977 JE endofpage 978 979 // 16 bytes loaded at this address won't cross 980 // a page boundary, so we can load it directly. 981 MOVOU -16(AX), X0 982 ADDQ CX, CX 983 MOVQ $masks<>(SB), AX 984 PAND (AX)(CX*8), X0 985 986 // scramble 3 times 987 AESENC X6, X0 988 AESENC X7, X0 989 AESENC X7, X0 990 MOVQ X0, (DX) 991 RET 992 993 endofpage: 994 // address ends in 1111xxxx. Might be up against 995 // a page boundary, so load ending at last byte. 996 // Then shift bytes down using pshufb. 997 MOVOU -32(AX)(CX*1), X0 998 ADDQ CX, CX 999 MOVQ $shifts<>(SB), AX 1000 PSHUFB (AX)(CX*8), X0 1001 AESENC X6, X0 1002 AESENC X7, X0 1003 AESENC X7, X0 1004 MOVQ X0, (DX) 1005 RET 1006 1007 aes0: 1008 // return input seed 1009 MOVQ h+8(FP), AX 1010 MOVQ AX, (DX) 1011 RET 1012 1013 aes16: 1014 MOVOU (AX), X0 1015 AESENC X6, X0 1016 AESENC X7, X0 1017 AESENC X7, X0 1018 MOVQ X0, (DX) 1019 RET 1020 1021 aes17to32: 1022 // load data to be hashed 1023 MOVOU (AX), X0 1024 MOVOU -16(AX)(CX*1), X1 1025 1026 // scramble 3 times 1027 AESENC X6, X0 1028 AESENC runtimeaeskeysched+16(SB), X1 1029 AESENC X7, X0 1030 AESENC X7, X1 1031 AESENC X7, X0 1032 AESENC X7, X1 1033 1034 // combine results 1035 PXOR X1, X0 1036 MOVQ X0, (DX) 1037 RET 1038 1039 aes33to64: 1040 MOVOU (AX), X0 1041 MOVOU 16(AX), X1 1042 MOVOU -32(AX)(CX*1), X2 1043 MOVOU -16(AX)(CX*1), X3 1044 1045 AESENC X6, X0 1046 AESENC runtimeaeskeysched+16(SB), X1 1047 AESENC runtimeaeskeysched+32(SB), X2 1048 AESENC runtimeaeskeysched+48(SB), X3 1049 AESENC X7, X0 1050 AESENC X7, X1 1051 AESENC X7, X2 1052 AESENC X7, X3 1053 AESENC X7, X0 1054 AESENC X7, X1 1055 AESENC X7, X2 1056 AESENC X7, X3 1057 1058 PXOR X2, X0 1059 PXOR X3, X1 1060 PXOR X1, X0 1061 MOVQ X0, (DX) 1062 RET 1063 1064 aes65to128: 1065 MOVOU (AX), X0 1066 MOVOU 16(AX), X1 1067 MOVOU 32(AX), X2 1068 MOVOU 48(AX), X3 1069 MOVOU -64(AX)(CX*1), X4 1070 MOVOU -48(AX)(CX*1), X5 1071 MOVOU -32(AX)(CX*1), X8 1072 MOVOU -16(AX)(CX*1), X9 1073 1074 AESENC X6, X0 1075 AESENC runtimeaeskeysched+16(SB), X1 1076 AESENC runtimeaeskeysched+32(SB), X2 1077 AESENC runtimeaeskeysched+48(SB), X3 1078 AESENC runtimeaeskeysched+64(SB), X4 1079 AESENC runtimeaeskeysched+80(SB), X5 1080 AESENC runtimeaeskeysched+96(SB), X8 1081 AESENC runtimeaeskeysched+112(SB), X9 1082 AESENC X7, X0 1083 AESENC X7, X1 1084 AESENC X7, X2 1085 AESENC X7, X3 1086 AESENC X7, X4 1087 AESENC X7, X5 1088 AESENC X7, X8 1089 AESENC X7, X9 1090 AESENC X7, X0 1091 AESENC X7, X1 1092 AESENC X7, X2 1093 AESENC X7, X3 1094 AESENC X7, X4 1095 AESENC X7, X5 1096 AESENC X7, X8 1097 AESENC X7, X9 1098 1099 PXOR X4, X0 1100 PXOR X5, X1 1101 PXOR X8, X2 1102 PXOR X9, X3 1103 PXOR X2, X0 1104 PXOR X3, X1 1105 PXOR X1, X0 1106 MOVQ X0, (DX) 1107 RET 1108 1109 aes129plus: 1110 // start with last (possibly overlapping) block 1111 MOVOU -128(AX)(CX*1), X0 1112 MOVOU -112(AX)(CX*1), X1 1113 MOVOU -96(AX)(CX*1), X2 1114 MOVOU -80(AX)(CX*1), X3 1115 MOVOU -64(AX)(CX*1), X4 1116 MOVOU -48(AX)(CX*1), X5 1117 MOVOU -32(AX)(CX*1), X8 1118 MOVOU -16(AX)(CX*1), X9 1119 1120 // scramble state once 1121 AESENC X6, X0 1122 AESENC runtimeaeskeysched+16(SB), X1 1123 AESENC runtimeaeskeysched+32(SB), X2 1124 AESENC runtimeaeskeysched+48(SB), X3 1125 AESENC runtimeaeskeysched+64(SB), X4 1126 AESENC runtimeaeskeysched+80(SB), X5 1127 AESENC runtimeaeskeysched+96(SB), X8 1128 AESENC runtimeaeskeysched+112(SB), X9 1129 1130 // compute number of remaining 128-byte blocks 1131 DECQ CX 1132 SHRQ $7, CX 1133 1134 aesloop: 1135 // scramble state, xor in a block 1136 MOVOU (AX), X10 1137 MOVOU 16(AX), X11 1138 MOVOU 32(AX), X12 1139 MOVOU 48(AX), X13 1140 AESENC X10, X0 1141 AESENC X11, X1 1142 AESENC X12, X2 1143 AESENC X13, X3 1144 MOVOU 64(AX), X10 1145 MOVOU 80(AX), X11 1146 MOVOU 96(AX), X12 1147 MOVOU 112(AX), X13 1148 AESENC X10, X4 1149 AESENC X11, X5 1150 AESENC X12, X8 1151 AESENC X13, X9 1152 1153 // scramble state 1154 AESENC X7, X0 1155 AESENC X7, X1 1156 AESENC X7, X2 1157 AESENC X7, X3 1158 AESENC X7, X4 1159 AESENC X7, X5 1160 AESENC X7, X8 1161 AESENC X7, X9 1162 1163 ADDQ $128, AX 1164 DECQ CX 1165 JNE aesloop 1166 1167 // 2 more scrambles to finish 1168 AESENC X7, X0 1169 AESENC X7, X1 1170 AESENC X7, X2 1171 AESENC X7, X3 1172 AESENC X7, X4 1173 AESENC X7, X5 1174 AESENC X7, X8 1175 AESENC X7, X9 1176 AESENC X7, X0 1177 AESENC X7, X1 1178 AESENC X7, X2 1179 AESENC X7, X3 1180 AESENC X7, X4 1181 AESENC X7, X5 1182 AESENC X7, X8 1183 AESENC X7, X9 1184 1185 PXOR X4, X0 1186 PXOR X5, X1 1187 PXOR X8, X2 1188 PXOR X9, X3 1189 PXOR X2, X0 1190 PXOR X3, X1 1191 PXOR X1, X0 1192 MOVQ X0, (DX) 1193 RET 1194 1195 TEXT runtimeaeshash32(SB),NOSPLIT,$0-24 1196 MOVQ p+0(FP), AX // ptr to data 1197 MOVQ h+8(FP), X0 // seed 1198 PINSRD $2, (AX), X0 // data 1199 AESENC runtimeaeskeysched+0(SB), X0 1200 AESENC runtimeaeskeysched+16(SB), X0 1201 AESENC runtimeaeskeysched+32(SB), X0 1202 MOVQ X0, ret+16(FP) 1203 RET 1204 1205 TEXT runtimeaeshash64(SB),NOSPLIT,$0-24 1206 MOVQ p+0(FP), AX // ptr to data 1207 MOVQ h+8(FP), X0 // seed 1208 PINSRQ $1, (AX), X0 // data 1209 AESENC runtimeaeskeysched+0(SB), X0 1210 AESENC runtimeaeskeysched+16(SB), X0 1211 AESENC runtimeaeskeysched+32(SB), X0 1212 MOVQ X0, ret+16(FP) 1213 RET 1214 1215 // simple mask to get rid of data in the high part of the register. 1216 DATA masks<>+0x00(SB)/8, $0x0000000000000000 1217 DATA masks<>+0x08(SB)/8, $0x0000000000000000 1218 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 1219 DATA masks<>+0x18(SB)/8, $0x0000000000000000 1220 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 1221 DATA masks<>+0x28(SB)/8, $0x0000000000000000 1222 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 1223 DATA masks<>+0x38(SB)/8, $0x0000000000000000 1224 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 1225 DATA masks<>+0x48(SB)/8, $0x0000000000000000 1226 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 1227 DATA masks<>+0x58(SB)/8, $0x0000000000000000 1228 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 1229 DATA masks<>+0x68(SB)/8, $0x0000000000000000 1230 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 1231 DATA masks<>+0x78(SB)/8, $0x0000000000000000 1232 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 1233 DATA masks<>+0x88(SB)/8, $0x0000000000000000 1234 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 1235 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 1236 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 1237 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 1238 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 1239 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 1240 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 1241 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 1242 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 1243 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 1244 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 1245 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 1246 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 1247 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 1248 GLOBL masks<>(SB),RODATA,$256 1249 1250 // these are arguments to pshufb. They move data down from 1251 // the high bytes of the register to the low bytes of the register. 1252 // index is how many bytes to move. 1253 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 1254 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 1255 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 1256 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 1257 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 1258 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 1259 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 1260 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 1261 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 1262 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 1263 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 1264 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 1265 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 1266 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 1267 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 1268 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 1269 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 1270 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 1271 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 1272 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 1273 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 1274 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 1275 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 1276 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 1277 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 1278 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 1279 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 1280 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 1281 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 1282 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 1283 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 1284 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 1285 GLOBL shifts<>(SB),RODATA,$256 1286 1287 TEXT runtimememeq(SB),NOSPLIT,$0-25 1288 MOVQ a+0(FP), SI 1289 MOVQ b+8(FP), DI 1290 MOVQ size+16(FP), BX 1291 LEAQ ret+24(FP), AX 1292 JMP runtimememeqbody(SB) 1293 1294 // memequal_varlen(a, b unsafe.Pointer) bool 1295 TEXT runtimememequal_varlen(SB),NOSPLIT,$0-17 1296 MOVQ a+0(FP), SI 1297 MOVQ b+8(FP), DI 1298 CMPQ SI, DI 1299 JEQ eq 1300 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 1301 LEAQ ret+16(FP), AX 1302 JMP runtimememeqbody(SB) 1303 eq: 1304 MOVB $1, ret+16(FP) 1305 RET 1306 1307 // eqstring tests whether two strings are equal. 1308 // The compiler guarantees that strings passed 1309 // to eqstring have equal length. 1310 // See runtime_test.go:eqstring_generic for 1311 // equivalent Go code. 1312 TEXT runtimeeqstring(SB),NOSPLIT,$0-33 1313 MOVQ s1str+0(FP), SI 1314 MOVQ s2str+16(FP), DI 1315 CMPQ SI, DI 1316 JEQ eq 1317 MOVQ s1len+8(FP), BX 1318 LEAQ v+32(FP), AX 1319 JMP runtimememeqbody(SB) 1320 eq: 1321 MOVB $1, v+32(FP) 1322 RET 1323 1324 // a in SI 1325 // b in DI 1326 // count in BX 1327 // address of result byte in AX 1328 TEXT runtimememeqbody(SB),NOSPLIT,$0-0 1329 CMPQ BX, $8 1330 JB small 1331 1332 // 64 bytes at a time using xmm registers 1333 hugeloop: 1334 CMPQ BX, $64 1335 JB bigloop 1336 MOVOU (SI), X0 1337 MOVOU (DI), X1 1338 MOVOU 16(SI), X2 1339 MOVOU 16(DI), X3 1340 MOVOU 32(SI), X4 1341 MOVOU 32(DI), X5 1342 MOVOU 48(SI), X6 1343 MOVOU 48(DI), X7 1344 PCMPEQB X1, X0 1345 PCMPEQB X3, X2 1346 PCMPEQB X5, X4 1347 PCMPEQB X7, X6 1348 PAND X2, X0 1349 PAND X6, X4 1350 PAND X4, X0 1351 PMOVMSKB X0, DX 1352 ADDQ $64, SI 1353 ADDQ $64, DI 1354 SUBQ $64, BX 1355 CMPL DX, $0xffff 1356 JEQ hugeloop 1357 MOVB $0, (AX) 1358 RET 1359 1360 // 8 bytes at a time using 64-bit register 1361 bigloop: 1362 CMPQ BX, $8 1363 JBE leftover 1364 MOVQ (SI), CX 1365 MOVQ (DI), DX 1366 ADDQ $8, SI 1367 ADDQ $8, DI 1368 SUBQ $8, BX 1369 CMPQ CX, DX 1370 JEQ bigloop 1371 MOVB $0, (AX) 1372 RET 1373 1374 // remaining 0-8 bytes 1375 leftover: 1376 MOVQ -8(SI)(BX*1), CX 1377 MOVQ -8(DI)(BX*1), DX 1378 CMPQ CX, DX 1379 SETEQ (AX) 1380 RET 1381 1382 small: 1383 CMPQ BX, $0 1384 JEQ equal 1385 1386 LEAQ 0(BX*8), CX 1387 NEGQ CX 1388 1389 CMPB SI, $0xf8 1390 JA si_high 1391 1392 // load at SI won't cross a page boundary. 1393 MOVQ (SI), SI 1394 JMP si_finish 1395 si_high: 1396 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 1397 MOVQ -8(SI)(BX*1), SI 1398 SHRQ CX, SI 1399 si_finish: 1400 1401 // same for DI. 1402 CMPB DI, $0xf8 1403 JA di_high 1404 MOVQ (DI), DI 1405 JMP di_finish 1406 di_high: 1407 MOVQ -8(DI)(BX*1), DI 1408 SHRQ CX, DI 1409 di_finish: 1410 1411 SUBQ SI, DI 1412 SHLQ CX, DI 1413 equal: 1414 SETEQ (AX) 1415 RET 1416 1417 TEXT runtimecmpstring(SB),NOSPLIT,$0-40 1418 MOVQ s1_base+0(FP), SI 1419 MOVQ s1_len+8(FP), BX 1420 MOVQ s2_base+16(FP), DI 1421 MOVQ s2_len+24(FP), DX 1422 LEAQ ret+32(FP), R9 1423 JMP runtimecmpbody(SB) 1424 1425 TEXT bytesCompare(SB),NOSPLIT,$0-56 1426 MOVQ s1+0(FP), SI 1427 MOVQ s1+8(FP), BX 1428 MOVQ s2+24(FP), DI 1429 MOVQ s2+32(FP), DX 1430 LEAQ res+48(FP), R9 1431 JMP runtimecmpbody(SB) 1432 1433 // input: 1434 // SI = a 1435 // DI = b 1436 // BX = alen 1437 // DX = blen 1438 // R9 = address of output word (stores -1/0/1 here) 1439 TEXT runtimecmpbody(SB),NOSPLIT,$0-0 1440 CMPQ SI, DI 1441 JEQ allsame 1442 CMPQ BX, DX 1443 MOVQ DX, R8 1444 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare 1445 CMPQ R8, $8 1446 JB small 1447 1448 loop: 1449 CMPQ R8, $16 1450 JBE _0through16 1451 MOVOU (SI), X0 1452 MOVOU (DI), X1 1453 PCMPEQB X0, X1 1454 PMOVMSKB X1, AX 1455 XORQ $0xffff, AX // convert EQ to NE 1456 JNE diff16 // branch if at least one byte is not equal 1457 ADDQ $16, SI 1458 ADDQ $16, DI 1459 SUBQ $16, R8 1460 JMP loop 1461 1462 // AX = bit mask of differences 1463 diff16: 1464 BSFQ AX, BX // index of first byte that differs 1465 XORQ AX, AX 1466 MOVB (SI)(BX*1), CX 1467 CMPB CX, (DI)(BX*1) 1468 SETHI AX 1469 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 1470 MOVQ AX, (R9) 1471 RET 1472 1473 // 0 through 16 bytes left, alen>=8, blen>=8 1474 _0through16: 1475 CMPQ R8, $8 1476 JBE _0through8 1477 MOVQ (SI), AX 1478 MOVQ (DI), CX 1479 CMPQ AX, CX 1480 JNE diff8 1481 _0through8: 1482 MOVQ -8(SI)(R8*1), AX 1483 MOVQ -8(DI)(R8*1), CX 1484 CMPQ AX, CX 1485 JEQ allsame 1486 1487 // AX and CX contain parts of a and b that differ. 1488 diff8: 1489 BSWAPQ AX // reverse order of bytes 1490 BSWAPQ CX 1491 XORQ AX, CX 1492 BSRQ CX, CX // index of highest bit difference 1493 SHRQ CX, AX // move a's bit to bottom 1494 ANDQ $1, AX // mask bit 1495 LEAQ -1(AX*2), AX // 1/0 => +1/-1 1496 MOVQ AX, (R9) 1497 RET 1498 1499 // 0-7 bytes in common 1500 small: 1501 LEAQ (R8*8), CX // bytes left -> bits left 1502 NEGQ CX // - bits lift (== 64 - bits left mod 64) 1503 JEQ allsame 1504 1505 // load bytes of a into high bytes of AX 1506 CMPB SI, $0xf8 1507 JA si_high 1508 MOVQ (SI), SI 1509 JMP si_finish 1510 si_high: 1511 MOVQ -8(SI)(R8*1), SI 1512 SHRQ CX, SI 1513 si_finish: 1514 SHLQ CX, SI 1515 1516 // load bytes of b in to high bytes of BX 1517 CMPB DI, $0xf8 1518 JA di_high 1519 MOVQ (DI), DI 1520 JMP di_finish 1521 di_high: 1522 MOVQ -8(DI)(R8*1), DI 1523 SHRQ CX, DI 1524 di_finish: 1525 SHLQ CX, DI 1526 1527 BSWAPQ SI // reverse order of bytes 1528 BSWAPQ DI 1529 XORQ SI, DI // find bit differences 1530 JEQ allsame 1531 BSRQ DI, CX // index of highest bit difference 1532 SHRQ CX, SI // move a's bit to bottom 1533 ANDQ $1, SI // mask bit 1534 LEAQ -1(SI*2), AX // 1/0 => +1/-1 1535 MOVQ AX, (R9) 1536 RET 1537 1538 allsame: 1539 XORQ AX, AX 1540 XORQ CX, CX 1541 CMPQ BX, DX 1542 SETGT AX // 1 if alen > blen 1543 SETEQ CX // 1 if alen == blen 1544 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result 1545 MOVQ AX, (R9) 1546 RET 1547 1548 TEXT bytesIndexByte(SB),NOSPLIT,$0-40 1549 MOVQ s+0(FP), SI 1550 MOVQ s_len+8(FP), BX 1551 MOVB c+24(FP), AL 1552 LEAQ ret+32(FP), R8 1553 JMP runtimeindexbytebody(SB) 1554 1555 TEXT stringsIndexByte(SB),NOSPLIT,$0-32 1556 MOVQ s+0(FP), SI 1557 MOVQ s_len+8(FP), BX 1558 MOVB c+16(FP), AL 1559 LEAQ ret+24(FP), R8 1560 JMP runtimeindexbytebody(SB) 1561 1562 // input: 1563 // SI: data 1564 // BX: data len 1565 // AL: byte sought 1566 // R8: address to put result 1567 TEXT runtimeindexbytebody(SB),NOSPLIT,$0 1568 MOVQ SI, DI 1569 1570 CMPQ BX, $16 1571 JLT small 1572 1573 // round up to first 16-byte boundary 1574 TESTQ $15, SI 1575 JZ aligned 1576 MOVQ SI, CX 1577 ANDQ $~15, CX 1578 ADDQ $16, CX 1579 1580 // search the beginning 1581 SUBQ SI, CX 1582 REPN; SCASB 1583 JZ success 1584 1585 // DI is 16-byte aligned; get ready to search using SSE instructions 1586 aligned: 1587 // round down to last 16-byte boundary 1588 MOVQ BX, R11 1589 ADDQ SI, R11 1590 ANDQ $~15, R11 1591 1592 // shuffle X0 around so that each byte contains c 1593 MOVD AX, X0 1594 PUNPCKLBW X0, X0 1595 PUNPCKLBW X0, X0 1596 PSHUFL $0, X0, X0 1597 JMP condition 1598 1599 sse: 1600 // move the next 16-byte chunk of the buffer into X1 1601 MOVO (DI), X1 1602 // compare bytes in X0 to X1 1603 PCMPEQB X0, X1 1604 // take the top bit of each byte in X1 and put the result in DX 1605 PMOVMSKB X1, DX 1606 TESTL DX, DX 1607 JNZ ssesuccess 1608 ADDQ $16, DI 1609 1610 condition: 1611 CMPQ DI, R11 1612 JLT sse 1613 1614 // search the end 1615 MOVQ SI, CX 1616 ADDQ BX, CX 1617 SUBQ R11, CX 1618 // if CX == 0, the zero flag will be set and we'll end up 1619 // returning a false success 1620 JZ failure 1621 REPN; SCASB 1622 JZ success 1623 1624 failure: 1625 MOVQ $-1, (R8) 1626 RET 1627 1628 // handle for lengths < 16 1629 small: 1630 MOVQ BX, CX 1631 REPN; SCASB 1632 JZ success 1633 MOVQ $-1, (R8) 1634 RET 1635 1636 // we've found the chunk containing the byte 1637 // now just figure out which specific byte it is 1638 ssesuccess: 1639 // get the index of the least significant set bit 1640 BSFW DX, DX 1641 SUBQ SI, DI 1642 ADDQ DI, DX 1643 MOVQ DX, (R8) 1644 RET 1645 1646 success: 1647 SUBQ SI, DI 1648 SUBL $1, DI 1649 MOVQ DI, (R8) 1650 RET 1651 1652 TEXT bytesEqual(SB),NOSPLIT,$0-49 1653 MOVQ a_len+8(FP), BX 1654 MOVQ b_len+32(FP), CX 1655 CMPQ BX, CX 1656 JNE eqret 1657 MOVQ a+0(FP), SI 1658 MOVQ b+24(FP), DI 1659 LEAQ ret+48(FP), AX 1660 JMP runtimememeqbody(SB) 1661 eqret: 1662 MOVB $0, ret+48(FP) 1663 RET 1664 1665 TEXT runtimefastrand1(SB), NOSPLIT, $0-4 1666 get_tls(CX) 1667 MOVQ g(CX), AX 1668 MOVQ g_m(AX), AX 1669 MOVL m_fastrand(AX), DX 1670 ADDL DX, DX 1671 MOVL DX, BX 1672 XORL $0x88888eef, DX 1673 CMOVLMI BX, DX 1674 MOVL DX, m_fastrand(AX) 1675 MOVL DX, ret+0(FP) 1676 RET 1677 1678 TEXT runtimereturn0(SB), NOSPLIT, $0 1679 MOVL $0, AX 1680 RET 1681 1682 1683 // Called from cgo wrappers, this function returns g->m->curg.stack.hi. 1684 // Must obey the gcc calling convention. 1685 TEXT _cgo_topofstack(SB),NOSPLIT,$0 1686 get_tls(CX) 1687 MOVQ g(CX), AX 1688 MOVQ g_m(AX), AX 1689 MOVQ m_curg(AX), AX 1690 MOVQ (g_stack+stack_hi)(AX), AX 1691 RET 1692 1693 // The top-most function running on a goroutine 1694 // returns to goexit+PCQuantum. 1695 TEXT runtimegoexit(SB),NOSPLIT,$0-0 1696 BYTE $0x90 // NOP 1697 CALL runtimegoexit1(SB) // does not return 1698 // traceback from goexit1 must hit code range of goexit 1699 BYTE $0x90 // NOP 1700 1701 TEXT runtimeprefetcht0(SB),NOSPLIT,$0-8 1702 MOVQ addr+0(FP), AX 1703 PREFETCHT0 (AX) 1704 RET 1705 1706 TEXT runtimeprefetcht1(SB),NOSPLIT,$0-8 1707 MOVQ addr+0(FP), AX 1708 PREFETCHT1 (AX) 1709 RET 1710 1711 TEXT runtimeprefetcht2(SB),NOSPLIT,$0-8 1712 MOVQ addr+0(FP), AX 1713 PREFETCHT2 (AX) 1714 RET 1715 1716 TEXT runtimeprefetchnta(SB),NOSPLIT,$0-8 1717 MOVQ addr+0(FP), AX 1718 PREFETCHNTA (AX) 1719 RET 1720 1721 // This is called from .init_array and follows the platform, not Go, ABI. 1722 TEXT runtimeaddmoduledata(SB),NOSPLIT,$0-0 1723 PUSHQ R15 // The access to global variables below implicitly uses R15, which is callee-save 1724 MOVQ runtimelastmoduledatap(SB), AX 1725 MOVQ DI, moduledata_next(AX) 1726 MOVQ DI, runtimelastmoduledatap(SB) 1727 POPQ R15 1728 RET 1729