Home | History | Annotate | Download | only in runtime
      1 // Copyright 2014 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // +build !plan9
      6 
      7 #include "textflag.h"
      8 
      9 // NOTE: Windows externalthreadhandler expects memclr to preserve DX.
     10 
     11 // void runtimememclrNoHeapPointers(void*, uintptr)
     12 TEXT runtimememclrNoHeapPointers(SB), NOSPLIT, $0-16
     13 	MOVQ	ptr+0(FP), DI
     14 	MOVQ	n+8(FP), BX
     15 	XORQ	AX, AX
     16 
     17 	// MOVOU seems always faster than REP STOSQ.
     18 tail:
     19 	TESTQ	BX, BX
     20 	JEQ	_0
     21 	CMPQ	BX, $2
     22 	JBE	_1or2
     23 	CMPQ	BX, $4
     24 	JBE	_3or4
     25 	CMPQ	BX, $8
     26 	JB	_5through7
     27 	JE	_8
     28 	CMPQ	BX, $16
     29 	JBE	_9through16
     30 	PXOR	X0, X0
     31 	CMPQ	BX, $32
     32 	JBE	_17through32
     33 	CMPQ	BX, $64
     34 	JBE	_33through64
     35 	CMPQ	BX, $128
     36 	JBE	_65through128
     37 	CMPQ	BX, $256
     38 	JBE	_129through256
     39 	CMPB	runtimesupport_avx2(SB), $1
     40 	JE loop_preheader_avx2
     41 	// TODO: use branch table and BSR to make this just a single dispatch
     42 	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
     43 
     44 loop:
     45 	MOVOU	X0, 0(DI)
     46 	MOVOU	X0, 16(DI)
     47 	MOVOU	X0, 32(DI)
     48 	MOVOU	X0, 48(DI)
     49 	MOVOU	X0, 64(DI)
     50 	MOVOU	X0, 80(DI)
     51 	MOVOU	X0, 96(DI)
     52 	MOVOU	X0, 112(DI)
     53 	MOVOU	X0, 128(DI)
     54 	MOVOU	X0, 144(DI)
     55 	MOVOU	X0, 160(DI)
     56 	MOVOU	X0, 176(DI)
     57 	MOVOU	X0, 192(DI)
     58 	MOVOU	X0, 208(DI)
     59 	MOVOU	X0, 224(DI)
     60 	MOVOU	X0, 240(DI)
     61 	SUBQ	$256, BX
     62 	ADDQ	$256, DI
     63 	CMPQ	BX, $256
     64 	JAE	loop
     65 	JMP	tail
     66 
     67 loop_preheader_avx2:
     68 	VPXOR Y0, Y0, Y0
     69 	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
     70 	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
     71 	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
     72 	CMPQ    BX, $0x2000000
     73 	JAE     loop_preheader_avx2_huge
     74 loop_avx2:
     75 	VMOVDQU	Y0, 0(DI)
     76 	VMOVDQU	Y0, 32(DI)
     77 	VMOVDQU	Y0, 64(DI)
     78 	VMOVDQU	Y0, 96(DI)
     79 	SUBQ	$128, BX
     80 	ADDQ	$128, DI
     81 	CMPQ	BX, $128
     82 	JAE	loop_avx2
     83 	VMOVDQU  Y0, -32(DI)(BX*1)
     84 	VMOVDQU  Y0, -64(DI)(BX*1)
     85 	VMOVDQU  Y0, -96(DI)(BX*1)
     86 	VMOVDQU  Y0, -128(DI)(BX*1)
     87 	VZEROUPPER
     88 	RET
     89 loop_preheader_avx2_huge:
     90 	// Align to 32 byte boundary
     91 	VMOVDQU  Y0, 0(DI)
     92 	MOVQ	DI, SI
     93 	ADDQ	$32, DI
     94 	ANDQ	$~31, DI
     95 	SUBQ	DI, SI
     96 	ADDQ	SI, BX
     97 loop_avx2_huge:
     98 	VMOVNTDQ	Y0, 0(DI)
     99 	VMOVNTDQ	Y0, 32(DI)
    100 	VMOVNTDQ	Y0, 64(DI)
    101 	VMOVNTDQ	Y0, 96(DI)
    102 	SUBQ	$128, BX
    103 	ADDQ	$128, DI
    104 	CMPQ	BX, $128
    105 	JAE	loop_avx2_huge
    106 	// In the description of MOVNTDQ in [1]
    107 	// "... fencing operation implemented with the SFENCE or MFENCE instruction
    108 	// should be used in conjunction with MOVNTDQ instructions..."
    109 	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
    110 	SFENCE
    111 	VMOVDQU  Y0, -32(DI)(BX*1)
    112 	VMOVDQU  Y0, -64(DI)(BX*1)
    113 	VMOVDQU  Y0, -96(DI)(BX*1)
    114 	VMOVDQU  Y0, -128(DI)(BX*1)
    115 	VZEROUPPER
    116 	RET
    117 
    118 _1or2:
    119 	MOVB	AX, (DI)
    120 	MOVB	AX, -1(DI)(BX*1)
    121 	RET
    122 _0:
    123 	RET
    124 _3or4:
    125 	MOVW	AX, (DI)
    126 	MOVW	AX, -2(DI)(BX*1)
    127 	RET
    128 _5through7:
    129 	MOVL	AX, (DI)
    130 	MOVL	AX, -4(DI)(BX*1)
    131 	RET
    132 _8:
    133 	// We need a separate case for 8 to make sure we clear pointers atomically.
    134 	MOVQ	AX, (DI)
    135 	RET
    136 _9through16:
    137 	MOVQ	AX, (DI)
    138 	MOVQ	AX, -8(DI)(BX*1)
    139 	RET
    140 _17through32:
    141 	MOVOU	X0, (DI)
    142 	MOVOU	X0, -16(DI)(BX*1)
    143 	RET
    144 _33through64:
    145 	MOVOU	X0, (DI)
    146 	MOVOU	X0, 16(DI)
    147 	MOVOU	X0, -32(DI)(BX*1)
    148 	MOVOU	X0, -16(DI)(BX*1)
    149 	RET
    150 _65through128:
    151 	MOVOU	X0, (DI)
    152 	MOVOU	X0, 16(DI)
    153 	MOVOU	X0, 32(DI)
    154 	MOVOU	X0, 48(DI)
    155 	MOVOU	X0, -64(DI)(BX*1)
    156 	MOVOU	X0, -48(DI)(BX*1)
    157 	MOVOU	X0, -32(DI)(BX*1)
    158 	MOVOU	X0, -16(DI)(BX*1)
    159 	RET
    160 _129through256:
    161 	MOVOU	X0, (DI)
    162 	MOVOU	X0, 16(DI)
    163 	MOVOU	X0, 32(DI)
    164 	MOVOU	X0, 48(DI)
    165 	MOVOU	X0, 64(DI)
    166 	MOVOU	X0, 80(DI)
    167 	MOVOU	X0, 96(DI)
    168 	MOVOU	X0, 112(DI)
    169 	MOVOU	X0, -128(DI)(BX*1)
    170 	MOVOU	X0, -112(DI)(BX*1)
    171 	MOVOU	X0, -96(DI)(BX*1)
    172 	MOVOU	X0, -80(DI)(BX*1)
    173 	MOVOU	X0, -64(DI)(BX*1)
    174 	MOVOU	X0, -48(DI)(BX*1)
    175 	MOVOU	X0, -32(DI)(BX*1)
    176 	MOVOU	X0, -16(DI)(BX*1)
    177 	RET
    178