Home | History | Annotate | Download | only in runtime
      1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
      2 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
      3 //
      4 //         Copyright  1994-1999 Lucent Technologies Inc.  All rights reserved.
      5 //         Revisions Copyright  2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
      6 //         Portions Copyright 2009 The Go Authors. All rights reserved.
      7 //
      8 // Permission is hereby granted, free of charge, to any person obtaining a copy
      9 // of this software and associated documentation files (the "Software"), to deal
     10 // in the Software without restriction, including without limitation the rights
     11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     12 // copies of the Software, and to permit persons to whom the Software is
     13 // furnished to do so, subject to the following conditions:
     14 //
     15 // The above copyright notice and this permission notice shall be included in
     16 // all copies or substantial portions of the Software.
     17 //
     18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
     21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     24 // THE SOFTWARE.
     25 
     26 // +build !plan9
     27 
     28 #include "textflag.h"
     29 
     30 // void runtimememmove(void*, void*, uintptr)
     31 TEXT runtimememmove(SB), NOSPLIT, $0-24
     32 
     33 	MOVQ	to+0(FP), DI
     34 	MOVQ	from+8(FP), SI
     35 	MOVQ	n+16(FP), BX
     36 
     37 	// REP instructions have a high startup cost, so we handle small sizes
     38 	// with some straightline code.  The REP MOVSQ instruction is really fast
     39 	// for large sizes.  The cutover is approximately 2K.
     40 tail:
     41 	// move_129through256 or smaller work whether or not the source and the
     42 	// destination memory regions overlap because they load all data into
     43 	// registers before writing it back.  move_256through2048 on the other
     44 	// hand can be used only when the memory regions don't overlap or the copy
     45 	// direction is forward.
     46 	TESTQ	BX, BX
     47 	JEQ	move_0
     48 	CMPQ	BX, $2
     49 	JBE	move_1or2
     50 	CMPQ	BX, $4
     51 	JBE	move_3or4
     52 	CMPQ	BX, $8
     53 	JBE	move_5through8
     54 	CMPQ	BX, $16
     55 	JBE	move_9through16
     56 	CMPQ	BX, $32
     57 	JBE	move_17through32
     58 	CMPQ	BX, $64
     59 	JBE	move_33through64
     60 	CMPQ	BX, $128
     61 	JBE	move_65through128
     62 	CMPQ	BX, $256
     63 	JBE	move_129through256
     64 	// TODO: use branch table and BSR to make this just a single dispatch
     65 
     66 /*
     67  * check and set for backwards
     68  */
     69 	CMPQ	SI, DI
     70 	JLS	back
     71 
     72 /*
     73  * forward copy loop
     74  */
     75 forward:
     76 	CMPQ	BX, $2048
     77 	JLS	move_256through2048
     78 
     79 	MOVQ	BX, CX
     80 	SHRQ	$3, CX
     81 	ANDQ	$7, BX
     82 	REP;	MOVSQ
     83 	JMP	tail
     84 
     85 back:
     86 /*
     87  * check overlap
     88  */
     89 	MOVQ	SI, CX
     90 	ADDQ	BX, CX
     91 	CMPQ	CX, DI
     92 	JLS	forward
     93 
     94 /*
     95  * whole thing backwards has
     96  * adjusted addresses
     97  */
     98 	ADDQ	BX, DI
     99 	ADDQ	BX, SI
    100 	STD
    101 
    102 /*
    103  * copy
    104  */
    105 	MOVQ	BX, CX
    106 	SHRQ	$3, CX
    107 	ANDQ	$7, BX
    108 
    109 	SUBQ	$8, DI
    110 	SUBQ	$8, SI
    111 	REP;	MOVSQ
    112 
    113 	CLD
    114 	ADDQ	$8, DI
    115 	ADDQ	$8, SI
    116 	SUBQ	BX, DI
    117 	SUBQ	BX, SI
    118 	JMP	tail
    119 
    120 move_1or2:
    121 	MOVB	(SI), AX
    122 	MOVB	-1(SI)(BX*1), CX
    123 	MOVB	AX, (DI)
    124 	MOVB	CX, -1(DI)(BX*1)
    125 	RET
    126 move_0:
    127 	RET
    128 move_3or4:
    129 	MOVW	(SI), AX
    130 	MOVW	-2(SI)(BX*1), CX
    131 	MOVW	AX, (DI)
    132 	MOVW	CX, -2(DI)(BX*1)
    133 	RET
    134 move_5through8:
    135 	MOVL	(SI), AX
    136 	MOVL	-4(SI)(BX*1), CX
    137 	MOVL	AX, (DI)
    138 	MOVL	CX, -4(DI)(BX*1)
    139 	RET
    140 move_9through16:
    141 	MOVQ	(SI), AX
    142 	MOVQ	-8(SI)(BX*1), CX
    143 	MOVQ	AX, (DI)
    144 	MOVQ	CX, -8(DI)(BX*1)
    145 	RET
    146 move_17through32:
    147 	MOVOU	(SI), X0
    148 	MOVOU	-16(SI)(BX*1), X1
    149 	MOVOU	X0, (DI)
    150 	MOVOU	X1, -16(DI)(BX*1)
    151 	RET
    152 move_33through64:
    153 	MOVOU	(SI), X0
    154 	MOVOU	16(SI), X1
    155 	MOVOU	-32(SI)(BX*1), X2
    156 	MOVOU	-16(SI)(BX*1), X3
    157 	MOVOU	X0, (DI)
    158 	MOVOU	X1, 16(DI)
    159 	MOVOU	X2, -32(DI)(BX*1)
    160 	MOVOU	X3, -16(DI)(BX*1)
    161 	RET
    162 move_65through128:
    163 	MOVOU	(SI), X0
    164 	MOVOU	16(SI), X1
    165 	MOVOU	32(SI), X2
    166 	MOVOU	48(SI), X3
    167 	MOVOU	-64(SI)(BX*1), X4
    168 	MOVOU	-48(SI)(BX*1), X5
    169 	MOVOU	-32(SI)(BX*1), X6
    170 	MOVOU	-16(SI)(BX*1), X7
    171 	MOVOU	X0, (DI)
    172 	MOVOU	X1, 16(DI)
    173 	MOVOU	X2, 32(DI)
    174 	MOVOU	X3, 48(DI)
    175 	MOVOU	X4, -64(DI)(BX*1)
    176 	MOVOU	X5, -48(DI)(BX*1)
    177 	MOVOU	X6, -32(DI)(BX*1)
    178 	MOVOU	X7, -16(DI)(BX*1)
    179 	RET
    180 move_129through256:
    181 	MOVOU	(SI), X0
    182 	MOVOU	16(SI), X1
    183 	MOVOU	32(SI), X2
    184 	MOVOU	48(SI), X3
    185 	MOVOU	64(SI), X4
    186 	MOVOU	80(SI), X5
    187 	MOVOU	96(SI), X6
    188 	MOVOU	112(SI), X7
    189 	MOVOU	-128(SI)(BX*1), X8
    190 	MOVOU	-112(SI)(BX*1), X9
    191 	MOVOU	-96(SI)(BX*1), X10
    192 	MOVOU	-80(SI)(BX*1), X11
    193 	MOVOU	-64(SI)(BX*1), X12
    194 	MOVOU	-48(SI)(BX*1), X13
    195 	MOVOU	-32(SI)(BX*1), X14
    196 	MOVOU	-16(SI)(BX*1), X15
    197 	MOVOU	X0, (DI)
    198 	MOVOU	X1, 16(DI)
    199 	MOVOU	X2, 32(DI)
    200 	MOVOU	X3, 48(DI)
    201 	MOVOU	X4, 64(DI)
    202 	MOVOU	X5, 80(DI)
    203 	MOVOU	X6, 96(DI)
    204 	MOVOU	X7, 112(DI)
    205 	MOVOU	X8, -128(DI)(BX*1)
    206 	MOVOU	X9, -112(DI)(BX*1)
    207 	MOVOU	X10, -96(DI)(BX*1)
    208 	MOVOU	X11, -80(DI)(BX*1)
    209 	MOVOU	X12, -64(DI)(BX*1)
    210 	MOVOU	X13, -48(DI)(BX*1)
    211 	MOVOU	X14, -32(DI)(BX*1)
    212 	MOVOU	X15, -16(DI)(BX*1)
    213 	RET
    214 move_256through2048:
    215 	SUBQ	$256, BX
    216 	MOVOU	(SI), X0
    217 	MOVOU	16(SI), X1
    218 	MOVOU	32(SI), X2
    219 	MOVOU	48(SI), X3
    220 	MOVOU	64(SI), X4
    221 	MOVOU	80(SI), X5
    222 	MOVOU	96(SI), X6
    223 	MOVOU	112(SI), X7
    224 	MOVOU	128(SI), X8
    225 	MOVOU	144(SI), X9
    226 	MOVOU	160(SI), X10
    227 	MOVOU	176(SI), X11
    228 	MOVOU	192(SI), X12
    229 	MOVOU	208(SI), X13
    230 	MOVOU	224(SI), X14
    231 	MOVOU	240(SI), X15
    232 	MOVOU	X0, (DI)
    233 	MOVOU	X1, 16(DI)
    234 	MOVOU	X2, 32(DI)
    235 	MOVOU	X3, 48(DI)
    236 	MOVOU	X4, 64(DI)
    237 	MOVOU	X5, 80(DI)
    238 	MOVOU	X6, 96(DI)
    239 	MOVOU	X7, 112(DI)
    240 	MOVOU	X8, 128(DI)
    241 	MOVOU	X9, 144(DI)
    242 	MOVOU	X10, 160(DI)
    243 	MOVOU	X11, 176(DI)
    244 	MOVOU	X12, 192(DI)
    245 	MOVOU	X13, 208(DI)
    246 	MOVOU	X14, 224(DI)
    247 	MOVOU	X15, 240(DI)
    248 	CMPQ	BX, $256
    249 	LEAQ	256(SI), SI
    250 	LEAQ	256(DI), DI
    251 	JGE	move_256through2048
    252 	JMP	tail
    253