Home | History | Annotate | Download | only in runtime
      1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
      2 // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
      3 //
      4 //         Copyright  1994-1999 Lucent Technologies Inc. All rights reserved.
      5 //         Revisions Copyright  2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
      6 //         Portions Copyright 2009 The Go Authors. All rights reserved.
      7 //
      8 // Permission is hereby granted, free of charge, to any person obtaining a copy
      9 // of this software and associated documentation files (the "Software"), to deal
     10 // in the Software without restriction, including without limitation the rights
     11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     12 // copies of the Software, and to permit persons to whom the Software is
     13 // furnished to do so, subject to the following conditions:
     14 //
     15 // The above copyright notice and this permission notice shall be included in
     16 // all copies or substantial portions of the Software.
     17 //
     18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
     21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     24 // THE SOFTWARE.
     25 
     26 // +build !plan9
     27 
     28 #include "textflag.h"
     29 
     30 // void runtimememmove(void*, void*, uintptr)
     31 TEXT runtimememmove(SB), NOSPLIT, $0-24
     32 
     33 	MOVQ	to+0(FP), DI
     34 	MOVQ	from+8(FP), SI
     35 	MOVQ	n+16(FP), BX
     36 
     37 	// REP instructions have a high startup cost, so we handle small sizes
     38 	// with some straightline code. The REP MOVSQ instruction is really fast
     39 	// for large sizes. The cutover is approximately 2K.
     40 tail:
     41 	// move_129through256 or smaller work whether or not the source and the
     42 	// destination memory regions overlap because they load all data into
     43 	// registers before writing it back.  move_256through2048 on the other
     44 	// hand can be used only when the memory regions don't overlap or the copy
     45 	// direction is forward.
     46 	TESTQ	BX, BX
     47 	JEQ	move_0
     48 	CMPQ	BX, $2
     49 	JBE	move_1or2
     50 	CMPQ	BX, $4
     51 	JBE	move_3or4
     52 	CMPQ	BX, $8
     53 	JB	move_5through7
     54 	JE	move_8
     55 	CMPQ	BX, $16
     56 	JBE	move_9through16
     57 	CMPQ	BX, $32
     58 	JBE	move_17through32
     59 	CMPQ	BX, $64
     60 	JBE	move_33through64
     61 	CMPQ	BX, $128
     62 	JBE	move_65through128
     63 	CMPQ	BX, $256
     64 	JBE	move_129through256
     65 	// TODO: use branch table and BSR to make this just a single dispatch
     66 
     67 	TESTB	$1, runtimeuseAVXmemmove(SB)
     68 	JNZ	avxUnaligned
     69 
     70 /*
     71  * check and set for backwards
     72  */
     73 	CMPQ	SI, DI
     74 	JLS	back
     75 
     76 /*
     77  * forward copy loop
     78  */
     79 forward:
     80 	CMPQ	BX, $2048
     81 	JLS	move_256through2048
     82 
     83 	// If REP MOVSB isn't fast, don't use it
     84 	CMPB	runtimesupport_erms(SB), $1 // enhanced REP MOVSB/STOSB
     85 	JNE	fwdBy8
     86 
     87 	// Check alignment
     88 	MOVL	SI, AX
     89 	ORL	DI, AX
     90 	TESTL	$7, AX
     91 	JEQ	fwdBy8
     92 
     93 	// Do 1 byte at a time
     94 	MOVQ	BX, CX
     95 	REP;	MOVSB
     96 	RET
     97 
     98 fwdBy8:
     99 	// Do 8 bytes at a time
    100 	MOVQ	BX, CX
    101 	SHRQ	$3, CX
    102 	ANDQ	$7, BX
    103 	REP;	MOVSQ
    104 	JMP	tail
    105 
    106 back:
    107 /*
    108  * check overlap
    109  */
    110 	MOVQ	SI, CX
    111 	ADDQ	BX, CX
    112 	CMPQ	CX, DI
    113 	JLS	forward
    114 /*
    115  * whole thing backwards has
    116  * adjusted addresses
    117  */
    118 	ADDQ	BX, DI
    119 	ADDQ	BX, SI
    120 	STD
    121 
    122 /*
    123  * copy
    124  */
    125 	MOVQ	BX, CX
    126 	SHRQ	$3, CX
    127 	ANDQ	$7, BX
    128 
    129 	SUBQ	$8, DI
    130 	SUBQ	$8, SI
    131 	REP;	MOVSQ
    132 
    133 	CLD
    134 	ADDQ	$8, DI
    135 	ADDQ	$8, SI
    136 	SUBQ	BX, DI
    137 	SUBQ	BX, SI
    138 	JMP	tail
    139 
    140 move_1or2:
    141 	MOVB	(SI), AX
    142 	MOVB	-1(SI)(BX*1), CX
    143 	MOVB	AX, (DI)
    144 	MOVB	CX, -1(DI)(BX*1)
    145 	RET
    146 move_0:
    147 	RET
    148 move_3or4:
    149 	CMPQ	BX, $4
    150 	JB	move_3
    151 	MOVL	(SI), AX
    152 	MOVL	AX, (DI)
    153 	RET
    154 move_3:
    155 	MOVW	(SI), AX
    156 	MOVB	2(SI), CX
    157 	MOVW	AX, (DI)
    158 	MOVB	CX, 2(DI)
    159 	RET
    160 move_5through7:
    161 	MOVL	(SI), AX
    162 	MOVL	-4(SI)(BX*1), CX
    163 	MOVL	AX, (DI)
    164 	MOVL	CX, -4(DI)(BX*1)
    165 	RET
    166 move_8:
    167 	// We need a separate case for 8 to make sure we write pointers atomically.
    168 	MOVQ	(SI), AX
    169 	MOVQ	AX, (DI)
    170 	RET
    171 move_9through16:
    172 	MOVQ	(SI), AX
    173 	MOVQ	-8(SI)(BX*1), CX
    174 	MOVQ	AX, (DI)
    175 	MOVQ	CX, -8(DI)(BX*1)
    176 	RET
    177 move_17through32:
    178 	MOVOU	(SI), X0
    179 	MOVOU	-16(SI)(BX*1), X1
    180 	MOVOU	X0, (DI)
    181 	MOVOU	X1, -16(DI)(BX*1)
    182 	RET
    183 move_33through64:
    184 	MOVOU	(SI), X0
    185 	MOVOU	16(SI), X1
    186 	MOVOU	-32(SI)(BX*1), X2
    187 	MOVOU	-16(SI)(BX*1), X3
    188 	MOVOU	X0, (DI)
    189 	MOVOU	X1, 16(DI)
    190 	MOVOU	X2, -32(DI)(BX*1)
    191 	MOVOU	X3, -16(DI)(BX*1)
    192 	RET
    193 move_65through128:
    194 	MOVOU	(SI), X0
    195 	MOVOU	16(SI), X1
    196 	MOVOU	32(SI), X2
    197 	MOVOU	48(SI), X3
    198 	MOVOU	-64(SI)(BX*1), X4
    199 	MOVOU	-48(SI)(BX*1), X5
    200 	MOVOU	-32(SI)(BX*1), X6
    201 	MOVOU	-16(SI)(BX*1), X7
    202 	MOVOU	X0, (DI)
    203 	MOVOU	X1, 16(DI)
    204 	MOVOU	X2, 32(DI)
    205 	MOVOU	X3, 48(DI)
    206 	MOVOU	X4, -64(DI)(BX*1)
    207 	MOVOU	X5, -48(DI)(BX*1)
    208 	MOVOU	X6, -32(DI)(BX*1)
    209 	MOVOU	X7, -16(DI)(BX*1)
    210 	RET
    211 move_129through256:
    212 	MOVOU	(SI), X0
    213 	MOVOU	16(SI), X1
    214 	MOVOU	32(SI), X2
    215 	MOVOU	48(SI), X3
    216 	MOVOU	64(SI), X4
    217 	MOVOU	80(SI), X5
    218 	MOVOU	96(SI), X6
    219 	MOVOU	112(SI), X7
    220 	MOVOU	-128(SI)(BX*1), X8
    221 	MOVOU	-112(SI)(BX*1), X9
    222 	MOVOU	-96(SI)(BX*1), X10
    223 	MOVOU	-80(SI)(BX*1), X11
    224 	MOVOU	-64(SI)(BX*1), X12
    225 	MOVOU	-48(SI)(BX*1), X13
    226 	MOVOU	-32(SI)(BX*1), X14
    227 	MOVOU	-16(SI)(BX*1), X15
    228 	MOVOU	X0, (DI)
    229 	MOVOU	X1, 16(DI)
    230 	MOVOU	X2, 32(DI)
    231 	MOVOU	X3, 48(DI)
    232 	MOVOU	X4, 64(DI)
    233 	MOVOU	X5, 80(DI)
    234 	MOVOU	X6, 96(DI)
    235 	MOVOU	X7, 112(DI)
    236 	MOVOU	X8, -128(DI)(BX*1)
    237 	MOVOU	X9, -112(DI)(BX*1)
    238 	MOVOU	X10, -96(DI)(BX*1)
    239 	MOVOU	X11, -80(DI)(BX*1)
    240 	MOVOU	X12, -64(DI)(BX*1)
    241 	MOVOU	X13, -48(DI)(BX*1)
    242 	MOVOU	X14, -32(DI)(BX*1)
    243 	MOVOU	X15, -16(DI)(BX*1)
    244 	RET
    245 move_256through2048:
    246 	SUBQ	$256, BX
    247 	MOVOU	(SI), X0
    248 	MOVOU	16(SI), X1
    249 	MOVOU	32(SI), X2
    250 	MOVOU	48(SI), X3
    251 	MOVOU	64(SI), X4
    252 	MOVOU	80(SI), X5
    253 	MOVOU	96(SI), X6
    254 	MOVOU	112(SI), X7
    255 	MOVOU	128(SI), X8
    256 	MOVOU	144(SI), X9
    257 	MOVOU	160(SI), X10
    258 	MOVOU	176(SI), X11
    259 	MOVOU	192(SI), X12
    260 	MOVOU	208(SI), X13
    261 	MOVOU	224(SI), X14
    262 	MOVOU	240(SI), X15
    263 	MOVOU	X0, (DI)
    264 	MOVOU	X1, 16(DI)
    265 	MOVOU	X2, 32(DI)
    266 	MOVOU	X3, 48(DI)
    267 	MOVOU	X4, 64(DI)
    268 	MOVOU	X5, 80(DI)
    269 	MOVOU	X6, 96(DI)
    270 	MOVOU	X7, 112(DI)
    271 	MOVOU	X8, 128(DI)
    272 	MOVOU	X9, 144(DI)
    273 	MOVOU	X10, 160(DI)
    274 	MOVOU	X11, 176(DI)
    275 	MOVOU	X12, 192(DI)
    276 	MOVOU	X13, 208(DI)
    277 	MOVOU	X14, 224(DI)
    278 	MOVOU	X15, 240(DI)
    279 	CMPQ	BX, $256
    280 	LEAQ	256(SI), SI
    281 	LEAQ	256(DI), DI
    282 	JGE	move_256through2048
    283 	JMP	tail
    284 
    285 avxUnaligned:
    286 	// There are two implementations of move algorithm.
    287 	// The first one for non-ovelapped memory regions. It uses forward copying.
    288 	// The second one for overlapped regions. It uses backward copying
    289 	MOVQ	DI, CX
    290 	SUBQ	SI, CX
    291 	// Now CX contains distance between SRC and DEST
    292 	CMPQ	CX, BX
    293 	// If the distance lesser than region length it means that regions are overlapped
    294 	JC	copy_backward
    295 
    296 	// Non-temporal copy would be better for big sizes.
    297 	CMPQ	BX, $0x100000
    298 	JAE	gobble_big_data_fwd
    299 
    300 	// Memory layout on the source side
    301 	// SI                                       CX
    302 	// |<---------BX before correction--------->|
    303 	// |       |<--BX corrected-->|             |
    304 	// |       |                  |<--- AX  --->|
    305 	// |<-R11->|                  |<-128 bytes->|
    306 	// +----------------------------------------+
    307 	// | Head  | Body             | Tail        |
    308 	// +-------+------------------+-------------+
    309 	// ^       ^                  ^
    310 	// |       |                  |
    311 	// Save head into Y4          Save tail into X5..X12
    312 	//         |
    313 	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
    314 	// Algorithm:
    315 	// 1. Unaligned save of the tail's 128 bytes
    316 	// 2. Unaligned save of the head's 32  bytes
    317 	// 3. Destination-aligned copying of body (128 bytes per iteration)
    318 	// 4. Put head on the new place
    319 	// 5. Put the tail on the new place
    320 	// It can be important to satisfy processor's pipeline requirements for
    321 	// small sizes as the cost of unaligned memory region copying is
    322 	// comparable with the cost of main loop. So code is slightly messed there.
    323 	// There is more clean implementation of that algorithm for bigger sizes
    324 	// where the cost of unaligned part copying is negligible.
    325 	// You can see it after gobble_big_data_fwd label.
    326 	LEAQ	(SI)(BX*1), CX
    327 	MOVQ	DI, R10
    328 	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
    329 	MOVOU	-0x80(CX), X5
    330 	MOVOU	-0x70(CX), X6
    331 	MOVQ	$0x80, AX
    332 	// Align destination address
    333 	ANDQ	$-32, DI
    334 	ADDQ	$32, DI
    335 	// Continue tail saving.
    336 	MOVOU	-0x60(CX), X7
    337 	MOVOU	-0x50(CX), X8
    338 	// Make R11 delta between aligned and unaligned destination addresses.
    339 	MOVQ	DI, R11
    340 	SUBQ	R10, R11
    341 	// Continue tail saving.
    342 	MOVOU	-0x40(CX), X9
    343 	MOVOU	-0x30(CX), X10
    344 	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
    345 	SUBQ	R11, BX
    346 	// Continue tail saving.
    347 	MOVOU	-0x20(CX), X11
    348 	MOVOU	-0x10(CX), X12
    349 	// The tail will be put on it's place after main body copying.
    350 	// It's time for the unaligned heading part.
    351 	VMOVDQU	(SI), Y4
    352 	// Adjust source address to point past head.
    353 	ADDQ	R11, SI
    354 	SUBQ	AX, BX
    355 	// Aligned memory copying there
    356 gobble_128_loop:
    357 	VMOVDQU	(SI), Y0
    358 	VMOVDQU	0x20(SI), Y1
    359 	VMOVDQU	0x40(SI), Y2
    360 	VMOVDQU	0x60(SI), Y3
    361 	ADDQ	AX, SI
    362 	VMOVDQA	Y0, (DI)
    363 	VMOVDQA	Y1, 0x20(DI)
    364 	VMOVDQA	Y2, 0x40(DI)
    365 	VMOVDQA	Y3, 0x60(DI)
    366 	ADDQ	AX, DI
    367 	SUBQ	AX, BX
    368 	JA	gobble_128_loop
    369 	// Now we can store unaligned parts.
    370 	ADDQ	AX, BX
    371 	ADDQ	DI, BX
    372 	VMOVDQU	Y4, (R10)
    373 	VZEROUPPER
    374 	MOVOU	X5, -0x80(BX)
    375 	MOVOU	X6, -0x70(BX)
    376 	MOVOU	X7, -0x60(BX)
    377 	MOVOU	X8, -0x50(BX)
    378 	MOVOU	X9, -0x40(BX)
    379 	MOVOU	X10, -0x30(BX)
    380 	MOVOU	X11, -0x20(BX)
    381 	MOVOU	X12, -0x10(BX)
    382 	RET
    383 
    384 gobble_big_data_fwd:
    385 	// There is forward copying for big regions.
    386 	// It uses non-temporal mov instructions.
    387 	// Details of this algorithm are commented previously for small sizes.
    388 	LEAQ	(SI)(BX*1), CX
    389 	MOVOU	-0x80(SI)(BX*1), X5
    390 	MOVOU	-0x70(CX), X6
    391 	MOVOU	-0x60(CX), X7
    392 	MOVOU	-0x50(CX), X8
    393 	MOVOU	-0x40(CX), X9
    394 	MOVOU	-0x30(CX), X10
    395 	MOVOU	-0x20(CX), X11
    396 	MOVOU	-0x10(CX), X12
    397 	VMOVDQU	(SI), Y4
    398 	MOVQ	DI, R8
    399 	ANDQ	$-32, DI
    400 	ADDQ	$32, DI
    401 	MOVQ	DI, R10
    402 	SUBQ	R8, R10
    403 	SUBQ	R10, BX
    404 	ADDQ	R10, SI
    405 	LEAQ	(DI)(BX*1), CX
    406 	SUBQ	$0x80, BX
    407 gobble_mem_fwd_loop:
    408 	PREFETCHNTA 0x1C0(SI)
    409 	PREFETCHNTA 0x280(SI)
    410 	// Prefetch values were chosen empirically.
    411 	// Approach for prefetch usage as in 7.6.6 of [1]
    412 	// [1] 64-ia-32-architectures-optimization-manual.pdf
    413 	// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
    414 	VMOVDQU	(SI), Y0
    415 	VMOVDQU	0x20(SI), Y1
    416 	VMOVDQU	0x40(SI), Y2
    417 	VMOVDQU	0x60(SI), Y3
    418 	ADDQ	$0x80, SI
    419 	VMOVNTDQ Y0, (DI)
    420 	VMOVNTDQ Y1, 0x20(DI)
    421 	VMOVNTDQ Y2, 0x40(DI)
    422 	VMOVNTDQ Y3, 0x60(DI)
    423 	ADDQ	$0x80, DI
    424 	SUBQ	$0x80, BX
    425 	JA		gobble_mem_fwd_loop
    426 	// NT instructions don't follow the normal cache-coherency rules.
    427 	// We need SFENCE there to make copied data available timely.
    428 	SFENCE
    429 	VMOVDQU	Y4, (R8)
    430 	VZEROUPPER
    431 	MOVOU	X5, -0x80(CX)
    432 	MOVOU	X6, -0x70(CX)
    433 	MOVOU	X7, -0x60(CX)
    434 	MOVOU	X8, -0x50(CX)
    435 	MOVOU	X9, -0x40(CX)
    436 	MOVOU	X10, -0x30(CX)
    437 	MOVOU	X11, -0x20(CX)
    438 	MOVOU	X12, -0x10(CX)
    439 	RET
    440 
    441 copy_backward:
    442 	MOVQ	DI, AX
    443 	// Backward copying is about the same as the forward one.
    444 	// Firstly we load unaligned tail in the beginning of region.
    445 	MOVOU	(SI), X5
    446 	MOVOU	0x10(SI), X6
    447 	ADDQ	BX, DI
    448 	MOVOU	0x20(SI), X7
    449 	MOVOU	0x30(SI), X8
    450 	LEAQ	-0x20(DI), R10
    451 	MOVQ	DI, R11
    452 	MOVOU	0x40(SI), X9
    453 	MOVOU	0x50(SI), X10
    454 	ANDQ	$0x1F, R11
    455 	MOVOU	0x60(SI), X11
    456 	MOVOU	0x70(SI), X12
    457 	XORQ	R11, DI
    458 	// Let's point SI to the end of region
    459 	ADDQ	BX, SI
    460 	// and load unaligned head into X4.
    461 	VMOVDQU	-0x20(SI), Y4
    462 	SUBQ	R11, SI
    463 	SUBQ	R11, BX
    464 	// If there is enough data for non-temporal moves go to special loop
    465 	CMPQ	BX, $0x100000
    466 	JA		gobble_big_data_bwd
    467 	SUBQ	$0x80, BX
    468 gobble_mem_bwd_loop:
    469 	VMOVDQU	-0x20(SI), Y0
    470 	VMOVDQU	-0x40(SI), Y1
    471 	VMOVDQU	-0x60(SI), Y2
    472 	VMOVDQU	-0x80(SI), Y3
    473 	SUBQ	$0x80, SI
    474 	VMOVDQA	Y0, -0x20(DI)
    475 	VMOVDQA	Y1, -0x40(DI)
    476 	VMOVDQA	Y2, -0x60(DI)
    477 	VMOVDQA	Y3, -0x80(DI)
    478 	SUBQ	$0x80, DI
    479 	SUBQ	$0x80, BX
    480 	JA		gobble_mem_bwd_loop
    481 	// Let's store unaligned data
    482 	VMOVDQU	Y4, (R10)
    483 	VZEROUPPER
    484 	MOVOU	X5, (AX)
    485 	MOVOU	X6, 0x10(AX)
    486 	MOVOU	X7, 0x20(AX)
    487 	MOVOU	X8, 0x30(AX)
    488 	MOVOU	X9, 0x40(AX)
    489 	MOVOU	X10, 0x50(AX)
    490 	MOVOU	X11, 0x60(AX)
    491 	MOVOU	X12, 0x70(AX)
    492 	RET
    493 
    494 gobble_big_data_bwd:
    495 	SUBQ	$0x80, BX
    496 gobble_big_mem_bwd_loop:
    497 	PREFETCHNTA -0x1C0(SI)
    498 	PREFETCHNTA -0x280(SI)
    499 	VMOVDQU	-0x20(SI), Y0
    500 	VMOVDQU	-0x40(SI), Y1
    501 	VMOVDQU	-0x60(SI), Y2
    502 	VMOVDQU	-0x80(SI), Y3
    503 	SUBQ	$0x80, SI
    504 	VMOVNTDQ	Y0, -0x20(DI)
    505 	VMOVNTDQ	Y1, -0x40(DI)
    506 	VMOVNTDQ	Y2, -0x60(DI)
    507 	VMOVNTDQ	Y3, -0x80(DI)
    508 	SUBQ	$0x80, DI
    509 	SUBQ	$0x80, BX
    510 	JA	gobble_big_mem_bwd_loop
    511 	SFENCE
    512 	VMOVDQU	Y4, (R10)
    513 	VZEROUPPER
    514 	MOVOU	X5, (AX)
    515 	MOVOU	X6, 0x10(AX)
    516 	MOVOU	X7, 0x20(AX)
    517 	MOVOU	X8, 0x30(AX)
    518 	MOVOU	X9, 0x40(AX)
    519 	MOVOU	X10, 0x50(AX)
    520 	MOVOU	X11, 0x60(AX)
    521 	MOVOU	X12, 0x70(AX)
    522 	RET
    523