1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64) 2 // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s 3 // 4 // Copyright 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 // +build !plan9 27 28 #include "textflag.h" 29 30 // void runtimememmove(void*, void*, uintptr) 31 TEXT runtimememmove(SB), NOSPLIT, $0-24 32 33 MOVQ to+0(FP), DI 34 MOVQ from+8(FP), SI 35 MOVQ n+16(FP), BX 36 37 // REP instructions have a high startup cost, so we handle small sizes 38 // with some straightline code. The REP MOVSQ instruction is really fast 39 // for large sizes. The cutover is approximately 2K. 40 tail: 41 // move_129through256 or smaller work whether or not the source and the 42 // destination memory regions overlap because they load all data into 43 // registers before writing it back. move_256through2048 on the other 44 // hand can be used only when the memory regions don't overlap or the copy 45 // direction is forward. 46 TESTQ BX, BX 47 JEQ move_0 48 CMPQ BX, $2 49 JBE move_1or2 50 CMPQ BX, $4 51 JBE move_3or4 52 CMPQ BX, $8 53 JB move_5through7 54 JE move_8 55 CMPQ BX, $16 56 JBE move_9through16 57 CMPQ BX, $32 58 JBE move_17through32 59 CMPQ BX, $64 60 JBE move_33through64 61 CMPQ BX, $128 62 JBE move_65through128 63 CMPQ BX, $256 64 JBE move_129through256 65 // TODO: use branch table and BSR to make this just a single dispatch 66 67 TESTB $1, runtimeuseRepMovs(SB) 68 JZ avxUnaligned 69 70 /* 71 * check and set for backwards 72 */ 73 CMPQ SI, DI 74 JLS back 75 76 /* 77 * forward copy loop 78 */ 79 forward: 80 CMPQ BX, $2048 81 JLS move_256through2048 82 83 // If REP MOVSB isn't fast, don't use it 84 TESTL $(1<<9), runtimecpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB 85 JEQ fwdBy8 86 87 // Check alignment 88 MOVL SI, AX 89 ORL DI, AX 90 TESTL $7, AX 91 JEQ fwdBy8 92 93 // Do 1 byte at a time 94 MOVQ BX, CX 95 REP; MOVSB 96 RET 97 98 fwdBy8: 99 // Do 8 bytes at a time 100 MOVQ BX, CX 101 SHRQ $3, CX 102 ANDQ $7, BX 103 REP; MOVSQ 104 JMP tail 105 106 back: 107 /* 108 * check overlap 109 */ 110 MOVQ SI, CX 111 ADDQ BX, CX 112 CMPQ CX, DI 113 JLS forward 114 /* 115 * whole thing backwards has 116 * adjusted addresses 117 */ 118 ADDQ BX, DI 119 ADDQ BX, SI 120 STD 121 122 /* 123 * copy 124 */ 125 MOVQ BX, CX 126 SHRQ $3, CX 127 ANDQ $7, BX 128 129 SUBQ $8, DI 130 SUBQ $8, SI 131 REP; MOVSQ 132 133 CLD 134 ADDQ $8, DI 135 ADDQ $8, SI 136 SUBQ BX, DI 137 SUBQ BX, SI 138 JMP tail 139 140 move_1or2: 141 MOVB (SI), AX 142 MOVB -1(SI)(BX*1), CX 143 MOVB AX, (DI) 144 MOVB CX, -1(DI)(BX*1) 145 RET 146 move_0: 147 RET 148 move_3or4: 149 CMPQ BX, $4 150 JB move_3 151 MOVL (SI), AX 152 MOVL AX, (DI) 153 RET 154 move_3: 155 MOVW (SI), AX 156 MOVB 2(SI), CX 157 MOVW AX, (DI) 158 MOVB CX, 2(DI) 159 RET 160 move_5through7: 161 MOVL (SI), AX 162 MOVL -4(SI)(BX*1), CX 163 MOVL AX, (DI) 164 MOVL CX, -4(DI)(BX*1) 165 RET 166 move_8: 167 // We need a separate case for 8 to make sure we write pointers atomically. 168 MOVQ (SI), AX 169 MOVQ AX, (DI) 170 RET 171 move_9through16: 172 MOVQ (SI), AX 173 MOVQ -8(SI)(BX*1), CX 174 MOVQ AX, (DI) 175 MOVQ CX, -8(DI)(BX*1) 176 RET 177 move_17through32: 178 MOVOU (SI), X0 179 MOVOU -16(SI)(BX*1), X1 180 MOVOU X0, (DI) 181 MOVOU X1, -16(DI)(BX*1) 182 RET 183 move_33through64: 184 MOVOU (SI), X0 185 MOVOU 16(SI), X1 186 MOVOU -32(SI)(BX*1), X2 187 MOVOU -16(SI)(BX*1), X3 188 MOVOU X0, (DI) 189 MOVOU X1, 16(DI) 190 MOVOU X2, -32(DI)(BX*1) 191 MOVOU X3, -16(DI)(BX*1) 192 RET 193 move_65through128: 194 MOVOU (SI), X0 195 MOVOU 16(SI), X1 196 MOVOU 32(SI), X2 197 MOVOU 48(SI), X3 198 MOVOU -64(SI)(BX*1), X4 199 MOVOU -48(SI)(BX*1), X5 200 MOVOU -32(SI)(BX*1), X6 201 MOVOU -16(SI)(BX*1), X7 202 MOVOU X0, (DI) 203 MOVOU X1, 16(DI) 204 MOVOU X2, 32(DI) 205 MOVOU X3, 48(DI) 206 MOVOU X4, -64(DI)(BX*1) 207 MOVOU X5, -48(DI)(BX*1) 208 MOVOU X6, -32(DI)(BX*1) 209 MOVOU X7, -16(DI)(BX*1) 210 RET 211 move_129through256: 212 MOVOU (SI), X0 213 MOVOU 16(SI), X1 214 MOVOU 32(SI), X2 215 MOVOU 48(SI), X3 216 MOVOU 64(SI), X4 217 MOVOU 80(SI), X5 218 MOVOU 96(SI), X6 219 MOVOU 112(SI), X7 220 MOVOU -128(SI)(BX*1), X8 221 MOVOU -112(SI)(BX*1), X9 222 MOVOU -96(SI)(BX*1), X10 223 MOVOU -80(SI)(BX*1), X11 224 MOVOU -64(SI)(BX*1), X12 225 MOVOU -48(SI)(BX*1), X13 226 MOVOU -32(SI)(BX*1), X14 227 MOVOU -16(SI)(BX*1), X15 228 MOVOU X0, (DI) 229 MOVOU X1, 16(DI) 230 MOVOU X2, 32(DI) 231 MOVOU X3, 48(DI) 232 MOVOU X4, 64(DI) 233 MOVOU X5, 80(DI) 234 MOVOU X6, 96(DI) 235 MOVOU X7, 112(DI) 236 MOVOU X8, -128(DI)(BX*1) 237 MOVOU X9, -112(DI)(BX*1) 238 MOVOU X10, -96(DI)(BX*1) 239 MOVOU X11, -80(DI)(BX*1) 240 MOVOU X12, -64(DI)(BX*1) 241 MOVOU X13, -48(DI)(BX*1) 242 MOVOU X14, -32(DI)(BX*1) 243 MOVOU X15, -16(DI)(BX*1) 244 RET 245 move_256through2048: 246 SUBQ $256, BX 247 MOVOU (SI), X0 248 MOVOU 16(SI), X1 249 MOVOU 32(SI), X2 250 MOVOU 48(SI), X3 251 MOVOU 64(SI), X4 252 MOVOU 80(SI), X5 253 MOVOU 96(SI), X6 254 MOVOU 112(SI), X7 255 MOVOU 128(SI), X8 256 MOVOU 144(SI), X9 257 MOVOU 160(SI), X10 258 MOVOU 176(SI), X11 259 MOVOU 192(SI), X12 260 MOVOU 208(SI), X13 261 MOVOU 224(SI), X14 262 MOVOU 240(SI), X15 263 MOVOU X0, (DI) 264 MOVOU X1, 16(DI) 265 MOVOU X2, 32(DI) 266 MOVOU X3, 48(DI) 267 MOVOU X4, 64(DI) 268 MOVOU X5, 80(DI) 269 MOVOU X6, 96(DI) 270 MOVOU X7, 112(DI) 271 MOVOU X8, 128(DI) 272 MOVOU X9, 144(DI) 273 MOVOU X10, 160(DI) 274 MOVOU X11, 176(DI) 275 MOVOU X12, 192(DI) 276 MOVOU X13, 208(DI) 277 MOVOU X14, 224(DI) 278 MOVOU X15, 240(DI) 279 CMPQ BX, $256 280 LEAQ 256(SI), SI 281 LEAQ 256(DI), DI 282 JGE move_256through2048 283 JMP tail 284 285 avxUnaligned: 286 // There are two implementations of move algorithm. 287 // The first one for non-ovelapped memory regions. It uses forward copying. 288 // The second one for overlapped regions. It uses backward copying 289 MOVQ DI, CX 290 SUBQ SI, CX 291 // Now CX contains distance between SRC and DEST 292 CMPQ CX, BX 293 // If the distance lesser than region length it means that regions are overlapped 294 JC copy_backward 295 296 // Non-temporal copy would be better for big sizes. 297 CMPQ BX, $0x100000 298 JAE gobble_big_data_fwd 299 300 // Memory layout on the source side 301 // SI CX 302 // |<---------BX before correction--------->| 303 // | |<--BX corrected-->| | 304 // | | |<--- AX --->| 305 // |<-R11->| |<-128 bytes->| 306 // +----------------------------------------+ 307 // | Head | Body | Tail | 308 // +-------+------------------+-------------+ 309 // ^ ^ ^ 310 // | | | 311 // Save head into Y4 Save tail into X5..X12 312 // | 313 // SI+R11, where R11 = ((DI & -32) + 32) - DI 314 // Algorithm: 315 // 1. Unaligned save of the tail's 128 bytes 316 // 2. Unaligned save of the head's 32 bytes 317 // 3. Destination-aligned copying of body (128 bytes per iteration) 318 // 4. Put head on the new place 319 // 5. Put the tail on the new place 320 // It can be important to satisfy processor's pipeline requirements for 321 // small sizes as the cost of unaligned memory region copying is 322 // comparable with the cost of main loop. So code is slightly messed there. 323 // There is more clean implementation of that algorithm for bigger sizes 324 // where the cost of unaligned part copying is negligible. 325 // You can see it after gobble_big_data_fwd label. 326 LEAQ (SI)(BX*1), CX 327 MOVQ DI, R10 328 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 329 MOVOU -0x80(CX), X5 330 MOVOU -0x70(CX), X6 331 MOVQ $0x80, AX 332 // Align destination address 333 ANDQ $-32, DI 334 ADDQ $32, DI 335 // Continue tail saving. 336 MOVOU -0x60(CX), X7 337 MOVOU -0x50(CX), X8 338 // Make R11 delta between aligned and unaligned destination addresses. 339 MOVQ DI, R11 340 SUBQ R10, R11 341 // Continue tail saving. 342 MOVOU -0x40(CX), X9 343 MOVOU -0x30(CX), X10 344 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 345 SUBQ R11, BX 346 // Continue tail saving. 347 MOVOU -0x20(CX), X11 348 MOVOU -0x10(CX), X12 349 // The tail will be put on it's place after main body copying. 350 // It's time for the unaligned heading part. 351 VMOVDQU (SI), Y4 352 // Adjust source address to point past head. 353 ADDQ R11, SI 354 SUBQ AX, BX 355 // Aligned memory copying there 356 gobble_128_loop: 357 VMOVDQU (SI), Y0 358 VMOVDQU 0x20(SI), Y1 359 VMOVDQU 0x40(SI), Y2 360 VMOVDQU 0x60(SI), Y3 361 ADDQ AX, SI 362 VMOVDQA Y0, (DI) 363 VMOVDQA Y1, 0x20(DI) 364 VMOVDQA Y2, 0x40(DI) 365 VMOVDQA Y3, 0x60(DI) 366 ADDQ AX, DI 367 SUBQ AX, BX 368 JA gobble_128_loop 369 // Now we can store unaligned parts. 370 ADDQ AX, BX 371 ADDQ DI, BX 372 VMOVDQU Y4, (R10) 373 VZEROUPPER 374 MOVOU X5, -0x80(BX) 375 MOVOU X6, -0x70(BX) 376 MOVOU X7, -0x60(BX) 377 MOVOU X8, -0x50(BX) 378 MOVOU X9, -0x40(BX) 379 MOVOU X10, -0x30(BX) 380 MOVOU X11, -0x20(BX) 381 MOVOU X12, -0x10(BX) 382 RET 383 384 gobble_big_data_fwd: 385 // There is forward copying for big regions. 386 // It uses non-temporal mov instructions. 387 // Details of this algorithm are commented previously for small sizes. 388 LEAQ (SI)(BX*1), CX 389 MOVOU -0x80(SI)(BX*1), X5 390 MOVOU -0x70(CX), X6 391 MOVOU -0x60(CX), X7 392 MOVOU -0x50(CX), X8 393 MOVOU -0x40(CX), X9 394 MOVOU -0x30(CX), X10 395 MOVOU -0x20(CX), X11 396 MOVOU -0x10(CX), X12 397 VMOVDQU (SI), Y4 398 MOVQ DI, R8 399 ANDQ $-32, DI 400 ADDQ $32, DI 401 MOVQ DI, R10 402 SUBQ R8, R10 403 SUBQ R10, BX 404 ADDQ R10, SI 405 LEAQ (DI)(BX*1), CX 406 SUBQ $0x80, BX 407 gobble_mem_fwd_loop: 408 PREFETCHNTA 0x1C0(SI) 409 PREFETCHNTA 0x280(SI) 410 // Prefetch values were choosen empirically. 411 // Approach for prefetch usage as in 7.6.6 of [1] 412 // [1] 64-ia-32-architectures-optimization-manual.pdf 413 // http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf 414 VMOVDQU (SI), Y0 415 VMOVDQU 0x20(SI), Y1 416 VMOVDQU 0x40(SI), Y2 417 VMOVDQU 0x60(SI), Y3 418 ADDQ $0x80, SI 419 VMOVNTDQ Y0, (DI) 420 VMOVNTDQ Y1, 0x20(DI) 421 VMOVNTDQ Y2, 0x40(DI) 422 VMOVNTDQ Y3, 0x60(DI) 423 ADDQ $0x80, DI 424 SUBQ $0x80, BX 425 JA gobble_mem_fwd_loop 426 // NT instructions don't follow the normal cache-coherency rules. 427 // We need SFENCE there to make copied data available timely. 428 SFENCE 429 VMOVDQU Y4, (R8) 430 VZEROUPPER 431 MOVOU X5, -0x80(CX) 432 MOVOU X6, -0x70(CX) 433 MOVOU X7, -0x60(CX) 434 MOVOU X8, -0x50(CX) 435 MOVOU X9, -0x40(CX) 436 MOVOU X10, -0x30(CX) 437 MOVOU X11, -0x20(CX) 438 MOVOU X12, -0x10(CX) 439 RET 440 441 copy_backward: 442 MOVQ DI, AX 443 // Backward copying is about the same as the forward one. 444 // Firstly we load unaligned tail in the beginning of region. 445 MOVOU (SI), X5 446 MOVOU 0x10(SI), X6 447 ADDQ BX, DI 448 MOVOU 0x20(SI), X7 449 MOVOU 0x30(SI), X8 450 LEAQ -0x20(DI), R10 451 MOVQ DI, R11 452 MOVOU 0x40(SI), X9 453 MOVOU 0x50(SI), X10 454 ANDQ $0x1F, R11 455 MOVOU 0x60(SI), X11 456 MOVOU 0x70(SI), X12 457 XORQ R11, DI 458 // Let's point SI to the end of region 459 ADDQ BX, SI 460 // and load unaligned head into X4. 461 VMOVDQU -0x20(SI), Y4 462 SUBQ R11, SI 463 SUBQ R11, BX 464 // If there is enough data for non-temporal moves go to special loop 465 CMPQ BX, $0x100000 466 JA gobble_big_data_bwd 467 SUBQ $0x80, BX 468 gobble_mem_bwd_loop: 469 VMOVDQU -0x20(SI), Y0 470 VMOVDQU -0x40(SI), Y1 471 VMOVDQU -0x60(SI), Y2 472 VMOVDQU -0x80(SI), Y3 473 SUBQ $0x80, SI 474 VMOVDQA Y0, -0x20(DI) 475 VMOVDQA Y1, -0x40(DI) 476 VMOVDQA Y2, -0x60(DI) 477 VMOVDQA Y3, -0x80(DI) 478 SUBQ $0x80, DI 479 SUBQ $0x80, BX 480 JA gobble_mem_bwd_loop 481 // Let's store unaligned data 482 VMOVDQU Y4, (R10) 483 VZEROUPPER 484 MOVOU X5, (AX) 485 MOVOU X6, 0x10(AX) 486 MOVOU X7, 0x20(AX) 487 MOVOU X8, 0x30(AX) 488 MOVOU X9, 0x40(AX) 489 MOVOU X10, 0x50(AX) 490 MOVOU X11, 0x60(AX) 491 MOVOU X12, 0x70(AX) 492 RET 493 494 gobble_big_data_bwd: 495 SUBQ $0x80, BX 496 gobble_big_mem_bwd_loop: 497 PREFETCHNTA -0x1C0(SI) 498 PREFETCHNTA -0x280(SI) 499 VMOVDQU -0x20(SI), Y0 500 VMOVDQU -0x40(SI), Y1 501 VMOVDQU -0x60(SI), Y2 502 VMOVDQU -0x80(SI), Y3 503 SUBQ $0x80, SI 504 VMOVNTDQ Y0, -0x20(DI) 505 VMOVNTDQ Y1, -0x40(DI) 506 VMOVNTDQ Y2, -0x60(DI) 507 VMOVNTDQ Y3, -0x80(DI) 508 SUBQ $0x80, DI 509 SUBQ $0x80, BX 510 JA gobble_big_mem_bwd_loop 511 SFENCE 512 VMOVDQU Y4, (R10) 513 VZEROUPPER 514 MOVOU X5, (AX) 515 MOVOU X6, 0x10(AX) 516 MOVOU X7, 0x20(AX) 517 MOVOU X8, 0x30(AX) 518 MOVOU X9, 0x40(AX) 519 MOVOU X10, 0x50(AX) 520 MOVOU X11, 0x60(AX) 521 MOVOU X12, 0x70(AX) 522 RET 523