1 /* 2 Copyright (c) 2014, Intel Corporation 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "cache.h" 32 33 #ifndef MEMMOVE 34 # define MEMMOVE memmove 35 #endif 36 37 #ifndef L 38 # define L(label) .L##label 39 #endif 40 41 #ifndef cfi_startproc 42 # define cfi_startproc .cfi_startproc 43 #endif 44 45 #ifndef cfi_endproc 46 # define cfi_endproc .cfi_endproc 47 #endif 48 49 #ifndef cfi_rel_offset 50 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51 #endif 52 53 #ifndef cfi_restore 54 # define cfi_restore(reg) .cfi_restore reg 55 #endif 56 57 #ifndef cfi_adjust_cfa_offset 58 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59 #endif 60 61 #ifndef ENTRY 62 # define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66 name: \ 67 cfi_startproc 68 #endif 69 70 #ifndef END 71 # define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74 #endif 75 76 #define CFI_PUSH(REG) \ 77 cfi_adjust_cfa_offset (4); \ 78 cfi_rel_offset (REG, 0) 79 80 #define CFI_POP(REG) \ 81 cfi_adjust_cfa_offset (-4); \ 82 cfi_restore (REG) 83 84 #define PUSH(REG) push REG; 85 #define POP(REG) pop REG; 86 87 #define ENTRANCE PUSH (%rbx); 88 #define RETURN_END POP (%rbx); ret 89 #define RETURN RETURN_END; 90 91 .section .text.sse2,"ax",@progbits 92 ENTRY (MEMMOVE) 93 ENTRANCE 94 mov %rdi, %rax 95 96 /* Check whether we should copy backward or forward. */ 97 cmp %rsi, %rdi 98 je L(mm_return) 99 jg L(mm_len_0_or_more_backward) 100 101 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 102 separately. */ 103 cmp $16, %rdx 104 jbe L(mm_len_0_16_bytes_forward) 105 106 cmp $32, %rdx 107 ja L(mm_len_32_or_more_forward) 108 109 /* Copy [0..32] and return. */ 110 movdqu (%rsi), %xmm0 111 movdqu -16(%rsi, %rdx), %xmm1 112 movdqu %xmm0, (%rdi) 113 movdqu %xmm1, -16(%rdi, %rdx) 114 jmp L(mm_return) 115 116 L(mm_len_32_or_more_forward): 117 cmp $64, %rdx 118 ja L(mm_len_64_or_more_forward) 119 120 /* Copy [0..64] and return. */ 121 movdqu (%rsi), %xmm0 122 movdqu 16(%rsi), %xmm1 123 movdqu -16(%rsi, %rdx), %xmm2 124 movdqu -32(%rsi, %rdx), %xmm3 125 movdqu %xmm0, (%rdi) 126 movdqu %xmm1, 16(%rdi) 127 movdqu %xmm2, -16(%rdi, %rdx) 128 movdqu %xmm3, -32(%rdi, %rdx) 129 jmp L(mm_return) 130 131 L(mm_len_64_or_more_forward): 132 cmp $128, %rdx 133 ja L(mm_len_128_or_more_forward) 134 135 /* Copy [0..128] and return. */ 136 movdqu (%rsi), %xmm0 137 movdqu 16(%rsi), %xmm1 138 movdqu 32(%rsi), %xmm2 139 movdqu 48(%rsi), %xmm3 140 movdqu -64(%rsi, %rdx), %xmm4 141 movdqu -48(%rsi, %rdx), %xmm5 142 movdqu -32(%rsi, %rdx), %xmm6 143 movdqu -16(%rsi, %rdx), %xmm7 144 movdqu %xmm0, (%rdi) 145 movdqu %xmm1, 16(%rdi) 146 movdqu %xmm2, 32(%rdi) 147 movdqu %xmm3, 48(%rdi) 148 movdqu %xmm4, -64(%rdi, %rdx) 149 movdqu %xmm5, -48(%rdi, %rdx) 150 movdqu %xmm6, -32(%rdi, %rdx) 151 movdqu %xmm7, -16(%rdi, %rdx) 152 jmp L(mm_return) 153 154 L(mm_len_128_or_more_forward): 155 /* Aligning the address of destination. */ 156 /* save first unaligned 64 bytes */ 157 movdqu (%rsi), %xmm0 158 movdqu 16(%rsi), %xmm1 159 movdqu 32(%rsi), %xmm2 160 movdqu 48(%rsi), %xmm3 161 162 lea 64(%rdi), %r8 163 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */ 164 sub %rdi, %rsi /* rsi = src - dst = diff */ 165 166 movdqu (%r8, %rsi), %xmm4 167 movdqu 16(%r8, %rsi), %xmm5 168 movdqu 32(%r8, %rsi), %xmm6 169 movdqu 48(%r8, %rsi), %xmm7 170 171 movdqu %xmm0, (%rdi) 172 movdqu %xmm1, 16(%rdi) 173 movdqu %xmm2, 32(%rdi) 174 movdqu %xmm3, 48(%rdi) 175 movdqa %xmm4, (%r8) 176 movaps %xmm5, 16(%r8) 177 movaps %xmm6, 32(%r8) 178 movaps %xmm7, 48(%r8) 179 add $64, %r8 180 181 lea (%rdi, %rdx), %rbx 182 and $-64, %rbx 183 cmp %r8, %rbx 184 jbe L(mm_copy_remaining_forward) 185 186 cmp $SHARED_CACHE_SIZE_HALF, %rdx 187 jae L(mm_large_page_loop_forward) 188 189 .p2align 4 190 L(mm_main_loop_forward): 191 192 prefetcht0 128(%r8, %rsi) 193 194 movdqu (%r8, %rsi), %xmm0 195 movdqu 16(%r8, %rsi), %xmm1 196 movdqu 32(%r8, %rsi), %xmm2 197 movdqu 48(%r8, %rsi), %xmm3 198 movdqa %xmm0, (%r8) 199 movaps %xmm1, 16(%r8) 200 movaps %xmm2, 32(%r8) 201 movaps %xmm3, 48(%r8) 202 lea 64(%r8), %r8 203 cmp %r8, %rbx 204 ja L(mm_main_loop_forward) 205 206 L(mm_copy_remaining_forward): 207 add %rdi, %rdx 208 sub %r8, %rdx 209 /* We copied all up till %rdi position in the dst. 210 In %rdx now is how many bytes are left to copy. 211 Now we need to advance %r8. */ 212 lea (%r8, %rsi), %r9 213 214 L(mm_remaining_0_64_bytes_forward): 215 cmp $32, %rdx 216 ja L(mm_remaining_33_64_bytes_forward) 217 cmp $16, %rdx 218 ja L(mm_remaining_17_32_bytes_forward) 219 test %rdx, %rdx 220 .p2align 4,,2 221 je L(mm_return) 222 223 cmpb $8, %dl 224 ja L(mm_remaining_9_16_bytes_forward) 225 cmpb $4, %dl 226 .p2align 4,,5 227 ja L(mm_remaining_5_8_bytes_forward) 228 cmpb $2, %dl 229 .p2align 4,,1 230 ja L(mm_remaining_3_4_bytes_forward) 231 movzbl -1(%r9,%rdx), %esi 232 movzbl (%r9), %ebx 233 movb %sil, -1(%r8,%rdx) 234 movb %bl, (%r8) 235 jmp L(mm_return) 236 237 L(mm_remaining_33_64_bytes_forward): 238 movdqu (%r9), %xmm0 239 movdqu 16(%r9), %xmm1 240 movdqu -32(%r9, %rdx), %xmm2 241 movdqu -16(%r9, %rdx), %xmm3 242 movdqu %xmm0, (%r8) 243 movdqu %xmm1, 16(%r8) 244 movdqu %xmm2, -32(%r8, %rdx) 245 movdqu %xmm3, -16(%r8, %rdx) 246 jmp L(mm_return) 247 248 L(mm_remaining_17_32_bytes_forward): 249 movdqu (%r9), %xmm0 250 movdqu -16(%r9, %rdx), %xmm1 251 movdqu %xmm0, (%r8) 252 movdqu %xmm1, -16(%r8, %rdx) 253 jmp L(mm_return) 254 255 L(mm_remaining_5_8_bytes_forward): 256 movl (%r9), %esi 257 movl -4(%r9,%rdx), %ebx 258 movl %esi, (%r8) 259 movl %ebx, -4(%r8,%rdx) 260 jmp L(mm_return) 261 262 L(mm_remaining_9_16_bytes_forward): 263 mov (%r9), %rsi 264 mov -8(%r9, %rdx), %rbx 265 mov %rsi, (%r8) 266 mov %rbx, -8(%r8, %rdx) 267 jmp L(mm_return) 268 269 L(mm_remaining_3_4_bytes_forward): 270 movzwl -2(%r9,%rdx), %esi 271 movzwl (%r9), %ebx 272 movw %si, -2(%r8,%rdx) 273 movw %bx, (%r8) 274 jmp L(mm_return) 275 276 L(mm_len_0_16_bytes_forward): 277 testb $24, %dl 278 jne L(mm_len_9_16_bytes_forward) 279 testb $4, %dl 280 .p2align 4,,5 281 jne L(mm_len_5_8_bytes_forward) 282 test %rdx, %rdx 283 .p2align 4,,2 284 je L(mm_return) 285 testb $2, %dl 286 .p2align 4,,1 287 jne L(mm_len_2_4_bytes_forward) 288 movzbl -1(%rsi,%rdx), %ebx 289 movzbl (%rsi), %esi 290 movb %bl, -1(%rdi,%rdx) 291 movb %sil, (%rdi) 292 jmp L(mm_return) 293 294 L(mm_len_2_4_bytes_forward): 295 movzwl -2(%rsi,%rdx), %ebx 296 movzwl (%rsi), %esi 297 movw %bx, -2(%rdi,%rdx) 298 movw %si, (%rdi) 299 jmp L(mm_return) 300 301 L(mm_len_5_8_bytes_forward): 302 movl (%rsi), %ebx 303 movl -4(%rsi,%rdx), %esi 304 movl %ebx, (%rdi) 305 movl %esi, -4(%rdi,%rdx) 306 jmp L(mm_return) 307 308 L(mm_len_9_16_bytes_forward): 309 mov (%rsi), %rbx 310 mov -8(%rsi, %rdx), %rsi 311 mov %rbx, (%rdi) 312 mov %rsi, -8(%rdi, %rdx) 313 jmp L(mm_return) 314 315 L(mm_recalc_len): 316 /* Compute in %rdx how many bytes are left to copy after 317 the main loop stops. */ 318 mov %rbx, %rdx 319 sub %rdi, %rdx 320 /* The code for copying backwards. */ 321 L(mm_len_0_or_more_backward): 322 323 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 324 separately. */ 325 cmp $16, %rdx 326 jbe L(mm_len_0_16_bytes_backward) 327 328 cmp $32, %rdx 329 ja L(mm_len_32_or_more_backward) 330 331 /* Copy [0..32] and return. */ 332 movdqu (%rsi), %xmm0 333 movdqu -16(%rsi, %rdx), %xmm1 334 movdqu %xmm0, (%rdi) 335 movdqu %xmm1, -16(%rdi, %rdx) 336 jmp L(mm_return) 337 338 L(mm_len_32_or_more_backward): 339 cmp $64, %rdx 340 ja L(mm_len_64_or_more_backward) 341 342 /* Copy [0..64] and return. */ 343 movdqu (%rsi), %xmm0 344 movdqu 16(%rsi), %xmm1 345 movdqu -16(%rsi, %rdx), %xmm2 346 movdqu -32(%rsi, %rdx), %xmm3 347 movdqu %xmm0, (%rdi) 348 movdqu %xmm1, 16(%rdi) 349 movdqu %xmm2, -16(%rdi, %rdx) 350 movdqu %xmm3, -32(%rdi, %rdx) 351 jmp L(mm_return) 352 353 L(mm_len_64_or_more_backward): 354 cmp $128, %rdx 355 ja L(mm_len_128_or_more_backward) 356 357 /* Copy [0..128] and return. */ 358 movdqu (%rsi), %xmm0 359 movdqu 16(%rsi), %xmm1 360 movdqu 32(%rsi), %xmm2 361 movdqu 48(%rsi), %xmm3 362 movdqu -64(%rsi, %rdx), %xmm4 363 movdqu -48(%rsi, %rdx), %xmm5 364 movdqu -32(%rsi, %rdx), %xmm6 365 movdqu -16(%rsi, %rdx), %xmm7 366 movdqu %xmm0, (%rdi) 367 movdqu %xmm1, 16(%rdi) 368 movdqu %xmm2, 32(%rdi) 369 movdqu %xmm3, 48(%rdi) 370 movdqu %xmm4, -64(%rdi, %rdx) 371 movdqu %xmm5, -48(%rdi, %rdx) 372 movdqu %xmm6, -32(%rdi, %rdx) 373 movdqu %xmm7, -16(%rdi, %rdx) 374 jmp L(mm_return) 375 376 L(mm_len_128_or_more_backward): 377 /* Aligning the address of destination. We need to save 378 16 bits from the source in order not to overwrite them. */ 379 movdqu -16(%rsi, %rdx), %xmm0 380 movdqu -32(%rsi, %rdx), %xmm1 381 movdqu -48(%rsi, %rdx), %xmm2 382 movdqu -64(%rsi, %rdx), %xmm3 383 384 lea (%rdi, %rdx), %r9 385 and $-64, %r9 /* r9 = aligned dst */ 386 387 mov %rsi, %r8 388 sub %rdi, %r8 /* r8 = src - dst, diff */ 389 390 movdqu -16(%r9, %r8), %xmm4 391 movdqu -32(%r9, %r8), %xmm5 392 movdqu -48(%r9, %r8), %xmm6 393 movdqu -64(%r9, %r8), %xmm7 394 395 movdqu %xmm0, -16(%rdi, %rdx) 396 movdqu %xmm1, -32(%rdi, %rdx) 397 movdqu %xmm2, -48(%rdi, %rdx) 398 movdqu %xmm3, -64(%rdi, %rdx) 399 movdqa %xmm4, -16(%r9) 400 movaps %xmm5, -32(%r9) 401 movaps %xmm6, -48(%r9) 402 movaps %xmm7, -64(%r9) 403 lea -64(%r9), %r9 404 405 lea 64(%rdi), %rbx 406 and $-64, %rbx 407 408 cmp %r9, %rbx 409 jae L(mm_recalc_len) 410 411 cmp $SHARED_CACHE_SIZE_HALF, %rdx 412 jae L(mm_large_page_loop_backward) 413 414 .p2align 4 415 L(mm_main_loop_backward): 416 417 prefetcht0 -128(%r9, %r8) 418 419 movdqu -64(%r9, %r8), %xmm0 420 movdqu -48(%r9, %r8), %xmm1 421 movdqu -32(%r9, %r8), %xmm2 422 movdqu -16(%r9, %r8), %xmm3 423 movdqa %xmm0, -64(%r9) 424 movaps %xmm1, -48(%r9) 425 movaps %xmm2, -32(%r9) 426 movaps %xmm3, -16(%r9) 427 lea -64(%r9), %r9 428 cmp %r9, %rbx 429 jb L(mm_main_loop_backward) 430 jmp L(mm_recalc_len) 431 432 /* Copy [0..16] and return. */ 433 L(mm_len_0_16_bytes_backward): 434 testb $24, %dl 435 jnz L(mm_len_9_16_bytes_backward) 436 testb $4, %dl 437 .p2align 4,,5 438 jnz L(mm_len_5_8_bytes_backward) 439 test %rdx, %rdx 440 .p2align 4,,2 441 je L(mm_return) 442 testb $2, %dl 443 .p2align 4,,1 444 jne L(mm_len_3_4_bytes_backward) 445 movzbl -1(%rsi,%rdx), %ebx 446 movzbl (%rsi), %ecx 447 movb %bl, -1(%rdi,%rdx) 448 movb %cl, (%rdi) 449 jmp L(mm_return) 450 451 L(mm_len_3_4_bytes_backward): 452 movzwl -2(%rsi,%rdx), %ebx 453 movzwl (%rsi), %ecx 454 movw %bx, -2(%rdi,%rdx) 455 movw %cx, (%rdi) 456 jmp L(mm_return) 457 458 L(mm_len_9_16_bytes_backward): 459 movl -4(%rsi,%rdx), %ebx 460 movl -8(%rsi,%rdx), %ecx 461 movl %ebx, -4(%rdi,%rdx) 462 movl %ecx, -8(%rdi,%rdx) 463 sub $8, %rdx 464 jmp L(mm_len_0_16_bytes_backward) 465 466 L(mm_len_5_8_bytes_backward): 467 movl (%rsi), %ebx 468 movl -4(%rsi,%rdx), %ecx 469 movl %ebx, (%rdi) 470 movl %ecx, -4(%rdi,%rdx) 471 472 L(mm_return): 473 RETURN 474 475 /* Big length copy forward part. */ 476 477 .p2align 4 478 L(mm_large_page_loop_forward): 479 movdqu (%r8, %rsi), %xmm0 480 movdqu 16(%r8, %rsi), %xmm1 481 movdqu 32(%r8, %rsi), %xmm2 482 movdqu 48(%r8, %rsi), %xmm3 483 movntdq %xmm0, (%r8) 484 movntdq %xmm1, 16(%r8) 485 movntdq %xmm2, 32(%r8) 486 movntdq %xmm3, 48(%r8) 487 lea 64(%r8), %r8 488 cmp %r8, %rbx 489 ja L(mm_large_page_loop_forward) 490 sfence 491 jmp L(mm_copy_remaining_forward) 492 493 /* Big length copy backward part. */ 494 .p2align 4 495 L(mm_large_page_loop_backward): 496 movdqu -64(%r9, %r8), %xmm0 497 movdqu -48(%r9, %r8), %xmm1 498 movdqu -32(%r9, %r8), %xmm2 499 movdqu -16(%r9, %r8), %xmm3 500 movntdq %xmm0, -64(%r9) 501 movntdq %xmm1, -48(%r9) 502 movntdq %xmm2, -32(%r9) 503 movntdq %xmm3, -16(%r9) 504 lea -64(%r9), %r9 505 cmp %r9, %rbx 506 jb L(mm_large_page_loop_backward) 507 sfence 508 jmp L(mm_recalc_len) 509 510 END (MEMMOVE) 511