Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMMOVE
     34 # define MEMMOVE		memmove
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc	.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc	.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)	.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)		\
     63 	.type name,  @function;		\
     64 	.globl name;		\
     65 	.p2align 4;		\
     66 name:		\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)		\
     72 	cfi_endproc;		\
     73 	.size name, .-name
     74 #endif
     75 
     76 #define CFI_PUSH(REG)		\
     77 	cfi_adjust_cfa_offset (4);		\
     78 	cfi_rel_offset (REG, 0)
     79 
     80 #define CFI_POP(REG)		\
     81 	cfi_adjust_cfa_offset (-4);		\
     82 	cfi_restore (REG)
     83 
     84 #define PUSH(REG)	push REG;
     85 #define POP(REG)	pop REG;
     86 
     87 #define ENTRANCE	PUSH (%rbx);
     88 #define RETURN_END	POP (%rbx); ret
     89 #define RETURN		RETURN_END;
     90 
     91 	.section .text.sse2,"ax",@progbits
     92 ENTRY (MEMMOVE)
     93 	ENTRANCE
     94 	mov	%rdi, %rax
     95 
     96 /* Check whether we should copy backward or forward.  */
     97 	cmp	%rsi, %rdi
     98 	je	L(mm_return)
     99 	jg	L(mm_len_0_or_more_backward)
    100 
    101 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
    102 	separately.  */
    103 	cmp	$16, %rdx
    104 	jbe	L(mm_len_0_16_bytes_forward)
    105 
    106 	cmp	$32, %rdx
    107 	ja	L(mm_len_32_or_more_forward)
    108 
    109 /* Copy [0..32] and return.  */
    110 	movdqu	(%rsi), %xmm0
    111 	movdqu	-16(%rsi, %rdx), %xmm1
    112 	movdqu	%xmm0, (%rdi)
    113 	movdqu	%xmm1, -16(%rdi, %rdx)
    114 	jmp	L(mm_return)
    115 
    116 L(mm_len_32_or_more_forward):
    117 	cmp	$64, %rdx
    118 	ja	L(mm_len_64_or_more_forward)
    119 
    120 /* Copy [0..64] and return.  */
    121 	movdqu	(%rsi), %xmm0
    122 	movdqu	16(%rsi), %xmm1
    123 	movdqu	-16(%rsi, %rdx), %xmm2
    124 	movdqu	-32(%rsi, %rdx), %xmm3
    125 	movdqu	%xmm0, (%rdi)
    126 	movdqu	%xmm1, 16(%rdi)
    127 	movdqu	%xmm2, -16(%rdi, %rdx)
    128 	movdqu	%xmm3, -32(%rdi, %rdx)
    129 	jmp	L(mm_return)
    130 
    131 L(mm_len_64_or_more_forward):
    132 	cmp	$128, %rdx
    133 	ja	L(mm_len_128_or_more_forward)
    134 
    135 /* Copy [0..128] and return.  */
    136 	movdqu	(%rsi), %xmm0
    137 	movdqu	16(%rsi), %xmm1
    138 	movdqu	32(%rsi), %xmm2
    139 	movdqu	48(%rsi), %xmm3
    140 	movdqu	-64(%rsi, %rdx), %xmm4
    141 	movdqu	-48(%rsi, %rdx), %xmm5
    142 	movdqu	-32(%rsi, %rdx), %xmm6
    143 	movdqu	-16(%rsi, %rdx), %xmm7
    144 	movdqu	%xmm0, (%rdi)
    145 	movdqu	%xmm1, 16(%rdi)
    146 	movdqu	%xmm2, 32(%rdi)
    147 	movdqu	%xmm3, 48(%rdi)
    148 	movdqu	%xmm4, -64(%rdi, %rdx)
    149 	movdqu	%xmm5, -48(%rdi, %rdx)
    150 	movdqu	%xmm6, -32(%rdi, %rdx)
    151 	movdqu	%xmm7, -16(%rdi, %rdx)
    152 	jmp	L(mm_return)
    153 
    154 L(mm_len_128_or_more_forward):
    155 /* Aligning the address of destination.  */
    156 /*  save first unaligned 64 bytes */
    157 	movdqu	(%rsi), %xmm0
    158 	movdqu	16(%rsi), %xmm1
    159 	movdqu	32(%rsi), %xmm2
    160 	movdqu	48(%rsi), %xmm3
    161 
    162 	lea	64(%rdi), %r8
    163 	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
    164 	sub	%rdi, %rsi /* rsi = src - dst = diff */
    165 
    166 	movdqu	(%r8, %rsi), %xmm4
    167 	movdqu	16(%r8, %rsi), %xmm5
    168 	movdqu	32(%r8, %rsi), %xmm6
    169 	movdqu	48(%r8, %rsi), %xmm7
    170 
    171 	movdqu	%xmm0, (%rdi)
    172 	movdqu	%xmm1, 16(%rdi)
    173 	movdqu	%xmm2, 32(%rdi)
    174 	movdqu	%xmm3, 48(%rdi)
    175 	movdqa	%xmm4, (%r8)
    176 	movaps	%xmm5, 16(%r8)
    177 	movaps	%xmm6, 32(%r8)
    178 	movaps	%xmm7, 48(%r8)
    179 	add	$64, %r8
    180 
    181 	lea	(%rdi, %rdx), %rbx
    182 	and	$-64, %rbx
    183 	cmp	%r8, %rbx
    184 	jbe	L(mm_copy_remaining_forward)
    185 
    186 	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
    187 	jae	L(mm_large_page_loop_forward)
    188 
    189 	.p2align 4
    190 L(mm_main_loop_forward):
    191 
    192 	prefetcht0 128(%r8, %rsi)
    193 
    194 	movdqu	(%r8, %rsi), %xmm0
    195 	movdqu	16(%r8, %rsi), %xmm1
    196 	movdqu	32(%r8, %rsi), %xmm2
    197 	movdqu	48(%r8, %rsi), %xmm3
    198 	movdqa	%xmm0, (%r8)
    199 	movaps	%xmm1, 16(%r8)
    200 	movaps	%xmm2, 32(%r8)
    201 	movaps	%xmm3, 48(%r8)
    202 	lea	64(%r8), %r8
    203 	cmp	%r8, %rbx
    204 	ja	L(mm_main_loop_forward)
    205 
    206 L(mm_copy_remaining_forward):
    207 	add	%rdi, %rdx
    208 	sub	%r8, %rdx
    209 /* We copied all up till %rdi position in the dst.
    210 	In %rdx now is how many bytes are left to copy.
    211 	Now we need to advance %r8. */
    212 	lea	(%r8, %rsi), %r9
    213 
    214 L(mm_remaining_0_64_bytes_forward):
    215 	cmp	$32, %rdx
    216 	ja	L(mm_remaining_33_64_bytes_forward)
    217 	cmp	$16, %rdx
    218 	ja	L(mm_remaining_17_32_bytes_forward)
    219 	test	%rdx, %rdx
    220 	.p2align 4,,2
    221 	je	L(mm_return)
    222 
    223 	cmpb	$8, %dl
    224 	ja	L(mm_remaining_9_16_bytes_forward)
    225 	cmpb	$4, %dl
    226 	.p2align 4,,5
    227 	ja	L(mm_remaining_5_8_bytes_forward)
    228 	cmpb	$2, %dl
    229 	.p2align 4,,1
    230 	ja	L(mm_remaining_3_4_bytes_forward)
    231 	movzbl	-1(%r9,%rdx), %esi
    232 	movzbl	(%r9), %ebx
    233 	movb	%sil, -1(%r8,%rdx)
    234 	movb	%bl, (%r8)
    235 	jmp	L(mm_return)
    236 
    237 L(mm_remaining_33_64_bytes_forward):
    238 	movdqu	(%r9), %xmm0
    239 	movdqu	16(%r9), %xmm1
    240 	movdqu	-32(%r9, %rdx), %xmm2
    241 	movdqu	-16(%r9, %rdx), %xmm3
    242 	movdqu	%xmm0, (%r8)
    243 	movdqu	%xmm1, 16(%r8)
    244 	movdqu	%xmm2, -32(%r8, %rdx)
    245 	movdqu	%xmm3, -16(%r8, %rdx)
    246 	jmp	L(mm_return)
    247 
    248 L(mm_remaining_17_32_bytes_forward):
    249 	movdqu	(%r9), %xmm0
    250 	movdqu	-16(%r9, %rdx), %xmm1
    251 	movdqu	%xmm0, (%r8)
    252 	movdqu	%xmm1, -16(%r8, %rdx)
    253 	jmp	L(mm_return)
    254 
    255 L(mm_remaining_5_8_bytes_forward):
    256 	movl	(%r9), %esi
    257 	movl	-4(%r9,%rdx), %ebx
    258 	movl	%esi, (%r8)
    259 	movl	%ebx, -4(%r8,%rdx)
    260 	jmp	L(mm_return)
    261 
    262 L(mm_remaining_9_16_bytes_forward):
    263 	mov	(%r9), %rsi
    264 	mov	-8(%r9, %rdx), %rbx
    265 	mov	%rsi, (%r8)
    266 	mov	%rbx, -8(%r8, %rdx)
    267 	jmp	L(mm_return)
    268 
    269 L(mm_remaining_3_4_bytes_forward):
    270 	movzwl	-2(%r9,%rdx), %esi
    271 	movzwl	(%r9), %ebx
    272 	movw	%si, -2(%r8,%rdx)
    273 	movw	%bx, (%r8)
    274 	jmp	L(mm_return)
    275 
    276 L(mm_len_0_16_bytes_forward):
    277 	testb	$24, %dl
    278 	jne	L(mm_len_9_16_bytes_forward)
    279 	testb	$4, %dl
    280 	.p2align 4,,5
    281 	jne	L(mm_len_5_8_bytes_forward)
    282 	test	%rdx, %rdx
    283 	.p2align 4,,2
    284 	je	L(mm_return)
    285 	testb	$2, %dl
    286 	.p2align 4,,1
    287 	jne	L(mm_len_2_4_bytes_forward)
    288 	movzbl	-1(%rsi,%rdx), %ebx
    289 	movzbl	(%rsi), %esi
    290 	movb	%bl, -1(%rdi,%rdx)
    291 	movb	%sil, (%rdi)
    292 	jmp	L(mm_return)
    293 
    294 L(mm_len_2_4_bytes_forward):
    295 	movzwl	-2(%rsi,%rdx), %ebx
    296 	movzwl	(%rsi), %esi
    297 	movw	%bx, -2(%rdi,%rdx)
    298 	movw	%si, (%rdi)
    299 	jmp	L(mm_return)
    300 
    301 L(mm_len_5_8_bytes_forward):
    302 	movl	(%rsi), %ebx
    303 	movl	-4(%rsi,%rdx), %esi
    304 	movl	%ebx, (%rdi)
    305 	movl	%esi, -4(%rdi,%rdx)
    306 	jmp	L(mm_return)
    307 
    308 L(mm_len_9_16_bytes_forward):
    309 	mov	(%rsi), %rbx
    310 	mov	-8(%rsi, %rdx), %rsi
    311 	mov	%rbx, (%rdi)
    312 	mov	%rsi, -8(%rdi, %rdx)
    313 	jmp	L(mm_return)
    314 
    315 L(mm_recalc_len):
    316 /* Compute in %rdx how many bytes are left to copy after
    317 	the main loop stops.  */
    318 	mov 	%rbx, %rdx
    319 	sub 	%rdi, %rdx
    320 /* The code for copying backwards.  */
    321 L(mm_len_0_or_more_backward):
    322 
    323 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
    324 	separately.  */
    325 	cmp	$16, %rdx
    326 	jbe	L(mm_len_0_16_bytes_backward)
    327 
    328 	cmp	$32, %rdx
    329 	ja	L(mm_len_32_or_more_backward)
    330 
    331 /* Copy [0..32] and return.  */
    332 	movdqu	(%rsi), %xmm0
    333 	movdqu	-16(%rsi, %rdx), %xmm1
    334 	movdqu	%xmm0, (%rdi)
    335 	movdqu	%xmm1, -16(%rdi, %rdx)
    336 	jmp	L(mm_return)
    337 
    338 L(mm_len_32_or_more_backward):
    339 	cmp	$64, %rdx
    340 	ja	L(mm_len_64_or_more_backward)
    341 
    342 /* Copy [0..64] and return.  */
    343 	movdqu	(%rsi), %xmm0
    344 	movdqu	16(%rsi), %xmm1
    345 	movdqu	-16(%rsi, %rdx), %xmm2
    346 	movdqu	-32(%rsi, %rdx), %xmm3
    347 	movdqu	%xmm0, (%rdi)
    348 	movdqu	%xmm1, 16(%rdi)
    349 	movdqu	%xmm2, -16(%rdi, %rdx)
    350 	movdqu	%xmm3, -32(%rdi, %rdx)
    351 	jmp	L(mm_return)
    352 
    353 L(mm_len_64_or_more_backward):
    354 	cmp	$128, %rdx
    355 	ja	L(mm_len_128_or_more_backward)
    356 
    357 /* Copy [0..128] and return.  */
    358 	movdqu	(%rsi), %xmm0
    359 	movdqu	16(%rsi), %xmm1
    360 	movdqu	32(%rsi), %xmm2
    361 	movdqu	48(%rsi), %xmm3
    362 	movdqu	-64(%rsi, %rdx), %xmm4
    363 	movdqu	-48(%rsi, %rdx), %xmm5
    364 	movdqu	-32(%rsi, %rdx), %xmm6
    365 	movdqu	-16(%rsi, %rdx), %xmm7
    366 	movdqu	%xmm0, (%rdi)
    367 	movdqu	%xmm1, 16(%rdi)
    368 	movdqu	%xmm2, 32(%rdi)
    369 	movdqu	%xmm3, 48(%rdi)
    370 	movdqu	%xmm4, -64(%rdi, %rdx)
    371 	movdqu	%xmm5, -48(%rdi, %rdx)
    372 	movdqu	%xmm6, -32(%rdi, %rdx)
    373 	movdqu	%xmm7, -16(%rdi, %rdx)
    374 	jmp	L(mm_return)
    375 
    376 L(mm_len_128_or_more_backward):
    377 /* Aligning the address of destination. We need to save
    378 	16 bits from the source in order not to overwrite them.  */
    379 	movdqu	-16(%rsi, %rdx), %xmm0
    380 	movdqu	-32(%rsi, %rdx), %xmm1
    381 	movdqu	-48(%rsi, %rdx), %xmm2
    382 	movdqu	-64(%rsi, %rdx), %xmm3
    383 
    384 	lea	(%rdi, %rdx), %r9
    385 	and	$-64, %r9 /* r9 = aligned dst */
    386 
    387 	mov	%rsi, %r8
    388 	sub	%rdi, %r8 /* r8 = src - dst, diff */
    389 
    390 	movdqu	-16(%r9, %r8), %xmm4
    391 	movdqu	-32(%r9, %r8), %xmm5
    392 	movdqu	-48(%r9, %r8), %xmm6
    393 	movdqu	-64(%r9, %r8), %xmm7
    394 
    395 	movdqu	%xmm0, -16(%rdi, %rdx)
    396 	movdqu	%xmm1, -32(%rdi, %rdx)
    397 	movdqu	%xmm2, -48(%rdi, %rdx)
    398 	movdqu	%xmm3, -64(%rdi, %rdx)
    399 	movdqa	%xmm4, -16(%r9)
    400 	movaps	%xmm5, -32(%r9)
    401 	movaps	%xmm6, -48(%r9)
    402 	movaps	%xmm7, -64(%r9)
    403 	lea	-64(%r9), %r9
    404 
    405 	lea	64(%rdi), %rbx
    406 	and	$-64, %rbx
    407 
    408 	cmp	%r9, %rbx
    409 	jae	L(mm_recalc_len)
    410 
    411 	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
    412 	jae	L(mm_large_page_loop_backward)
    413 
    414 	.p2align 4
    415 L(mm_main_loop_backward):
    416 
    417 	prefetcht0 -128(%r9, %r8)
    418 
    419 	movdqu	-64(%r9, %r8), %xmm0
    420 	movdqu	-48(%r9, %r8), %xmm1
    421 	movdqu	-32(%r9, %r8), %xmm2
    422 	movdqu	-16(%r9, %r8), %xmm3
    423 	movdqa	%xmm0, -64(%r9)
    424 	movaps	%xmm1, -48(%r9)
    425 	movaps	%xmm2, -32(%r9)
    426 	movaps	%xmm3, -16(%r9)
    427 	lea	-64(%r9), %r9
    428 	cmp	%r9, %rbx
    429 	jb	L(mm_main_loop_backward)
    430 	jmp	L(mm_recalc_len)
    431 
    432 /* Copy [0..16] and return.  */
    433 L(mm_len_0_16_bytes_backward):
    434 	testb	$24, %dl
    435 	jnz	L(mm_len_9_16_bytes_backward)
    436 	testb	$4, %dl
    437 	.p2align 4,,5
    438 	jnz	L(mm_len_5_8_bytes_backward)
    439 	test	%rdx, %rdx
    440 	.p2align 4,,2
    441 	je	L(mm_return)
    442 	testb	$2, %dl
    443 	.p2align 4,,1
    444 	jne	L(mm_len_3_4_bytes_backward)
    445 	movzbl	-1(%rsi,%rdx), %ebx
    446 	movzbl	(%rsi), %ecx
    447 	movb	%bl, -1(%rdi,%rdx)
    448 	movb	%cl, (%rdi)
    449 	jmp	L(mm_return)
    450 
    451 L(mm_len_3_4_bytes_backward):
    452 	movzwl	-2(%rsi,%rdx), %ebx
    453 	movzwl	(%rsi), %ecx
    454 	movw	%bx, -2(%rdi,%rdx)
    455 	movw	%cx, (%rdi)
    456 	jmp	L(mm_return)
    457 
    458 L(mm_len_9_16_bytes_backward):
    459 	movl	-4(%rsi,%rdx), %ebx
    460 	movl	-8(%rsi,%rdx), %ecx
    461 	movl	%ebx, -4(%rdi,%rdx)
    462 	movl	%ecx, -8(%rdi,%rdx)
    463 	sub	$8, %rdx
    464 	jmp	L(mm_len_0_16_bytes_backward)
    465 
    466 L(mm_len_5_8_bytes_backward):
    467 	movl	(%rsi), %ebx
    468 	movl	-4(%rsi,%rdx), %ecx
    469 	movl	%ebx, (%rdi)
    470 	movl	%ecx, -4(%rdi,%rdx)
    471 
    472 L(mm_return):
    473 	RETURN
    474 
    475 /* Big length copy forward part.  */
    476 
    477 	.p2align 4
    478 L(mm_large_page_loop_forward):
    479 	movdqu	(%r8, %rsi), %xmm0
    480 	movdqu	16(%r8, %rsi), %xmm1
    481 	movdqu	32(%r8, %rsi), %xmm2
    482 	movdqu	48(%r8, %rsi), %xmm3
    483 	movntdq	%xmm0, (%r8)
    484 	movntdq	%xmm1, 16(%r8)
    485 	movntdq	%xmm2, 32(%r8)
    486 	movntdq	%xmm3, 48(%r8)
    487 	lea 	64(%r8), %r8
    488 	cmp	%r8, %rbx
    489 	ja	L(mm_large_page_loop_forward)
    490 	sfence
    491 	jmp	L(mm_copy_remaining_forward)
    492 
    493 /* Big length copy backward part.  */
    494 	.p2align 4
    495 L(mm_large_page_loop_backward):
    496 	movdqu	-64(%r9, %r8), %xmm0
    497 	movdqu	-48(%r9, %r8), %xmm1
    498 	movdqu	-32(%r9, %r8), %xmm2
    499 	movdqu	-16(%r9, %r8), %xmm3
    500 	movntdq	%xmm0, -64(%r9)
    501 	movntdq	%xmm1, -48(%r9)
    502 	movntdq	%xmm2, -32(%r9)
    503 	movntdq	%xmm3, -16(%r9)
    504 	lea 	-64(%r9), %r9
    505 	cmp	%r9, %rbx
    506 	jb	L(mm_large_page_loop_backward)
    507 	sfence
    508 	jmp	L(mm_recalc_len)
    509 
    510 END (MEMMOVE)
    511