Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMMOVE
     34 # define MEMMOVE		memmove
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc	.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc	.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)	.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)		\
     63 	.type name,  @function;		\
     64 	.globl name;		\
     65 	.p2align 4;		\
     66 name:		\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)		\
     72 	cfi_endproc;		\
     73 	.size name, .-name
     74 #endif
     75 
     76 #define CFI_PUSH(REG)		\
     77 	cfi_adjust_cfa_offset (4);		\
     78 	cfi_rel_offset (REG, 0)
     79 
     80 #define CFI_POP(REG)		\
     81 	cfi_adjust_cfa_offset (-4);		\
     82 	cfi_restore (REG)
     83 
     84 #define PUSH(REG)	push REG;
     85 #define POP(REG)	pop REG;
     86 
     87 #define ENTRANCE	PUSH (%rbx);
     88 #define RETURN_END	POP (%rbx); ret
     89 #define RETURN		RETURN_END;
     90 
     91 	.section .text.sse2,"ax",@progbits
     92 ENTRY (MEMMOVE)
     93 	ENTRANCE
     94 #ifdef USE_AS_BCOPY
     95 	xchg	%rsi, %rdi
     96 #endif
     97 	mov	%rdi, %rax
     98 
     99 /* Check whether we should copy backward or forward.  */
    100 	cmp	%rsi, %rdi
    101 	je	L(mm_return)
    102 	jg	L(mm_len_0_or_more_backward)
    103 
    104 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
    105 	separately.  */
    106 	cmp	$16, %rdx
    107 	jbe	L(mm_len_0_16_bytes_forward)
    108 
    109 	cmp	$32, %rdx
    110 	ja	L(mm_len_32_or_more_forward)
    111 
    112 /* Copy [0..32] and return.  */
    113 	movdqu	(%rsi), %xmm0
    114 	movdqu	-16(%rsi, %rdx), %xmm1
    115 	movdqu	%xmm0, (%rdi)
    116 	movdqu	%xmm1, -16(%rdi, %rdx)
    117 	jmp	L(mm_return)
    118 
    119 L(mm_len_32_or_more_forward):
    120 	cmp	$64, %rdx
    121 	ja	L(mm_len_64_or_more_forward)
    122 
    123 /* Copy [0..64] and return.  */
    124 	movdqu	(%rsi), %xmm0
    125 	movdqu	16(%rsi), %xmm1
    126 	movdqu	-16(%rsi, %rdx), %xmm2
    127 	movdqu	-32(%rsi, %rdx), %xmm3
    128 	movdqu	%xmm0, (%rdi)
    129 	movdqu	%xmm1, 16(%rdi)
    130 	movdqu	%xmm2, -16(%rdi, %rdx)
    131 	movdqu	%xmm3, -32(%rdi, %rdx)
    132 	jmp	L(mm_return)
    133 
    134 L(mm_len_64_or_more_forward):
    135 	cmp	$128, %rdx
    136 	ja	L(mm_len_128_or_more_forward)
    137 
    138 /* Copy [0..128] and return.  */
    139 	movdqu	(%rsi), %xmm0
    140 	movdqu	16(%rsi), %xmm1
    141 	movdqu	32(%rsi), %xmm2
    142 	movdqu	48(%rsi), %xmm3
    143 	movdqu	-64(%rsi, %rdx), %xmm4
    144 	movdqu	-48(%rsi, %rdx), %xmm5
    145 	movdqu	-32(%rsi, %rdx), %xmm6
    146 	movdqu	-16(%rsi, %rdx), %xmm7
    147 	movdqu	%xmm0, (%rdi)
    148 	movdqu	%xmm1, 16(%rdi)
    149 	movdqu	%xmm2, 32(%rdi)
    150 	movdqu	%xmm3, 48(%rdi)
    151 	movdqu	%xmm4, -64(%rdi, %rdx)
    152 	movdqu	%xmm5, -48(%rdi, %rdx)
    153 	movdqu	%xmm6, -32(%rdi, %rdx)
    154 	movdqu	%xmm7, -16(%rdi, %rdx)
    155 	jmp	L(mm_return)
    156 
    157 L(mm_len_128_or_more_forward):
    158 /* Aligning the address of destination.  */
    159 /*  save first unaligned 64 bytes */
    160 	movdqu	(%rsi), %xmm0
    161 	movdqu	16(%rsi), %xmm1
    162 	movdqu	32(%rsi), %xmm2
    163 	movdqu	48(%rsi), %xmm3
    164 
    165 	lea	64(%rdi), %r8
    166 	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
    167 	sub	%rdi, %rsi /* rsi = src - dst = diff */
    168 
    169 	movdqu	(%r8, %rsi), %xmm4
    170 	movdqu	16(%r8, %rsi), %xmm5
    171 	movdqu	32(%r8, %rsi), %xmm6
    172 	movdqu	48(%r8, %rsi), %xmm7
    173 
    174 	movdqu	%xmm0, (%rdi)
    175 	movdqu	%xmm1, 16(%rdi)
    176 	movdqu	%xmm2, 32(%rdi)
    177 	movdqu	%xmm3, 48(%rdi)
    178 	movdqa	%xmm4, (%r8)
    179 	movaps	%xmm5, 16(%r8)
    180 	movaps	%xmm6, 32(%r8)
    181 	movaps	%xmm7, 48(%r8)
    182 	add	$64, %r8
    183 
    184 	lea	(%rdi, %rdx), %rbx
    185 	and	$-64, %rbx
    186 	cmp	%r8, %rbx
    187 	jbe	L(mm_copy_remaining_forward)
    188 
    189 	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
    190 	jae	L(mm_large_page_loop_forward)
    191 
    192 	.p2align 4
    193 L(mm_main_loop_forward):
    194 
    195 	prefetcht0 128(%r8, %rsi)
    196 
    197 	movdqu	(%r8, %rsi), %xmm0
    198 	movdqu	16(%r8, %rsi), %xmm1
    199 	movdqu	32(%r8, %rsi), %xmm2
    200 	movdqu	48(%r8, %rsi), %xmm3
    201 	movdqa	%xmm0, (%r8)
    202 	movaps	%xmm1, 16(%r8)
    203 	movaps	%xmm2, 32(%r8)
    204 	movaps	%xmm3, 48(%r8)
    205 	lea	64(%r8), %r8
    206 	cmp	%r8, %rbx
    207 	ja	L(mm_main_loop_forward)
    208 
    209 L(mm_copy_remaining_forward):
    210 	add	%rdi, %rdx
    211 	sub	%r8, %rdx
    212 /* We copied all up till %rdi position in the dst.
    213 	In %rdx now is how many bytes are left to copy.
    214 	Now we need to advance %r8. */
    215 	lea	(%r8, %rsi), %r9
    216 
    217 L(mm_remaining_0_64_bytes_forward):
    218 	cmp	$32, %rdx
    219 	ja	L(mm_remaining_33_64_bytes_forward)
    220 	cmp	$16, %rdx
    221 	ja	L(mm_remaining_17_32_bytes_forward)
    222 	test	%rdx, %rdx
    223 	.p2align 4,,2
    224 	je	L(mm_return)
    225 
    226 	cmpb	$8, %dl
    227 	ja	L(mm_remaining_9_16_bytes_forward)
    228 	cmpb	$4, %dl
    229 	.p2align 4,,5
    230 	ja	L(mm_remaining_5_8_bytes_forward)
    231 	cmpb	$2, %dl
    232 	.p2align 4,,1
    233 	ja	L(mm_remaining_3_4_bytes_forward)
    234 	movzbl	-1(%r9,%rdx), %esi
    235 	movzbl	(%r9), %ebx
    236 	movb	%sil, -1(%r8,%rdx)
    237 	movb	%bl, (%r8)
    238 	jmp	L(mm_return)
    239 
    240 L(mm_remaining_33_64_bytes_forward):
    241 	movdqu	(%r9), %xmm0
    242 	movdqu	16(%r9), %xmm1
    243 	movdqu	-32(%r9, %rdx), %xmm2
    244 	movdqu	-16(%r9, %rdx), %xmm3
    245 	movdqu	%xmm0, (%r8)
    246 	movdqu	%xmm1, 16(%r8)
    247 	movdqu	%xmm2, -32(%r8, %rdx)
    248 	movdqu	%xmm3, -16(%r8, %rdx)
    249 	jmp	L(mm_return)
    250 
    251 L(mm_remaining_17_32_bytes_forward):
    252 	movdqu	(%r9), %xmm0
    253 	movdqu	-16(%r9, %rdx), %xmm1
    254 	movdqu	%xmm0, (%r8)
    255 	movdqu	%xmm1, -16(%r8, %rdx)
    256 	jmp	L(mm_return)
    257 
    258 L(mm_remaining_5_8_bytes_forward):
    259 	movl	(%r9), %esi
    260 	movl	-4(%r9,%rdx), %ebx
    261 	movl	%esi, (%r8)
    262 	movl	%ebx, -4(%r8,%rdx)
    263 	jmp	L(mm_return)
    264 
    265 L(mm_remaining_9_16_bytes_forward):
    266 	mov	(%r9), %rsi
    267 	mov	-8(%r9, %rdx), %rbx
    268 	mov	%rsi, (%r8)
    269 	mov	%rbx, -8(%r8, %rdx)
    270 	jmp	L(mm_return)
    271 
    272 L(mm_remaining_3_4_bytes_forward):
    273 	movzwl	-2(%r9,%rdx), %esi
    274 	movzwl	(%r9), %ebx
    275 	movw	%si, -2(%r8,%rdx)
    276 	movw	%bx, (%r8)
    277 	jmp	L(mm_return)
    278 
    279 L(mm_len_0_16_bytes_forward):
    280 	testb	$24, %dl
    281 	jne	L(mm_len_9_16_bytes_forward)
    282 	testb	$4, %dl
    283 	.p2align 4,,5
    284 	jne	L(mm_len_5_8_bytes_forward)
    285 	test	%rdx, %rdx
    286 	.p2align 4,,2
    287 	je	L(mm_return)
    288 	testb	$2, %dl
    289 	.p2align 4,,1
    290 	jne	L(mm_len_2_4_bytes_forward)
    291 	movzbl	-1(%rsi,%rdx), %ebx
    292 	movzbl	(%rsi), %esi
    293 	movb	%bl, -1(%rdi,%rdx)
    294 	movb	%sil, (%rdi)
    295 	jmp	L(mm_return)
    296 
    297 L(mm_len_2_4_bytes_forward):
    298 	movzwl	-2(%rsi,%rdx), %ebx
    299 	movzwl	(%rsi), %esi
    300 	movw	%bx, -2(%rdi,%rdx)
    301 	movw	%si, (%rdi)
    302 	jmp	L(mm_return)
    303 
    304 L(mm_len_5_8_bytes_forward):
    305 	movl	(%rsi), %ebx
    306 	movl	-4(%rsi,%rdx), %esi
    307 	movl	%ebx, (%rdi)
    308 	movl	%esi, -4(%rdi,%rdx)
    309 	jmp	L(mm_return)
    310 
    311 L(mm_len_9_16_bytes_forward):
    312 	mov	(%rsi), %rbx
    313 	mov	-8(%rsi, %rdx), %rsi
    314 	mov	%rbx, (%rdi)
    315 	mov	%rsi, -8(%rdi, %rdx)
    316 	jmp	L(mm_return)
    317 
    318 L(mm_recalc_len):
    319 /* Compute in %rdx how many bytes are left to copy after
    320 	the main loop stops.  */
    321 	mov 	%rbx, %rdx
    322 	sub 	%rdi, %rdx
    323 /* The code for copying backwards.  */
    324 L(mm_len_0_or_more_backward):
    325 
    326 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
    327 	separately.  */
    328 	cmp	$16, %rdx
    329 	jbe	L(mm_len_0_16_bytes_backward)
    330 
    331 	cmp	$32, %rdx
    332 	ja	L(mm_len_32_or_more_backward)
    333 
    334 /* Copy [0..32] and return.  */
    335 	movdqu	(%rsi), %xmm0
    336 	movdqu	-16(%rsi, %rdx), %xmm1
    337 	movdqu	%xmm0, (%rdi)
    338 	movdqu	%xmm1, -16(%rdi, %rdx)
    339 	jmp	L(mm_return)
    340 
    341 L(mm_len_32_or_more_backward):
    342 	cmp	$64, %rdx
    343 	ja	L(mm_len_64_or_more_backward)
    344 
    345 /* Copy [0..64] and return.  */
    346 	movdqu	(%rsi), %xmm0
    347 	movdqu	16(%rsi), %xmm1
    348 	movdqu	-16(%rsi, %rdx), %xmm2
    349 	movdqu	-32(%rsi, %rdx), %xmm3
    350 	movdqu	%xmm0, (%rdi)
    351 	movdqu	%xmm1, 16(%rdi)
    352 	movdqu	%xmm2, -16(%rdi, %rdx)
    353 	movdqu	%xmm3, -32(%rdi, %rdx)
    354 	jmp	L(mm_return)
    355 
    356 L(mm_len_64_or_more_backward):
    357 	cmp	$128, %rdx
    358 	ja	L(mm_len_128_or_more_backward)
    359 
    360 /* Copy [0..128] and return.  */
    361 	movdqu	(%rsi), %xmm0
    362 	movdqu	16(%rsi), %xmm1
    363 	movdqu	32(%rsi), %xmm2
    364 	movdqu	48(%rsi), %xmm3
    365 	movdqu	-64(%rsi, %rdx), %xmm4
    366 	movdqu	-48(%rsi, %rdx), %xmm5
    367 	movdqu	-32(%rsi, %rdx), %xmm6
    368 	movdqu	-16(%rsi, %rdx), %xmm7
    369 	movdqu	%xmm0, (%rdi)
    370 	movdqu	%xmm1, 16(%rdi)
    371 	movdqu	%xmm2, 32(%rdi)
    372 	movdqu	%xmm3, 48(%rdi)
    373 	movdqu	%xmm4, -64(%rdi, %rdx)
    374 	movdqu	%xmm5, -48(%rdi, %rdx)
    375 	movdqu	%xmm6, -32(%rdi, %rdx)
    376 	movdqu	%xmm7, -16(%rdi, %rdx)
    377 	jmp	L(mm_return)
    378 
    379 L(mm_len_128_or_more_backward):
    380 /* Aligning the address of destination. We need to save
    381 	16 bits from the source in order not to overwrite them.  */
    382 	movdqu	-16(%rsi, %rdx), %xmm0
    383 	movdqu	-32(%rsi, %rdx), %xmm1
    384 	movdqu	-48(%rsi, %rdx), %xmm2
    385 	movdqu	-64(%rsi, %rdx), %xmm3
    386 
    387 	lea	(%rdi, %rdx), %r9
    388 	and	$-64, %r9 /* r9 = aligned dst */
    389 
    390 	mov	%rsi, %r8
    391 	sub	%rdi, %r8 /* r8 = src - dst, diff */
    392 
    393 	movdqu	-16(%r9, %r8), %xmm4
    394 	movdqu	-32(%r9, %r8), %xmm5
    395 	movdqu	-48(%r9, %r8), %xmm6
    396 	movdqu	-64(%r9, %r8), %xmm7
    397 
    398 	movdqu	%xmm0, -16(%rdi, %rdx)
    399 	movdqu	%xmm1, -32(%rdi, %rdx)
    400 	movdqu	%xmm2, -48(%rdi, %rdx)
    401 	movdqu	%xmm3, -64(%rdi, %rdx)
    402 	movdqa	%xmm4, -16(%r9)
    403 	movaps	%xmm5, -32(%r9)
    404 	movaps	%xmm6, -48(%r9)
    405 	movaps	%xmm7, -64(%r9)
    406 	lea	-64(%r9), %r9
    407 
    408 	lea	64(%rdi), %rbx
    409 	and	$-64, %rbx
    410 
    411 	cmp	%r9, %rbx
    412 	jae	L(mm_recalc_len)
    413 
    414 	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
    415 	jae	L(mm_large_page_loop_backward)
    416 
    417 	.p2align 4
    418 L(mm_main_loop_backward):
    419 
    420 	prefetcht0 -128(%r9, %r8)
    421 
    422 	movdqu	-64(%r9, %r8), %xmm0
    423 	movdqu	-48(%r9, %r8), %xmm1
    424 	movdqu	-32(%r9, %r8), %xmm2
    425 	movdqu	-16(%r9, %r8), %xmm3
    426 	movdqa	%xmm0, -64(%r9)
    427 	movaps	%xmm1, -48(%r9)
    428 	movaps	%xmm2, -32(%r9)
    429 	movaps	%xmm3, -16(%r9)
    430 	lea	-64(%r9), %r9
    431 	cmp	%r9, %rbx
    432 	jb	L(mm_main_loop_backward)
    433 	jmp	L(mm_recalc_len)
    434 
    435 /* Copy [0..16] and return.  */
    436 L(mm_len_0_16_bytes_backward):
    437 	testb	$24, %dl
    438 	jnz	L(mm_len_9_16_bytes_backward)
    439 	testb	$4, %dl
    440 	.p2align 4,,5
    441 	jnz	L(mm_len_5_8_bytes_backward)
    442 	test	%rdx, %rdx
    443 	.p2align 4,,2
    444 	je	L(mm_return)
    445 	testb	$2, %dl
    446 	.p2align 4,,1
    447 	jne	L(mm_len_3_4_bytes_backward)
    448 	movzbl	-1(%rsi,%rdx), %ebx
    449 	movzbl	(%rsi), %ecx
    450 	movb	%bl, -1(%rdi,%rdx)
    451 	movb	%cl, (%rdi)
    452 	jmp	L(mm_return)
    453 
    454 L(mm_len_3_4_bytes_backward):
    455 	movzwl	-2(%rsi,%rdx), %ebx
    456 	movzwl	(%rsi), %ecx
    457 	movw	%bx, -2(%rdi,%rdx)
    458 	movw	%cx, (%rdi)
    459 	jmp	L(mm_return)
    460 
    461 L(mm_len_9_16_bytes_backward):
    462 	movl	-4(%rsi,%rdx), %ebx
    463 	movl	-8(%rsi,%rdx), %ecx
    464 	movl	%ebx, -4(%rdi,%rdx)
    465 	movl	%ecx, -8(%rdi,%rdx)
    466 	sub	$8, %rdx
    467 	jmp	L(mm_len_0_16_bytes_backward)
    468 
    469 L(mm_len_5_8_bytes_backward):
    470 	movl	(%rsi), %ebx
    471 	movl	-4(%rsi,%rdx), %ecx
    472 	movl	%ebx, (%rdi)
    473 	movl	%ecx, -4(%rdi,%rdx)
    474 
    475 L(mm_return):
    476 	RETURN
    477 
    478 /* Big length copy forward part.  */
    479 
    480 	.p2align 4
    481 L(mm_large_page_loop_forward):
    482 	movdqu	(%r8, %rsi), %xmm0
    483 	movdqu	16(%r8, %rsi), %xmm1
    484 	movdqu	32(%r8, %rsi), %xmm2
    485 	movdqu	48(%r8, %rsi), %xmm3
    486 	movntdq	%xmm0, (%r8)
    487 	movntdq	%xmm1, 16(%r8)
    488 	movntdq	%xmm2, 32(%r8)
    489 	movntdq	%xmm3, 48(%r8)
    490 	lea 	64(%r8), %r8
    491 	cmp	%r8, %rbx
    492 	ja	L(mm_large_page_loop_forward)
    493 	sfence
    494 	jmp	L(mm_copy_remaining_forward)
    495 
    496 /* Big length copy backward part.  */
    497 	.p2align 4
    498 L(mm_large_page_loop_backward):
    499 	movdqu	-64(%r9, %r8), %xmm0
    500 	movdqu	-48(%r9, %r8), %xmm1
    501 	movdqu	-32(%r9, %r8), %xmm2
    502 	movdqu	-16(%r9, %r8), %xmm3
    503 	movntdq	%xmm0, -64(%r9)
    504 	movntdq	%xmm1, -48(%r9)
    505 	movntdq	%xmm2, -32(%r9)
    506 	movntdq	%xmm3, -16(%r9)
    507 	lea 	-64(%r9), %r9
    508 	cmp	%r9, %rbx
    509 	jb	L(mm_large_page_loop_backward)
    510 	sfence
    511 	jmp	L(mm_recalc_len)
    512 
    513 END (MEMMOVE)
    514