Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMMOVE
     34 # define MEMMOVE	memmove
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc	.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc	.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)	.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)		\
     63 	.type name,  @function;		\
     64 	.globl name;		\
     65 	.p2align 4;		\
     66 name:		\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)		\
     72 	cfi_endproc;		\
     73 	.size name, .-name
     74 #endif
     75 
     76 #ifdef USE_AS_BCOPY
     77 # define SRC		PARMS
     78 # define DEST		SRC+4
     79 # define LEN		DEST+4
     80 #else
     81 # define DEST		PARMS
     82 # define SRC		DEST+4
     83 # define LEN		SRC+4
     84 #endif
     85 
     86 #define CFI_PUSH(REG)		\
     87   cfi_adjust_cfa_offset (4);		\
     88   cfi_rel_offset (REG, 0)
     89 
     90 #define CFI_POP(REG)		\
     91   cfi_adjust_cfa_offset (-4);		\
     92   cfi_restore (REG)
     93 
     94 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     95 #define POP(REG)	popl REG; CFI_POP (REG)
     96 
     97 #define PARMS		8		/* Preserve EBX.  */
     98 #define ENTRANCE	PUSH (%ebx);
     99 #define RETURN_END	POP (%ebx); ret
    100 #define RETURN		RETURN_END; CFI_PUSH (%ebx)
    101 
    102 	.section .text.sse2,"ax",@progbits
    103 ENTRY (MEMMOVE)
    104 	ENTRANCE
    105 	movl	LEN(%esp), %ecx
    106 	movl	SRC(%esp), %eax
    107 	movl	DEST(%esp), %edx
    108 
    109 /* Check whether we should copy backward or forward.  */
    110 	cmp	%eax, %edx
    111 	je	L(mm_return)
    112 	jg	L(mm_len_0_or_more_backward)
    113 
    114 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
    115 	separately.  */
    116 	cmp	$16, %ecx
    117 	jbe	L(mm_len_0_16_bytes_forward)
    118 
    119 	cmpl	$32, %ecx
    120 	ja	L(mm_len_32_or_more_forward)
    121 
    122 /* Copy [0..32] and return.  */
    123 	movdqu	(%eax), %xmm0
    124 	movdqu	-16(%eax, %ecx), %xmm1
    125 	movdqu	%xmm0, (%edx)
    126 	movdqu	%xmm1, -16(%edx, %ecx)
    127 	jmp	L(mm_return)
    128 
    129 L(mm_len_32_or_more_forward):
    130 	cmpl	$64, %ecx
    131 	ja	L(mm_len_64_or_more_forward)
    132 
    133 /* Copy [0..64] and return.  */
    134 	movdqu	(%eax), %xmm0
    135 	movdqu	16(%eax), %xmm1
    136 	movdqu	-16(%eax, %ecx), %xmm2
    137 	movdqu	-32(%eax, %ecx), %xmm3
    138 	movdqu	%xmm0, (%edx)
    139 	movdqu	%xmm1, 16(%edx)
    140 	movdqu	%xmm2, -16(%edx, %ecx)
    141 	movdqu	%xmm3, -32(%edx, %ecx)
    142 	jmp	L(mm_return)
    143 
    144 L(mm_len_64_or_more_forward):
    145 	cmpl	$128, %ecx
    146 	ja	L(mm_len_128_or_more_forward)
    147 
    148 /* Copy [0..128] and return.  */
    149 	movdqu	(%eax), %xmm0
    150 	movdqu	16(%eax), %xmm1
    151 	movdqu	32(%eax), %xmm2
    152 	movdqu	48(%eax), %xmm3
    153 	movdqu	-64(%eax, %ecx), %xmm4
    154 	movdqu	-48(%eax, %ecx), %xmm5
    155 	movdqu	-32(%eax, %ecx), %xmm6
    156 	movdqu	-16(%eax, %ecx), %xmm7
    157 	movdqu	%xmm0, (%edx)
    158 	movdqu	%xmm1, 16(%edx)
    159 	movdqu	%xmm2, 32(%edx)
    160 	movdqu	%xmm3, 48(%edx)
    161 	movdqu	%xmm4, -64(%edx, %ecx)
    162 	movdqu	%xmm5, -48(%edx, %ecx)
    163 	movdqu	%xmm6, -32(%edx, %ecx)
    164 	movdqu	%xmm7, -16(%edx, %ecx)
    165 	jmp	L(mm_return)
    166 
    167 L(mm_len_128_or_more_forward):
    168 	PUSH (%esi)
    169 	PUSH (%edi)
    170 
    171 /* Aligning the address of destination.  */
    172 	movdqu	(%eax), %xmm0
    173 	movdqu	16(%eax), %xmm1
    174 	movdqu	32(%eax), %xmm2
    175 	movdqu	48(%eax), %xmm3
    176 
    177 	leal	64(%edx), %edi
    178 	andl	$-64, %edi
    179 	subl	%edx, %eax
    180 
    181 	movdqu	(%eax, %edi), %xmm4
    182 	movdqu	16(%eax, %edi), %xmm5
    183 	movdqu	32(%eax, %edi), %xmm6
    184 	movdqu	48(%eax, %edi), %xmm7
    185 
    186 	movdqu	%xmm0, (%edx)
    187 	movdqu	%xmm1, 16(%edx)
    188 	movdqu	%xmm2, 32(%edx)
    189 	movdqu	%xmm3, 48(%edx)
    190 	movdqa	%xmm4, (%edi)
    191 	movaps	%xmm5, 16(%edi)
    192 	movaps	%xmm6, 32(%edi)
    193 	movaps	%xmm7, 48(%edi)
    194 	addl	$64, %edi
    195 
    196 	leal	(%edx, %ecx), %ebx
    197 	andl	$-64, %ebx
    198 	cmp	%edi, %ebx
    199 	jbe	L(mm_copy_remaining_forward)
    200 
    201 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    202 	jae	L(mm_large_page_loop_forward)
    203 
    204 	.p2align 4
    205 L(mm_main_loop_forward):
    206 
    207 	prefetcht0 128(%eax, %edi)
    208 
    209 	movdqu	(%eax, %edi), %xmm0
    210 	movdqu	16(%eax, %edi), %xmm1
    211 	movdqu	32(%eax, %edi), %xmm2
    212 	movdqu	48(%eax, %edi), %xmm3
    213 	movdqa	%xmm0, (%edi)
    214 	movaps	%xmm1, 16(%edi)
    215 	movaps	%xmm2, 32(%edi)
    216 	movaps	%xmm3, 48(%edi)
    217 	leal	64(%edi), %edi
    218 	cmp	%edi, %ebx
    219 	ja	L(mm_main_loop_forward)
    220 
    221 L(mm_copy_remaining_forward):
    222 	addl	%edx, %ecx
    223 	subl	%edi, %ecx
    224 /* We copied all up till %edi position in the dst.
    225 	In %ecx now is how many bytes are left to copy.
    226 	Now we need to advance %esi. */
    227 	leal	(%edi, %eax), %esi
    228 
    229 L(mm_remaining_0_64_bytes_forward):
    230 	cmp	$32, %ecx
    231 	ja	L(mm_remaining_33_64_bytes_forward)
    232 	cmp	$16, %ecx
    233 	ja	L(mm_remaining_17_32_bytes_forward)
    234 	testl	%ecx, %ecx
    235 	.p2align 4,,2
    236 	je	L(mm_return_pop_all)
    237 
    238 	cmpb	$8, %cl
    239 	ja	L(mm_remaining_9_16_bytes_forward)
    240 	cmpb	$4, %cl
    241 	.p2align 4,,5
    242 	ja	L(mm_remaining_5_8_bytes_forward)
    243 	cmpb	$2, %cl
    244 	.p2align 4,,1
    245 	ja	L(mm_remaining_3_4_bytes_forward)
    246 	movzbl	-1(%esi,%ecx), %eax
    247 	movzbl	(%esi), %ebx
    248 	movb	%al, -1(%edi,%ecx)
    249 	movb	%bl, (%edi)
    250 	jmp	L(mm_return_pop_all)
    251 
    252 L(mm_remaining_33_64_bytes_forward):
    253 	movdqu	(%esi), %xmm0
    254 	movdqu	16(%esi), %xmm1
    255 	movdqu	-32(%esi, %ecx), %xmm2
    256 	movdqu	-16(%esi, %ecx), %xmm3
    257 	movdqu	%xmm0, (%edi)
    258 	movdqu	%xmm1, 16(%edi)
    259 	movdqu	%xmm2, -32(%edi, %ecx)
    260 	movdqu	%xmm3, -16(%edi, %ecx)
    261 	jmp	L(mm_return_pop_all)
    262 
    263 L(mm_remaining_17_32_bytes_forward):
    264 	movdqu	(%esi), %xmm0
    265 	movdqu	-16(%esi, %ecx), %xmm1
    266 	movdqu	%xmm0, (%edi)
    267 	movdqu	%xmm1, -16(%edi, %ecx)
    268 	jmp	L(mm_return_pop_all)
    269 
    270 L(mm_remaining_9_16_bytes_forward):
    271 	movq	(%esi), %xmm0
    272 	movq	-8(%esi, %ecx), %xmm1
    273 	movq	%xmm0, (%edi)
    274 	movq	%xmm1, -8(%edi, %ecx)
    275 	jmp	L(mm_return_pop_all)
    276 
    277 L(mm_remaining_5_8_bytes_forward):
    278 	movl	(%esi), %eax
    279 	movl	-4(%esi,%ecx), %ebx
    280 	movl	%eax, (%edi)
    281 	movl	%ebx, -4(%edi,%ecx)
    282 	jmp	L(mm_return_pop_all)
    283 
    284 L(mm_remaining_3_4_bytes_forward):
    285 	movzwl	-2(%esi,%ecx), %eax
    286 	movzwl	(%esi), %ebx
    287 	movw	%ax, -2(%edi,%ecx)
    288 	movw	%bx, (%edi)
    289 	jmp	L(mm_return_pop_all)
    290 
    291 L(mm_len_0_16_bytes_forward):
    292 	testb	$24, %cl
    293 	jne	L(mm_len_9_16_bytes_forward)
    294 	testb	$4, %cl
    295 	.p2align 4,,5
    296 	jne	L(mm_len_5_8_bytes_forward)
    297 	testl	%ecx, %ecx
    298 	.p2align 4,,2
    299 	je	L(mm_return)
    300 	testb	$2, %cl
    301 	.p2align 4,,1
    302 	jne	L(mm_len_2_4_bytes_forward)
    303 	movzbl	-1(%eax,%ecx), %ebx
    304 	movzbl	(%eax), %eax
    305 	movb	%bl, -1(%edx,%ecx)
    306 	movb	%al, (%edx)
    307 	jmp	L(mm_return)
    308 
    309 L(mm_len_2_4_bytes_forward):
    310 	movzwl	-2(%eax,%ecx), %ebx
    311 	movzwl	(%eax), %eax
    312 	movw	%bx, -2(%edx,%ecx)
    313 	movw	%ax, (%edx)
    314 	jmp	L(mm_return)
    315 
    316 L(mm_len_5_8_bytes_forward):
    317 	movl	(%eax), %ebx
    318 	movl	-4(%eax,%ecx), %eax
    319 	movl	%ebx, (%edx)
    320 	movl	%eax, -4(%edx,%ecx)
    321 	jmp	L(mm_return)
    322 
    323 L(mm_len_9_16_bytes_forward):
    324 	movq	(%eax), %xmm0
    325 	movq	-8(%eax, %ecx), %xmm1
    326 	movq	%xmm0, (%edx)
    327 	movq	%xmm1, -8(%edx, %ecx)
    328 	jmp	L(mm_return)
    329 
    330 L(mm_recalc_len):
    331 /* Compute in %ecx how many bytes are left to copy after
    332 	the main loop stops.  */
    333 	movl	%ebx, %ecx
    334 	subl	%edx, %ecx
    335 /* The code for copying backwards.  */
    336 L(mm_len_0_or_more_backward):
    337 
    338 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
    339 	separately.  */
    340 	cmp	$16, %ecx
    341 	jbe	L(mm_len_0_16_bytes_backward)
    342 
    343 	cmpl	$32, %ecx
    344 	jg	L(mm_len_32_or_more_backward)
    345 
    346 /* Copy [0..32] and return.  */
    347 	movdqu	(%eax), %xmm0
    348 	movdqu	-16(%eax, %ecx), %xmm1
    349 	movdqu	%xmm0, (%edx)
    350 	movdqu	%xmm1, -16(%edx, %ecx)
    351 	jmp	L(mm_return)
    352 
    353 L(mm_len_32_or_more_backward):
    354 	cmpl	$64, %ecx
    355 	jg	L(mm_len_64_or_more_backward)
    356 
    357 /* Copy [0..64] and return.  */
    358 	movdqu	(%eax), %xmm0
    359 	movdqu	16(%eax), %xmm1
    360 	movdqu	-16(%eax, %ecx), %xmm2
    361 	movdqu	-32(%eax, %ecx), %xmm3
    362 	movdqu	%xmm0, (%edx)
    363 	movdqu	%xmm1, 16(%edx)
    364 	movdqu	%xmm2, -16(%edx, %ecx)
    365 	movdqu	%xmm3, -32(%edx, %ecx)
    366 	jmp	L(mm_return)
    367 
    368 L(mm_len_64_or_more_backward):
    369 	cmpl	$128, %ecx
    370 	jg	L(mm_len_128_or_more_backward)
    371 
    372 /* Copy [0..128] and return.  */
    373 	movdqu	(%eax), %xmm0
    374 	movdqu	16(%eax), %xmm1
    375 	movdqu	32(%eax), %xmm2
    376 	movdqu	48(%eax), %xmm3
    377 	movdqu	-64(%eax, %ecx), %xmm4
    378 	movdqu	-48(%eax, %ecx), %xmm5
    379 	movdqu	-32(%eax, %ecx), %xmm6
    380 	movdqu	-16(%eax, %ecx), %xmm7
    381 	movdqu	%xmm0, (%edx)
    382 	movdqu	%xmm1, 16(%edx)
    383 	movdqu	%xmm2, 32(%edx)
    384 	movdqu	%xmm3, 48(%edx)
    385 	movdqu	%xmm4, -64(%edx, %ecx)
    386 	movdqu	%xmm5, -48(%edx, %ecx)
    387 	movdqu	%xmm6, -32(%edx, %ecx)
    388 	movdqu	%xmm7, -16(%edx, %ecx)
    389 	jmp	L(mm_return)
    390 
    391 L(mm_len_128_or_more_backward):
    392 	PUSH (%esi)
    393 	PUSH (%edi)
    394 
    395 /* Aligning the address of destination. We need to save
    396 	16 bits from the source in order not to overwrite them.  */
    397 	movdqu	-16(%eax, %ecx), %xmm0
    398 	movdqu	-32(%eax, %ecx), %xmm1
    399 	movdqu	-48(%eax, %ecx), %xmm2
    400 	movdqu	-64(%eax, %ecx), %xmm3
    401 
    402 	leal	(%edx, %ecx), %edi
    403 	andl	$-64, %edi
    404 
    405 	movl	%eax, %esi
    406 	subl	%edx, %esi
    407 
    408 	movdqu	-16(%edi, %esi), %xmm4
    409 	movdqu	-32(%edi, %esi), %xmm5
    410 	movdqu	-48(%edi, %esi), %xmm6
    411 	movdqu	-64(%edi, %esi), %xmm7
    412 
    413 	movdqu	%xmm0, -16(%edx, %ecx)
    414 	movdqu	%xmm1, -32(%edx, %ecx)
    415 	movdqu	%xmm2, -48(%edx, %ecx)
    416 	movdqu	%xmm3, -64(%edx, %ecx)
    417 	movdqa	%xmm4, -16(%edi)
    418 	movdqa	%xmm5, -32(%edi)
    419 	movdqa	%xmm6, -48(%edi)
    420 	movdqa	%xmm7, -64(%edi)
    421 	leal	-64(%edi), %edi
    422 
    423 	leal	64(%edx), %ebx
    424 	andl	$-64, %ebx
    425 
    426 	cmp	%edi, %ebx
    427 	jae	L(mm_main_loop_backward_end)
    428 
    429 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    430 	jae	L(mm_large_page_loop_backward)
    431 
    432 	.p2align 4
    433 L(mm_main_loop_backward):
    434 
    435 	prefetcht0 -128(%edi, %esi)
    436 
    437 	movdqu	-64(%edi, %esi), %xmm0
    438 	movdqu	-48(%edi, %esi), %xmm1
    439 	movdqu	-32(%edi, %esi), %xmm2
    440 	movdqu	-16(%edi, %esi), %xmm3
    441 	movdqa	%xmm0, -64(%edi)
    442 	movdqa	%xmm1, -48(%edi)
    443 	movdqa	%xmm2, -32(%edi)
    444 	movdqa	%xmm3, -16(%edi)
    445 	leal	-64(%edi), %edi
    446 	cmp	%edi, %ebx
    447 	jb	L(mm_main_loop_backward)
    448 L(mm_main_loop_backward_end):
    449 	POP (%edi)
    450 	POP (%esi)
    451 	jmp	L(mm_recalc_len)
    452 
    453 /* Copy [0..16] and return.  */
    454 L(mm_len_0_16_bytes_backward):
    455 	testb	$24, %cl
    456 	jnz	L(mm_len_9_16_bytes_backward)
    457 	testb	$4, %cl
    458 	.p2align 4,,5
    459 	jnz	L(mm_len_5_8_bytes_backward)
    460 	testl	%ecx, %ecx
    461 	.p2align 4,,2
    462 	je	L(mm_return)
    463 	testb	$2, %cl
    464 	.p2align 4,,1
    465 	jne	L(mm_len_3_4_bytes_backward)
    466 	movzbl	-1(%eax,%ecx), %ebx
    467 	movzbl	(%eax), %eax
    468 	movb	%bl, -1(%edx,%ecx)
    469 	movb	%al, (%edx)
    470 	jmp	L(mm_return)
    471 
    472 L(mm_len_3_4_bytes_backward):
    473 	movzwl	-2(%eax,%ecx), %ebx
    474 	movzwl	(%eax), %eax
    475 	movw	%bx, -2(%edx,%ecx)
    476 	movw	%ax, (%edx)
    477 	jmp	L(mm_return)
    478 
    479 L(mm_len_9_16_bytes_backward):
    480 	PUSH (%esi)
    481 	movl	-4(%eax,%ecx), %ebx
    482 	movl	-8(%eax,%ecx), %esi
    483 	movl	%ebx, -4(%edx,%ecx)
    484 	movl	%esi, -8(%edx,%ecx)
    485 	subl	$8, %ecx
    486 	POP (%esi)
    487 	jmp	L(mm_len_0_16_bytes_backward)
    488 
    489 L(mm_len_5_8_bytes_backward):
    490 	movl	(%eax), %ebx
    491 	movl	-4(%eax,%ecx), %eax
    492 	movl	%ebx, (%edx)
    493 	movl	%eax, -4(%edx,%ecx)
    494 
    495 L(mm_return):
    496 	movl	%edx, %eax
    497 	RETURN
    498 
    499 L(mm_return_pop_all):
    500 	movl	%edx, %eax
    501 	POP (%edi)
    502 	POP (%esi)
    503 	RETURN
    504 
    505 /* Big length copy forward part.  */
    506 
    507 	.p2align 4
    508 L(mm_large_page_loop_forward):
    509 	movdqu	(%eax, %edi), %xmm0
    510 	movdqu	16(%eax, %edi), %xmm1
    511 	movdqu	32(%eax, %edi), %xmm2
    512 	movdqu	48(%eax, %edi), %xmm3
    513 	movntdq	%xmm0, (%edi)
    514 	movntdq	%xmm1, 16(%edi)
    515 	movntdq	%xmm2, 32(%edi)
    516 	movntdq	%xmm3, 48(%edi)
    517 	leal	64(%edi), %edi
    518 	cmp	%edi, %ebx
    519 	ja	L(mm_large_page_loop_forward)
    520 	sfence
    521 	jmp	L(mm_copy_remaining_forward)
    522 
    523 /* Big length copy backward part.  */
    524 	.p2align 4
    525 L(mm_large_page_loop_backward):
    526 	movdqu	-64(%edi, %esi), %xmm0
    527 	movdqu	-48(%edi, %esi), %xmm1
    528 	movdqu	-32(%edi, %esi), %xmm2
    529 	movdqu	-16(%edi, %esi), %xmm3
    530 	movntdq	%xmm0, -64(%edi)
    531 	movntdq	%xmm1, -48(%edi)
    532 	movntdq	%xmm2, -32(%edi)
    533 	movntdq	%xmm3, -16(%edi)
    534 	leal	-64(%edi), %edi
    535 	cmp	%edi, %ebx
    536 	jb	L(mm_large_page_loop_backward)
    537 	sfence
    538 	POP (%edi)
    539 	POP (%esi)
    540 	jmp	L(mm_recalc_len)
    541 
    542 END (MEMMOVE)
    543