Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMMOVE
     34 # define MEMMOVE	memmove
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc	.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc	.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)	.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)		\
     63 	.type name,  @function;		\
     64 	.globl name;		\
     65 	.p2align 4;		\
     66 name:		\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)		\
     72 	cfi_endproc;		\
     73 	.size name, .-name
     74 #endif
     75 
     76 #ifdef USE_AS_BCOPY
     77 # define SRC		PARMS
     78 # define DEST		SRC+4
     79 # define LEN		DEST+4
     80 #else
     81 # define DEST		PARMS
     82 # define SRC		DEST+4
     83 # define LEN		SRC+4
     84 #endif
     85 
     86 #define CFI_PUSH(REG)		\
     87   cfi_adjust_cfa_offset (4);		\
     88   cfi_rel_offset (REG, 0)
     89 
     90 #define CFI_POP(REG)		\
     91   cfi_adjust_cfa_offset (-4);		\
     92   cfi_restore (REG)
     93 
     94 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     95 #define POP(REG)	popl REG; CFI_POP (REG)
     96 
     97 #define PARMS		8		/* Preserve EBX.  */
     98 #define ENTRANCE	PUSH (%ebx);
     99 #define RETURN_END	POP (%ebx); ret
    100 #define RETURN		RETURN_END; CFI_PUSH (%ebx)
    101 
    102 	.section .text.sse2,"ax",@progbits
    103 ENTRY (MEMMOVE)
    104 	ENTRANCE
    105 	movl	LEN(%esp), %ecx
    106 	movl	SRC(%esp), %eax
    107 	movl	DEST(%esp), %edx
    108 
    109 /* Check whether we should copy backward or forward.  */
    110 	cmp	%eax, %edx
    111 	je	L(mm_return)
    112 	jg	L(mm_len_0_or_more_backward)
    113 
    114 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
    115 	separately.  */
    116 	cmp	$16, %ecx
    117 	jbe	L(mm_len_0_16_bytes_forward)
    118 
    119 	cmpl	$32, %ecx
    120 	ja	L(mm_len_32_or_more_forward)
    121 
    122 /* Copy [0..32] and return.  */
    123 	movdqu	(%eax), %xmm0
    124 	movdqu	-16(%eax, %ecx), %xmm1
    125 	movdqu	%xmm0, (%edx)
    126 	movdqu	%xmm1, -16(%edx, %ecx)
    127 	jmp	L(mm_return)
    128 
    129 L(mm_len_32_or_more_forward):
    130 	cmpl	$64, %ecx
    131 	ja	L(mm_len_64_or_more_forward)
    132 
    133 /* Copy [0..64] and return.  */
    134 	movdqu	(%eax), %xmm0
    135 	movdqu	16(%eax), %xmm1
    136 	movdqu	-16(%eax, %ecx), %xmm2
    137 	movdqu	-32(%eax, %ecx), %xmm3
    138 	movdqu	%xmm0, (%edx)
    139 	movdqu	%xmm1, 16(%edx)
    140 	movdqu	%xmm2, -16(%edx, %ecx)
    141 	movdqu	%xmm3, -32(%edx, %ecx)
    142 	jmp	L(mm_return)
    143 
    144 L(mm_len_64_or_more_forward):
    145 	cmpl	$128, %ecx
    146 	ja	L(mm_len_128_or_more_forward)
    147 
    148 /* Copy [0..128] and return.  */
    149 	movdqu	(%eax), %xmm0
    150 	movdqu	16(%eax), %xmm1
    151 	movdqu	32(%eax), %xmm2
    152 	movdqu	48(%eax), %xmm3
    153 	movdqu	-64(%eax, %ecx), %xmm4
    154 	movdqu	-48(%eax, %ecx), %xmm5
    155 	movdqu	-32(%eax, %ecx), %xmm6
    156 	movdqu	-16(%eax, %ecx), %xmm7
    157 	movdqu	%xmm0, (%edx)
    158 	movdqu	%xmm1, 16(%edx)
    159 	movdqu	%xmm2, 32(%edx)
    160 	movdqu	%xmm3, 48(%edx)
    161 	movdqu	%xmm4, -64(%edx, %ecx)
    162 	movdqu	%xmm5, -48(%edx, %ecx)
    163 	movdqu	%xmm6, -32(%edx, %ecx)
    164 	movdqu	%xmm7, -16(%edx, %ecx)
    165 	jmp	L(mm_return)
    166 
    167 L(mm_len_128_or_more_forward):
    168 	PUSH (%esi)
    169 	PUSH (%edi)
    170 
    171 /* Aligning the address of destination.  */
    172 	movdqu	(%eax), %xmm0
    173 	movdqu	16(%eax), %xmm1
    174 	movdqu	32(%eax), %xmm2
    175 	movdqu	48(%eax), %xmm3
    176 
    177 	leal	64(%edx), %edi
    178 	andl	$-64, %edi
    179 	subl	%edx, %eax
    180 
    181 	movdqu	(%eax, %edi), %xmm4
    182 	movdqu	16(%eax, %edi), %xmm5
    183 	movdqu	32(%eax, %edi), %xmm6
    184 	movdqu	48(%eax, %edi), %xmm7
    185 
    186 	movdqu	%xmm0, (%edx)
    187 	movdqu	%xmm1, 16(%edx)
    188 	movdqu	%xmm2, 32(%edx)
    189 	movdqu	%xmm3, 48(%edx)
    190 	movdqa	%xmm4, (%edi)
    191 	movaps	%xmm5, 16(%edi)
    192 	movaps	%xmm6, 32(%edi)
    193 	movaps	%xmm7, 48(%edi)
    194 	addl	$64, %edi
    195 
    196 	leal	(%edx, %ecx), %ebx
    197 	andl	$-64, %ebx
    198 	cmp	%edi, %ebx
    199 	jbe	L(mm_copy_remaining_forward)
    200 
    201 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    202 	jae	L(mm_large_page_loop_forward)
    203 
    204 	.p2align 4
    205 L(mm_main_loop_forward):
    206 
    207 	prefetcht0 128(%eax, %edi)
    208 
    209 	movdqu	(%eax, %edi), %xmm0
    210 	movdqu	16(%eax, %edi), %xmm1
    211 	movdqu	32(%eax, %edi), %xmm2
    212 	movdqu	48(%eax, %edi), %xmm3
    213 	movdqa	%xmm0, (%edi)
    214 	movaps	%xmm1, 16(%edi)
    215 	movaps	%xmm2, 32(%edi)
    216 	movaps	%xmm3, 48(%edi)
    217 	leal	64(%edi), %edi
    218 	cmp	%edi, %ebx
    219 	ja	L(mm_main_loop_forward)
    220 
    221 L(mm_copy_remaining_forward):
    222 	addl	%edx, %ecx
    223 	subl	%edi, %ecx
    224 /* We copied all up till %edi position in the dst.
    225 	In %ecx now is how many bytes are left to copy.
    226 	Now we need to advance %esi. */
    227 	leal	(%edi, %eax), %esi
    228 
    229 L(mm_remaining_0_64_bytes_forward):
    230 	cmp	$32, %ecx
    231 	ja	L(mm_remaining_33_64_bytes_forward)
    232 	cmp	$16, %ecx
    233 	ja	L(mm_remaining_17_32_bytes_forward)
    234 	testl	%ecx, %ecx
    235 	.p2align 4,,2
    236 	je	L(mm_return_pop_all)
    237 
    238 	cmpb	$8, %cl
    239 	ja	L(mm_remaining_9_16_bytes_forward)
    240 	cmpb	$4, %cl
    241 	.p2align 4,,5
    242 	ja	L(mm_remaining_5_8_bytes_forward)
    243 	cmpb	$2, %cl
    244 	.p2align 4,,1
    245 	ja	L(mm_remaining_3_4_bytes_forward)
    246 	movzbl	-1(%esi,%ecx), %eax
    247 	movzbl	(%esi), %ebx
    248 	movb	%al, -1(%edi,%ecx)
    249 	movb	%bl, (%edi)
    250 	jmp	L(mm_return_pop_all)
    251 
    252 L(mm_remaining_33_64_bytes_forward):
    253 	movdqu	(%esi), %xmm0
    254 	movdqu	16(%esi), %xmm1
    255 	movdqu	-32(%esi, %ecx), %xmm2
    256 	movdqu	-16(%esi, %ecx), %xmm3
    257 	movdqu	%xmm0, (%edi)
    258 	movdqu	%xmm1, 16(%edi)
    259 	movdqu	%xmm2, -32(%edi, %ecx)
    260 	movdqu	%xmm3, -16(%edi, %ecx)
    261 	jmp	L(mm_return_pop_all)
    262 
    263 L(mm_remaining_17_32_bytes_forward):
    264 	movdqu	(%esi), %xmm0
    265 	movdqu	-16(%esi, %ecx), %xmm1
    266 	movdqu	%xmm0, (%edi)
    267 	movdqu	%xmm1, -16(%edi, %ecx)
    268 	jmp	L(mm_return_pop_all)
    269 
    270 L(mm_remaining_9_16_bytes_forward):
    271 	movq	(%esi), %xmm0
    272 	movq	-8(%esi, %ecx), %xmm1
    273 	movq	%xmm0, (%edi)
    274 	movq	%xmm1, -8(%edi, %ecx)
    275 	jmp	L(mm_return_pop_all)
    276 
    277 L(mm_remaining_5_8_bytes_forward):
    278 	movl	(%esi), %eax
    279 	movl	-4(%esi,%ecx), %ebx
    280 	movl	%eax, (%edi)
    281 	movl	%ebx, -4(%edi,%ecx)
    282 	jmp	L(mm_return_pop_all)
    283 
    284 L(mm_remaining_3_4_bytes_forward):
    285 	movzwl	-2(%esi,%ecx), %eax
    286 	movzwl	(%esi), %ebx
    287 	movw	%ax, -2(%edi,%ecx)
    288 	movw	%bx, (%edi)
    289 	jmp	L(mm_return_pop_all)
    290 
    291 L(mm_len_0_16_bytes_forward):
    292 	testb	$24, %cl
    293 	jne	L(mm_len_9_16_bytes_forward)
    294 	testb	$4, %cl
    295 	.p2align 4,,5
    296 	jne	L(mm_len_5_8_bytes_forward)
    297 	testl	%ecx, %ecx
    298 	.p2align 4,,2
    299 	je	L(mm_return)
    300 	testb	$2, %cl
    301 	.p2align 4,,1
    302 	jne	L(mm_len_2_4_bytes_forward)
    303 	movzbl	-1(%eax,%ecx), %ebx
    304 	movzbl	(%eax), %eax
    305 	movb	%bl, -1(%edx,%ecx)
    306 	movb	%al, (%edx)
    307 	jmp	L(mm_return)
    308 
    309 L(mm_len_2_4_bytes_forward):
    310 	movzwl	-2(%eax,%ecx), %ebx
    311 	movzwl	(%eax), %eax
    312 	movw	%bx, -2(%edx,%ecx)
    313 	movw	%ax, (%edx)
    314 	jmp	L(mm_return)
    315 
    316 L(mm_len_5_8_bytes_forward):
    317 	movl	(%eax), %ebx
    318 	movl	-4(%eax,%ecx), %eax
    319 	movl	%ebx, (%edx)
    320 	movl	%eax, -4(%edx,%ecx)
    321 	jmp	L(mm_return)
    322 
    323 L(mm_len_9_16_bytes_forward):
    324 	movq	(%eax), %xmm0
    325 	movq	-8(%eax, %ecx), %xmm1
    326 	movq	%xmm0, (%edx)
    327 	movq	%xmm1, -8(%edx, %ecx)
    328 	jmp	L(mm_return)
    329 
    330 	CFI_POP (%edi)
    331 	CFI_POP (%esi)
    332 
    333 L(mm_recalc_len):
    334 /* Compute in %ecx how many bytes are left to copy after
    335 	the main loop stops.  */
    336 	movl	%ebx, %ecx
    337 	subl	%edx, %ecx
    338 /* The code for copying backwards.  */
    339 L(mm_len_0_or_more_backward):
    340 
    341 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
    342 	separately.  */
    343 	cmp	$16, %ecx
    344 	jbe	L(mm_len_0_16_bytes_backward)
    345 
    346 	cmpl	$32, %ecx
    347 	jg	L(mm_len_32_or_more_backward)
    348 
    349 /* Copy [0..32] and return.  */
    350 	movdqu	(%eax), %xmm0
    351 	movdqu	-16(%eax, %ecx), %xmm1
    352 	movdqu	%xmm0, (%edx)
    353 	movdqu	%xmm1, -16(%edx, %ecx)
    354 	jmp	L(mm_return)
    355 
    356 L(mm_len_32_or_more_backward):
    357 	cmpl	$64, %ecx
    358 	jg	L(mm_len_64_or_more_backward)
    359 
    360 /* Copy [0..64] and return.  */
    361 	movdqu	(%eax), %xmm0
    362 	movdqu	16(%eax), %xmm1
    363 	movdqu	-16(%eax, %ecx), %xmm2
    364 	movdqu	-32(%eax, %ecx), %xmm3
    365 	movdqu	%xmm0, (%edx)
    366 	movdqu	%xmm1, 16(%edx)
    367 	movdqu	%xmm2, -16(%edx, %ecx)
    368 	movdqu	%xmm3, -32(%edx, %ecx)
    369 	jmp	L(mm_return)
    370 
    371 L(mm_len_64_or_more_backward):
    372 	cmpl	$128, %ecx
    373 	jg	L(mm_len_128_or_more_backward)
    374 
    375 /* Copy [0..128] and return.  */
    376 	movdqu	(%eax), %xmm0
    377 	movdqu	16(%eax), %xmm1
    378 	movdqu	32(%eax), %xmm2
    379 	movdqu	48(%eax), %xmm3
    380 	movdqu	-64(%eax, %ecx), %xmm4
    381 	movdqu	-48(%eax, %ecx), %xmm5
    382 	movdqu	-32(%eax, %ecx), %xmm6
    383 	movdqu	-16(%eax, %ecx), %xmm7
    384 	movdqu	%xmm0, (%edx)
    385 	movdqu	%xmm1, 16(%edx)
    386 	movdqu	%xmm2, 32(%edx)
    387 	movdqu	%xmm3, 48(%edx)
    388 	movdqu	%xmm4, -64(%edx, %ecx)
    389 	movdqu	%xmm5, -48(%edx, %ecx)
    390 	movdqu	%xmm6, -32(%edx, %ecx)
    391 	movdqu	%xmm7, -16(%edx, %ecx)
    392 	jmp	L(mm_return)
    393 
    394 L(mm_len_128_or_more_backward):
    395 	PUSH (%esi)
    396 	PUSH (%edi)
    397 
    398 /* Aligning the address of destination. We need to save
    399 	16 bits from the source in order not to overwrite them.  */
    400 	movdqu	-16(%eax, %ecx), %xmm0
    401 	movdqu	-32(%eax, %ecx), %xmm1
    402 	movdqu	-48(%eax, %ecx), %xmm2
    403 	movdqu	-64(%eax, %ecx), %xmm3
    404 
    405 	leal	(%edx, %ecx), %edi
    406 	andl	$-64, %edi
    407 
    408 	movl	%eax, %esi
    409 	subl	%edx, %esi
    410 
    411 	movdqu	-16(%edi, %esi), %xmm4
    412 	movdqu	-32(%edi, %esi), %xmm5
    413 	movdqu	-48(%edi, %esi), %xmm6
    414 	movdqu	-64(%edi, %esi), %xmm7
    415 
    416 	movdqu	%xmm0, -16(%edx, %ecx)
    417 	movdqu	%xmm1, -32(%edx, %ecx)
    418 	movdqu	%xmm2, -48(%edx, %ecx)
    419 	movdqu	%xmm3, -64(%edx, %ecx)
    420 	movdqa	%xmm4, -16(%edi)
    421 	movdqa	%xmm5, -32(%edi)
    422 	movdqa	%xmm6, -48(%edi)
    423 	movdqa	%xmm7, -64(%edi)
    424 	leal	-64(%edi), %edi
    425 
    426 	leal	64(%edx), %ebx
    427 	andl	$-64, %ebx
    428 
    429 	cmp	%edi, %ebx
    430 	jae	L(mm_main_loop_backward_end)
    431 
    432 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    433 	jae	L(mm_large_page_loop_backward)
    434 
    435 	.p2align 4
    436 L(mm_main_loop_backward):
    437 
    438 	prefetcht0 -128(%edi, %esi)
    439 
    440 	movdqu	-64(%edi, %esi), %xmm0
    441 	movdqu	-48(%edi, %esi), %xmm1
    442 	movdqu	-32(%edi, %esi), %xmm2
    443 	movdqu	-16(%edi, %esi), %xmm3
    444 	movdqa	%xmm0, -64(%edi)
    445 	movdqa	%xmm1, -48(%edi)
    446 	movdqa	%xmm2, -32(%edi)
    447 	movdqa	%xmm3, -16(%edi)
    448 	leal	-64(%edi), %edi
    449 	cmp	%edi, %ebx
    450 	jb	L(mm_main_loop_backward)
    451 L(mm_main_loop_backward_end):
    452 	POP (%edi)
    453 	POP (%esi)
    454 	jmp	L(mm_recalc_len)
    455 
    456 /* Copy [0..16] and return.  */
    457 L(mm_len_0_16_bytes_backward):
    458 	testb	$24, %cl
    459 	jnz	L(mm_len_9_16_bytes_backward)
    460 	testb	$4, %cl
    461 	.p2align 4,,5
    462 	jnz	L(mm_len_5_8_bytes_backward)
    463 	testl	%ecx, %ecx
    464 	.p2align 4,,2
    465 	je	L(mm_return)
    466 	testb	$2, %cl
    467 	.p2align 4,,1
    468 	jne	L(mm_len_3_4_bytes_backward)
    469 	movzbl	-1(%eax,%ecx), %ebx
    470 	movzbl	(%eax), %eax
    471 	movb	%bl, -1(%edx,%ecx)
    472 	movb	%al, (%edx)
    473 	jmp	L(mm_return)
    474 
    475 L(mm_len_3_4_bytes_backward):
    476 	movzwl	-2(%eax,%ecx), %ebx
    477 	movzwl	(%eax), %eax
    478 	movw	%bx, -2(%edx,%ecx)
    479 	movw	%ax, (%edx)
    480 	jmp	L(mm_return)
    481 
    482 L(mm_len_9_16_bytes_backward):
    483 	PUSH (%esi)
    484 	movl	-4(%eax,%ecx), %ebx
    485 	movl	-8(%eax,%ecx), %esi
    486 	movl	%ebx, -4(%edx,%ecx)
    487 	movl	%esi, -8(%edx,%ecx)
    488 	subl	$8, %ecx
    489 	POP (%esi)
    490 	jmp	L(mm_len_0_16_bytes_backward)
    491 
    492 L(mm_len_5_8_bytes_backward):
    493 	movl	(%eax), %ebx
    494 	movl	-4(%eax,%ecx), %eax
    495 	movl	%ebx, (%edx)
    496 	movl	%eax, -4(%edx,%ecx)
    497 
    498 L(mm_return):
    499 	movl	%edx, %eax
    500 	RETURN
    501 
    502 L(mm_return_pop_all):
    503 	movl	%edx, %eax
    504 	POP (%edi)
    505 	POP (%esi)
    506 	RETURN
    507 
    508 /* Big length copy forward part.  */
    509 
    510 	.p2align 4
    511 L(mm_large_page_loop_forward):
    512 	movdqu	(%eax, %edi), %xmm0
    513 	movdqu	16(%eax, %edi), %xmm1
    514 	movdqu	32(%eax, %edi), %xmm2
    515 	movdqu	48(%eax, %edi), %xmm3
    516 	movntdq	%xmm0, (%edi)
    517 	movntdq	%xmm1, 16(%edi)
    518 	movntdq	%xmm2, 32(%edi)
    519 	movntdq	%xmm3, 48(%edi)
    520 	leal	64(%edi), %edi
    521 	cmp	%edi, %ebx
    522 	ja	L(mm_large_page_loop_forward)
    523 	sfence
    524 	jmp	L(mm_copy_remaining_forward)
    525 
    526 /* Big length copy backward part.  */
    527 	.p2align 4
    528 L(mm_large_page_loop_backward):
    529 	movdqu	-64(%edi, %esi), %xmm0
    530 	movdqu	-48(%edi, %esi), %xmm1
    531 	movdqu	-32(%edi, %esi), %xmm2
    532 	movdqu	-16(%edi, %esi), %xmm3
    533 	movntdq	%xmm0, -64(%edi)
    534 	movntdq	%xmm1, -48(%edi)
    535 	movntdq	%xmm2, -32(%edi)
    536 	movntdq	%xmm3, -16(%edi)
    537 	leal	-64(%edi), %edi
    538 	cmp	%edi, %ebx
    539 	jb	L(mm_large_page_loop_backward)
    540 	sfence
    541 	POP (%edi)
    542 	POP (%esi)
    543 	jmp	L(mm_recalc_len)
    544 
    545 END (MEMMOVE)
    546