Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMMOVE
     34 # define MEMMOVE	memmove
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc	.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc	.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)	.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)		\
     63 	.type name,  @function;		\
     64 	.globl name;		\
     65 	.p2align 4;		\
     66 name:		\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)		\
     72 	cfi_endproc;		\
     73 	.size name, .-name
     74 #endif
     75 
     76 #define DEST		PARMS
     77 #define SRC		DEST+4
     78 #define LEN		SRC+4
     79 
     80 #define CFI_PUSH(REG)		\
     81   cfi_adjust_cfa_offset (4);		\
     82   cfi_rel_offset (REG, 0)
     83 
     84 #define CFI_POP(REG)		\
     85   cfi_adjust_cfa_offset (-4);		\
     86   cfi_restore (REG)
     87 
     88 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     89 #define POP(REG)	popl REG; CFI_POP (REG)
     90 
     91 #define PARMS		8		/* Preserve EBX.  */
     92 #define ENTRANCE	PUSH (%ebx);
     93 #define RETURN_END	POP (%ebx); ret
     94 #define RETURN		RETURN_END; CFI_PUSH (%ebx)
     95 
     96 	.section .text.sse2,"ax",@progbits
     97 ENTRY (MEMMOVE)
     98 	ENTRANCE
     99 	movl	LEN(%esp), %ecx
    100 	movl	SRC(%esp), %eax
    101 	movl	DEST(%esp), %edx
    102 
    103 /* Check whether we should copy backward or forward.  */
    104 	cmp	%eax, %edx
    105 	je	L(mm_return)
    106 	jg	L(mm_len_0_or_more_backward)
    107 
    108 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
    109 	separately.  */
    110 	cmp	$16, %ecx
    111 	jbe	L(mm_len_0_16_bytes_forward)
    112 
    113 	cmpl	$32, %ecx
    114 	ja	L(mm_len_32_or_more_forward)
    115 
    116 /* Copy [0..32] and return.  */
    117 	movdqu	(%eax), %xmm0
    118 	movdqu	-16(%eax, %ecx), %xmm1
    119 	movdqu	%xmm0, (%edx)
    120 	movdqu	%xmm1, -16(%edx, %ecx)
    121 	jmp	L(mm_return)
    122 
    123 L(mm_len_32_or_more_forward):
    124 	cmpl	$64, %ecx
    125 	ja	L(mm_len_64_or_more_forward)
    126 
    127 /* Copy [0..64] and return.  */
    128 	movdqu	(%eax), %xmm0
    129 	movdqu	16(%eax), %xmm1
    130 	movdqu	-16(%eax, %ecx), %xmm2
    131 	movdqu	-32(%eax, %ecx), %xmm3
    132 	movdqu	%xmm0, (%edx)
    133 	movdqu	%xmm1, 16(%edx)
    134 	movdqu	%xmm2, -16(%edx, %ecx)
    135 	movdqu	%xmm3, -32(%edx, %ecx)
    136 	jmp	L(mm_return)
    137 
    138 L(mm_len_64_or_more_forward):
    139 	cmpl	$128, %ecx
    140 	ja	L(mm_len_128_or_more_forward)
    141 
    142 /* Copy [0..128] and return.  */
    143 	movdqu	(%eax), %xmm0
    144 	movdqu	16(%eax), %xmm1
    145 	movdqu	32(%eax), %xmm2
    146 	movdqu	48(%eax), %xmm3
    147 	movdqu	-64(%eax, %ecx), %xmm4
    148 	movdqu	-48(%eax, %ecx), %xmm5
    149 	movdqu	-32(%eax, %ecx), %xmm6
    150 	movdqu	-16(%eax, %ecx), %xmm7
    151 	movdqu	%xmm0, (%edx)
    152 	movdqu	%xmm1, 16(%edx)
    153 	movdqu	%xmm2, 32(%edx)
    154 	movdqu	%xmm3, 48(%edx)
    155 	movdqu	%xmm4, -64(%edx, %ecx)
    156 	movdqu	%xmm5, -48(%edx, %ecx)
    157 	movdqu	%xmm6, -32(%edx, %ecx)
    158 	movdqu	%xmm7, -16(%edx, %ecx)
    159 	jmp	L(mm_return)
    160 
    161 L(mm_len_128_or_more_forward):
    162 	PUSH (%esi)
    163 	PUSH (%edi)
    164 
    165 /* Aligning the address of destination.  */
    166 	movdqu	(%eax), %xmm0
    167 	movdqu	16(%eax), %xmm1
    168 	movdqu	32(%eax), %xmm2
    169 	movdqu	48(%eax), %xmm3
    170 
    171 	leal	64(%edx), %edi
    172 	andl	$-64, %edi
    173 	subl	%edx, %eax
    174 
    175 	movdqu	(%eax, %edi), %xmm4
    176 	movdqu	16(%eax, %edi), %xmm5
    177 	movdqu	32(%eax, %edi), %xmm6
    178 	movdqu	48(%eax, %edi), %xmm7
    179 
    180 	movdqu	%xmm0, (%edx)
    181 	movdqu	%xmm1, 16(%edx)
    182 	movdqu	%xmm2, 32(%edx)
    183 	movdqu	%xmm3, 48(%edx)
    184 	movdqa	%xmm4, (%edi)
    185 	movaps	%xmm5, 16(%edi)
    186 	movaps	%xmm6, 32(%edi)
    187 	movaps	%xmm7, 48(%edi)
    188 	addl	$64, %edi
    189 
    190 	leal	(%edx, %ecx), %ebx
    191 	andl	$-64, %ebx
    192 	cmp	%edi, %ebx
    193 	jbe	L(mm_copy_remaining_forward)
    194 
    195 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    196 	jae	L(mm_large_page_loop_forward)
    197 
    198 	.p2align 4
    199 L(mm_main_loop_forward):
    200 
    201 	prefetcht0 128(%eax, %edi)
    202 
    203 	movdqu	(%eax, %edi), %xmm0
    204 	movdqu	16(%eax, %edi), %xmm1
    205 	movdqu	32(%eax, %edi), %xmm2
    206 	movdqu	48(%eax, %edi), %xmm3
    207 	movdqa	%xmm0, (%edi)
    208 	movaps	%xmm1, 16(%edi)
    209 	movaps	%xmm2, 32(%edi)
    210 	movaps	%xmm3, 48(%edi)
    211 	leal	64(%edi), %edi
    212 	cmp	%edi, %ebx
    213 	ja	L(mm_main_loop_forward)
    214 
    215 L(mm_copy_remaining_forward):
    216 	addl	%edx, %ecx
    217 	subl	%edi, %ecx
    218 /* We copied all up till %edi position in the dst.
    219 	In %ecx now is how many bytes are left to copy.
    220 	Now we need to advance %esi. */
    221 	leal	(%edi, %eax), %esi
    222 
    223 L(mm_remaining_0_64_bytes_forward):
    224 	cmp	$32, %ecx
    225 	ja	L(mm_remaining_33_64_bytes_forward)
    226 	cmp	$16, %ecx
    227 	ja	L(mm_remaining_17_32_bytes_forward)
    228 	testl	%ecx, %ecx
    229 	.p2align 4,,2
    230 	je	L(mm_return_pop_all)
    231 
    232 	cmpb	$8, %cl
    233 	ja	L(mm_remaining_9_16_bytes_forward)
    234 	cmpb	$4, %cl
    235 	.p2align 4,,5
    236 	ja	L(mm_remaining_5_8_bytes_forward)
    237 	cmpb	$2, %cl
    238 	.p2align 4,,1
    239 	ja	L(mm_remaining_3_4_bytes_forward)
    240 	movzbl	-1(%esi,%ecx), %eax
    241 	movzbl	(%esi), %ebx
    242 	movb	%al, -1(%edi,%ecx)
    243 	movb	%bl, (%edi)
    244 	jmp	L(mm_return_pop_all)
    245 
    246 L(mm_remaining_33_64_bytes_forward):
    247 	movdqu	(%esi), %xmm0
    248 	movdqu	16(%esi), %xmm1
    249 	movdqu	-32(%esi, %ecx), %xmm2
    250 	movdqu	-16(%esi, %ecx), %xmm3
    251 	movdqu	%xmm0, (%edi)
    252 	movdqu	%xmm1, 16(%edi)
    253 	movdqu	%xmm2, -32(%edi, %ecx)
    254 	movdqu	%xmm3, -16(%edi, %ecx)
    255 	jmp	L(mm_return_pop_all)
    256 
    257 L(mm_remaining_17_32_bytes_forward):
    258 	movdqu	(%esi), %xmm0
    259 	movdqu	-16(%esi, %ecx), %xmm1
    260 	movdqu	%xmm0, (%edi)
    261 	movdqu	%xmm1, -16(%edi, %ecx)
    262 	jmp	L(mm_return_pop_all)
    263 
    264 L(mm_remaining_9_16_bytes_forward):
    265 	movq	(%esi), %xmm0
    266 	movq	-8(%esi, %ecx), %xmm1
    267 	movq	%xmm0, (%edi)
    268 	movq	%xmm1, -8(%edi, %ecx)
    269 	jmp	L(mm_return_pop_all)
    270 
    271 L(mm_remaining_5_8_bytes_forward):
    272 	movl	(%esi), %eax
    273 	movl	-4(%esi,%ecx), %ebx
    274 	movl	%eax, (%edi)
    275 	movl	%ebx, -4(%edi,%ecx)
    276 	jmp	L(mm_return_pop_all)
    277 
    278 L(mm_remaining_3_4_bytes_forward):
    279 	movzwl	-2(%esi,%ecx), %eax
    280 	movzwl	(%esi), %ebx
    281 	movw	%ax, -2(%edi,%ecx)
    282 	movw	%bx, (%edi)
    283 	jmp	L(mm_return_pop_all)
    284 
    285 L(mm_len_0_16_bytes_forward):
    286 	testb	$24, %cl
    287 	jne	L(mm_len_9_16_bytes_forward)
    288 	testb	$4, %cl
    289 	.p2align 4,,5
    290 	jne	L(mm_len_5_8_bytes_forward)
    291 	testl	%ecx, %ecx
    292 	.p2align 4,,2
    293 	je	L(mm_return)
    294 	testb	$2, %cl
    295 	.p2align 4,,1
    296 	jne	L(mm_len_2_4_bytes_forward)
    297 	movzbl	-1(%eax,%ecx), %ebx
    298 	movzbl	(%eax), %eax
    299 	movb	%bl, -1(%edx,%ecx)
    300 	movb	%al, (%edx)
    301 	jmp	L(mm_return)
    302 
    303 L(mm_len_2_4_bytes_forward):
    304 	movzwl	-2(%eax,%ecx), %ebx
    305 	movzwl	(%eax), %eax
    306 	movw	%bx, -2(%edx,%ecx)
    307 	movw	%ax, (%edx)
    308 	jmp	L(mm_return)
    309 
    310 L(mm_len_5_8_bytes_forward):
    311 	movl	(%eax), %ebx
    312 	movl	-4(%eax,%ecx), %eax
    313 	movl	%ebx, (%edx)
    314 	movl	%eax, -4(%edx,%ecx)
    315 	jmp	L(mm_return)
    316 
    317 L(mm_len_9_16_bytes_forward):
    318 	movq	(%eax), %xmm0
    319 	movq	-8(%eax, %ecx), %xmm1
    320 	movq	%xmm0, (%edx)
    321 	movq	%xmm1, -8(%edx, %ecx)
    322 	jmp	L(mm_return)
    323 
    324 	CFI_POP (%edi)
    325 	CFI_POP (%esi)
    326 
    327 L(mm_recalc_len):
    328 /* Compute in %ecx how many bytes are left to copy after
    329 	the main loop stops.  */
    330 	movl	%ebx, %ecx
    331 	subl	%edx, %ecx
    332 /* The code for copying backwards.  */
    333 L(mm_len_0_or_more_backward):
    334 
    335 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
    336 	separately.  */
    337 	cmp	$16, %ecx
    338 	jbe	L(mm_len_0_16_bytes_backward)
    339 
    340 	cmpl	$32, %ecx
    341 	jg	L(mm_len_32_or_more_backward)
    342 
    343 /* Copy [0..32] and return.  */
    344 	movdqu	(%eax), %xmm0
    345 	movdqu	-16(%eax, %ecx), %xmm1
    346 	movdqu	%xmm0, (%edx)
    347 	movdqu	%xmm1, -16(%edx, %ecx)
    348 	jmp	L(mm_return)
    349 
    350 L(mm_len_32_or_more_backward):
    351 	cmpl	$64, %ecx
    352 	jg	L(mm_len_64_or_more_backward)
    353 
    354 /* Copy [0..64] and return.  */
    355 	movdqu	(%eax), %xmm0
    356 	movdqu	16(%eax), %xmm1
    357 	movdqu	-16(%eax, %ecx), %xmm2
    358 	movdqu	-32(%eax, %ecx), %xmm3
    359 	movdqu	%xmm0, (%edx)
    360 	movdqu	%xmm1, 16(%edx)
    361 	movdqu	%xmm2, -16(%edx, %ecx)
    362 	movdqu	%xmm3, -32(%edx, %ecx)
    363 	jmp	L(mm_return)
    364 
    365 L(mm_len_64_or_more_backward):
    366 	cmpl	$128, %ecx
    367 	jg	L(mm_len_128_or_more_backward)
    368 
    369 /* Copy [0..128] and return.  */
    370 	movdqu	(%eax), %xmm0
    371 	movdqu	16(%eax), %xmm1
    372 	movdqu	32(%eax), %xmm2
    373 	movdqu	48(%eax), %xmm3
    374 	movdqu	-64(%eax, %ecx), %xmm4
    375 	movdqu	-48(%eax, %ecx), %xmm5
    376 	movdqu	-32(%eax, %ecx), %xmm6
    377 	movdqu	-16(%eax, %ecx), %xmm7
    378 	movdqu	%xmm0, (%edx)
    379 	movdqu	%xmm1, 16(%edx)
    380 	movdqu	%xmm2, 32(%edx)
    381 	movdqu	%xmm3, 48(%edx)
    382 	movdqu	%xmm4, -64(%edx, %ecx)
    383 	movdqu	%xmm5, -48(%edx, %ecx)
    384 	movdqu	%xmm6, -32(%edx, %ecx)
    385 	movdqu	%xmm7, -16(%edx, %ecx)
    386 	jmp	L(mm_return)
    387 
    388 L(mm_len_128_or_more_backward):
    389 	PUSH (%esi)
    390 	PUSH (%edi)
    391 
    392 /* Aligning the address of destination. We need to save
    393 	16 bits from the source in order not to overwrite them.  */
    394 	movdqu	-16(%eax, %ecx), %xmm0
    395 	movdqu	-32(%eax, %ecx), %xmm1
    396 	movdqu	-48(%eax, %ecx), %xmm2
    397 	movdqu	-64(%eax, %ecx), %xmm3
    398 
    399 	leal	(%edx, %ecx), %edi
    400 	andl	$-64, %edi
    401 
    402 	movl	%eax, %esi
    403 	subl	%edx, %esi
    404 
    405 	movdqu	-16(%edi, %esi), %xmm4
    406 	movdqu	-32(%edi, %esi), %xmm5
    407 	movdqu	-48(%edi, %esi), %xmm6
    408 	movdqu	-64(%edi, %esi), %xmm7
    409 
    410 	movdqu	%xmm0, -16(%edx, %ecx)
    411 	movdqu	%xmm1, -32(%edx, %ecx)
    412 	movdqu	%xmm2, -48(%edx, %ecx)
    413 	movdqu	%xmm3, -64(%edx, %ecx)
    414 	movdqa	%xmm4, -16(%edi)
    415 	movdqa	%xmm5, -32(%edi)
    416 	movdqa	%xmm6, -48(%edi)
    417 	movdqa	%xmm7, -64(%edi)
    418 	leal	-64(%edi), %edi
    419 
    420 	leal	64(%edx), %ebx
    421 	andl	$-64, %ebx
    422 
    423 	cmp	%edi, %ebx
    424 	jae	L(mm_main_loop_backward_end)
    425 
    426 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
    427 	jae	L(mm_large_page_loop_backward)
    428 
    429 	.p2align 4
    430 L(mm_main_loop_backward):
    431 
    432 	prefetcht0 -128(%edi, %esi)
    433 
    434 	movdqu	-64(%edi, %esi), %xmm0
    435 	movdqu	-48(%edi, %esi), %xmm1
    436 	movdqu	-32(%edi, %esi), %xmm2
    437 	movdqu	-16(%edi, %esi), %xmm3
    438 	movdqa	%xmm0, -64(%edi)
    439 	movdqa	%xmm1, -48(%edi)
    440 	movdqa	%xmm2, -32(%edi)
    441 	movdqa	%xmm3, -16(%edi)
    442 	leal	-64(%edi), %edi
    443 	cmp	%edi, %ebx
    444 	jb	L(mm_main_loop_backward)
    445 L(mm_main_loop_backward_end):
    446 	POP (%edi)
    447 	POP (%esi)
    448 	jmp	L(mm_recalc_len)
    449 
    450 /* Copy [0..16] and return.  */
    451 L(mm_len_0_16_bytes_backward):
    452 	testb	$24, %cl
    453 	jnz	L(mm_len_9_16_bytes_backward)
    454 	testb	$4, %cl
    455 	.p2align 4,,5
    456 	jnz	L(mm_len_5_8_bytes_backward)
    457 	testl	%ecx, %ecx
    458 	.p2align 4,,2
    459 	je	L(mm_return)
    460 	testb	$2, %cl
    461 	.p2align 4,,1
    462 	jne	L(mm_len_3_4_bytes_backward)
    463 	movzbl	-1(%eax,%ecx), %ebx
    464 	movzbl	(%eax), %eax
    465 	movb	%bl, -1(%edx,%ecx)
    466 	movb	%al, (%edx)
    467 	jmp	L(mm_return)
    468 
    469 L(mm_len_3_4_bytes_backward):
    470 	movzwl	-2(%eax,%ecx), %ebx
    471 	movzwl	(%eax), %eax
    472 	movw	%bx, -2(%edx,%ecx)
    473 	movw	%ax, (%edx)
    474 	jmp	L(mm_return)
    475 
    476 L(mm_len_9_16_bytes_backward):
    477 	PUSH (%esi)
    478 	movl	-4(%eax,%ecx), %ebx
    479 	movl	-8(%eax,%ecx), %esi
    480 	movl	%ebx, -4(%edx,%ecx)
    481 	movl	%esi, -8(%edx,%ecx)
    482 	subl	$8, %ecx
    483 	POP (%esi)
    484 	jmp	L(mm_len_0_16_bytes_backward)
    485 
    486 L(mm_len_5_8_bytes_backward):
    487 	movl	(%eax), %ebx
    488 	movl	-4(%eax,%ecx), %eax
    489 	movl	%ebx, (%edx)
    490 	movl	%eax, -4(%edx,%ecx)
    491 
    492 L(mm_return):
    493 	movl	%edx, %eax
    494 	RETURN
    495 
    496 L(mm_return_pop_all):
    497 	movl	%edx, %eax
    498 	POP (%edi)
    499 	POP (%esi)
    500 	RETURN
    501 
    502 /* Big length copy forward part.  */
    503 
    504 	.p2align 4
    505 L(mm_large_page_loop_forward):
    506 	movdqu	(%eax, %edi), %xmm0
    507 	movdqu	16(%eax, %edi), %xmm1
    508 	movdqu	32(%eax, %edi), %xmm2
    509 	movdqu	48(%eax, %edi), %xmm3
    510 	movntdq	%xmm0, (%edi)
    511 	movntdq	%xmm1, 16(%edi)
    512 	movntdq	%xmm2, 32(%edi)
    513 	movntdq	%xmm3, 48(%edi)
    514 	leal	64(%edi), %edi
    515 	cmp	%edi, %ebx
    516 	ja	L(mm_large_page_loop_forward)
    517 	sfence
    518 	jmp	L(mm_copy_remaining_forward)
    519 
    520 /* Big length copy backward part.  */
    521 	.p2align 4
    522 L(mm_large_page_loop_backward):
    523 	movdqu	-64(%edi, %esi), %xmm0
    524 	movdqu	-48(%edi, %esi), %xmm1
    525 	movdqu	-32(%edi, %esi), %xmm2
    526 	movdqu	-16(%edi, %esi), %xmm3
    527 	movntdq	%xmm0, -64(%edi)
    528 	movntdq	%xmm1, -48(%edi)
    529 	movntdq	%xmm2, -32(%edi)
    530 	movntdq	%xmm3, -16(%edi)
    531 	leal	-64(%edi), %edi
    532 	cmp	%edi, %ebx
    533 	jb	L(mm_large_page_loop_backward)
    534 	sfence
    535 	POP (%edi)
    536 	POP (%esi)
    537 	jmp	L(mm_recalc_len)
    538 
    539 END (MEMMOVE)
    540