Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMCPY
     34 # define MEMCPY		memcpy
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc	.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc	.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)	.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)		\
     63 	.type name,  @function;		\
     64 	.globl name;		\
     65 	.p2align 4;		\
     66 name:		\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)		\
     72 	cfi_endproc;		\
     73 	.size name, .-name
     74 #endif
     75 
     76 #define CFI_PUSH(REG)		\
     77 	cfi_adjust_cfa_offset (4);		\
     78 	cfi_rel_offset (REG, 0)
     79 
     80 #define CFI_POP(REG)		\
     81 	cfi_adjust_cfa_offset (-4);		\
     82 	cfi_restore (REG)
     83 
     84 #define PUSH(REG)	push REG;
     85 #define POP(REG)	pop REG;
     86 
     87 #define ENTRANCE	PUSH (%rbx);
     88 #define RETURN_END	POP (%rbx); ret
     89 #define RETURN		RETURN_END;
     90 
     91 	.section .text.sse2,"ax",@progbits
     92 ENTRY (MEMCPY)
     93 	ENTRANCE
     94 	cmp	%rsi, %rdi
     95 	je	L(return)
     96 
     97 	cmp	$16, %rdx
     98 	jbe	L(len_0_16_bytes)
     99 
    100 	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
    101 	jae	L(large_page)
    102 
    103 	movdqu	(%rsi), %xmm0
    104 	movdqu	-16(%rsi, %rdx), %xmm1
    105 	cmp	$32, %rdx
    106 	movdqu	%xmm0, (%rdi)
    107 	movdqu	%xmm1, -16(%rdi, %rdx)
    108 	jbe	L(return)
    109 
    110 	movdqu	16(%rsi), %xmm0
    111 	movdqu	-32(%rsi, %rdx), %xmm1
    112 	cmp	$64, %rdx
    113 	movdqu	%xmm0, 16(%rdi)
    114 	movdqu	%xmm1, -32(%rdi, %rdx)
    115 	jbe	L(return)
    116 
    117 	movdqu	32(%rsi), %xmm0
    118 	movdqu	48(%rsi), %xmm1
    119 	movdqu	-48(%rsi, %rdx), %xmm2
    120 	movdqu	-64(%rsi, %rdx), %xmm3
    121 	cmp	$128, %rdx
    122 	movdqu	%xmm0, 32(%rdi)
    123 	movdqu	%xmm1, 48(%rdi)
    124 	movdqu	%xmm2, -48(%rdi, %rdx)
    125 	movdqu	%xmm3, -64(%rdi, %rdx)
    126 	jbe	L(return)
    127 
    128 /* Now the main loop: we align the address of the destination.  */
    129 	lea	64(%rdi), %r8
    130 	and	$-64, %r8
    131 
    132 	add	%rdi, %rdx
    133 	and	$-64, %rdx
    134 
    135 	sub	%rdi, %rsi
    136 
    137 /* We should stop two iterations before the termination
    138 	(in order not to misprefetch).  */
    139 	sub	$64, %rdx
    140 	cmp	%r8, %rdx
    141 	je	L(main_loop_just_one_iteration)
    142 
    143 	sub	$64, %rdx
    144 	cmp	%r8, %rdx
    145 	je	L(main_loop_last_two_iterations)
    146 
    147 
    148 	.p2align 4
    149 L(main_loop_cache):
    150 
    151 	prefetcht0 128(%r8, %rsi)
    152 
    153 	movdqu	(%r8, %rsi), %xmm0
    154 	movdqu	16(%r8, %rsi), %xmm1
    155 	movdqu	32(%r8, %rsi), %xmm2
    156 	movdqu	48(%r8, %rsi), %xmm3
    157 	movdqa	%xmm0, (%r8)
    158 	movdqa	%xmm1, 16(%r8)
    159 	movdqa	%xmm2, 32(%r8)
    160 	movdqa	%xmm3, 48(%r8)
    161 	lea	64(%r8), %r8
    162 	cmp	%r8, %rdx
    163 	jne	L(main_loop_cache)
    164 
    165 L(main_loop_last_two_iterations):
    166 	movdqu	(%r8, %rsi), %xmm0
    167 	movdqu	16(%r8, %rsi), %xmm1
    168 	movdqu	32(%r8, %rsi), %xmm2
    169 	movdqu	48(%r8, %rsi), %xmm3
    170 	movdqu	64(%r8, %rsi), %xmm4
    171 	movdqu	80(%r8, %rsi), %xmm5
    172 	movdqu	96(%r8, %rsi), %xmm6
    173 	movdqu	112(%r8, %rsi), %xmm7
    174 	movdqa	%xmm0, (%r8)
    175 	movdqa	%xmm1, 16(%r8)
    176 	movdqa	%xmm2, 32(%r8)
    177 	movdqa	%xmm3, 48(%r8)
    178 	movdqa	%xmm4, 64(%r8)
    179 	movdqa	%xmm5, 80(%r8)
    180 	movdqa	%xmm6, 96(%r8)
    181 	movdqa	%xmm7, 112(%r8)
    182 	jmp	L(return)
    183 
    184 L(main_loop_just_one_iteration):
    185 	movdqu	(%r8, %rsi), %xmm0
    186 	movdqu	16(%r8, %rsi), %xmm1
    187 	movdqu	32(%r8, %rsi), %xmm2
    188 	movdqu	48(%r8, %rsi), %xmm3
    189 	movdqa	%xmm0, (%r8)
    190 	movdqa	%xmm1, 16(%r8)
    191 	movdqa	%xmm2, 32(%r8)
    192 	movdqa	%xmm3, 48(%r8)
    193 	jmp	L(return)
    194 
    195 L(large_page):
    196 	movdqu	(%rsi), %xmm0
    197 	movdqu	16(%rsi), %xmm1
    198 	movdqu	32(%rsi), %xmm2
    199 	movdqu	48(%rsi), %xmm3
    200 	movdqu	-64(%rsi, %rdx), %xmm4
    201 	movdqu	-48(%rsi, %rdx), %xmm5
    202 	movdqu	-32(%rsi, %rdx), %xmm6
    203 	movdqu	-16(%rsi, %rdx), %xmm7
    204 	movdqu	%xmm0, (%rdi)
    205 	movdqu	%xmm1, 16(%rdi)
    206 	movdqu	%xmm2, 32(%rdi)
    207 	movdqu	%xmm3, 48(%rdi)
    208 	movdqu	%xmm4, -64(%rdi, %rdx)
    209 	movdqu	%xmm5, -48(%rdi, %rdx)
    210 	movdqu	%xmm6, -32(%rdi, %rdx)
    211 	movdqu	%xmm7, -16(%rdi, %rdx)
    212 
    213 	movdqu	64(%rsi), %xmm0
    214 	movdqu	80(%rsi), %xmm1
    215 	movdqu	96(%rsi), %xmm2
    216 	movdqu	112(%rsi), %xmm3
    217 	movdqu	-128(%rsi, %rdx), %xmm4
    218 	movdqu	-112(%rsi, %rdx), %xmm5
    219 	movdqu	-96(%rsi, %rdx), %xmm6
    220 	movdqu	-80(%rsi, %rdx), %xmm7
    221 	movdqu	%xmm0, 64(%rdi)
    222 	movdqu	%xmm1, 80(%rdi)
    223 	movdqu	%xmm2, 96(%rdi)
    224 	movdqu	%xmm3, 112(%rdi)
    225 	movdqu	%xmm4, -128(%rdi, %rdx)
    226 	movdqu	%xmm5, -112(%rdi, %rdx)
    227 	movdqu	%xmm6, -96(%rdi, %rdx)
    228 	movdqu	%xmm7, -80(%rdi, %rdx)
    229 
    230 /* Now the main loop with non temporal stores. We align
    231 	the address of the destination.  */
    232 	lea	128(%rdi), %r8
    233 	and	$-128, %r8
    234 
    235 	add	%rdi, %rdx
    236 	and	$-128, %rdx
    237 
    238 	sub	%rdi, %rsi
    239 
    240 	.p2align 4
    241 L(main_loop_large_page):
    242 	movdqu	(%r8, %rsi), %xmm0
    243 	movdqu	16(%r8, %rsi), %xmm1
    244 	movdqu	32(%r8, %rsi), %xmm2
    245 	movdqu	48(%r8, %rsi), %xmm3
    246 	movdqu	64(%r8, %rsi), %xmm4
    247 	movdqu	80(%r8, %rsi), %xmm5
    248 	movdqu	96(%r8, %rsi), %xmm6
    249 	movdqu	112(%r8, %rsi), %xmm7
    250 	movntdq	%xmm0, (%r8)
    251 	movntdq	%xmm1, 16(%r8)
    252 	movntdq	%xmm2, 32(%r8)
    253 	movntdq	%xmm3, 48(%r8)
    254 	movntdq	%xmm4, 64(%r8)
    255 	movntdq	%xmm5, 80(%r8)
    256 	movntdq	%xmm6, 96(%r8)
    257 	movntdq	%xmm7, 112(%r8)
    258 	lea	128(%r8), %r8
    259 	cmp	%r8, %rdx
    260 	jne	L(main_loop_large_page)
    261 	sfence
    262 	jmp	L(return)
    263 
    264 L(len_0_16_bytes):
    265 	testb	$24, %dl
    266 	jne	L(len_9_16_bytes)
    267 	testb	$4, %dl
    268 	.p2align 4,,5
    269 	jne	L(len_5_8_bytes)
    270 	test	%rdx, %rdx
    271 	.p2align 4,,2
    272 	je	L(return)
    273 	movzbl	(%rsi), %ebx
    274 	testb	$2, %dl
    275 	movb	%bl, (%rdi)
    276 	je	L(return)
    277 	movzwl	-2(%rsi,%rdx), %ebx
    278 	movw	%bx, -2(%rdi,%rdx)
    279 	jmp	L(return)
    280 
    281 L(len_9_16_bytes):
    282 	movq	(%rsi), %xmm0
    283 	movq	-8(%rsi, %rdx), %xmm1
    284 	movq	%xmm0, (%rdi)
    285 	movq	%xmm1, -8(%rdi, %rdx)
    286 	jmp	L(return)
    287 
    288 L(len_5_8_bytes):
    289 	movl	(%rsi), %ebx
    290 	movl	%ebx, (%rdi)
    291 	movl	-4(%rsi,%rdx), %ebx
    292 	movl	%ebx, -4(%rdi,%rdx)
    293 	jmp	L(return)
    294 
    295 L(return):
    296 	mov 	%rdi, %rax
    297 	RETURN
    298 
    299 END (MEMCPY)
    300