Home | History | Annotate | Download | only in string
      1 /*
      2 Copyright (c) 2014, Intel Corporation
      3 All rights reserved.
      4 
      5 Redistribution and use in source and binary forms, with or without
      6 modification, are permitted provided that the following conditions are met:
      7 
      8     * Redistributions of source code must retain the above copyright notice,
      9     * this list of conditions and the following disclaimer.
     10 
     11     * Redistributions in binary form must reproduce the above copyright notice,
     12     * this list of conditions and the following disclaimer in the documentation
     13     * and/or other materials provided with the distribution.
     14 
     15     * Neither the name of Intel Corporation nor the names of its contributors
     16     * may be used to endorse or promote products derived from this software
     17     * without specific prior written permission.
     18 
     19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
     26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 #include "cache.h"
     32 
     33 #ifndef MEMCPY
     34 # define MEMCPY	memcpy
     35 #endif
     36 
     37 #ifndef L
     38 # define L(label)	.L##label
     39 #endif
     40 
     41 #ifndef cfi_startproc
     42 # define cfi_startproc	.cfi_startproc
     43 #endif
     44 
     45 #ifndef cfi_endproc
     46 # define cfi_endproc	.cfi_endproc
     47 #endif
     48 
     49 #ifndef cfi_rel_offset
     50 # define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
     51 #endif
     52 
     53 #ifndef cfi_restore
     54 # define cfi_restore(reg)	.cfi_restore reg
     55 #endif
     56 
     57 #ifndef cfi_adjust_cfa_offset
     58 # define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
     59 #endif
     60 
     61 #ifndef ENTRY
     62 # define ENTRY(name)		\
     63 	.type name,  @function;		\
     64 	.globl name;		\
     65 	.p2align 4;		\
     66 name:		\
     67 	cfi_startproc
     68 #endif
     69 
     70 #ifndef END
     71 # define END(name)		\
     72 	cfi_endproc;		\
     73 	.size name, .-name
     74 #endif
     75 
     76 #define DEST		PARMS
     77 #define SRC		DEST+4
     78 #define LEN		SRC+4
     79 
     80 #define CFI_PUSH(REG)		\
     81   cfi_adjust_cfa_offset (4);		\
     82   cfi_rel_offset (REG, 0)
     83 
     84 #define CFI_POP(REG)		\
     85   cfi_adjust_cfa_offset (-4);		\
     86   cfi_restore (REG)
     87 
     88 #define PUSH(REG)	pushl REG; CFI_PUSH (REG)
     89 #define POP(REG)	popl REG; CFI_POP (REG)
     90 
     91 #define PARMS		8		/* Preserve EBX.  */
     92 #define ENTRANCE	PUSH (%ebx);
     93 #define RETURN_END	POP (%ebx); ret
     94 #define RETURN		RETURN_END; CFI_PUSH (%ebx)
     95 
     96 	.section .text.sse2,"ax",@progbits
     97 ENTRY (MEMCPY)
     98 	ENTRANCE
     99 	movl	LEN(%esp), %ecx
    100 	movl	SRC(%esp), %eax
    101 	movl	DEST(%esp), %edx
    102 
    103 	cmp	%eax, %edx
    104 	je	L(return)
    105 
    106 	cmp	$16, %ecx
    107 	jbe	L(len_0_16_bytes)
    108 
    109 	cmp     $SHARED_CACHE_SIZE_HALF, %ecx
    110 	jae     L(large_page)
    111 
    112 	movdqu	(%eax), %xmm0
    113 	movdqu	-16(%eax, %ecx), %xmm1
    114 	cmpl    $32, %ecx
    115 	movdqu	%xmm0, (%edx)
    116 	movdqu	%xmm1, -16(%edx, %ecx)
    117 	jbe	L(return)
    118 
    119 	movdqu	16(%eax), %xmm0
    120 	movdqu	-32(%eax, %ecx), %xmm1
    121 	cmpl    $64, %ecx
    122 	movdqu	%xmm0, 16(%edx)
    123 	movdqu	%xmm1, -32(%edx, %ecx)
    124 	jbe	L(return)
    125 
    126 	movdqu	32(%eax), %xmm0
    127 	movdqu	48(%eax), %xmm1
    128 	movdqu	-48(%eax, %ecx), %xmm2
    129 	movdqu	-64(%eax, %ecx), %xmm3
    130 	cmpl    $128, %ecx
    131 	movdqu	%xmm0, 32(%edx)
    132 	movdqu	%xmm1, 48(%edx)
    133 	movdqu	%xmm2, -48(%edx, %ecx)
    134 	movdqu	%xmm3, -64(%edx, %ecx)
    135 	jbe	L(return)
    136 
    137 /* Now the main loop: we align the address of the destination.  */
    138 	leal	64(%edx), %ebx
    139 	andl	$-64, %ebx
    140 
    141 	addl	%edx, %ecx
    142 	andl	$-64, %ecx
    143 
    144 	subl	%edx, %eax
    145 
    146 /* We should stop two iterations before the termination
    147 	(in order not to misprefetch).  */
    148 	subl	$64, %ecx
    149 	cmpl	%ebx, %ecx
    150 	je	L(main_loop_just_one_iteration)
    151 
    152 	subl	$64, %ecx
    153 	cmpl	%ebx, %ecx
    154 	je	L(main_loop_last_two_iterations)
    155 
    156 
    157 	.p2align 4
    158 L(main_loop_cache):
    159 
    160 	prefetcht0 128(%ebx, %eax)
    161 
    162 	movdqu	(%ebx, %eax), %xmm0
    163 	movdqu	16(%ebx, %eax), %xmm1
    164 	movdqu	32(%ebx, %eax), %xmm2
    165 	movdqu	48(%ebx, %eax), %xmm3
    166 	movdqa	%xmm0, (%ebx)
    167 	movdqa	%xmm1, 16(%ebx)
    168 	movdqa	%xmm2, 32(%ebx)
    169 	movdqa	%xmm3, 48(%ebx)
    170 	lea	64(%ebx), %ebx
    171 	cmpl	%ebx, %ecx
    172 	jne	L(main_loop_cache)
    173 
    174 L(main_loop_last_two_iterations):
    175 	movdqu	(%ebx, %eax), %xmm0
    176 	movdqu	16(%ebx, %eax), %xmm1
    177 	movdqu	32(%ebx, %eax), %xmm2
    178 	movdqu	48(%ebx, %eax), %xmm3
    179 	movdqu	64(%ebx, %eax), %xmm4
    180 	movdqu	80(%ebx, %eax), %xmm5
    181 	movdqu	96(%ebx, %eax), %xmm6
    182 	movdqu	112(%ebx, %eax), %xmm7
    183 	movdqa	%xmm0, (%ebx)
    184 	movdqa	%xmm1, 16(%ebx)
    185 	movdqa	%xmm2, 32(%ebx)
    186 	movdqa	%xmm3, 48(%ebx)
    187 	movdqa	%xmm4, 64(%ebx)
    188 	movdqa	%xmm5, 80(%ebx)
    189 	movdqa	%xmm6, 96(%ebx)
    190 	movdqa	%xmm7, 112(%ebx)
    191 	jmp	L(return)
    192 
    193 L(main_loop_just_one_iteration):
    194 	movdqu	(%ebx, %eax), %xmm0
    195 	movdqu	16(%ebx, %eax), %xmm1
    196 	movdqu	32(%ebx, %eax), %xmm2
    197 	movdqu	48(%ebx, %eax), %xmm3
    198 	movdqa	%xmm0, (%ebx)
    199 	movdqa	%xmm1, 16(%ebx)
    200 	movdqa	%xmm2, 32(%ebx)
    201 	movdqa	%xmm3, 48(%ebx)
    202 	jmp	L(return)
    203 
    204 L(large_page):
    205 	movdqu	(%eax), %xmm0
    206 	movdqu	16(%eax), %xmm1
    207 	movdqu	32(%eax), %xmm2
    208 	movdqu	48(%eax), %xmm3
    209 	movdqu	-64(%eax, %ecx), %xmm4
    210 	movdqu	-48(%eax, %ecx), %xmm5
    211 	movdqu	-32(%eax, %ecx), %xmm6
    212 	movdqu	-16(%eax, %ecx), %xmm7
    213 	movdqu	%xmm0, (%edx)
    214 	movdqu	%xmm1, 16(%edx)
    215 	movdqu	%xmm2, 32(%edx)
    216 	movdqu	%xmm3, 48(%edx)
    217 	movdqu	%xmm4, -64(%edx, %ecx)
    218 	movdqu	%xmm5, -48(%edx, %ecx)
    219 	movdqu	%xmm6, -32(%edx, %ecx)
    220 	movdqu	%xmm7, -16(%edx, %ecx)
    221 
    222 	movdqu	64(%eax), %xmm0
    223 	movdqu	80(%eax), %xmm1
    224 	movdqu	96(%eax), %xmm2
    225 	movdqu	112(%eax), %xmm3
    226 	movdqu	-128(%eax, %ecx), %xmm4
    227 	movdqu	-112(%eax, %ecx), %xmm5
    228 	movdqu	-96(%eax, %ecx), %xmm6
    229 	movdqu	-80(%eax, %ecx), %xmm7
    230 	movdqu	%xmm0, 64(%edx)
    231 	movdqu	%xmm1, 80(%edx)
    232 	movdqu	%xmm2, 96(%edx)
    233 	movdqu	%xmm3, 112(%edx)
    234 	movdqu	%xmm4, -128(%edx, %ecx)
    235 	movdqu	%xmm5, -112(%edx, %ecx)
    236 	movdqu	%xmm6, -96(%edx, %ecx)
    237 	movdqu	%xmm7, -80(%edx, %ecx)
    238 
    239 /* Now the main loop with non temporal stores. We align
    240 	the address of the destination.  */
    241 	leal	128(%edx), %ebx
    242 	andl	$-128, %ebx
    243 
    244 	addl	%edx, %ecx
    245 	andl	$-128, %ecx
    246 
    247 	subl	%edx, %eax
    248 
    249 	.p2align 4
    250 L(main_loop_large_page):
    251 	movdqu	(%ebx, %eax), %xmm0
    252 	movdqu	16(%ebx, %eax), %xmm1
    253 	movdqu	32(%ebx, %eax), %xmm2
    254 	movdqu	48(%ebx, %eax), %xmm3
    255 	movdqu	64(%ebx, %eax), %xmm4
    256 	movdqu	80(%ebx, %eax), %xmm5
    257 	movdqu	96(%ebx, %eax), %xmm6
    258 	movdqu	112(%ebx, %eax), %xmm7
    259 	movntdq	%xmm0, (%ebx)
    260 	movntdq	%xmm1, 16(%ebx)
    261 	movntdq	%xmm2, 32(%ebx)
    262 	movntdq	%xmm3, 48(%ebx)
    263 	movntdq	%xmm4, 64(%ebx)
    264 	movntdq	%xmm5, 80(%ebx)
    265 	movntdq	%xmm6, 96(%ebx)
    266 	movntdq	%xmm7, 112(%ebx)
    267 	lea	128(%ebx), %ebx
    268 	cmpl	%ebx, %ecx
    269 	jne	L(main_loop_large_page)
    270 	sfence
    271 	jmp	L(return)
    272 
    273 L(len_0_16_bytes):
    274 	testb	$24, %cl
    275 	jne	L(len_9_16_bytes)
    276 	testb	$4, %cl
    277 	.p2align 4,,5
    278 	jne	L(len_5_8_bytes)
    279 	testl	%ecx, %ecx
    280 	.p2align 4,,2
    281 	je	L(return)
    282 	movzbl	(%eax), %ebx
    283 	testb	$2, %cl
    284 	movb	%bl, (%edx)
    285 	je	L(return)
    286 	movzwl	-2(%eax,%ecx), %ebx
    287 	movw	%bx, -2(%edx,%ecx)
    288 	jmp	L(return)
    289 
    290 L(len_9_16_bytes):
    291 	movq	(%eax), %xmm0
    292 	movq	-8(%eax, %ecx), %xmm1
    293 	movq	%xmm0, (%edx)
    294 	movq	%xmm1, -8(%edx, %ecx)
    295 	jmp	L(return)
    296 
    297 L(len_5_8_bytes):
    298 	movl	(%eax), %ebx
    299 	movl	%ebx, (%edx)
    300 	movl	-4(%eax,%ecx), %ebx
    301 	movl	%ebx, -4(%edx,%ecx)
    302 	jmp	L(return)
    303 
    304 L(return):
    305 	movl	%edx, %eax
    306 	RETURN
    307 
    308 END (MEMCPY)
    309