Home | History | Annotate | Download | only in lib
      1 /* Copyright 2002 Andi Kleen */
      2 
      3 #include <linux/linkage.h>
      4 
      5 #include <asm/cpufeature.h>
      6 #include <asm/dwarf2.h>
      7 #include <asm/alternative-asm.h>
      8 
      9 /*
     10  * memcpy - Copy a memory block.
     11  *
     12  * Input:
     13  *  rdi destination
     14  *  rsi source
     15  *  rdx count
     16  *
     17  * Output:
     18  * rax original destination
     19  */
     20 
     21 /*
     22  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
     23  *
     24  * This gets patched over the unrolled variant (below) via the
     25  * alternative instructions framework:
     26  */
     27 	.section .altinstr_replacement, "ax", @progbits
     28 .Lmemcpy_c:
     29 	movq %rdi, %rax
     30 	movq %rdx, %rcx
     31 	shrq $3, %rcx
     32 	andl $7, %edx
     33 	rep movsq
     34 	movl %edx, %ecx
     35 	rep movsb
     36 	ret
     37 .Lmemcpy_e:
     38 	.previous
     39 
     40 /*
     41  * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
     42  * memcpy_c. Use memcpy_c_e when possible.
     43  *
     44  * This gets patched over the unrolled variant (below) via the
     45  * alternative instructions framework:
     46  */
     47 	.section .altinstr_replacement, "ax", @progbits
     48 .Lmemcpy_c_e:
     49 	movq %rdi, %rax
     50 	movq %rdx, %rcx
     51 	rep movsb
     52 	ret
     53 .Lmemcpy_e_e:
     54 	.previous
     55 
     56 ENTRY(__memcpy)
     57 ENTRY(memcpy)
     58 	CFI_STARTPROC
     59 	movq %rdi, %rax
     60 
     61 	cmpq $0x20, %rdx
     62 	jb .Lhandle_tail
     63 
     64 	/*
     65 	 * We check whether memory false dependence could occur,
     66 	 * then jump to corresponding copy mode.
     67 	 */
     68 	cmp  %dil, %sil
     69 	jl .Lcopy_backward
     70 	subq $0x20, %rdx
     71 .Lcopy_forward_loop:
     72 	subq $0x20,	%rdx
     73 
     74 	/*
     75 	 * Move in blocks of 4x8 bytes:
     76 	 */
     77 	movq 0*8(%rsi),	%r8
     78 	movq 1*8(%rsi),	%r9
     79 	movq 2*8(%rsi),	%r10
     80 	movq 3*8(%rsi),	%r11
     81 	leaq 4*8(%rsi),	%rsi
     82 
     83 	movq %r8,	0*8(%rdi)
     84 	movq %r9,	1*8(%rdi)
     85 	movq %r10,	2*8(%rdi)
     86 	movq %r11,	3*8(%rdi)
     87 	leaq 4*8(%rdi),	%rdi
     88 	jae  .Lcopy_forward_loop
     89 	addl $0x20,	%edx
     90 	jmp  .Lhandle_tail
     91 
     92 .Lcopy_backward:
     93 	/*
     94 	 * Calculate copy position to tail.
     95 	 */
     96 	addq %rdx,	%rsi
     97 	addq %rdx,	%rdi
     98 	subq $0x20,	%rdx
     99 	/*
    100 	 * At most 3 ALU operations in one cycle,
    101 	 * so append NOPS in the same 16 bytes trunk.
    102 	 */
    103 	.p2align 4
    104 .Lcopy_backward_loop:
    105 	subq $0x20,	%rdx
    106 	movq -1*8(%rsi),	%r8
    107 	movq -2*8(%rsi),	%r9
    108 	movq -3*8(%rsi),	%r10
    109 	movq -4*8(%rsi),	%r11
    110 	leaq -4*8(%rsi),	%rsi
    111 	movq %r8,		-1*8(%rdi)
    112 	movq %r9,		-2*8(%rdi)
    113 	movq %r10,		-3*8(%rdi)
    114 	movq %r11,		-4*8(%rdi)
    115 	leaq -4*8(%rdi),	%rdi
    116 	jae  .Lcopy_backward_loop
    117 
    118 	/*
    119 	 * Calculate copy position to head.
    120 	 */
    121 	addl $0x20,	%edx
    122 	subq %rdx,	%rsi
    123 	subq %rdx,	%rdi
    124 .Lhandle_tail:
    125 	cmpl $16,	%edx
    126 	jb   .Lless_16bytes
    127 
    128 	/*
    129 	 * Move data from 16 bytes to 31 bytes.
    130 	 */
    131 	movq 0*8(%rsi), %r8
    132 	movq 1*8(%rsi),	%r9
    133 	movq -2*8(%rsi, %rdx),	%r10
    134 	movq -1*8(%rsi, %rdx),	%r11
    135 	movq %r8,	0*8(%rdi)
    136 	movq %r9,	1*8(%rdi)
    137 	movq %r10,	-2*8(%rdi, %rdx)
    138 	movq %r11,	-1*8(%rdi, %rdx)
    139 	retq
    140 	.p2align 4
    141 .Lless_16bytes:
    142 	cmpl $8,	%edx
    143 	jb   .Lless_8bytes
    144 	/*
    145 	 * Move data from 8 bytes to 15 bytes.
    146 	 */
    147 	movq 0*8(%rsi),	%r8
    148 	movq -1*8(%rsi, %rdx),	%r9
    149 	movq %r8,	0*8(%rdi)
    150 	movq %r9,	-1*8(%rdi, %rdx)
    151 	retq
    152 	.p2align 4
    153 .Lless_8bytes:
    154 	cmpl $4,	%edx
    155 	jb   .Lless_3bytes
    156 
    157 	/*
    158 	 * Move data from 4 bytes to 7 bytes.
    159 	 */
    160 	movl (%rsi), %ecx
    161 	movl -4(%rsi, %rdx), %r8d
    162 	movl %ecx, (%rdi)
    163 	movl %r8d, -4(%rdi, %rdx)
    164 	retq
    165 	.p2align 4
    166 .Lless_3bytes:
    167 	subl $1, %edx
    168 	jb .Lend
    169 	/*
    170 	 * Move data from 1 bytes to 3 bytes.
    171 	 */
    172 	movzbl (%rsi), %ecx
    173 	jz .Lstore_1byte
    174 	movzbq 1(%rsi), %r8
    175 	movzbq (%rsi, %rdx), %r9
    176 	movb %r8b, 1(%rdi)
    177 	movb %r9b, (%rdi, %rdx)
    178 .Lstore_1byte:
    179 	movb %cl, (%rdi)
    180 
    181 .Lend:
    182 	retq
    183 	CFI_ENDPROC
    184 ENDPROC(memcpy)
    185 ENDPROC(__memcpy)
    186 
    187 	/*
    188 	 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
    189 	 * If the feature is supported, memcpy_c_e() is the first choice.
    190 	 * If enhanced rep movsb copy is not available, use fast string copy
    191 	 * memcpy_c() when possible. This is faster and code is simpler than
    192 	 * original memcpy().
    193 	 * Otherwise, original memcpy() is used.
    194 	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
    195          * feature to implement the right patch order.
    196 	 *
    197 	 * Replace only beginning, memcpy is used to apply alternatives,
    198 	 * so it is silly to overwrite itself with nops - reboot is the
    199 	 * only outcome...
    200 	 */
    201 	.section .altinstructions, "a"
    202 	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
    203 			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
    204 	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
    205 			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
    206 	.previous
    207