Home | History | Annotate | Download | only in libc
      1 /*
      2  * Copyright (c) 2013 ARM Ltd
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  * 3. The name of the company may not be used to endorse or promote
     14  *    products derived from this software without specific prior written
     15  *    permission.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
     30    unaligned access.
     31 
     32    If compiled with GCC, this file should be enclosed within following
     33    pre-processing check:
     34    if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
     35 
     36    Prototype: void *memcpy (void *dst, const void *src, size_t count);
     37 
     38    The job will be done in 5 steps.
     39    Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
     40    Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
     41    Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
     42    Step 4: Copy word by word
     43    Step 5: Copy byte-to-byte
     44 
     45    Tunable options:
     46      __OPT_BIG_BLOCK_SIZE: Size of big block in words.  Default to 64.
     47      __OPT_MID_BLOCK_SIZE: Size of big block in words.  Default to 16.
     48  */
     49 #ifndef __OPT_BIG_BLOCK_SIZE
     50 #define __OPT_BIG_BLOCK_SIZE (4 * 16)
     51 #endif
     52 
     53 #ifndef __OPT_MID_BLOCK_SIZE
     54 #define __OPT_MID_BLOCK_SIZE (4 * 4)
     55 #endif
     56 
     57 #if __OPT_BIG_BLOCK_SIZE == 16
     58 #define BEGIN_UNROLL_BIG_BLOCK \
     59   .irp offset, 0,4,8,12
     60 #elif __OPT_BIG_BLOCK_SIZE == 32
     61 #define BEGIN_UNROLL_BIG_BLOCK \
     62   .irp offset, 0,4,8,12,16,20,24,28
     63 #elif __OPT_BIG_BLOCK_SIZE == 64
     64 #define BEGIN_UNROLL_BIG_BLOCK \
     65   .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
     66 #else
     67 #error "Illegal __OPT_BIG_BLOCK_SIZE"
     68 #endif
     69 
     70 #if __OPT_MID_BLOCK_SIZE == 8
     71 #define BEGIN_UNROLL_MID_BLOCK \
     72   .irp offset, 0,4
     73 #elif __OPT_MID_BLOCK_SIZE == 16
     74 #define BEGIN_UNROLL_MID_BLOCK \
     75   .irp offset, 0,4,8,12
     76 #else
     77 #error "Illegal __OPT_MID_BLOCK_SIZE"
     78 #endif
     79 
     80 #define END_UNROLL .endr
     81 
     82 	.syntax unified
     83 	.text
     84 	.align	2
     85 	.global	memcpy
     86 	.thumb
     87 	.thumb_func
     88 	.type	memcpy, %function
     89 memcpy:
     90 	@ r0: dst
     91 	@ r1: src
     92 	@ r2: len
     93 #ifdef __ARM_FEATURE_UNALIGNED
     94 	/* In case of UNALIGNED access supported, ip is not used in
     95 	   function body.  */
     96 	mov	ip, r0
     97 #else
     98 	push	{r0}
     99 #endif
    100 	orr	r3, r1, r0
    101 	ands	r3, r3, #3
    102 	bne	.Lmisaligned_copy
    103 
    104 .Lbig_block:
    105 	subs	r2, __OPT_BIG_BLOCK_SIZE
    106 	blo	.Lmid_block
    107 
    108 	/* Kernel loop for big block copy */
    109 	.align 2
    110 .Lbig_block_loop:
    111 	BEGIN_UNROLL_BIG_BLOCK
    112 #ifdef __ARM_ARCH_7EM__
    113 	ldr	r3, [r1], #4
    114 	str	r3, [r0], #4
    115 	END_UNROLL
    116 #else /* __ARM_ARCH_7M__ */
    117 	ldr	r3, [r1, \offset]
    118 	str	r3, [r0, \offset]
    119 	END_UNROLL
    120 	adds	r0, __OPT_BIG_BLOCK_SIZE
    121 	adds	r1, __OPT_BIG_BLOCK_SIZE
    122 #endif
    123 	subs	r2, __OPT_BIG_BLOCK_SIZE
    124 	bhs .Lbig_block_loop
    125 
    126 .Lmid_block:
    127 	adds	r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
    128 	blo	.Lcopy_word_by_word
    129 
    130 	/* Kernel loop for mid-block copy */
    131 	.align 2
    132 .Lmid_block_loop:
    133 	BEGIN_UNROLL_MID_BLOCK
    134 #ifdef __ARM_ARCH_7EM__
    135 	ldr	r3, [r1], #4
    136 	str	r3, [r0], #4
    137 	END_UNROLL
    138 #else /* __ARM_ARCH_7M__ */
    139 	ldr	r3, [r1, \offset]
    140 	str	r3, [r0, \offset]
    141 	END_UNROLL
    142 	adds    r0, __OPT_MID_BLOCK_SIZE
    143 	adds    r1, __OPT_MID_BLOCK_SIZE
    144 #endif
    145 	subs	r2, __OPT_MID_BLOCK_SIZE
    146 	bhs	.Lmid_block_loop
    147 
    148 .Lcopy_word_by_word:
    149 	adds	r2, __OPT_MID_BLOCK_SIZE - 4
    150 	blo	.Lcopy_less_than_4
    151 
    152 	/* Kernel loop for small block copy */
    153 	.align 2
    154 .Lcopy_word_by_word_loop:
    155 	ldr	r3, [r1], #4
    156 	str	r3, [r0], #4
    157 	subs	r2, #4
    158 	bhs	.Lcopy_word_by_word_loop
    159 
    160 .Lcopy_less_than_4:
    161 	adds	r2, #4
    162 	beq	.Ldone
    163 
    164 	lsls	r2, r2, #31
    165 	itt ne
    166 	ldrbne  r3, [r1], #1
    167 	strbne  r3, [r0], #1
    168 
    169 	bcc	.Ldone
    170 #ifdef __ARM_FEATURE_UNALIGNED
    171 	ldrh	r3, [r1]
    172 	strh	r3, [r0]
    173 #else
    174 	ldrb	r3, [r1]
    175 	strb	r3, [r0]
    176 	ldrb	r3, [r1, #1]
    177 	strb	r3, [r0, #1]
    178 #endif /* __ARM_FEATURE_UNALIGNED */
    179 
    180 .Ldone:
    181 #ifdef __ARM_FEATURE_UNALIGNED
    182 	mov	r0, ip
    183 #else
    184 	pop	{r0}
    185 #endif
    186 	bx	lr
    187 
    188 	.align 2
    189 .Lmisaligned_copy:
    190 #ifdef __ARM_FEATURE_UNALIGNED
    191 	/* Define label DST_ALIGNED to BIG_BLOCK.  It will go to aligned copy
    192 	   once destination is adjusted to aligned.  */
    193 #define Ldst_aligned Lbig_block
    194 
    195 	/* Copy word by word using LDR when alignment can be done in hardware,
    196 	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
    197 
    198 	cmp	r2, #8
    199 	blo	.Lbyte_copy
    200 
    201 	/* if src is aligned, just go to the big block loop.  */
    202 	lsls	r3, r1, #30
    203 	beq	.Ldst_aligned
    204 #else
    205 	/* if len < 12, misalignment adjustment has more overhead than
    206 	just byte-to-byte copy.  Also, len must >=8 to guarantee code
    207 	afterward work correctly.  */
    208 	cmp	r2, #12
    209 	blo	.Lbyte_copy
    210 #endif /* __ARM_FEATURE_UNALIGNED */
    211 
    212 	/* Align dst only, not trying to align src.  That is the because
    213 	handling of aligned src and misaligned dst need more overhead than
    214 	otherwise.  By doing this the worst case is when initial src is aligned,
    215 	additional up to 4 byte additional copy will executed, which is
    216 	acceptable.  */
    217 
    218 	ands	r3, r0, #3
    219 	beq	.Ldst_aligned
    220 
    221 	rsb	r3, #4
    222 	subs	r2, r3
    223 
    224 	lsls    r3, r3, #31
    225 	itt ne
    226 	ldrbne  r3, [r1], #1
    227 	strbne  r3, [r0], #1
    228 
    229 	bcc .Ldst_aligned
    230 
    231 #ifdef __ARM_FEATURE_UNALIGNED
    232 	ldrh    r3, [r1], #2
    233 	strh    r3, [r0], #2
    234 	b	.Ldst_aligned
    235 #else
    236 	ldrb    r3, [r1], #1
    237 	strb    r3, [r0], #1
    238 	ldrb    r3, [r1], #1
    239 	strb    r3, [r0], #1
    240 	/* Now that dst is aligned */
    241 .Ldst_aligned:
    242 	/* if r1 is aligned now, it means r0/r1 has the same misalignment,
    243 	and they are both aligned now.  Go aligned copy.  */
    244 	ands	r3, r1, #3
    245 	beq	.Lbig_block
    246 
    247 	/* dst is aligned, but src isn't.  Misaligned copy.  */
    248 
    249 	push	{r4, r5}
    250 	subs	r2, #4
    251 
    252 	/* Backward r1 by misaligned bytes, to make r1 aligned.
    253 	Since we need to restore r1 to unaligned address after the loop,
    254 	we need keep the offset bytes to ip and sub it from r1 afterward.  */
    255 	subs	r1, r3
    256 	rsb	ip, r3, #4
    257 
    258 	/* Pre-load on word */
    259 	ldr	r4, [r1], #4
    260 
    261 	cmp	r3, #2
    262 	beq	.Lmisaligned_copy_2_2
    263 	cmp	r3, #3
    264 	beq	.Lmisaligned_copy_3_1
    265 
    266 	.macro mis_src_copy shift
    267 1:
    268 	lsrs	r4, r4, \shift
    269 	ldr	r3, [r1], #4
    270 	lsls	r5, r3, 32-\shift
    271 	orr	r4, r4, r5
    272 	str	r4, [r0], #4
    273 	mov	r4, r3
    274 	subs	r2, #4
    275 	bhs	1b
    276 	.endm
    277 
    278 .Lmisaligned_copy_1_3:
    279 	mis_src_copy shift=8
    280 	b	.Lsrc_misaligned_tail
    281 
    282 .Lmisaligned_copy_3_1:
    283 	mis_src_copy shift=24
    284 	b	.Lsrc_misaligned_tail
    285 
    286 .Lmisaligned_copy_2_2:
    287 	/* For 2_2 misalignment, ldr is still faster than 2 x ldrh.  */
    288 	mis_src_copy shift=16
    289 
    290 .Lsrc_misaligned_tail:
    291 	adds	r2, #4
    292 	subs	r1, ip
    293 	pop	{r4, r5}
    294 
    295 #endif /* __ARM_FEATURE_UNALIGNED */
    296 
    297 .Lbyte_copy:
    298 	subs	r2, #4
    299 	blo	.Lcopy_less_than_4
    300 
    301 .Lbyte_copy_loop:
    302 	subs    r2, #1
    303 	ldrb    r3, [r1], #1
    304 	strb    r3, [r0], #1
    305 	bhs	.Lbyte_copy_loop
    306 
    307 	ldrb	r3, [r1]
    308 	strb	r3, [r0]
    309 	ldrb	r3, [r1, #1]
    310 	strb	r3, [r0, #1]
    311 	ldrb	r3, [r1, #2]
    312 	strb	r3, [r0, #2]
    313 
    314 #ifdef __ARM_FEATURE_UNALIGNED
    315 	mov	r0, ip
    316 #else
    317 	pop	{r0}
    318 #endif
    319 	bx	lr
    320 
    321 	.size	memcpy, .-memcpy
    322