Home | History | Annotate | Download | only in bionic
      1 /* Copyright (c) 2012-2013, Linaro Limited
      2    All rights reserved.
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions are met:
      6        * Redistributions of source code must retain the above copyright
      7          notice, this list of conditions and the following disclaimer.
      8        * Redistributions in binary form must reproduce the above copyright
      9          notice, this list of conditions and the following disclaimer in the
     10          documentation and/or other materials provided with the distribution.
     11        * Neither the name of the Linaro nor the
     12          names of its contributors may be used to endorse or promote products
     13          derived from this software without specific prior written permission.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
     26 
     27 /*
     28  * Copyright (c) 2015 ARM Ltd
     29  * All rights reserved.
     30  *
     31  * Redistribution and use in source and binary forms, with or without
     32  * modification, are permitted provided that the following conditions
     33  * are met:
     34  * 1. Redistributions of source code must retain the above copyright
     35  *    notice, this list of conditions and the following disclaimer.
     36  * 2. Redistributions in binary form must reproduce the above copyright
     37  *    notice, this list of conditions and the following disclaimer in the
     38  *    documentation and/or other materials provided with the distribution.
     39  * 3. The name of the company may not be used to endorse or promote
     40  *    products derived from this software without specific prior written
     41  *    permission.
     42  *
     43  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
     44  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     45  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     46  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     47  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     48  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     49  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     50  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     51  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     52  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     53  */
     54 
     55 /* Assumptions:
     56  *
     57  * ARMv8-a, AArch64, unaligned accesses.
     58  *
     59  */
     60 
     61 #include <private/bionic_asm.h>
     62 
     63 #define dstin	x0
     64 #define src	x1
     65 #define count	x2
     66 #define dst	x3
     67 #define srcend	x4
     68 #define dstend	x5
     69 #define A_l	x6
     70 #define A_lw	w6
     71 #define A_h	x7
     72 #define A_hw	w7
     73 #define B_l	x8
     74 #define B_lw   w8
     75 #define B_h	x9
     76 #define C_l	x10
     77 #define C_h	x11
     78 #define D_l	x12
     79 #define D_h	x13
     80 #define E_l	src
     81 #define E_h	count
     82 #define F_l	srcend
     83 #define F_h	dst
     84 #define tmp1	x9
     85 
     86 #define L(l) .L ## l
     87 
     88 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
     89    medium copies of 17..96 bytes which are fully unrolled. Large copies
     90    of more than 96 bytes align the destination and use an unrolled loop
     91    processing 64 bytes per iteration.
     92    Small and medium copies read all data before writing, allowing any
     93    kind of overlap, and memmove tailcalls memcpy for these cases as
     94    well as non-overlapping copies.
     95 */
     96 
     97 	prfm    PLDL1KEEP, [src]
     98 	add	srcend, src, count
     99 	add	dstend, dstin, count
    100         cmp     count, 16
    101         b.ls    L(copy16)
    102 	cmp	count, 96
    103 	b.hi	L(copy_long)
    104 
    105 	/* Medium copies: 17..96 bytes.  */
    106 	sub	tmp1, count, 1
    107 	ldp	A_l, A_h, [src]
    108 	tbnz	tmp1, 6, L(copy96)
    109 	ldp	D_l, D_h, [srcend, -16]
    110 	tbz	tmp1, 5, 1f
    111 	ldp	B_l, B_h, [src, 16]
    112 	ldp	C_l, C_h, [srcend, -32]
    113 	stp	B_l, B_h, [dstin, 16]
    114 	stp	C_l, C_h, [dstend, -32]
    115 1:
    116 	stp	A_l, A_h, [dstin]
    117 	stp	D_l, D_h, [dstend, -16]
    118 	ret
    119 
    120 	.p2align 4
    121 
    122 	/* Small copies: 0..16 bytes.  */
    123 L(copy16):
    124 	cmp	count, 8
    125 	b.lo	1f
    126 	ldr	A_l, [src]
    127 	ldr	A_h, [srcend, -8]
    128 	str	A_l, [dstin]
    129 	str	A_h, [dstend, -8]
    130 	ret
    131 	.p2align 4
    132 1:
    133 	tbz	count, 2, 1f
    134 	ldr	A_lw, [src]
    135 	ldr	A_hw, [srcend, -4]
    136 	str	A_lw, [dstin]
    137 	str	A_hw, [dstend, -4]
    138 	ret
    139 
    140 	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
    141 	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
    142 1:
    143 	cbz	count, 2f
    144 	lsr	tmp1, count, 1
    145 	ldrb	A_lw, [src]
    146 	ldrb	A_hw, [srcend, -1]
    147 	ldrb	B_lw, [src, tmp1]
    148 	strb	A_lw, [dstin]
    149 	strb	B_lw, [dstin, tmp1]
    150 	strb	A_hw, [dstend, -1]
    151 2:	ret
    152 
    153 	.p2align 4
    154 	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
    155 	   32 bytes from the end.  */
    156 L(copy96):
    157 	ldp	B_l, B_h, [src, 16]
    158 	ldp	C_l, C_h, [src, 32]
    159 	ldp	D_l, D_h, [src, 48]
    160 	ldp	E_l, E_h, [srcend, -32]
    161 	ldp	F_l, F_h, [srcend, -16]
    162 	stp	A_l, A_h, [dstin]
    163 	stp	B_l, B_h, [dstin, 16]
    164 	stp	C_l, C_h, [dstin, 32]
    165 	stp	D_l, D_h, [dstin, 48]
    166 	stp	E_l, E_h, [dstend, -32]
    167 	stp	F_l, F_h, [dstend, -16]
    168 	ret
    169 
    170 	/* Align DST to 16 byte alignment so that we don't cross cache line
    171 	   boundaries on both loads and stores.	 There are at least 96 bytes
    172 	   to copy, so copy 16 bytes unaligned and then align.	The loop
    173 	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
    174 
    175 	.p2align 4
    176 L(copy_long):
    177 	and	tmp1, dstin, 15
    178 	bic	dst, dstin, 15
    179 	ldp	D_l, D_h, [src]
    180 	sub	src, src, tmp1
    181 	add	count, count, tmp1	/* Count is now 16 too large.  */
    182 	ldp	A_l, A_h, [src, 16]
    183 	stp	D_l, D_h, [dstin]
    184 	ldp	B_l, B_h, [src, 32]
    185 	ldp	C_l, C_h, [src, 48]
    186 	ldp	D_l, D_h, [src, 64]!
    187 	subs	count, count, 128 + 16	/* Test and readjust count.  */
    188 	b.ls	2f
    189 1:
    190 	stp	A_l, A_h, [dst, 16]
    191 	ldp	A_l, A_h, [src, 16]
    192 	stp	B_l, B_h, [dst, 32]
    193 	ldp	B_l, B_h, [src, 32]
    194 	stp	C_l, C_h, [dst, 48]
    195 	ldp	C_l, C_h, [src, 48]
    196 	stp	D_l, D_h, [dst, 64]!
    197 	ldp	D_l, D_h, [src, 64]!
    198 	subs	count, count, 64
    199 	b.hi	1b
    200 
    201 	/* Write the last full set of 64 bytes.	 The remainder is at most 64
    202 	   bytes, so it is safe to always copy 64 bytes from the end even if
    203 	   there is just 1 byte left.  */
    204 2:
    205 	ldp	E_l, E_h, [srcend, -64]
    206 	stp	A_l, A_h, [dst, 16]
    207 	ldp	A_l, A_h, [srcend, -48]
    208 	stp	B_l, B_h, [dst, 32]
    209 	ldp	B_l, B_h, [srcend, -32]
    210 	stp	C_l, C_h, [dst, 48]
    211 	ldp	C_l, C_h, [srcend, -16]
    212 	stp	D_l, D_h, [dst, 64]
    213 	stp	E_l, E_h, [dstend, -64]
    214 	stp	A_l, A_h, [dstend, -48]
    215 	stp	B_l, B_h, [dstend, -32]
    216 	stp	C_l, C_h, [dstend, -16]
    217 	ret
    218