Home | History | Annotate | Download | only in bionic
      1 /* Copyright (c) 2014, Linaro Limited
      2    All rights reserved.
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions are met:
      6        * Redistributions of source code must retain the above copyright
      7          notice, this list of conditions and the following disclaimer.
      8        * Redistributions in binary form must reproduce the above copyright
      9          notice, this list of conditions and the following disclaimer in the
     10          documentation and/or other materials provided with the distribution.
     11        * Neither the name of the Linaro nor the
     12          names of its contributors may be used to endorse or promote products
     13          derived from this software without specific prior written permission.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 /* Assumptions:
     29  *
     30  * ARMv8-a, AArch64
     31  * Unaligned accesses
     32  * wchar_t is 4 bytes
     33  */
     34 
     35 #include <private/bionic_asm.h>
     36 
     37 /* Parameters and result.  */
     38 #ifdef BCOPY
     39 #define origdstin	x1
     40 #define origsrc	x0
     41 #endif
     42 #define dstin	x0
     43 #define src	x1
     44 #define count	x2
     45 #define tmp1	x3
     46 #define tmp1w	w3
     47 #define tmp2	x4
     48 #define tmp2w	w4
     49 #define tmp3	x5
     50 #define tmp3w	w5
     51 #define dst	x6
     52 
     53 #define A_l	x7
     54 #define A_h	x8
     55 #define B_l	x9
     56 #define B_h	x10
     57 #define C_l	x11
     58 #define C_h	x12
     59 #define D_l	x13
     60 #define D_h	x14
     61 
     62 #ifdef BCOPY
     63 ENTRY(bcopy)
     64 	/* Swap src and dst so that a branch to memcpy doesn't cause issues. */
     65 	mov	tmp1, origsrc
     66 	mov	origsrc, origdstin
     67 	mov	origdstin, tmp1
     68 #elif defined(WMEMMOVE)
     69 ENTRY(wmemmove)
     70 	lsl	count, count, #2
     71 #else
     72 ENTRY(memmove)
     73 #endif
     74 	cmp	dstin, src
     75 	b.lo	.Ldownwards
     76 	add	tmp1, src, count
     77 	cmp	dstin, tmp1
     78 	b.hs	memcpy		/* No overlap.  */
     79 
     80 	/* Upwards move with potential overlap.
     81 	 * Need to move from the tail backwards.  SRC and DST point one
     82 	 * byte beyond the remaining data to move.  */
     83 	add	dst, dstin, count
     84 	add	src, src, count
     85 	cmp	count, #64
     86 	b.ge	.Lmov_not_short_up
     87 
     88 	/* Deal with small moves quickly by dropping straight into the
     89 	 * exit block.  */
     90 .Ltail63up:
     91 	/* Move up to 48 bytes of data.  At this point we only need the
     92 	 * bottom 6 bits of count to be accurate.  */
     93 	ands	tmp1, count, #0x30
     94 	b.eq	.Ltail15up
     95 	sub	dst, dst, tmp1
     96 	sub	src, src, tmp1
     97 	cmp	tmp1w, #0x20
     98 	b.eq	1f
     99 	b.lt	2f
    100 	ldp	A_l, A_h, [src, #32]
    101 	stp	A_l, A_h, [dst, #32]
    102 1:
    103 	ldp	A_l, A_h, [src, #16]
    104 	stp	A_l, A_h, [dst, #16]
    105 2:
    106 	ldp	A_l, A_h, [src]
    107 	stp	A_l, A_h, [dst]
    108 .Ltail15up:
    109 	/* Move up to 15 bytes of data.  Does not assume additional data
    110 	 * being moved.  */
    111 	tbz	count, #3, 1f
    112 	ldr	tmp1, [src, #-8]!
    113 	str	tmp1, [dst, #-8]!
    114 1:
    115 	tbz	count, #2, 1f
    116 	ldr	tmp1w, [src, #-4]!
    117 	str	tmp1w, [dst, #-4]!
    118 1:
    119 	tbz	count, #1, 1f
    120 	ldrh	tmp1w, [src, #-2]!
    121 	strh	tmp1w, [dst, #-2]!
    122 1:
    123 	tbz	count, #0, 1f
    124 	ldrb	tmp1w, [src, #-1]
    125 	strb	tmp1w, [dst, #-1]
    126 1:
    127 	ret
    128 
    129 .Lmov_not_short_up:
    130 	/* We don't much care about the alignment of DST, but we want SRC
    131 	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
    132 	 * boundaries on both loads and stores.  */
    133 	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
    134 	b.eq	2f
    135 	sub	count, count, tmp2
    136 	/* Move enough data to reach alignment; unlike memcpy, we have to
    137 	 * be aware of the overlap, which means we can't move data twice.  */
    138 	tbz	tmp2, #3, 1f
    139 	ldr	tmp1, [src, #-8]!
    140 	str	tmp1, [dst, #-8]!
    141 1:
    142 	tbz	tmp2, #2, 1f
    143 	ldr	tmp1w, [src, #-4]!
    144 	str	tmp1w, [dst, #-4]!
    145 1:
    146 	tbz	tmp2, #1, 1f
    147 	ldrh	tmp1w, [src, #-2]!
    148 	strh	tmp1w, [dst, #-2]!
    149 1:
    150 	tbz	tmp2, #0, 1f
    151 	ldrb	tmp1w, [src, #-1]!
    152 	strb	tmp1w, [dst, #-1]!
    153 1:
    154 
    155 	/* There may be less than 63 bytes to go now.  */
    156 	cmp	count, #63
    157 	b.le	.Ltail63up
    158 2:
    159 	subs	count, count, #128
    160 	b.ge	.Lmov_body_large_up
    161 	/* Less than 128 bytes to move, so handle 64 here and then jump
    162 	 * to the tail.  */
    163 	ldp	A_l, A_h, [src, #-64]!
    164 	ldp	B_l, B_h, [src, #16]
    165 	ldp	C_l, C_h, [src, #32]
    166 	ldp	D_l, D_h, [src, #48]
    167 	stp	A_l, A_h, [dst, #-64]!
    168 	stp	B_l, B_h, [dst, #16]
    169 	stp	C_l, C_h, [dst, #32]
    170 	stp	D_l, D_h, [dst, #48]
    171 	tst	count, #0x3f
    172 	b.ne	.Ltail63up
    173 	ret
    174 
    175 	/* Critical loop.  Start at a new Icache line boundary.  Assuming
    176 	 * 64 bytes per line this ensures the entire loop is in one line.  */
    177 	.p2align 6
    178 .Lmov_body_large_up:
    179 	/* There are at least 128 bytes to move.  */
    180 	ldp	A_l, A_h, [src, #-16]
    181 	ldp	B_l, B_h, [src, #-32]
    182 	ldp	C_l, C_h, [src, #-48]
    183 	ldp	D_l, D_h, [src, #-64]!
    184 1:
    185 	stp	A_l, A_h, [dst, #-16]
    186 	ldp	A_l, A_h, [src, #-16]
    187 	stp	B_l, B_h, [dst, #-32]
    188 	ldp	B_l, B_h, [src, #-32]
    189 	stp	C_l, C_h, [dst, #-48]
    190 	ldp	C_l, C_h, [src, #-48]
    191 	stp	D_l, D_h, [dst, #-64]!
    192 	ldp	D_l, D_h, [src, #-64]!
    193 	subs	count, count, #64
    194 	b.ge	1b
    195 	stp	A_l, A_h, [dst, #-16]
    196 	stp	B_l, B_h, [dst, #-32]
    197 	stp	C_l, C_h, [dst, #-48]
    198 	stp	D_l, D_h, [dst, #-64]!
    199 	tst	count, #0x3f
    200 	b.ne	.Ltail63up
    201 	ret
    202 
    203 
    204 .Ldownwards:
    205 	/* For a downwards move we can safely use memcpy provided that
    206 	 * DST is more than 16 bytes away from SRC.  */
    207 	sub	tmp1, src, #16
    208 	cmp	dstin, tmp1
    209 	b.ls	memcpy		/* May overlap, but not critically.  */
    210 
    211 	mov	dst, dstin	/* Preserve DSTIN for return value.  */
    212 	cmp	count, #64
    213 	b.ge	.Lmov_not_short_down
    214 
    215 	/* Deal with small moves quickly by dropping straight into the
    216 	 * exit block.  */
    217 .Ltail63down:
    218 	/* Move up to 48 bytes of data.  At this point we only need the
    219 	 * bottom 6 bits of count to be accurate.  */
    220 	ands	tmp1, count, #0x30
    221 	b.eq	.Ltail15down
    222 	add	dst, dst, tmp1
    223 	add	src, src, tmp1
    224 	cmp	tmp1w, #0x20
    225 	b.eq	1f
    226 	b.lt	2f
    227 	ldp	A_l, A_h, [src, #-48]
    228 	stp	A_l, A_h, [dst, #-48]
    229 1:
    230 	ldp	A_l, A_h, [src, #-32]
    231 	stp	A_l, A_h, [dst, #-32]
    232 2:
    233 	ldp	A_l, A_h, [src, #-16]
    234 	stp	A_l, A_h, [dst, #-16]
    235 .Ltail15down:
    236 	/* Move up to 15 bytes of data.  Does not assume additional data
    237 	   being moved.  */
    238 	tbz	count, #3, 1f
    239 	ldr	tmp1, [src], #8
    240 	str	tmp1, [dst], #8
    241 1:
    242 	tbz	count, #2, 1f
    243 	ldr	tmp1w, [src], #4
    244 	str	tmp1w, [dst], #4
    245 1:
    246 	tbz	count, #1, 1f
    247 	ldrh	tmp1w, [src], #2
    248 	strh	tmp1w, [dst], #2
    249 1:
    250 	tbz	count, #0, 1f
    251 	ldrb	tmp1w, [src]
    252 	strb	tmp1w, [dst]
    253 1:
    254 	ret
    255 
    256 .Lmov_not_short_down:
    257 	/* We don't much care about the alignment of DST, but we want SRC
    258 	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
    259 	 * boundaries on both loads and stores.  */
    260 	neg	tmp2, src
    261 	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
    262 	b.eq	2f
    263 	sub	count, count, tmp2
    264 	/* Move enough data to reach alignment; unlike memcpy, we have to
    265 	 * be aware of the overlap, which means we can't move data twice.  */
    266 	tbz	tmp2, #3, 1f
    267 	ldr	tmp1, [src], #8
    268 	str	tmp1, [dst], #8
    269 1:
    270 	tbz	tmp2, #2, 1f
    271 	ldr	tmp1w, [src], #4
    272 	str	tmp1w, [dst], #4
    273 1:
    274 	tbz	tmp2, #1, 1f
    275 	ldrh	tmp1w, [src], #2
    276 	strh	tmp1w, [dst], #2
    277 1:
    278 	tbz	tmp2, #0, 1f
    279 	ldrb	tmp1w, [src], #1
    280 	strb	tmp1w, [dst], #1
    281 1:
    282 
    283 	/* There may be less than 63 bytes to go now.  */
    284 	cmp	count, #63
    285 	b.le	.Ltail63down
    286 2:
    287 	subs	count, count, #128
    288 	b.ge	.Lmov_body_large_down
    289 	/* Less than 128 bytes to move, so handle 64 here and then jump
    290 	 * to the tail.  */
    291 	ldp	A_l, A_h, [src]
    292 	ldp	B_l, B_h, [src, #16]
    293 	ldp	C_l, C_h, [src, #32]
    294 	ldp	D_l, D_h, [src, #48]
    295 	stp	A_l, A_h, [dst]
    296 	stp	B_l, B_h, [dst, #16]
    297 	stp	C_l, C_h, [dst, #32]
    298 	stp	D_l, D_h, [dst, #48]
    299 	tst	count, #0x3f
    300 	add	src, src, #64
    301 	add	dst, dst, #64
    302 	b.ne	.Ltail63down
    303 	ret
    304 
    305 	/* Critical loop.  Start at a new cache line boundary.  Assuming
    306 	 * 64 bytes per line this ensures the entire loop is in one line.  */
    307 	.p2align 6
    308 .Lmov_body_large_down:
    309 	/* There are at least 128 bytes to move.  */
    310 	ldp	A_l, A_h, [src, #0]
    311 	sub	dst, dst, #16		/* Pre-bias.  */
    312 	ldp	B_l, B_h, [src, #16]
    313 	ldp	C_l, C_h, [src, #32]
    314 	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
    315 1:
    316 	stp	A_l, A_h, [dst, #16]
    317 	ldp	A_l, A_h, [src, #16]
    318 	stp	B_l, B_h, [dst, #32]
    319 	ldp	B_l, B_h, [src, #32]
    320 	stp	C_l, C_h, [dst, #48]
    321 	ldp	C_l, C_h, [src, #48]
    322 	stp	D_l, D_h, [dst, #64]!
    323 	ldp	D_l, D_h, [src, #64]!
    324 	subs	count, count, #64
    325 	b.ge	1b
    326 	stp	A_l, A_h, [dst, #16]
    327 	stp	B_l, B_h, [dst, #32]
    328 	stp	C_l, C_h, [dst, #48]
    329 	stp	D_l, D_h, [dst, #64]
    330 	add	src, src, #16
    331 	add	dst, dst, #64 + 16
    332 	tst	count, #0x3f
    333 	b.ne	.Ltail63down
    334 	ret
    335 #ifdef BCOPY
    336 END(bcopy)
    337 #elif defined(WMEMMOVE)
    338 END(wmemmove)
    339 #else
    340 END(memmove)
    341 #endif
    342