Home | History | Annotate | Download | only in bionic
      1 /* Copyright (c) 2014, Linaro Limited
      2    All rights reserved.
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions are met:
      6        * Redistributions of source code must retain the above copyright
      7          notice, this list of conditions and the following disclaimer.
      8        * Redistributions in binary form must reproduce the above copyright
      9          notice, this list of conditions and the following disclaimer in the
     10          documentation and/or other materials provided with the distribution.
     11        * Neither the name of the Linaro nor the
     12          names of its contributors may be used to endorse or promote products
     13          derived from this software without specific prior written permission.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 /* Assumptions:
     29  *
     30  * ARMv8-a, AArch64
     31  * Unaligned accesses
     32  * wchar_t is 4 bytes
     33  */
     34 
     35 #include <private/bionic_asm.h>
     36 
     37 /* Parameters and result.  */
     38 #define dstin	x0
     39 #define src	x1
     40 #define count	x2
     41 #define tmp1	x3
     42 #define tmp1w	w3
     43 #define tmp2	x4
     44 #define tmp2w	w4
     45 #define tmp3	x5
     46 #define tmp3w	w5
     47 #define dst	x6
     48 
     49 #define A_l	x7
     50 #define A_h	x8
     51 #define B_l	x9
     52 #define B_h	x10
     53 #define C_l	x11
     54 #define C_h	x12
     55 #define D_l	x13
     56 #define D_h	x14
     57 
     58 #if defined(WMEMMOVE)
     59 ENTRY(wmemmove)
     60 	lsl	count, count, #2
     61 #else
     62 ENTRY(memmove)
     63 #endif
     64 	cmp	dstin, src
     65 	b.lo	.Ldownwards
     66 	add	tmp1, src, count
     67 	cmp	dstin, tmp1
     68 	b.hs	memcpy		/* No overlap.  */
     69 
     70 	/* Upwards move with potential overlap.
     71 	 * Need to move from the tail backwards.  SRC and DST point one
     72 	 * byte beyond the remaining data to move.  */
     73 	add	dst, dstin, count
     74 	add	src, src, count
     75 	cmp	count, #64
     76 	b.ge	.Lmov_not_short_up
     77 
     78 	/* Deal with small moves quickly by dropping straight into the
     79 	 * exit block.  */
     80 .Ltail63up:
     81 	/* Move up to 48 bytes of data.  At this point we only need the
     82 	 * bottom 6 bits of count to be accurate.  */
     83 	ands	tmp1, count, #0x30
     84 	b.eq	.Ltail15up
     85 	sub	dst, dst, tmp1
     86 	sub	src, src, tmp1
     87 	cmp	tmp1w, #0x20
     88 	b.eq	1f
     89 	b.lt	2f
     90 	ldp	A_l, A_h, [src, #32]
     91 	stp	A_l, A_h, [dst, #32]
     92 1:
     93 	ldp	A_l, A_h, [src, #16]
     94 	stp	A_l, A_h, [dst, #16]
     95 2:
     96 	ldp	A_l, A_h, [src]
     97 	stp	A_l, A_h, [dst]
     98 .Ltail15up:
     99 	/* Move up to 15 bytes of data.  Does not assume additional data
    100 	 * being moved.  */
    101 	tbz	count, #3, 1f
    102 	ldr	tmp1, [src, #-8]!
    103 	str	tmp1, [dst, #-8]!
    104 1:
    105 	tbz	count, #2, 1f
    106 	ldr	tmp1w, [src, #-4]!
    107 	str	tmp1w, [dst, #-4]!
    108 1:
    109 	tbz	count, #1, 1f
    110 	ldrh	tmp1w, [src, #-2]!
    111 	strh	tmp1w, [dst, #-2]!
    112 1:
    113 	tbz	count, #0, 1f
    114 	ldrb	tmp1w, [src, #-1]
    115 	strb	tmp1w, [dst, #-1]
    116 1:
    117 	ret
    118 
    119 .Lmov_not_short_up:
    120 	/* We don't much care about the alignment of DST, but we want SRC
    121 	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
    122 	 * boundaries on both loads and stores.  */
    123 	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
    124 	b.eq	2f
    125 	sub	count, count, tmp2
    126 	/* Move enough data to reach alignment; unlike memcpy, we have to
    127 	 * be aware of the overlap, which means we can't move data twice.  */
    128 	tbz	tmp2, #3, 1f
    129 	ldr	tmp1, [src, #-8]!
    130 	str	tmp1, [dst, #-8]!
    131 1:
    132 	tbz	tmp2, #2, 1f
    133 	ldr	tmp1w, [src, #-4]!
    134 	str	tmp1w, [dst, #-4]!
    135 1:
    136 	tbz	tmp2, #1, 1f
    137 	ldrh	tmp1w, [src, #-2]!
    138 	strh	tmp1w, [dst, #-2]!
    139 1:
    140 	tbz	tmp2, #0, 1f
    141 	ldrb	tmp1w, [src, #-1]!
    142 	strb	tmp1w, [dst, #-1]!
    143 1:
    144 
    145 	/* There may be less than 63 bytes to go now.  */
    146 	cmp	count, #63
    147 	b.le	.Ltail63up
    148 2:
    149 	subs	count, count, #128
    150 	b.ge	.Lmov_body_large_up
    151 	/* Less than 128 bytes to move, so handle 64 here and then jump
    152 	 * to the tail.  */
    153 	ldp	A_l, A_h, [src, #-64]!
    154 	ldp	B_l, B_h, [src, #16]
    155 	ldp	C_l, C_h, [src, #32]
    156 	ldp	D_l, D_h, [src, #48]
    157 	stp	A_l, A_h, [dst, #-64]!
    158 	stp	B_l, B_h, [dst, #16]
    159 	stp	C_l, C_h, [dst, #32]
    160 	stp	D_l, D_h, [dst, #48]
    161 	tst	count, #0x3f
    162 	b.ne	.Ltail63up
    163 	ret
    164 
    165 	/* Critical loop.  Start at a new Icache line boundary.  Assuming
    166 	 * 64 bytes per line this ensures the entire loop is in one line.  */
    167 	.p2align 6
    168 .Lmov_body_large_up:
    169 	/* There are at least 128 bytes to move.  */
    170 	ldp	A_l, A_h, [src, #-16]
    171 	ldp	B_l, B_h, [src, #-32]
    172 	ldp	C_l, C_h, [src, #-48]
    173 	ldp	D_l, D_h, [src, #-64]!
    174 1:
    175 	stp	A_l, A_h, [dst, #-16]
    176 	ldp	A_l, A_h, [src, #-16]
    177 	stp	B_l, B_h, [dst, #-32]
    178 	ldp	B_l, B_h, [src, #-32]
    179 	stp	C_l, C_h, [dst, #-48]
    180 	ldp	C_l, C_h, [src, #-48]
    181 	stp	D_l, D_h, [dst, #-64]!
    182 	ldp	D_l, D_h, [src, #-64]!
    183 	subs	count, count, #64
    184 	b.ge	1b
    185 	stp	A_l, A_h, [dst, #-16]
    186 	stp	B_l, B_h, [dst, #-32]
    187 	stp	C_l, C_h, [dst, #-48]
    188 	stp	D_l, D_h, [dst, #-64]!
    189 	tst	count, #0x3f
    190 	b.ne	.Ltail63up
    191 	ret
    192 
    193 
    194 .Ldownwards:
    195 	/* For a downwards move we can safely use memcpy provided that
    196 	 * DST is more than 16 bytes away from SRC.  */
    197 	sub	tmp1, src, #16
    198 	cmp	dstin, tmp1
    199 	b.ls	memcpy		/* May overlap, but not critically.  */
    200 
    201 	mov	dst, dstin	/* Preserve DSTIN for return value.  */
    202 	cmp	count, #64
    203 	b.ge	.Lmov_not_short_down
    204 
    205 	/* Deal with small moves quickly by dropping straight into the
    206 	 * exit block.  */
    207 .Ltail63down:
    208 	/* Move up to 48 bytes of data.  At this point we only need the
    209 	 * bottom 6 bits of count to be accurate.  */
    210 	ands	tmp1, count, #0x30
    211 	b.eq	.Ltail15down
    212 	add	dst, dst, tmp1
    213 	add	src, src, tmp1
    214 	cmp	tmp1w, #0x20
    215 	b.eq	1f
    216 	b.lt	2f
    217 	ldp	A_l, A_h, [src, #-48]
    218 	stp	A_l, A_h, [dst, #-48]
    219 1:
    220 	ldp	A_l, A_h, [src, #-32]
    221 	stp	A_l, A_h, [dst, #-32]
    222 2:
    223 	ldp	A_l, A_h, [src, #-16]
    224 	stp	A_l, A_h, [dst, #-16]
    225 .Ltail15down:
    226 	/* Move up to 15 bytes of data.  Does not assume additional data
    227 	   being moved.  */
    228 	tbz	count, #3, 1f
    229 	ldr	tmp1, [src], #8
    230 	str	tmp1, [dst], #8
    231 1:
    232 	tbz	count, #2, 1f
    233 	ldr	tmp1w, [src], #4
    234 	str	tmp1w, [dst], #4
    235 1:
    236 	tbz	count, #1, 1f
    237 	ldrh	tmp1w, [src], #2
    238 	strh	tmp1w, [dst], #2
    239 1:
    240 	tbz	count, #0, 1f
    241 	ldrb	tmp1w, [src]
    242 	strb	tmp1w, [dst]
    243 1:
    244 	ret
    245 
    246 .Lmov_not_short_down:
    247 	/* We don't much care about the alignment of DST, but we want SRC
    248 	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
    249 	 * boundaries on both loads and stores.  */
    250 	neg	tmp2, src
    251 	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
    252 	b.eq	2f
    253 	sub	count, count, tmp2
    254 	/* Move enough data to reach alignment; unlike memcpy, we have to
    255 	 * be aware of the overlap, which means we can't move data twice.  */
    256 	tbz	tmp2, #3, 1f
    257 	ldr	tmp1, [src], #8
    258 	str	tmp1, [dst], #8
    259 1:
    260 	tbz	tmp2, #2, 1f
    261 	ldr	tmp1w, [src], #4
    262 	str	tmp1w, [dst], #4
    263 1:
    264 	tbz	tmp2, #1, 1f
    265 	ldrh	tmp1w, [src], #2
    266 	strh	tmp1w, [dst], #2
    267 1:
    268 	tbz	tmp2, #0, 1f
    269 	ldrb	tmp1w, [src], #1
    270 	strb	tmp1w, [dst], #1
    271 1:
    272 
    273 	/* There may be less than 63 bytes to go now.  */
    274 	cmp	count, #63
    275 	b.le	.Ltail63down
    276 2:
    277 	subs	count, count, #128
    278 	b.ge	.Lmov_body_large_down
    279 	/* Less than 128 bytes to move, so handle 64 here and then jump
    280 	 * to the tail.  */
    281 	ldp	A_l, A_h, [src]
    282 	ldp	B_l, B_h, [src, #16]
    283 	ldp	C_l, C_h, [src, #32]
    284 	ldp	D_l, D_h, [src, #48]
    285 	stp	A_l, A_h, [dst]
    286 	stp	B_l, B_h, [dst, #16]
    287 	stp	C_l, C_h, [dst, #32]
    288 	stp	D_l, D_h, [dst, #48]
    289 	tst	count, #0x3f
    290 	add	src, src, #64
    291 	add	dst, dst, #64
    292 	b.ne	.Ltail63down
    293 	ret
    294 
    295 	/* Critical loop.  Start at a new cache line boundary.  Assuming
    296 	 * 64 bytes per line this ensures the entire loop is in one line.  */
    297 	.p2align 6
    298 .Lmov_body_large_down:
    299 	/* There are at least 128 bytes to move.  */
    300 	ldp	A_l, A_h, [src, #0]
    301 	sub	dst, dst, #16		/* Pre-bias.  */
    302 	ldp	B_l, B_h, [src, #16]
    303 	ldp	C_l, C_h, [src, #32]
    304 	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
    305 1:
    306 	stp	A_l, A_h, [dst, #16]
    307 	ldp	A_l, A_h, [src, #16]
    308 	stp	B_l, B_h, [dst, #32]
    309 	ldp	B_l, B_h, [src, #32]
    310 	stp	C_l, C_h, [dst, #48]
    311 	ldp	C_l, C_h, [src, #48]
    312 	stp	D_l, D_h, [dst, #64]!
    313 	ldp	D_l, D_h, [src, #64]!
    314 	subs	count, count, #64
    315 	b.ge	1b
    316 	stp	A_l, A_h, [dst, #16]
    317 	stp	B_l, B_h, [dst, #32]
    318 	stp	C_l, C_h, [dst, #48]
    319 	stp	D_l, D_h, [dst, #64]
    320 	add	src, src, #16
    321 	add	dst, dst, #64 + 16
    322 	tst	count, #0x3f
    323 	b.ne	.Ltail63down
    324 	ret
    325 #if defined(WMEMMOVE)
    326 END(wmemmove)
    327 #else
    328 END(memmove)
    329 #endif
    330