Home | History | Annotate | Download | only in arch-arm64
      1 /* Copyright (c) 2012, Linaro Limited
      2    All rights reserved.
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions are met:
      6        * Redistributions of source code must retain the above copyright
      7          notice, this list of conditions and the following disclaimer.
      8        * Redistributions in binary form must reproduce the above copyright
      9          notice, this list of conditions and the following disclaimer in the
     10          documentation and/or other materials provided with the distribution.
     11        * Neither the name of the Linaro nor the
     12          names of its contributors may be used to endorse or promote products
     13          derived from this software without specific prior written permission.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 /* Assumptions:
     29  *
     30  * ARMv8-a, AArch64
     31  * Unaligned accesses
     32  *
     33  */
     34 
     35 /* By default we assume that the DC instruction can be used to zero
     36    data blocks more efficiently.  In some circumstances this might be
     37    unsafe, for example in an asymmetric multiprocessor environment with
     38    different DC clear lengths (neither the upper nor lower lengths are
     39    safe to use). */
     40 
     41 #define dst  		x0
     42 #define count		x2
     43 #define tmp1		x3
     44 #define tmp1w		w3
     45 #define tmp2		x4
     46 #define tmp2w		w4
     47 #define zva_len_x	x5
     48 #define zva_len		w5
     49 #define zva_bits_x	x6
     50 
     51 #define A_l		x1
     52 #define A_lw		w1
     53 #define tmp3w		w9
     54 
     55 #define ENTRY(f) \
     56   .text; \
     57   .globl f; \
     58   .align 0; \
     59   .type f, %function; \
     60   f: \
     61   .cfi_startproc \
     62 
     63 #define END(f) \
     64   .cfi_endproc; \
     65   .size f, .-f; \
     66 
     67 ENTRY(android_memset16)
     68 	ands   A_lw, A_lw, #0xffff
     69 	b.eq	.Lzero_mem
     70 	orr	A_lw, A_lw, A_lw, lsl #16
     71 	b .Lexpand_to_64
     72 END(android_memset16)
     73 
     74 ENTRY(android_memset32)
     75 	cmp	    A_lw, #0
     76 	b.eq	.Lzero_mem
     77 .Lexpand_to_64:
     78 	orr	A_l, A_l, A_l, lsl #32
     79 .Ltail_maybe_long:
     80 	cmp	count, #64
     81 	b.ge	.Lnot_short
     82 .Ltail_maybe_tiny:
     83 	cmp	count, #15
     84 	b.le	.Ltail15tiny
     85 .Ltail63:
     86 	ands	tmp1, count, #0x30
     87 	b.eq	.Ltail15
     88 	add	dst, dst, tmp1
     89 	cmp	tmp1w, #0x20
     90 	b.eq	1f
     91 	b.lt	2f
     92 	stp	A_l, A_l, [dst, #-48]
     93 1:
     94 	stp	A_l, A_l, [dst, #-32]
     95 2:
     96 	stp	A_l, A_l, [dst, #-16]
     97 
     98 .Ltail15:
     99 	and	count, count, #15
    100 	add	dst, dst, count
    101 	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
    102 	ret
    103 
    104 .Ltail15tiny:
    105 	/* Set up to 15 bytes.  Does not assume earlier memory
    106 	   being set.  */
    107 	tbz	count, #3, 1f
    108 	str	A_l, [dst], #8
    109 1:
    110 	tbz	count, #2, 1f
    111 	str	A_lw, [dst], #4
    112 1:
    113 	tbz	count, #1, 1f
    114 	strh	A_lw, [dst], #2
    115 1:
    116 	ret
    117 
    118 	/* Critical loop.  Start at a new cache line boundary.  Assuming
    119 	 * 64 bytes per line, this ensures the entire loop is in one line.  */
    120 	.p2align 6
    121 .Lnot_short:
    122 	neg	tmp2, dst
    123 	ands	tmp2, tmp2, #15
    124 	b.eq	2f
    125 	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
    126 	 * more than that to set, so we simply store 16 bytes and advance by
    127 	 * the amount required to reach alignment.  */
    128 	sub	count, count, tmp2
    129 	stp	A_l, A_l, [dst]
    130 	add	dst, dst, tmp2
    131 	/* There may be less than 63 bytes to go now.  */
    132 	cmp	count, #63
    133 	b.le	.Ltail63
    134 2:
    135 	sub	dst, dst, #16		/* Pre-bias.  */
    136 	sub	count, count, #64
    137 1:
    138 	stp	A_l, A_l, [dst, #16]
    139 	stp	A_l, A_l, [dst, #32]
    140 	stp	A_l, A_l, [dst, #48]
    141 	stp	A_l, A_l, [dst, #64]!
    142 	subs	count, count, #64
    143 	b.ge	1b
    144 	tst	count, #0x3f
    145 	add	dst, dst, #16
    146 	b.ne	.Ltail63
    147 	ret
    148 
    149 	/* For zeroing memory, check to see if we can use the ZVA feature to
    150 	 * zero entire 'cache' lines.  */
    151 .Lzero_mem:
    152 	mov	A_l, #0
    153 	cmp	count, #63
    154 	b.le	.Ltail_maybe_tiny
    155 	neg	tmp2, dst
    156 	ands	tmp2, tmp2, #15
    157 	b.eq	1f
    158 	sub	count, count, tmp2
    159 	stp	A_l, A_l, [dst]
    160 	add	dst, dst, tmp2
    161 	cmp	count, #63
    162 	b.le	.Ltail63
    163 1:
    164 	/* For zeroing small amounts of memory, it's not worth setting up
    165 	 * the line-clear code.  */
    166 	cmp	count, #128
    167 	b.lt	.Lnot_short
    168 	mrs	tmp1, dczid_el0
    169 	tbnz	tmp1, #4, .Lnot_short
    170 	mov	tmp3w, #4
    171 	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
    172 	lsl	zva_len, tmp3w, zva_len
    173 
    174 .Lzero_by_line:
    175 	/* Compute how far we need to go to become suitably aligned.  We're
    176 	 * already at quad-word alignment.  */
    177 	cmp	count, zva_len_x
    178 	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
    179 	sub	zva_bits_x, zva_len_x, #1
    180 	neg	tmp2, dst
    181 	ands	tmp2, tmp2, zva_bits_x
    182 	b.eq	1f			/* Already aligned.  */
    183 	/* Not aligned, check that there's enough to copy after alignment.  */
    184 	sub	tmp1, count, tmp2
    185 	cmp	tmp1, #64
    186 	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
    187 	b.lt	.Lnot_short
    188 	/* We know that there's at least 64 bytes to zero and that it's safe
    189 	 * to overrun by 64 bytes.  */
    190 	mov	count, tmp1
    191 2:
    192 	stp	A_l, A_l, [dst]
    193 	stp	A_l, A_l, [dst, #16]
    194 	stp	A_l, A_l, [dst, #32]
    195 	subs	tmp2, tmp2, #64
    196 	stp	A_l, A_l, [dst, #48]
    197 	add	dst, dst, #64
    198 	b.ge	2b
    199 	/* We've overrun a bit, so adjust dst downwards.  */
    200 	add	dst, dst, tmp2
    201 1:
    202 	sub	count, count, zva_len_x
    203 3:
    204 	dc	zva, dst
    205 	add	dst, dst, zva_len_x
    206 	subs	count, count, zva_len_x
    207 	b.ge	3b
    208 	ands	count, count, zva_bits_x
    209 	b.ne	.Ltail_maybe_long
    210 	ret
    211 END(android_memset32)
    212