Home | History | Annotate | Download | only in bionic
      1 /* Copyright (c) 2013-2015, Linaro Limited
      2    All rights reserved.
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions are met:
      6        * Redistributions of source code must retain the above copyright
      7 	 notice, this list of conditions and the following disclaimer.
      8        * Redistributions in binary form must reproduce the above copyright
      9 	 notice, this list of conditions and the following disclaimer in the
     10 	 documentation and/or other materials provided with the distribution.
     11        * Neither the name of the Linaro nor the
     12 	 names of its contributors may be used to endorse or promote products
     13 	 derived from this software without specific prior written permission.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
     26 
     27 /* Assumptions:
     28  *
     29  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
     30  */
     31 
     32 #include <private/bionic_asm.h>
     33 
     34 /* To test the page crossing code path more thoroughly, compile with
     35    -DTEST_PAGE_CROSS - this will force all calls through the slower
     36    entry path.  This option is not intended for production use.	 */
     37 
     38 /* Arguments and results.  */
     39 #define srcin		x0
     40 #define len		x0
     41 
     42 /* Locals and temporaries.  */
     43 #define src		x1
     44 #define data1		x2
     45 #define data2		x3
     46 #define has_nul1	x4
     47 #define has_nul2	x5
     48 #define tmp1		x4
     49 #define tmp2		x5
     50 #define tmp3		x6
     51 #define tmp4		x7
     52 #define zeroones	x8
     53 
     54 #define L(l) .L ## l
     55 
     56 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
     57 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
     58 	   can be done in parallel across the entire word. A faster check
     59 	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
     60 	   false hits for characters 129..255.	*/
     61 
     62 #define REP8_01 0x0101010101010101
     63 #define REP8_7f 0x7f7f7f7f7f7f7f7f
     64 #define REP8_80 0x8080808080808080
     65 
     66 #ifdef TEST_PAGE_CROSS
     67 # define MIN_PAGE_SIZE 15
     68 #else
     69 # define MIN_PAGE_SIZE 4096
     70 #endif
     71 
     72 	/* Since strings are short on average, we check the first 16 bytes
     73 	   of the string for a NUL character.  In order to do an unaligned ldp
     74 	   safely we have to do a page cross check first.  If there is a NUL
     75 	   byte we calculate the length from the 2 8-byte words using
     76 	   conditional select to reduce branch mispredictions (it is unlikely
     77 	   strlen will be repeatedly called on strings with the same length).
     78 
     79 	   If the string is longer than 16 bytes, we align src so don't need
     80 	   further page cross checks, and process 32 bytes per iteration
     81 	   using the fast NUL check.  If we encounter non-ASCII characters,
     82 	   fallback to a second loop using the full NUL check.
     83 
     84 	   If the page cross check fails, we read 16 bytes from an aligned
     85 	   address, remove any characters before the string, and continue
     86 	   in the main loop using aligned loads.  Since strings crossing a
     87 	   page in the first 16 bytes are rare (probability of
     88 	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
     89 
     90 	   AArch64 systems have a minimum page size of 4k.  We don't bother
     91 	   checking for larger page sizes - the cost of setting up the correct
     92 	   page size is just not worth the extra gain from a small reduction in
     93 	   the cases taking the slow path.  Note that we only care about
     94 	   whether the first fetch, which may be misaligned, crosses a page
     95 	   boundary.  */
     96 
     97 ENTRY(strlen)
     98 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
     99 	mov	zeroones, REP8_01
    100 	cmp	tmp1, MIN_PAGE_SIZE - 16
    101 	b.gt	L(page_cross)
    102 	ldp	data1, data2, [srcin]
    103 #ifdef __AARCH64EB__
    104 	/* For big-endian, carry propagation (if the final byte in the
    105 	   string is 0x01) means we cannot use has_nul1/2 directly.
    106 	   Since we expect strings to be small and early-exit,
    107 	   byte-swap the data now so has_null1/2 will be correct.  */
    108 	rev	data1, data1
    109 	rev	data2, data2
    110 #endif
    111 	sub	tmp1, data1, zeroones
    112 	orr	tmp2, data1, REP8_7f
    113 	sub	tmp3, data2, zeroones
    114 	orr	tmp4, data2, REP8_7f
    115 	bics	has_nul1, tmp1, tmp2
    116 	bic	has_nul2, tmp3, tmp4
    117 	ccmp	has_nul2, 0, 0, eq
    118 	beq	L(main_loop_entry)
    119 
    120 	/* Enter with C = has_nul1 == 0.  */
    121 	csel	has_nul1, has_nul1, has_nul2, cc
    122 	mov	len, 8
    123 	rev	has_nul1, has_nul1
    124 	clz	tmp1, has_nul1
    125 	csel	len, xzr, len, cc
    126 	add	len, len, tmp1, lsr 3
    127 	ret
    128 
    129 	/* The inner loop processes 32 bytes per iteration and uses the fast
    130 	   NUL check.  If we encounter non-ASCII characters, use a second
    131 	   loop with the accurate NUL check.  */
    132 	.p2align 4
    133 L(main_loop_entry):
    134 	bic	src, srcin, 15
    135 	sub	src, src, 16
    136 L(main_loop):
    137 	ldp	data1, data2, [src, 32]!
    138 .Lpage_cross_entry:
    139 	sub	tmp1, data1, zeroones
    140 	sub	tmp3, data2, zeroones
    141 	orr	tmp2, tmp1, tmp3
    142 	tst	tmp2, zeroones, lsl 7
    143 	bne	1f
    144 	ldp	data1, data2, [src, 16]
    145 	sub	tmp1, data1, zeroones
    146 	sub	tmp3, data2, zeroones
    147 	orr	tmp2, tmp1, tmp3
    148 	tst	tmp2, zeroones, lsl 7
    149 	beq	L(main_loop)
    150 	add	src, src, 16
    151 1:
    152 	/* The fast check failed, so do the slower, accurate NUL check.	 */
    153 	orr	tmp2, data1, REP8_7f
    154 	orr	tmp4, data2, REP8_7f
    155 	bics	has_nul1, tmp1, tmp2
    156 	bic	has_nul2, tmp3, tmp4
    157 	ccmp	has_nul2, 0, 0, eq
    158 	beq	L(nonascii_loop)
    159 
    160 	/* Enter with C = has_nul1 == 0.  */
    161 L(tail):
    162 #ifdef __AARCH64EB__
    163 	/* For big-endian, carry propagation (if the final byte in the
    164 	   string is 0x01) means we cannot use has_nul1/2 directly.  The
    165 	   easiest way to get the correct byte is to byte-swap the data
    166 	   and calculate the syndrome a second time.  */
    167 	csel	data1, data1, data2, cc
    168 	rev	data1, data1
    169 	sub	tmp1, data1, zeroones
    170 	orr	tmp2, data1, REP8_7f
    171 	bic	has_nul1, tmp1, tmp2
    172 #else
    173 	csel	has_nul1, has_nul1, has_nul2, cc
    174 #endif
    175 	sub	len, src, srcin
    176 	rev	has_nul1, has_nul1
    177 	add	tmp2, len, 8
    178 	clz	tmp1, has_nul1
    179 	csel	len, len, tmp2, cc
    180 	add	len, len, tmp1, lsr 3
    181 	ret
    182 
    183 L(nonascii_loop):
    184 	ldp	data1, data2, [src, 16]!
    185 	sub	tmp1, data1, zeroones
    186 	orr	tmp2, data1, REP8_7f
    187 	sub	tmp3, data2, zeroones
    188 	orr	tmp4, data2, REP8_7f
    189 	bics	has_nul1, tmp1, tmp2
    190 	bic	has_nul2, tmp3, tmp4
    191 	ccmp	has_nul2, 0, 0, eq
    192 	bne	L(tail)
    193 	ldp	data1, data2, [src, 16]!
    194 	sub	tmp1, data1, zeroones
    195 	orr	tmp2, data1, REP8_7f
    196 	sub	tmp3, data2, zeroones
    197 	orr	tmp4, data2, REP8_7f
    198 	bics	has_nul1, tmp1, tmp2
    199 	bic	has_nul2, tmp3, tmp4
    200 	ccmp	has_nul2, 0, 0, eq
    201 	beq	L(nonascii_loop)
    202 	b	L(tail)
    203 
    204 	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
    205 	   srcin to 0x7f, so we ignore any NUL bytes before the string.
    206 	   Then continue in the aligned loop.  */
    207 L(page_cross):
    208 	bic	src, srcin, 15
    209 	ldp	data1, data2, [src]
    210 	lsl	tmp1, srcin, 3
    211 	mov	tmp4, -1
    212 #ifdef __AARCH64EB__
    213 	/* Big-endian.	Early bytes are at MSB.	 */
    214 	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
    215 #else
    216 	/* Little-endian.  Early bytes are at LSB.  */
    217 	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
    218 #endif
    219 	orr	tmp1, tmp1, REP8_80
    220 	orn	data1, data1, tmp1
    221 	orn	tmp2, data2, tmp1
    222 	tst	srcin, 8
    223 	csel	data1, data1, tmp4, eq
    224 	csel	data2, data2, tmp2, eq
    225 	b	L(page_cross_entry)
    226 
    227 END(strlen)
    228