Home | History | Annotate | Download | only in bionic
      1 /* Copyright (c) 2014, Linaro Limited
      2    All rights reserved.
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions are met:
      6        * Redistributions of source code must retain the above copyright
      7          notice, this list of conditions and the following disclaimer.
      8        * Redistributions in binary form must reproduce the above copyright
      9          notice, this list of conditions and the following disclaimer in the
     10          documentation and/or other materials provided with the distribution.
     11        * Neither the name of the Linaro nor the
     12          names of its contributors may be used to endorse or promote products
     13          derived from this software without specific prior written permission.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     22    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 /* Assumptions:
     29  *
     30  * ARMv8-a, AArch64
     31  */
     32 
     33 #include <private/bionic_asm.h>
     34 
     35 /* Arguments and results.  */
     36 #define srcin		x0
     37 #define len		x0
     38 #define limit		x1
     39 
     40 /* Locals and temporaries.  */
     41 #define src		x2
     42 #define data1		x3
     43 #define data2		x4
     44 #define data2a		x5
     45 #define has_nul1	x6
     46 #define has_nul2	x7
     47 #define tmp1		x8
     48 #define tmp2		x9
     49 #define tmp3		x10
     50 #define tmp4		x11
     51 #define zeroones	x12
     52 #define pos		x13
     53 #define limit_wd	x14
     54 
     55 #define REP8_01 0x0101010101010101
     56 #define REP8_7f 0x7f7f7f7f7f7f7f7f
     57 #define REP8_80 0x8080808080808080
     58 
     59 	.text
     60 	.p2align	6
     61 .Lstart:
     62 	/* Pre-pad to ensure critical loop begins an icache line.  */
     63 	.rep 7
     64 	nop
     65 	.endr
     66 	/* Put this code here to avoid wasting more space with pre-padding.  */
     67 .Lhit_limit:
     68 	mov	len, limit
     69 	ret
     70 
     71 ENTRY(strnlen)
     72 	cbz	limit, .Lhit_limit
     73 	mov	zeroones, #REP8_01
     74 	bic	src, srcin, #15
     75 	ands	tmp1, srcin, #15
     76 	b.ne	.Lmisaligned
     77 	/* Calculate the number of full and partial words -1.  */
     78 	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
     79 	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
     80 
     81 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
     82 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
     83 	   can be done in parallel across the entire word.  */
     84 	/* The inner loop deals with two Dwords at a time.  This has a
     85 	   slightly higher start-up cost, but we should win quite quickly,
     86 	   especially on cores with a high number of issue slots per
     87 	   cycle, as we get much better parallelism out of the operations.  */
     88 
     89 	/* Start of critial section -- keep to one 64Byte cache line.  */
     90 .Lloop:
     91 	ldp	data1, data2, [src], #16
     92 .Lrealigned:
     93 	sub	tmp1, data1, zeroones
     94 	orr	tmp2, data1, #REP8_7f
     95 	sub	tmp3, data2, zeroones
     96 	orr	tmp4, data2, #REP8_7f
     97 	bic	has_nul1, tmp1, tmp2
     98 	bic	has_nul2, tmp3, tmp4
     99 	subs	limit_wd, limit_wd, #1
    100 	orr	tmp1, has_nul1, has_nul2
    101 	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
    102 	b.eq	.Lloop
    103 	/* End of critical section -- keep to one 64Byte cache line.  */
    104 
    105 	orr	tmp1, has_nul1, has_nul2
    106 	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
    107 
    108 	/* We know there's a null in the final Qword.  The easiest thing
    109 	   to do now is work out the length of the string and return
    110 	   MIN (len, limit).  */
    111 
    112 	sub	len, src, srcin
    113 	cbz	has_nul1, .Lnul_in_data2
    114 #ifdef __AARCH64EB__
    115 	mov	data2, data1
    116 #endif
    117 	sub	len, len, #8
    118 	mov	has_nul2, has_nul1
    119 .Lnul_in_data2:
    120 #ifdef __AARCH64EB__
    121 	/* For big-endian, carry propagation (if the final byte in the
    122 	   string is 0x01) means we cannot use has_nul directly.  The
    123 	   easiest way to get the correct byte is to byte-swap the data
    124 	   and calculate the syndrome a second time.  */
    125 	rev	data2, data2
    126 	sub	tmp1, data2, zeroones
    127 	orr	tmp2, data2, #REP8_7f
    128 	bic	has_nul2, tmp1, tmp2
    129 #endif
    130 	sub	len, len, #8
    131 	rev	has_nul2, has_nul2
    132 	clz	pos, has_nul2
    133 	add	len, len, pos, lsr #3		/* Bits to bytes.  */
    134 	cmp	len, limit
    135 	csel	len, len, limit, ls		/* Return the lower value.  */
    136 	ret
    137 
    138 .Lmisaligned:
    139 	/* Deal with a partial first word.
    140 	   We're doing two things in parallel here;
    141 	   1) Calculate the number of words (but avoiding overflow if
    142 	      limit is near ULONG_MAX) - to do this we need to work out
    143 	      limit + tmp1 - 1 as a 65-bit value before shifting it;
    144 	   2) Load and mask the initial data words - we force the bytes
    145 	      before the ones we are interested in to 0xff - this ensures
    146 	      early bytes will not hit any zero detection.  */
    147 	sub	limit_wd, limit, #1
    148 	neg	tmp4, tmp1
    149 	cmp	tmp1, #8
    150 
    151 	and	tmp3, limit_wd, #15
    152 	lsr	limit_wd, limit_wd, #4
    153 	mov	tmp2, #~0
    154 
    155 	ldp	data1, data2, [src], #16
    156 	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
    157 	add	tmp3, tmp3, tmp1
    158 
    159 #ifdef __AARCH64EB__
    160 	/* Big-endian.  Early bytes are at MSB.  */
    161 	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
    162 #else
    163 	/* Little-endian.  Early bytes are at LSB.  */
    164 	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
    165 #endif
    166 	add	limit_wd, limit_wd, tmp3, lsr #4
    167 
    168 	orr	data1, data1, tmp2
    169 	orr	data2a, data2, tmp2
    170 
    171 	csinv	data1, data1, xzr, le
    172 	csel	data2, data2, data2a, le
    173 	b	.Lrealigned
    174 END(strnlen)
    175