Home | History | Annotate | Download | only in bionic
      1 /*
      2  *
      3    Copyright (c) 2014, ARM Limited
      4    All rights Reserved.
      5    Copyright (c) 2014, Linaro Ltd.
      6 
      7    Redistribution and use in source and binary forms, with or without
      8    modification, are permitted provided that the following conditions are met:
      9        * Redistributions of source code must retain the above copyright
     10          notice, this list of conditions and the following disclaimer.
     11        * Redistributions in binary form must reproduce the above copyright
     12          notice, this list of conditions and the following disclaimer in the
     13          documentation and/or other materials provided with the distribution.
     14        * Neither the name of the company nor the names of its contributors
     15          may be used to endorse or promote products derived from this
     16          software without specific prior written permission.
     17 
     18    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 */
     30 
     31 /* Assumptions:
     32  *
     33  * ARMv8-a, AArch64
     34  * Neon Available.
     35  */
     36 
     37 #include <private/bionic_asm.h>
     38 
     39 /* Arguments and results.  */
     40 #define srcin		x0
     41 #define chrin		w1
     42 #define cntin		x2
     43 
     44 #define result		x0
     45 
     46 #define src		x3
     47 #define	tmp		x4
     48 #define wtmp2		w5
     49 #define synd		x6
     50 #define soff		x9
     51 #define cntrem		x10
     52 
     53 #define vrepchr		v0
     54 #define vdata1		v1
     55 #define vdata2		v2
     56 #define vhas_chr1	v3
     57 #define vhas_chr2	v4
     58 #define vrepmask	v5
     59 #define vend		v6
     60 
     61 /*
     62  * Core algorithm:
     63  *
     64  * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
     65  * per byte. For each tuple, bit 0 is set if the relevant byte matched the
     66  * requested character and bit 1 is not used (faster than using a 32bit
     67  * syndrome). Since the bits in the syndrome reflect exactly the order in which
     68  * things occur in the original string, counting trailing zeros allows to
     69  * identify exactly which byte has matched.
     70  */
     71 
     72 ENTRY(memchr)
     73 	/*
     74 	 * Magic constant 0x40100401 allows us to identify which lane matches
     75 	 * the requested byte.
     76 	 */
     77 	cbz	cntin, .Lzero_length
     78 	mov	wtmp2, #0x0401
     79 	movk	wtmp2, #0x4010, lsl #16
     80 	dup	vrepchr.16b, chrin
     81 	/* Work with aligned 32-byte chunks */
     82 	bic	src, srcin, #31
     83 	dup	vrepmask.4s, wtmp2
     84 	ands	soff, srcin, #31
     85 	and	cntrem, cntin, #31
     86 	b.eq	.Lloop
     87 
     88 	/*
     89 	 * Input string is not 32-byte aligned. We calculate the syndrome
     90 	 * value for the aligned 32 bytes block containing the first bytes
     91 	 * and mask the irrelevant part.
     92 	 */
     93 
     94 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
     95 	sub	tmp, soff, #32
     96 	adds	cntin, cntin, tmp
     97 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
     98 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
     99 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
    100 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
    101 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
    102 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
    103 	mov	synd, vend.d[0]
    104 	/* Clear the soff*2 lower bits */
    105 	lsl	tmp, soff, #1
    106 	lsr	synd, synd, tmp
    107 	lsl	synd, synd, tmp
    108 	/* The first block can also be the last */
    109 	b.ls	.Lmasklast
    110 	/* Have we found something already? */
    111 	cbnz	synd, .Ltail
    112 
    113 .Lloop:
    114 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
    115 	subs	cntin, cntin, #32
    116 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
    117 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
    118 	/* If we're out of data we finish regardless of the result */
    119 	b.ls	.Lend
    120 	/* Use a fast check for the termination condition */
    121 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
    122 	addp	vend.2d, vend.2d, vend.2d
    123 	mov	synd, vend.d[0]
    124 	/* We're not out of data, loop if we haven't found the character */
    125 	cbz	synd, .Lloop
    126 
    127 .Lend:
    128 	/* Termination condition found, let's calculate the syndrome value */
    129 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
    130 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
    131 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
    132 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
    133 	mov	synd, vend.d[0]
    134 	/* Only do the clear for the last possible block */
    135 	b.hi	.Ltail
    136 
    137 .Lmasklast:
    138 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
    139 	add	tmp, cntrem, soff
    140 	and	tmp, tmp, #31
    141 	sub	tmp, tmp, #32
    142 	neg	tmp, tmp, lsl #1
    143 	lsl	synd, synd, tmp
    144 	lsr	synd, synd, tmp
    145 
    146 .Ltail:
    147 	/* Count the trailing zeros using bit reversing */
    148 	rbit	synd, synd
    149 	/* Compensate the last post-increment */
    150 	sub	src, src, #32
    151 	/* Check that we have found a character */
    152 	cmp	synd, #0
    153 	/* And count the leading zeros */
    154 	clz	synd, synd
    155 	/* Compute the potential result */
    156 	add	result, src, synd, lsr #1
    157 	/* Select result or NULL */
    158 	csel	result, xzr, result, eq
    159 	ret
    160 
    161 .Lzero_length:
    162 	mov	result, xzr
    163 	ret
    164 END(memchr)
    165