1 /* 2 * 3 Copyright (c) 2014, ARM Limited 4 All rights Reserved. 5 Copyright (c) 2014, Linaro Ltd. 6 7 Redistribution and use in source and binary forms, with or without 8 modification, are permitted provided that the following conditions are met: 9 * Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 * Neither the name of the company nor the names of its contributors 15 may be used to endorse or promote products derived from this 16 software without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* Assumptions: 32 * 33 * ARMv8-a, AArch64 34 * Neon Available. 35 */ 36 37 #include <private/bionic_asm.h> 38 39 /* Arguments and results. */ 40 #define srcin x0 41 #define chrin w1 42 43 #define result x0 44 45 #define src x2 46 #define tmp1 x3 47 #define wtmp2 w4 48 #define tmp3 x5 49 50 #define vrepchr v0 51 #define vdata1 v1 52 #define vdata2 v2 53 #define vhas_nul1 v3 54 #define vhas_nul2 v4 55 #define vhas_chr1 v5 56 #define vhas_chr2 v6 57 #define vrepmask_0 v7 58 #define vrepmask_c v16 59 #define vend1 v17 60 #define vend2 v18 61 62 /* Core algorithm. 63 64 For each 32-byte hunk we calculate a 64-bit syndrome value, with 65 two bits per byte (LSB is always in bits 0 and 1, for both big 66 and little-endian systems). For each tuple, bit 0 is set iff 67 the relevant byte matched the requested character; bit 1 is set 68 iff the relevant byte matched the NUL end of string (we trigger 69 off bit0 for the special case of looking for NUL). Since the bits 70 in the syndrome reflect exactly the order in which things occur 71 in the original string a count_trailing_zeros() operation will 72 identify exactly which byte is causing the termination, and why. */ 73 74 /* Locals and temporaries. */ 75 76 ENTRY(strchr) 77 /* Magic constant 0x40100401 to allow us to identify which lane 78 matches the requested byte. Magic constant 0x80200802 used 79 similarly for NUL termination. */ 80 mov wtmp2, #0x0401 81 movk wtmp2, #0x4010, lsl #16 82 dup vrepchr.16b, chrin 83 bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 84 dup vrepmask_c.4s, wtmp2 85 ands tmp1, srcin, #31 86 add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ 87 b.eq .Lloop 88 89 /* Input string is not 32-byte aligned. Rather than forcing 90 the padding bytes to a safe value, we calculate the syndrome 91 for all the bytes, but then mask off those bits of the 92 syndrome that are related to the padding. */ 93 ld1 {vdata1.16b, vdata2.16b}, [src], #32 94 neg tmp1, tmp1 95 cmeq vhas_nul1.16b, vdata1.16b, #0 96 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 97 cmeq vhas_nul2.16b, vdata2.16b, #0 98 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 99 and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b 100 and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b 101 and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b 102 and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b 103 orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b 104 orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b 105 lsl tmp1, tmp1, #1 106 addp vend1.16b, vend1.16b, vend2.16b // 256->128 107 mov tmp3, #~0 108 addp vend1.16b, vend1.16b, vend2.16b // 128->64 109 lsr tmp1, tmp3, tmp1 110 111 mov tmp3, vend1.d[0] 112 bic tmp1, tmp3, tmp1 // Mask padding bits. 113 cbnz tmp1, .Ltail 114 115 .Lloop: 116 ld1 {vdata1.16b, vdata2.16b}, [src], #32 117 cmeq vhas_nul1.16b, vdata1.16b, #0 118 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 119 cmeq vhas_nul2.16b, vdata2.16b, #0 120 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 121 /* Use a fast check for the termination condition. */ 122 orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b 123 orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b 124 orr vend1.16b, vend1.16b, vend2.16b 125 addp vend1.2d, vend1.2d, vend1.2d 126 mov tmp1, vend1.d[0] 127 cbz tmp1, .Lloop 128 129 /* Termination condition found. Now need to establish exactly why 130 we terminated. */ 131 and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b 132 and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b 133 and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b 134 and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b 135 orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b 136 orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b 137 addp vend1.16b, vend1.16b, vend2.16b // 256->128 138 addp vend1.16b, vend1.16b, vend2.16b // 128->64 139 140 mov tmp1, vend1.d[0] 141 .Ltail: 142 /* Count the trailing zeros, by bit reversing... */ 143 rbit tmp1, tmp1 144 /* Re-bias source. */ 145 sub src, src, #32 146 clz tmp1, tmp1 /* And counting the leading zeros. */ 147 /* Tmp1 is even if the target charager was found first. Otherwise 148 we've found the end of string and we weren't looking for NUL. */ 149 tst tmp1, #1 150 add result, src, tmp1, lsr #1 151 csel result, result, xzr, eq 152 ret 153 END(strchr) 154