1 /* 2 * 3 Copyright (c) 2014, ARM Limited 4 All rights Reserved. 5 Copyright (c) 2014, Linaro Ltd. 6 7 Redistribution and use in source and binary forms, with or without 8 modification, are permitted provided that the following conditions are met: 9 * Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 * Neither the name of the company nor the names of its contributors 15 may be used to endorse or promote products derived from this 16 software without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* Assumptions: 32 * 33 * ARMv8-a, AArch64 34 * Neon Available. 35 */ 36 37 #include <private/bionic_asm.h> 38 39 /* Arguments and results. */ 40 #define srcin x0 41 #define chrin w1 42 #define cntin x2 43 44 #define result x0 45 46 #define src x3 47 #define tmp x4 48 #define wtmp2 w5 49 #define synd x6 50 #define soff x9 51 #define cntrem x10 52 53 #define vrepchr v0 54 #define vdata1 v1 55 #define vdata2 v2 56 #define vhas_chr1 v3 57 #define vhas_chr2 v4 58 #define vrepmask v5 59 #define vend v6 60 61 /* 62 * Core algorithm: 63 * 64 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 65 * per byte. For each tuple, bit 0 is set if the relevant byte matched the 66 * requested character and bit 1 is not used (faster than using a 32bit 67 * syndrome). Since the bits in the syndrome reflect exactly the order in which 68 * things occur in the original string, counting trailing zeros allows to 69 * identify exactly which byte has matched. 70 */ 71 72 ENTRY(memchr) 73 /* 74 * Magic constant 0x40100401 allows us to identify which lane matches 75 * the requested byte. 76 */ 77 cbz cntin, .Lzero_length 78 mov wtmp2, #0x0401 79 movk wtmp2, #0x4010, lsl #16 80 dup vrepchr.16b, chrin 81 /* Work with aligned 32-byte chunks */ 82 bic src, srcin, #31 83 dup vrepmask.4s, wtmp2 84 ands soff, srcin, #31 85 and cntrem, cntin, #31 86 b.eq .Lloop 87 88 /* 89 * Input string is not 32-byte aligned. We calculate the syndrome 90 * value for the aligned 32 bytes block containing the first bytes 91 * and mask the irrelevant part. 92 */ 93 94 ld1 {vdata1.16b, vdata2.16b}, [src], #32 95 sub tmp, soff, #32 96 adds cntin, cntin, tmp 97 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 98 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 99 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 100 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 101 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 102 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 103 mov synd, vend.d[0] 104 /* Clear the soff*2 lower bits */ 105 lsl tmp, soff, #1 106 lsr synd, synd, tmp 107 lsl synd, synd, tmp 108 /* The first block can also be the last */ 109 b.ls .Lmasklast 110 /* Have we found something already? */ 111 cbnz synd, .Ltail 112 113 .Lloop: 114 ld1 {vdata1.16b, vdata2.16b}, [src], #32 115 subs cntin, cntin, #32 116 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 117 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 118 /* If we're out of data we finish regardless of the result */ 119 b.ls .Lend 120 /* Use a fast check for the termination condition */ 121 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 122 addp vend.2d, vend.2d, vend.2d 123 mov synd, vend.d[0] 124 /* We're not out of data, loop if we haven't found the character */ 125 cbz synd, .Lloop 126 127 .Lend: 128 /* Termination condition found, let's calculate the syndrome value */ 129 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 130 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 131 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 132 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 133 mov synd, vend.d[0] 134 /* Only do the clear for the last possible block */ 135 b.hi .Ltail 136 137 .Lmasklast: 138 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 139 add tmp, cntrem, soff 140 and tmp, tmp, #31 141 sub tmp, tmp, #32 142 neg tmp, tmp, lsl #1 143 lsl synd, synd, tmp 144 lsr synd, synd, tmp 145 146 .Ltail: 147 /* Count the trailing zeros using bit reversing */ 148 rbit synd, synd 149 /* Compensate the last post-increment */ 150 sub src, src, #32 151 /* Check that we have found a character */ 152 cmp synd, #0 153 /* And count the leading zeros */ 154 clz synd, synd 155 /* Compute the potential result */ 156 add result, src, synd, lsr #1 157 /* Select result or NULL */ 158 csel result, xzr, result, eq 159 ret 160 161 .Lzero_length: 162 mov result, xzr 163 ret 164 END(memchr) 165