1 // 2 // Copyright (c) 2014, ARM Limited 3 // All rights Reserved. 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are met: 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above copyright 10 // notice, this list of conditions and the following disclaimer in the 11 // documentation and/or other materials provided with the distribution. 12 // * Neither the name of the company nor the names of its contributors 13 // may be used to endorse or promote products derived from this 14 // software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 // 28 29 // Assumptions: 30 // 31 // ARMv8-a, AArch64 32 // Neon Available. 33 // 34 35 // Arguments and results. 36 #define srcin x0 37 #define cntin x1 38 #define chrin w2 39 40 #define result x0 41 42 #define src x3 43 #define tmp x4 44 #define wtmp2 w5 45 #define synd x6 46 #define soff x9 47 #define cntrem x10 48 49 #define vrepchr v0 50 #define vdata1 v1 51 #define vdata2 v2 52 #define vhas_chr1 v3 53 #define vhas_chr2 v4 54 #define vrepmask v5 55 #define vend v6 56 57 // 58 // Core algorithm: 59 // 60 // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 61 // per byte. For each tuple, bit 0 is set if the relevant byte matched the 62 // requested character and bit 1 is not used (faster than using a 32bit 63 // syndrome). Since the bits in the syndrome reflect exactly the order in which 64 // things occur in the original string, counting trailing zeros allows to 65 // identify exactly which byte has matched. 66 // 67 68 ASM_GLOBAL ASM_PFX(InternalMemScanMem8) 69 ASM_PFX(InternalMemScanMem8): 70 // Do not dereference srcin if no bytes to compare. 71 cbz cntin, .Lzero_length 72 // 73 // Magic constant 0x40100401 allows us to identify which lane matches 74 // the requested byte. 75 // 76 mov wtmp2, #0x0401 77 movk wtmp2, #0x4010, lsl #16 78 dup vrepchr.16b, chrin 79 // Work with aligned 32-byte chunks 80 bic src, srcin, #31 81 dup vrepmask.4s, wtmp2 82 ands soff, srcin, #31 83 and cntrem, cntin, #31 84 b.eq .Lloop 85 86 // 87 // Input string is not 32-byte aligned. We calculate the syndrome 88 // value for the aligned 32 bytes block containing the first bytes 89 // and mask the irrelevant part. 90 // 91 92 ld1 {vdata1.16b, vdata2.16b}, [src], #32 93 sub tmp, soff, #32 94 adds cntin, cntin, tmp 95 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 96 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 97 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 98 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 99 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 100 addp vend.16b, vend.16b, vend.16b // 128->64 101 mov synd, vend.d[0] 102 // Clear the soff*2 lower bits 103 lsl tmp, soff, #1 104 lsr synd, synd, tmp 105 lsl synd, synd, tmp 106 // The first block can also be the last 107 b.ls .Lmasklast 108 // Have we found something already? 109 cbnz synd, .Ltail 110 111 .Lloop: 112 ld1 {vdata1.16b, vdata2.16b}, [src], #32 113 subs cntin, cntin, #32 114 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 115 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 116 // If we're out of data we finish regardless of the result 117 b.ls .Lend 118 // Use a fast check for the termination condition 119 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 120 addp vend.2d, vend.2d, vend.2d 121 mov synd, vend.d[0] 122 // We're not out of data, loop if we haven't found the character 123 cbz synd, .Lloop 124 125 .Lend: 126 // Termination condition found, let's calculate the syndrome value 127 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 128 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 129 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 130 addp vend.16b, vend.16b, vend.16b // 128->64 131 mov synd, vend.d[0] 132 // Only do the clear for the last possible block 133 b.hi .Ltail 134 135 .Lmasklast: 136 // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits 137 add tmp, cntrem, soff 138 and tmp, tmp, #31 139 sub tmp, tmp, #32 140 neg tmp, tmp, lsl #1 141 lsl synd, synd, tmp 142 lsr synd, synd, tmp 143 144 .Ltail: 145 // Count the trailing zeros using bit reversing 146 rbit synd, synd 147 // Compensate the last post-increment 148 sub src, src, #32 149 // Check that we have found a character 150 cmp synd, #0 151 // And count the leading zeros 152 clz synd, synd 153 // Compute the potential result 154 add result, src, synd, lsr #1 155 // Select result or NULL 156 csel result, xzr, result, eq 157 ret 158 159 .Lzero_length: 160 mov result, #0 161 ret 162