Home | History | Annotate | Download | only in AArch64
      1 //
      2 // Copyright (c) 2014, ARM Limited
      3 // All rights Reserved.
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are met:
      7 //     * Redistributions of source code must retain the above copyright
      8 //       notice, this list of conditions and the following disclaimer.
      9 //     * Redistributions in binary form must reproduce the above copyright
     10 //       notice, this list of conditions and the following disclaimer in the
     11 //       documentation and/or other materials provided with the distribution.
     12 //     * Neither the name of the company nor the names of its contributors
     13 //       may be used to endorse or promote products derived from this
     14 //       software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 //
     28 
     29 // Assumptions:
     30 //
     31 // ARMv8-a, AArch64
     32 // Neon Available.
     33 //
     34 
     35 // Arguments and results.
     36 #define srcin     x0
     37 #define cntin     x1
     38 #define chrin     w2
     39 
     40 #define result    x0
     41 
     42 #define src       x3
     43 #define	tmp       x4
     44 #define wtmp2     w5
     45 #define synd      x6
     46 #define soff      x9
     47 #define cntrem    x10
     48 
     49 #define vrepchr   v0
     50 #define vdata1    v1
     51 #define vdata2    v2
     52 #define vhas_chr1 v3
     53 #define vhas_chr2 v4
     54 #define vrepmask  v5
     55 #define vend      v6
     56 
     57 //
     58 // Core algorithm:
     59 //
     60 // For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
     61 // per byte. For each tuple, bit 0 is set if the relevant byte matched the
     62 // requested character and bit 1 is not used (faster than using a 32bit
     63 // syndrome). Since the bits in the syndrome reflect exactly the order in which
     64 // things occur in the original string, counting trailing zeros allows to
     65 // identify exactly which byte has matched.
     66 //
     67 
     68 ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
     69 ASM_PFX(InternalMemScanMem8):
     70     // Do not dereference srcin if no bytes to compare.
     71     cbz	cntin, .Lzero_length
     72     //
     73     // Magic constant 0x40100401 allows us to identify which lane matches
     74     // the requested byte.
     75     //
     76     mov     wtmp2, #0x0401
     77     movk    wtmp2, #0x4010, lsl #16
     78     dup     vrepchr.16b, chrin
     79     // Work with aligned 32-byte chunks
     80     bic     src, srcin, #31
     81     dup     vrepmask.4s, wtmp2
     82     ands    soff, srcin, #31
     83     and     cntrem, cntin, #31
     84     b.eq    .Lloop
     85 
     86     //
     87     // Input string is not 32-byte aligned. We calculate the syndrome
     88     // value for the aligned 32 bytes block containing the first bytes
     89     // and mask the irrelevant part.
     90     //
     91 
     92     ld1     {vdata1.16b, vdata2.16b}, [src], #32
     93     sub     tmp, soff, #32
     94     adds    cntin, cntin, tmp
     95     cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
     96     cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
     97     and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
     98     and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
     99     addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128
    100     addp    vend.16b, vend.16b, vend.16b                  // 128->64
    101     mov     synd, vend.d[0]
    102     // Clear the soff*2 lower bits
    103     lsl     tmp, soff, #1
    104     lsr     synd, synd, tmp
    105     lsl     synd, synd, tmp
    106     // The first block can also be the last
    107     b.ls    .Lmasklast
    108     // Have we found something already?
    109     cbnz    synd, .Ltail
    110 
    111 .Lloop:
    112     ld1     {vdata1.16b, vdata2.16b}, [src], #32
    113     subs    cntin, cntin, #32
    114     cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
    115     cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
    116     // If we're out of data we finish regardless of the result
    117     b.ls    .Lend
    118     // Use a fast check for the termination condition
    119     orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
    120     addp    vend.2d, vend.2d, vend.2d
    121     mov     synd, vend.d[0]
    122     // We're not out of data, loop if we haven't found the character
    123     cbz     synd, .Lloop
    124 
    125 .Lend:
    126     // Termination condition found, let's calculate the syndrome value
    127     and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
    128     and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
    129     addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128
    130     addp    vend.16b, vend.16b, vend.16b                // 128->64
    131     mov     synd, vend.d[0]
    132     // Only do the clear for the last possible block
    133     b.hi    .Ltail
    134 
    135 .Lmasklast:
    136     // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
    137     add     tmp, cntrem, soff
    138     and     tmp, tmp, #31
    139     sub     tmp, tmp, #32
    140     neg     tmp, tmp, lsl #1
    141     lsl     synd, synd, tmp
    142     lsr     synd, synd, tmp
    143 
    144 .Ltail:
    145     // Count the trailing zeros using bit reversing
    146     rbit    synd, synd
    147     // Compensate the last post-increment
    148     sub     src, src, #32
    149     // Check that we have found a character
    150     cmp     synd, #0
    151     // And count the leading zeros
    152     clz     synd, synd
    153     // Compute the potential result
    154     add     result, src, synd, lsr #1
    155     // Select result or NULL
    156     csel    result, xzr, result, eq
    157     ret
    158 
    159 .Lzero_length:
    160     mov   result, #0
    161     ret
    162