Home | History | Annotate | Download | only in AArch64
      1 //
      2 // Copyright (c) 2013, Linaro Limited
      3 // All rights reserved.
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are met:
      7 //     * Redistributions of source code must retain the above copyright
      8 //       notice, this list of conditions and the following disclaimer.
      9 //     * Redistributions in binary form must reproduce the above copyright
     10 //       notice, this list of conditions and the following disclaimer in the
     11 //       documentation and/or other materials provided with the distribution.
     12 //     * Neither the name of the Linaro nor the
     13 //       names of its contributors may be used to endorse or promote products
     14 //       derived from this software without specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 //
     28 
     29 // Assumptions:
     30 //
     31 // ARMv8-a, AArch64
     32 //
     33 
     34 
     35 // Parameters and result.
     36 #define src1      x0
     37 #define src2      x1
     38 #define limit     x2
     39 #define result    x0
     40 
     41 // Internal variables.
     42 #define data1     x3
     43 #define data1w    w3
     44 #define data2     x4
     45 #define data2w    w4
     46 #define diff      x6
     47 #define endloop   x7
     48 #define tmp1      x8
     49 #define tmp2      x9
     50 #define pos       x11
     51 #define limit_wd  x12
     52 #define mask      x13
     53 
     54     .p2align 6
     55 ASM_GLOBAL ASM_PFX(InternalMemCompareMem)
     56 ASM_PFX(InternalMemCompareMem):
     57     eor     tmp1, src1, src2
     58     tst     tmp1, #7
     59     b.ne    .Lmisaligned8
     60     ands    tmp1, src1, #7
     61     b.ne    .Lmutual_align
     62     add     limit_wd, limit, #7
     63     lsr     limit_wd, limit_wd, #3
     64 
     65     // Start of performance-critical section  -- one 64B cache line.
     66 .Lloop_aligned:
     67     ldr     data1, [src1], #8
     68     ldr     data2, [src2], #8
     69 .Lstart_realigned:
     70     subs    limit_wd, limit_wd, #1
     71     eor     diff, data1, data2        // Non-zero if differences found.
     72     csinv   endloop, diff, xzr, ne    // Last Dword or differences.
     73     cbz     endloop, .Lloop_aligned
     74     // End of performance-critical section  -- one 64B cache line.
     75 
     76     // Not reached the limit, must have found a diff.
     77     cbnz    limit_wd, .Lnot_limit
     78 
     79     // Limit % 8 == 0 => all bytes significant.
     80     ands    limit, limit, #7
     81     b.eq    .Lnot_limit
     82 
     83     lsl     limit, limit, #3              // Bits -> bytes.
     84     mov     mask, #~0
     85     lsl     mask, mask, limit
     86     bic     data1, data1, mask
     87     bic     data2, data2, mask
     88 
     89     orr     diff, diff, mask
     90 
     91 .Lnot_limit:
     92     rev     diff, diff
     93     rev     data1, data1
     94     rev     data2, data2
     95 
     96     // The MS-non-zero bit of DIFF marks either the first bit
     97     // that is different, or the end of the significant data.
     98     // Shifting left now will bring the critical information into the
     99     // top bits.
    100     clz     pos, diff
    101     lsl     data1, data1, pos
    102     lsl     data2, data2, pos
    103 
    104     // But we need to zero-extend (char is unsigned) the value and then
    105     // perform a signed 32-bit subtraction.
    106     lsr     data1, data1, #56
    107     sub     result, data1, data2, lsr #56
    108     ret
    109 
    110 .Lmutual_align:
    111     // Sources are mutually aligned, but are not currently at an
    112     // alignment boundary.  Round down the addresses and then mask off
    113     // the bytes that precede the start point.
    114     bic     src1, src1, #7
    115     bic     src2, src2, #7
    116     add     limit, limit, tmp1          // Adjust the limit for the extra.
    117     lsl     tmp1, tmp1, #3              // Bytes beyond alignment -> bits.
    118     ldr     data1, [src1], #8
    119     neg     tmp1, tmp1                  // Bits to alignment -64.
    120     ldr     data2, [src2], #8
    121     mov     tmp2, #~0
    122 
    123     // Little-endian.  Early bytes are at LSB.
    124     lsr     tmp2, tmp2, tmp1            // Shift (tmp1 & 63).
    125     add     limit_wd, limit, #7
    126     orr     data1, data1, tmp2
    127     orr     data2, data2, tmp2
    128     lsr     limit_wd, limit_wd, #3
    129     b       .Lstart_realigned
    130 
    131     .p2align 6
    132 .Lmisaligned8:
    133     sub     limit, limit, #1
    134 1:
    135     // Perhaps we can do better than this.
    136     ldrb    data1w, [src1], #1
    137     ldrb    data2w, [src2], #1
    138     subs    limit, limit, #1
    139     ccmp    data1w, data2w, #0, cs      // NZCV = 0b0000.
    140     b.eq    1b
    141     sub     result, data1, data2
    142     ret
    143