1 // 2 // Copyright (c) 2013, Linaro Limited 3 // All rights reserved. 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are met: 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above copyright 10 // notice, this list of conditions and the following disclaimer in the 11 // documentation and/or other materials provided with the distribution. 12 // * Neither the name of the Linaro nor the 13 // names of its contributors may be used to endorse or promote products 14 // derived from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 // 28 29 // Assumptions: 30 // 31 // ARMv8-a, AArch64 32 // 33 34 35 // Parameters and result. 36 #define src1 x0 37 #define src2 x1 38 #define limit x2 39 #define result x0 40 41 // Internal variables. 42 #define data1 x3 43 #define data1w w3 44 #define data2 x4 45 #define data2w w4 46 #define diff x6 47 #define endloop x7 48 #define tmp1 x8 49 #define tmp2 x9 50 #define pos x11 51 #define limit_wd x12 52 #define mask x13 53 54 .p2align 6 55 ASM_GLOBAL ASM_PFX(InternalMemCompareMem) 56 ASM_PFX(InternalMemCompareMem): 57 eor tmp1, src1, src2 58 tst tmp1, #7 59 b.ne .Lmisaligned8 60 ands tmp1, src1, #7 61 b.ne .Lmutual_align 62 add limit_wd, limit, #7 63 lsr limit_wd, limit_wd, #3 64 65 // Start of performance-critical section -- one 64B cache line. 66 .Lloop_aligned: 67 ldr data1, [src1], #8 68 ldr data2, [src2], #8 69 .Lstart_realigned: 70 subs limit_wd, limit_wd, #1 71 eor diff, data1, data2 // Non-zero if differences found. 72 csinv endloop, diff, xzr, ne // Last Dword or differences. 73 cbz endloop, .Lloop_aligned 74 // End of performance-critical section -- one 64B cache line. 75 76 // Not reached the limit, must have found a diff. 77 cbnz limit_wd, .Lnot_limit 78 79 // Limit % 8 == 0 => all bytes significant. 80 ands limit, limit, #7 81 b.eq .Lnot_limit 82 83 lsl limit, limit, #3 // Bits -> bytes. 84 mov mask, #~0 85 lsl mask, mask, limit 86 bic data1, data1, mask 87 bic data2, data2, mask 88 89 orr diff, diff, mask 90 91 .Lnot_limit: 92 rev diff, diff 93 rev data1, data1 94 rev data2, data2 95 96 // The MS-non-zero bit of DIFF marks either the first bit 97 // that is different, or the end of the significant data. 98 // Shifting left now will bring the critical information into the 99 // top bits. 100 clz pos, diff 101 lsl data1, data1, pos 102 lsl data2, data2, pos 103 104 // But we need to zero-extend (char is unsigned) the value and then 105 // perform a signed 32-bit subtraction. 106 lsr data1, data1, #56 107 sub result, data1, data2, lsr #56 108 ret 109 110 .Lmutual_align: 111 // Sources are mutually aligned, but are not currently at an 112 // alignment boundary. Round down the addresses and then mask off 113 // the bytes that precede the start point. 114 bic src1, src1, #7 115 bic src2, src2, #7 116 add limit, limit, tmp1 // Adjust the limit for the extra. 117 lsl tmp1, tmp1, #3 // Bytes beyond alignment -> bits. 118 ldr data1, [src1], #8 119 neg tmp1, tmp1 // Bits to alignment -64. 120 ldr data2, [src2], #8 121 mov tmp2, #~0 122 123 // Little-endian. Early bytes are at LSB. 124 lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 63). 125 add limit_wd, limit, #7 126 orr data1, data1, tmp2 127 orr data2, data2, tmp2 128 lsr limit_wd, limit_wd, #3 129 b .Lstart_realigned 130 131 .p2align 6 132 .Lmisaligned8: 133 sub limit, limit, #1 134 1: 135 // Perhaps we can do better than this. 136 ldrb data1w, [src1], #1 137 ldrb data2w, [src2], #1 138 subs limit, limit, #1 139 ccmp data1w, data2w, #0, cs // NZCV = 0b0000. 140 b.eq 1b 141 sub result, data1, data2 142 ret 143