1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <machine/cpu-features.h> 30 31 .text 32 33 .global __memcmp16 34 .type __memcmp16, %function 35 .align 4 36 37 /* 38 * Optimized memcmp16() for ARM9. 39 * This would not be optimal on XScale or ARM11, where more prefetching 40 * and use of PLD will be needed. 41 * The 2 major optimzations here are 42 * (1) The main loop compares 16 bytes at a time 43 * (2) The loads are scheduled in a way they won't stall 44 */ 45 46 __memcmp16: 47 .fnstart 48 PLD (r0, #0) 49 PLD (r1, #0) 50 51 /* take of the case where length is nul or the buffers are the same */ 52 cmp r0, r1 53 cmpne r2, #0 54 moveq r0, #0 55 bxeq lr 56 57 /* since r0 hold the result, move the first source 58 * pointer somewhere else 59 */ 60 61 mov r3, r0 62 63 /* make sure we have at least 12 words, this simplify things below 64 * and avoid some overhead for small blocks 65 */ 66 67 cmp r2, #12 68 bpl 0f 69 70 /* small blocks (less then 12 words) */ 71 PLD (r0, #32) 72 PLD (r1, #32) 73 74 1: ldrh r0, [r3], #2 75 ldrh ip, [r1], #2 76 subs r0, r0, ip 77 bxne lr 78 subs r2, r2, #1 79 bne 1b 80 bx lr 81 82 83 .save {r4, lr} 84 /* save registers */ 85 0: stmfd sp!, {r4, lr} 86 87 /* align first pointer to word boundary */ 88 tst r3, #2 89 beq 0f 90 91 ldrh r0, [r3], #2 92 ldrh ip, [r1], #2 93 sub r2, r2, #1 94 subs r0, r0, ip 95 /* restore registers and return */ 96 ldmnefd sp!, {r4, lr} 97 bxne lr 98 .fnend 99 100 101 102 0: /* here the first pointer is aligned, and we have at least 3 words 103 * to process. 104 */ 105 106 /* see if the pointers are congruent */ 107 eor r0, r3, r1 108 ands r0, r0, #2 109 bne 5f 110 111 /* congruent case, 16 half-words per iteration 112 * We need to make sure there are at least 16+2 words left 113 * because we effectively read ahead one long word, and we could 114 * read past the buffer (and segfault) if we're not careful. 115 */ 116 117 ldr ip, [r1] 118 subs r2, r2, #(16 + 2) 119 bmi 1f 120 121 0: 122 PLD (r3, #64) 123 PLD (r1, #64) 124 ldr r0, [r3], #4 125 ldr lr, [r1, #4]! 126 eors r0, r0, ip 127 ldreq r0, [r3], #4 128 ldreq ip, [r1, #4]! 129 eoreqs r0, r0, lr 130 ldreq r0, [r3], #4 131 ldreq lr, [r1, #4]! 132 eoreqs r0, r0, ip 133 ldreq r0, [r3], #4 134 ldreq ip, [r1, #4]! 135 eoreqs r0, r0, lr 136 ldreq r0, [r3], #4 137 ldreq lr, [r1, #4]! 138 eoreqs r0, r0, ip 139 ldreq r0, [r3], #4 140 ldreq ip, [r1, #4]! 141 eoreqs r0, r0, lr 142 ldreq r0, [r3], #4 143 ldreq lr, [r1, #4]! 144 eoreqs r0, r0, ip 145 ldreq r0, [r3], #4 146 ldreq ip, [r1, #4]! 147 eoreqs r0, r0, lr 148 bne 2f 149 subs r2, r2, #16 150 bhs 0b 151 152 /* do we have at least 2 words left? */ 153 1: adds r2, r2, #(16 - 2 + 2) 154 bmi 4f 155 156 /* finish off 2 words at a time */ 157 3: ldr r0, [r3], #4 158 ldr ip, [r1], #4 159 eors r0, r0, ip 160 bne 2f 161 subs r2, r2, #2 162 bhs 3b 163 164 /* are we done? */ 165 4: adds r2, r2, #2 166 bne 8f 167 /* restore registers and return */ 168 mov r0, #0 169 ldmfd sp!, {r4, lr} 170 bx lr 171 172 2: /* the last 2 words are different, restart them */ 173 ldrh r0, [r3, #-4] 174 ldrh ip, [r1, #-4] 175 subs r0, r0, ip 176 ldreqh r0, [r3, #-2] 177 ldreqh ip, [r1, #-2] 178 subeqs r0, r0, ip 179 /* restore registers and return */ 180 ldmfd sp!, {r4, lr} 181 bx lr 182 183 /* process the last few words */ 184 8: ldrh r0, [r3], #2 185 ldrh ip, [r1], #2 186 subs r0, r0, ip 187 bne 9f 188 subs r2, r2, #1 189 bne 8b 190 191 9: /* restore registers and return */ 192 ldmfd sp!, {r4, lr} 193 bx lr 194 195 196 5: /*************** non-congruent case ***************/ 197 198 /* align the unaligned pointer */ 199 bic r1, r1, #3 200 ldr lr, [r1], #4 201 sub r2, r2, #8 202 203 6: 204 PLD (r3, #64) 205 PLD (r1, #64) 206 mov ip, lr, lsr #16 207 ldr lr, [r1], #4 208 ldr r0, [r3], #4 209 orr ip, ip, lr, lsl #16 210 eors r0, r0, ip 211 moveq ip, lr, lsr #16 212 ldreq lr, [r1], #4 213 ldreq r0, [r3], #4 214 orreq ip, ip, lr, lsl #16 215 eoreqs r0, r0, ip 216 moveq ip, lr, lsr #16 217 ldreq lr, [r1], #4 218 ldreq r0, [r3], #4 219 orreq ip, ip, lr, lsl #16 220 eoreqs r0, r0, ip 221 moveq ip, lr, lsr #16 222 ldreq lr, [r1], #4 223 ldreq r0, [r3], #4 224 orreq ip, ip, lr, lsl #16 225 eoreqs r0, r0, ip 226 bne 7f 227 subs r2, r2, #8 228 bhs 6b 229 sub r1, r1, #2 230 /* are we done? */ 231 adds r2, r2, #8 232 moveq r0, #0 233 beq 9b 234 /* finish off the remaining bytes */ 235 b 8b 236 237 7: /* fix up the 2 pointers and fallthrough... */ 238 sub r1, r1, #2 239 b 2b 240