1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <machine/cpu-features.h> 30 #include <machine/asm.h> 31 32 /* 33 * Optimized memcmp16() for ARM9. 34 * This would not be optimal on XScale or ARM11, where more prefetching 35 * and use of PLD will be needed. 36 * The 2 major optimzations here are 37 * (1) The main loop compares 16 bytes at a time 38 * (2) The loads are scheduled in a way they won't stall 39 */ 40 41 ENTRY(__memcmp16) 42 PLD (r0, #0) 43 PLD (r1, #0) 44 45 /* take of the case where length is nul or the buffers are the same */ 46 cmp r0, r1 47 cmpne r2, #0 48 moveq r0, #0 49 bxeq lr 50 51 /* since r0 hold the result, move the first source 52 * pointer somewhere else 53 */ 54 55 mov r3, r0 56 57 /* make sure we have at least 12 words, this simplify things below 58 * and avoid some overhead for small blocks 59 */ 60 61 cmp r2, #12 62 bpl 0f 63 64 /* small blocks (less then 12 words) */ 65 PLD (r0, #32) 66 PLD (r1, #32) 67 68 1: ldrh r0, [r3], #2 69 ldrh ip, [r1], #2 70 subs r0, r0, ip 71 bxne lr 72 subs r2, r2, #1 73 bne 1b 74 bx lr 75 76 77 .save {r4, lr} 78 /* save registers */ 79 0: stmfd sp!, {r4, lr} 80 81 /* align first pointer to word boundary */ 82 tst r3, #2 83 beq 0f 84 85 ldrh r0, [r3], #2 86 ldrh ip, [r1], #2 87 sub r2, r2, #1 88 subs r0, r0, ip 89 /* restore registers and return */ 90 ldmnefd sp!, {r4, lr} 91 bxne lr 92 93 94 0: /* here the first pointer is aligned, and we have at least 3 words 95 * to process. 96 */ 97 98 /* see if the pointers are congruent */ 99 eor r0, r3, r1 100 ands r0, r0, #2 101 bne 5f 102 103 /* congruent case, 16 half-words per iteration 104 * We need to make sure there are at least 16+2 words left 105 * because we effectively read ahead one long word, and we could 106 * read past the buffer (and segfault) if we're not careful. 107 */ 108 109 ldr ip, [r1] 110 subs r2, r2, #(16 + 2) 111 bmi 1f 112 113 0: 114 PLD (r3, #64) 115 PLD (r1, #64) 116 ldr r0, [r3], #4 117 ldr lr, [r1, #4]! 118 eors r0, r0, ip 119 ldreq r0, [r3], #4 120 ldreq ip, [r1, #4]! 121 eoreqs r0, r0, lr 122 ldreq r0, [r3], #4 123 ldreq lr, [r1, #4]! 124 eoreqs r0, r0, ip 125 ldreq r0, [r3], #4 126 ldreq ip, [r1, #4]! 127 eoreqs r0, r0, lr 128 ldreq r0, [r3], #4 129 ldreq lr, [r1, #4]! 130 eoreqs r0, r0, ip 131 ldreq r0, [r3], #4 132 ldreq ip, [r1, #4]! 133 eoreqs r0, r0, lr 134 ldreq r0, [r3], #4 135 ldreq lr, [r1, #4]! 136 eoreqs r0, r0, ip 137 ldreq r0, [r3], #4 138 ldreq ip, [r1, #4]! 139 eoreqs r0, r0, lr 140 bne 2f 141 subs r2, r2, #16 142 bhs 0b 143 144 /* do we have at least 2 words left? */ 145 1: adds r2, r2, #(16 - 2 + 2) 146 bmi 4f 147 148 /* finish off 2 words at a time */ 149 3: ldr r0, [r3], #4 150 ldr ip, [r1], #4 151 eors r0, r0, ip 152 bne 2f 153 subs r2, r2, #2 154 bhs 3b 155 156 /* are we done? */ 157 4: adds r2, r2, #2 158 bne 8f 159 /* restore registers and return */ 160 mov r0, #0 161 ldmfd sp!, {r4, lr} 162 bx lr 163 164 2: /* the last 2 words are different, restart them */ 165 ldrh r0, [r3, #-4] 166 ldrh ip, [r1, #-4] 167 subs r0, r0, ip 168 ldreqh r0, [r3, #-2] 169 ldreqh ip, [r1, #-2] 170 subeqs r0, r0, ip 171 /* restore registers and return */ 172 ldmfd sp!, {r4, lr} 173 bx lr 174 175 /* process the last few words */ 176 8: ldrh r0, [r3], #2 177 ldrh ip, [r1], #2 178 subs r0, r0, ip 179 bne 9f 180 subs r2, r2, #1 181 bne 8b 182 183 9: /* restore registers and return */ 184 ldmfd sp!, {r4, lr} 185 bx lr 186 187 188 5: /*************** non-congruent case ***************/ 189 190 /* align the unaligned pointer */ 191 bic r1, r1, #3 192 ldr lr, [r1], #4 193 sub r2, r2, #8 194 195 6: 196 PLD (r3, #64) 197 PLD (r1, #64) 198 mov ip, lr, lsr #16 199 ldr lr, [r1], #4 200 ldr r0, [r3], #4 201 orr ip, ip, lr, lsl #16 202 eors r0, r0, ip 203 moveq ip, lr, lsr #16 204 ldreq lr, [r1], #4 205 ldreq r0, [r3], #4 206 orreq ip, ip, lr, lsl #16 207 eoreqs r0, r0, ip 208 moveq ip, lr, lsr #16 209 ldreq lr, [r1], #4 210 ldreq r0, [r3], #4 211 orreq ip, ip, lr, lsl #16 212 eoreqs r0, r0, ip 213 moveq ip, lr, lsr #16 214 ldreq lr, [r1], #4 215 ldreq r0, [r3], #4 216 orreq ip, ip, lr, lsl #16 217 eoreqs r0, r0, ip 218 bne 7f 219 subs r2, r2, #8 220 bhs 6b 221 sub r1, r1, #2 222 /* are we done? */ 223 adds r2, r2, #8 224 moveq r0, #0 225 beq 9b 226 /* finish off the remaining bytes */ 227 b 8b 228 229 7: /* fix up the 2 pointers and fallthrough... */ 230 sub r1, r1, #2 231 b 2b 232 END(__memcmp16) 233