1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <machine/cpu-features.h> 30 #include <machine/asm.h> 31 32 33 #ifdef HAVE_32_BYTE_CACHE_LINE 34 #define CACHE_LINE_SIZE 32 35 #else 36 #define CACHE_LINE_SIZE 64 37 #endif 38 39 /* 40 * Optimized memcmp() for Cortex-A9. 41 */ 42 43 ENTRY(memcmp) 44 pld [r0, #(CACHE_LINE_SIZE * 0)] 45 pld [r0, #(CACHE_LINE_SIZE * 1)] 46 47 /* take of the case where length is 0 or the buffers are the same */ 48 cmp r0, r1 49 moveq r0, #0 50 bxeq lr 51 52 pld [r1, #(CACHE_LINE_SIZE * 0)] 53 pld [r1, #(CACHE_LINE_SIZE * 1)] 54 55 /* make sure we have at least 8+4 bytes, this simplify things below 56 * and avoid some overhead for small blocks 57 */ 58 cmp r2, #(8+4) 59 bmi 10f 60 /* 61 * Neon optimization 62 * Comparing 32 bytes at a time 63 */ 64 #if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS) 65 subs r2, r2, #32 66 blo 3f 67 68 /* preload all the cache lines we need. */ 69 pld [r0, #(CACHE_LINE_SIZE * 2)] 70 pld [r1, #(CACHE_LINE_SIZE * 2)] 71 72 1: /* The main loop compares 32 bytes at a time */ 73 vld1.8 {d0 - d3}, [r0]! 74 pld [r0, #(CACHE_LINE_SIZE * 2)] 75 vld1.8 {d4 - d7}, [r1]! 76 pld [r1, #(CACHE_LINE_SIZE * 2)] 77 78 /* Start subtracting the values and merge results */ 79 vsub.i8 q0, q2 80 vsub.i8 q1, q3 81 vorr q2, q0, q1 82 vorr d4, d5 83 vmov r3, ip, d4 84 /* Check if there are any differences among the 32 bytes */ 85 orrs r3, ip 86 bne 2f 87 subs r2, r2, #32 88 bhs 1b 89 b 3f 90 2: 91 /* Check if the difference was in the first or last 16 bytes */ 92 sub r0, #32 93 vorr d0, d1 94 sub r1, #32 95 vmov r3, ip, d0 96 orrs r3, ip 97 /* if the first 16 bytes are equal, we only have to rewind 16 bytes */ 98 ittt eq 99 subeq r2, #16 100 addeq r0, #16 101 addeq r1, #16 102 103 3: /* fix-up the remaining count */ 104 add r2, r2, #32 105 106 cmp r2, #(8+4) 107 bmi 10f 108 #endif 109 110 .save {r4, lr} 111 /* save registers */ 112 stmfd sp!, {r4, lr} 113 114 /* since r0 hold the result, move the first source 115 * pointer somewhere else 116 */ 117 mov r4, r0 118 119 /* align first pointer to word boundary 120 * offset = -src & 3 121 */ 122 rsb r3, r4, #0 123 ands r3, r3, #3 124 beq 0f 125 126 /* align first pointer */ 127 sub r2, r2, r3 128 1: ldrb r0, [r4], #1 129 ldrb ip, [r1], #1 130 subs r0, r0, ip 131 bne 9f 132 subs r3, r3, #1 133 bne 1b 134 135 136 0: /* here the first pointer is aligned, and we have at least 4 bytes 137 * to process. 138 */ 139 140 /* see if the pointers are congruent */ 141 eor r0, r4, r1 142 ands r0, r0, #3 143 bne 5f 144 145 /* congruent case, 32 bytes per iteration 146 * We need to make sure there are at least 32+4 bytes left 147 * because we effectively read ahead one word, and we could 148 * read past the buffer (and segfault) if we're not careful. 149 */ 150 151 ldr ip, [r1] 152 subs r2, r2, #(32 + 4) 153 bmi 1f 154 155 0: pld [r4, #(CACHE_LINE_SIZE * 2)] 156 pld [r1, #(CACHE_LINE_SIZE * 2)] 157 ldr r0, [r4], #4 158 ldr lr, [r1, #4]! 159 eors r0, r0, ip 160 ldreq r0, [r4], #4 161 ldreq ip, [r1, #4]! 162 eoreqs r0, r0, lr 163 ldreq r0, [r4], #4 164 ldreq lr, [r1, #4]! 165 eoreqs r0, r0, ip 166 ldreq r0, [r4], #4 167 ldreq ip, [r1, #4]! 168 eoreqs r0, r0, lr 169 ldreq r0, [r4], #4 170 ldreq lr, [r1, #4]! 171 eoreqs r0, r0, ip 172 ldreq r0, [r4], #4 173 ldreq ip, [r1, #4]! 174 eoreqs r0, r0, lr 175 ldreq r0, [r4], #4 176 ldreq lr, [r1, #4]! 177 eoreqs r0, r0, ip 178 ldreq r0, [r4], #4 179 ldreq ip, [r1, #4]! 180 eoreqs r0, r0, lr 181 bne 2f 182 subs r2, r2, #32 183 bhs 0b 184 185 /* do we have at least 4 bytes left? */ 186 1: adds r2, r2, #(32 - 4 + 4) 187 bmi 4f 188 189 /* finish off 4 bytes at a time */ 190 3: ldr r0, [r4], #4 191 ldr ip, [r1], #4 192 eors r0, r0, ip 193 bne 2f 194 subs r2, r2, #4 195 bhs 3b 196 197 /* are we done? */ 198 4: adds r2, r2, #4 199 moveq r0, #0 200 beq 9f 201 202 /* finish off the remaining bytes */ 203 b 8f 204 205 2: /* the last 4 bytes are different, restart them */ 206 sub r4, r4, #4 207 sub r1, r1, #4 208 mov r2, #4 209 210 /* process the last few bytes */ 211 8: ldrb r0, [r4], #1 212 ldrb ip, [r1], #1 213 // stall 214 subs r0, r0, ip 215 bne 9f 216 subs r2, r2, #1 217 bne 8b 218 219 9: /* restore registers and return */ 220 ldmfd sp!, {r4, lr} 221 bx lr 222 223 10: /* process less than 12 bytes */ 224 cmp r2, #0 225 moveq r0, #0 226 bxeq lr 227 mov r3, r0 228 11: 229 ldrb r0, [r3], #1 230 ldrb ip, [r1], #1 231 subs r0, ip 232 bxne lr 233 subs r2, r2, #1 234 bne 11b 235 bx lr 236 237 5: /*************** non-congruent case ***************/ 238 and r0, r1, #3 239 cmp r0, #2 240 bne 4f 241 242 /* here, offset is 2 (16-bits aligned, special cased) */ 243 244 /* make sure we have at least 16 bytes to process */ 245 subs r2, r2, #16 246 addmi r2, r2, #16 247 bmi 8b 248 249 /* align the unaligned pointer */ 250 bic r1, r1, #3 251 ldr lr, [r1], #4 252 253 6: pld [r1, #(CACHE_LINE_SIZE * 2)] 254 pld [r4, #(CACHE_LINE_SIZE * 2)] 255 mov ip, lr, lsr #16 256 ldr lr, [r1], #4 257 ldr r0, [r4], #4 258 orr ip, ip, lr, lsl #16 259 eors r0, r0, ip 260 moveq ip, lr, lsr #16 261 ldreq lr, [r1], #4 262 ldreq r0, [r4], #4 263 orreq ip, ip, lr, lsl #16 264 eoreqs r0, r0, ip 265 moveq ip, lr, lsr #16 266 ldreq lr, [r1], #4 267 ldreq r0, [r4], #4 268 orreq ip, ip, lr, lsl #16 269 eoreqs r0, r0, ip 270 moveq ip, lr, lsr #16 271 ldreq lr, [r1], #4 272 ldreq r0, [r4], #4 273 orreq ip, ip, lr, lsl #16 274 eoreqs r0, r0, ip 275 bne 7f 276 subs r2, r2, #16 277 bhs 6b 278 sub r1, r1, #2 279 /* are we done? */ 280 adds r2, r2, #16 281 moveq r0, #0 282 beq 9b 283 /* finish off the remaining bytes */ 284 b 8b 285 286 7: /* fix up the 2 pointers and fallthrough... */ 287 sub r1, r1, #(4+2) 288 sub r4, r4, #4 289 mov r2, #4 290 b 8b 291 292 293 4: /*************** offset is 1 or 3 (less optimized) ***************/ 294 295 stmfd sp!, {r5, r6, r7} 296 297 // r5 = rhs 298 // r6 = lhs 299 // r7 = scratch 300 301 mov r5, r0, lsl #3 /* r5 = right shift */ 302 rsb r6, r5, #32 /* r6 = left shift */ 303 304 /* align the unaligned pointer */ 305 bic r1, r1, #3 306 ldr r7, [r1], #4 307 sub r2, r2, #8 308 309 6: mov ip, r7, lsr r5 310 ldr r7, [r1], #4 311 ldr r0, [r4], #4 312 orr ip, ip, r7, lsl r6 313 eors r0, r0, ip 314 moveq ip, r7, lsr r5 315 ldreq r7, [r1], #4 316 ldreq r0, [r4], #4 317 orreq ip, ip, r7, lsl r6 318 eoreqs r0, r0, ip 319 bne 7f 320 subs r2, r2, #8 321 bhs 6b 322 323 sub r1, r1, r6, lsr #3 324 ldmfd sp!, {r5, r6, r7} 325 326 /* are we done? */ 327 adds r2, r2, #8 328 moveq r0, #0 329 beq 9b 330 331 /* finish off the remaining bytes */ 332 b 8b 333 334 7: /* fix up the 2 pointers and fallthrough... */ 335 sub r1, r1, #4 336 sub r1, r1, r6, lsr #3 337 sub r4, r4, #4 338 mov r2, #4 339 ldmfd sp!, {r5, r6, r7} 340 b 8b 341 END(memcmp) 342