1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <machine/cpu-features.h> 30 #include <private/bionic_asm.h> 31 #include <private/libc_events.h> 32 33 /* 34 * Optimized memcpy() for ARM. 35 * 36 * note that memcpy() always returns the destination pointer, 37 * so we have to preserve R0. 38 */ 39 40 .syntax unified 41 42 ENTRY(__memcpy_chk) 43 cmp r2, r3 44 bhi __memcpy_chk_fail 45 46 // Fall through to memcpy... 47 END(__memcpy_chk) 48 49 ENTRY(memcpy) 50 /* The stack must always be 64-bits aligned to be compliant with the 51 * ARM ABI. Since we have to save R0, we might as well save R4 52 * which we can use for better pipelining of the reads below 53 */ 54 stmfd sp!, {r0, r4, lr} 55 .cfi_def_cfa_offset 12 56 .cfi_rel_offset r0, 0 57 .cfi_rel_offset r4, 4 58 .cfi_rel_offset lr, 8 59 /* Making room for r5-r11 which will be spilled later */ 60 sub sp, sp, #28 61 .cfi_adjust_cfa_offset 28 62 63 // preload the destination because we'll align it to a cache line 64 // with small writes. Also start the source "pump". 65 pld [r0, #0] 66 pld [r1, #0] 67 pld [r1, #32] 68 69 /* it simplifies things to take care of len<4 early */ 70 cmp r2, #4 71 blo .Lcopy_last_3_and_return 72 73 /* compute the offset to align the source 74 * offset = (4-(src&3))&3 = -src & 3 75 */ 76 rsb r3, r1, #0 77 ands r3, r3, #3 78 beq .Lsrc_aligned 79 80 /* align source to 32 bits. We need to insert 2 instructions between 81 * a ldr[b|h] and str[b|h] because byte and half-word instructions 82 * stall 2 cycles. 83 */ 84 movs r12, r3, lsl #31 85 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ 86 ldrbmi r3, [r1], #1 87 ldrbcs r4, [r1], #1 88 ldrbcs r12,[r1], #1 89 strbmi r3, [r0], #1 90 strbcs r4, [r0], #1 91 strbcs r12,[r0], #1 92 93 .Lsrc_aligned: 94 95 /* see if src and dst are aligned together (congruent) */ 96 eor r12, r0, r1 97 tst r12, #3 98 bne .Lnon_congruent 99 100 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 101 * frame. Don't update sp. 102 */ 103 stmea sp, {r5-r11} 104 105 /* align the destination to a cache-line */ 106 rsb r3, r0, #0 107 ands r3, r3, #0x1C 108 beq .Lcongruent_aligned32 109 cmp r3, r2 110 andhi r3, r2, #0x1C 111 112 /* conditionally copies 0 to 7 words (length in r3) */ 113 movs r12, r3, lsl #28 114 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ 115 ldmmi r1!, {r8, r9} /* 8 bytes */ 116 stmcs r0!, {r4, r5, r6, r7} 117 stmmi r0!, {r8, r9} 118 tst r3, #0x4 119 ldrne r10,[r1], #4 /* 4 bytes */ 120 strne r10,[r0], #4 121 sub r2, r2, r3 122 123 .Lcongruent_aligned32: 124 /* 125 * here source is aligned to 32 bytes. 126 */ 127 128 .Lcached_aligned32: 129 subs r2, r2, #32 130 blo .Lless_than_32_left 131 132 /* 133 * We preload a cache-line up to 64 bytes ahead. On the 926, this will 134 * stall only until the requested world is fetched, but the linefill 135 * continues in the the background. 136 * While the linefill is going, we write our previous cache-line 137 * into the write-buffer (which should have some free space). 138 * When the linefill is done, the writebuffer will 139 * start dumping its content into memory 140 * 141 * While all this is going, we then load a full cache line into 142 * 8 registers, this cache line should be in the cache by now 143 * (or partly in the cache). 144 * 145 * This code should work well regardless of the source/dest alignment. 146 * 147 */ 148 149 // Align the preload register to a cache-line because the cpu does 150 // "critical word first" (the first word requested is loaded first). 151 bic r12, r1, #0x1F 152 add r12, r12, #64 153 154 1: ldmia r1!, { r4-r11 } 155 pld [r12, #64] 156 subs r2, r2, #32 157 158 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi 159 // for ARM9 preload will not be safely guarded by the preceding subs. 160 // When it is safely guarded the only possibility to have SIGSEGV here 161 // is because the caller overstates the length. 162 ldrhi r3, [r12], #32 /* cheap ARM9 preload */ 163 stmia r0!, { r4-r11 } 164 bhs 1b 165 166 add r2, r2, #32 167 168 .Lless_than_32_left: 169 /* 170 * less than 32 bytes left at this point (length in r2) 171 */ 172 173 /* skip all this if there is nothing to do, which should 174 * be a common case (if not executed the code below takes 175 * about 16 cycles) 176 */ 177 tst r2, #0x1F 178 beq 1f 179 180 /* conditionnaly copies 0 to 31 bytes */ 181 movs r12, r2, lsl #28 182 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ 183 ldmmi r1!, {r8, r9} /* 8 bytes */ 184 stmcs r0!, {r4, r5, r6, r7} 185 stmmi r0!, {r8, r9} 186 movs r12, r2, lsl #30 187 ldrcs r3, [r1], #4 /* 4 bytes */ 188 ldrhmi r4, [r1], #2 /* 2 bytes */ 189 strcs r3, [r0], #4 190 strhmi r4, [r0], #2 191 tst r2, #0x1 192 ldrbne r3, [r1] /* last byte */ 193 strbne r3, [r0] 194 195 /* we're done! restore everything and return */ 196 1: ldmfd sp!, {r5-r11} 197 ldmfd sp!, {r0, r4, pc} 198 199 /********************************************************************/ 200 201 .Lnon_congruent: 202 /* 203 * here source is aligned to 4 bytes 204 * but destination is not. 205 * 206 * in the code below r2 is the number of bytes read 207 * (the number of bytes written is always smaller, because we have 208 * partial words in the shift queue) 209 */ 210 cmp r2, #4 211 blo .Lcopy_last_3_and_return 212 213 /* Use post-increment mode for stm to spill r5-r11 to reserved stack 214 * frame. Don't update sp. 215 */ 216 stmea sp, {r5-r11} 217 218 /* compute shifts needed to align src to dest */ 219 rsb r5, r0, #0 220 and r5, r5, #3 /* r5 = # bytes in partial words */ 221 mov r12, r5, lsl #3 /* r12 = right */ 222 rsb lr, r12, #32 /* lr = left */ 223 224 /* read the first word */ 225 ldr r3, [r1], #4 226 sub r2, r2, #4 227 228 /* write a partial word (0 to 3 bytes), such that destination 229 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) 230 */ 231 movs r5, r5, lsl #31 232 strbmi r3, [r0], #1 233 movmi r3, r3, lsr #8 234 strbcs r3, [r0], #1 235 movcs r3, r3, lsr #8 236 strbcs r3, [r0], #1 237 movcs r3, r3, lsr #8 238 239 cmp r2, #4 240 blo .Lpartial_word_tail 241 242 /* Align destination to 32 bytes (cache line boundary) */ 243 1: tst r0, #0x1c 244 beq 2f 245 ldr r5, [r1], #4 246 sub r2, r2, #4 247 orr r4, r3, r5, lsl lr 248 mov r3, r5, lsr r12 249 str r4, [r0], #4 250 cmp r2, #4 251 bhs 1b 252 blo .Lpartial_word_tail 253 254 /* copy 32 bytes at a time */ 255 2: subs r2, r2, #32 256 blo .Lless_than_thirtytwo 257 258 /* Use immediate mode for the shifts, because there is an extra cycle 259 * for register shifts, which could account for up to 50% of 260 * performance hit. 261 */ 262 263 cmp r12, #24 264 beq .Lloop24 265 cmp r12, #8 266 beq .Lloop8 267 268 .Lloop16: 269 ldr r12, [r1], #4 270 1: mov r4, r12 271 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 272 pld [r1, #64] 273 subs r2, r2, #32 274 ldrhs r12, [r1], #4 275 orr r3, r3, r4, lsl #16 276 mov r4, r4, lsr #16 277 orr r4, r4, r5, lsl #16 278 mov r5, r5, lsr #16 279 orr r5, r5, r6, lsl #16 280 mov r6, r6, lsr #16 281 orr r6, r6, r7, lsl #16 282 mov r7, r7, lsr #16 283 orr r7, r7, r8, lsl #16 284 mov r8, r8, lsr #16 285 orr r8, r8, r9, lsl #16 286 mov r9, r9, lsr #16 287 orr r9, r9, r10, lsl #16 288 mov r10, r10, lsr #16 289 orr r10, r10, r11, lsl #16 290 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 291 mov r3, r11, lsr #16 292 bhs 1b 293 b .Lless_than_thirtytwo 294 295 .Lloop8: 296 ldr r12, [r1], #4 297 1: mov r4, r12 298 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 299 pld [r1, #64] 300 subs r2, r2, #32 301 ldrhs r12, [r1], #4 302 orr r3, r3, r4, lsl #24 303 mov r4, r4, lsr #8 304 orr r4, r4, r5, lsl #24 305 mov r5, r5, lsr #8 306 orr r5, r5, r6, lsl #24 307 mov r6, r6, lsr #8 308 orr r6, r6, r7, lsl #24 309 mov r7, r7, lsr #8 310 orr r7, r7, r8, lsl #24 311 mov r8, r8, lsr #8 312 orr r8, r8, r9, lsl #24 313 mov r9, r9, lsr #8 314 orr r9, r9, r10, lsl #24 315 mov r10, r10, lsr #8 316 orr r10, r10, r11, lsl #24 317 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 318 mov r3, r11, lsr #8 319 bhs 1b 320 b .Lless_than_thirtytwo 321 322 .Lloop24: 323 ldr r12, [r1], #4 324 1: mov r4, r12 325 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 326 pld [r1, #64] 327 subs r2, r2, #32 328 ldrhs r12, [r1], #4 329 orr r3, r3, r4, lsl #8 330 mov r4, r4, lsr #24 331 orr r4, r4, r5, lsl #8 332 mov r5, r5, lsr #24 333 orr r5, r5, r6, lsl #8 334 mov r6, r6, lsr #24 335 orr r6, r6, r7, lsl #8 336 mov r7, r7, lsr #24 337 orr r7, r7, r8, lsl #8 338 mov r8, r8, lsr #24 339 orr r8, r8, r9, lsl #8 340 mov r9, r9, lsr #24 341 orr r9, r9, r10, lsl #8 342 mov r10, r10, lsr #24 343 orr r10, r10, r11, lsl #8 344 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 345 mov r3, r11, lsr #24 346 bhs 1b 347 348 349 .Lless_than_thirtytwo: 350 /* copy the last 0 to 31 bytes of the source */ 351 rsb r12, lr, #32 /* we corrupted r12, recompute it */ 352 add r2, r2, #32 353 cmp r2, #4 354 blo .Lpartial_word_tail 355 356 1: ldr r5, [r1], #4 357 sub r2, r2, #4 358 orr r4, r3, r5, lsl lr 359 mov r3, r5, lsr r12 360 str r4, [r0], #4 361 cmp r2, #4 362 bhs 1b 363 364 .Lpartial_word_tail: 365 /* we have a partial word in the input buffer */ 366 movs r5, lr, lsl #(31-3) 367 strbmi r3, [r0], #1 368 movmi r3, r3, lsr #8 369 strbcs r3, [r0], #1 370 movcs r3, r3, lsr #8 371 strbcs r3, [r0], #1 372 373 /* Refill spilled registers from the stack. Don't update sp. */ 374 ldmfd sp, {r5-r11} 375 376 .Lcopy_last_3_and_return: 377 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ 378 ldrbmi r2, [r1], #1 379 ldrbcs r3, [r1], #1 380 ldrbcs r12,[r1] 381 strbmi r2, [r0], #1 382 strbcs r3, [r0], #1 383 strbcs r12,[r0] 384 385 /* we're done! restore sp and spilled registers and return */ 386 add sp, sp, #28 387 ldmfd sp!, {r0, r4, pc} 388 END(memcpy) 389 390 // Only reached when the __memcpy_chk check fails. 391 ENTRY_PRIVATE(__memcpy_chk_fail) 392 // Preserve lr for backtrace. 393 push {lr} 394 .cfi_def_cfa_offset 4 395 .cfi_rel_offset lr, 0 396 397 ldr r0, error_message 398 ldr r1, error_code 399 1: 400 add r0, pc 401 bl __fortify_chk_fail 402 error_code: 403 .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW 404 error_message: 405 .word error_string-(1b+8) 406 END(__memcpy_chk_fail) 407 408 .data 409 error_string: 410 .string "memcpy: prevented write past end of buffer" 411