1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * * Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 24 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <private/bionic_asm.h> 32 33 #define PLDOFFS (16) 34 #define PLDSIZE (128) /* L2 cache line size */ 35 36 .code 32 37 ENTRY(memcpy) 38 push {r0} 39 .cfi_def_cfa_offset 4 40 .cfi_rel_offset r0, 0 41 cmp r2, #4 42 blt .Lneon_lt4 43 cmp r2, #16 44 blt .Lneon_lt16 45 cmp r2, #32 46 blt .Lneon_16 47 cmp r2, #128 48 blt .Lneon_copy_32_a 49 /* Copy blocks of 128-bytes (word-aligned) at a time*/ 50 /* Code below is optimized for PLDSIZE=128 only */ 51 mov r12, r2, lsr #7 52 cmp r12, #PLDOFFS 53 ble .Lneon_copy_128_loop_nopld 54 sub r12, #PLDOFFS 55 pld [r1, #(PLDOFFS-1)*PLDSIZE] 56 .Lneon_copy_128_loop_outer: 57 pld [r1, #(PLDOFFS*PLDSIZE)] 58 pld [r1, #(PLDOFFS)*(PLDSIZE)+64] 59 vld1.32 {q0, q1}, [r1]! 60 vld1.32 {q2, q3}, [r1]! 61 vld1.32 {q8, q9}, [r1]! 62 vld1.32 {q10, q11}, [r1]! 63 subs r12, r12, #1 64 vst1.32 {q0, q1}, [r0]! 65 vst1.32 {q2, q3}, [r0]! 66 vst1.32 {q8, q9}, [r0]! 67 vst1.32 {q10, q11}, [r0]! 68 bne .Lneon_copy_128_loop_outer 69 mov r12, #PLDOFFS 70 .Lneon_copy_128_loop_nopld: 71 vld1.32 {q0, q1}, [r1]! 72 vld1.32 {q2, q3}, [r1]! 73 vld1.32 {q8, q9}, [r1]! 74 vld1.32 {q10, q11}, [r1]! 75 subs r12, r12, #1 76 vst1.32 {q0, q1}, [r0]! 77 vst1.32 {q2, q3}, [r0]! 78 vst1.32 {q8, q9}, [r0]! 79 vst1.32 {q10, q11}, [r0]! 80 bne .Lneon_copy_128_loop_nopld 81 ands r2, r2, #0x7f 82 beq .Lneon_exit 83 cmp r2, #32 84 blt .Lneon_16 85 nop 86 /* Copy blocks of 32-bytes (word aligned) at a time*/ 87 .Lneon_copy_32_a: 88 mov r12, r2, lsr #5 89 .Lneon_copy_32_loop_a: 90 vld1.32 {q0,q1}, [r1]! 91 subs r12, r12, #1 92 vst1.32 {q0,q1}, [r0]! 93 bne .Lneon_copy_32_loop_a 94 ands r2, r2, #0x1f 95 beq .Lneon_exit 96 .Lneon_16: 97 subs r2, r2, #16 98 blt .Lneon_lt16 99 vld1.32 {q8}, [r1]! 100 vst1.32 {q8}, [r0]! 101 beq .Lneon_exit 102 .Lneon_lt16: 103 movs r12, r2, lsl #29 104 bcc .Lneon_skip8 105 ldr r3, [r1], #4 106 ldr r12, [r1], #4 107 str r3, [r0], #4 108 str r12, [r0], #4 109 .Lneon_skip8: 110 bpl .Lneon_lt4 111 ldr r3, [r1], #4 112 str r3, [r0], #4 113 .Lneon_lt4: 114 movs r2, r2, lsl #31 115 bcc .Lneon_lt2 116 ldrh r3, [r1], #2 117 strh r3, [r0], #2 118 .Lneon_lt2: 119 bpl .Lneon_exit 120 ldrb r12, [r1] 121 strb r12, [r0] 122 .Lneon_exit: 123 pop {r0} 124 bx lr 125 126 END(memcpy) 127