1 /*************************************************************************** 2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of The Linux Foundation nor the names of its contributors may 12 be used to endorse or promote products derived from this software 13 without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 POSSIBILITY OF SUCH DAMAGE. 26 ***************************************************************************/ 27 28 /* Assumes neon instructions and a cache line size of 64 bytes. */ 29 30 #define PLDOFFS (10) 31 #define PLDTHRESH (PLDOFFS) 32 #define BBTHRESH (4096/64) 33 #define PLDSIZE (64) 34 35 #if (PLDOFFS < 1) 36 #error Routine does not support offsets less than 1 37 #endif 38 39 #if (PLDTHRESH < PLDOFFS) 40 #error PLD threshold must be greater than or equal to the PLD offset 41 #endif 42 43 .text 44 .fpu neon 45 46 .L_memcpy_base: 47 cmp r2, #4 48 blt .L_neon_lt4 49 cmp r2, #16 50 blt .L_neon_lt16 51 cmp r2, #32 52 blt .L_neon_16 53 cmp r2, #64 54 blt .L_neon_copy_32_a 55 56 mov r12, r2, lsr #6 57 cmp r12, #PLDTHRESH 58 ble .L_neon_copy_64_loop_nopld 59 60 push {r9, r10} 61 .cfi_adjust_cfa_offset 8 62 .cfi_rel_offset r9, 0 63 .cfi_rel_offset r10, 4 64 65 cmp r12, #BBTHRESH 66 ble .L_neon_prime_pump 67 68 add lr, r0, #0x400 69 add r9, r1, #(PLDOFFS*PLDSIZE) 70 sub lr, lr, r9 71 lsl lr, lr, #21 72 lsr lr, lr, #21 73 add lr, lr, #(PLDOFFS*PLDSIZE) 74 cmp r12, lr, lsr #6 75 ble .L_neon_prime_pump 76 77 itt gt 78 movgt r9, #(PLDOFFS) 79 rsbsgt r9, r9, lr, lsr #6 80 ble .L_neon_prime_pump 81 82 add r10, r1, lr 83 bic r10, #0x3F 84 85 sub r12, r12, lr, lsr #6 86 87 cmp r9, r12 88 itee le 89 suble r12, r12, r9 90 movgt r9, r12 91 movgt r12, #0 92 93 pld [r1, #((PLDOFFS-1)*PLDSIZE)] 94 .L_neon_copy_64_loop_outer_doublepld: 95 pld [r1, #((PLDOFFS)*PLDSIZE)] 96 vld1.32 {q0, q1}, [r1]! 97 vld1.32 {q2, q3}, [r1]! 98 ldr r3, [r10] 99 subs r9, r9, #1 100 vst1.32 {q0, q1}, [r0]! 101 vst1.32 {q2, q3}, [r0]! 102 add r10, #64 103 bne .L_neon_copy_64_loop_outer_doublepld 104 cmp r12, #0 105 beq .L_neon_pop_before_nopld 106 107 cmp r12, #(512*1024/64) 108 blt .L_neon_copy_64_loop_outer 109 110 .L_neon_copy_64_loop_ddr: 111 vld1.32 {q0, q1}, [r1]! 112 vld1.32 {q2, q3}, [r1]! 113 pld [r10] 114 subs r12, r12, #1 115 vst1.32 {q0, q1}, [r0]! 116 vst1.32 {q2, q3}, [r0]! 117 add r10, #64 118 bne .L_neon_copy_64_loop_ddr 119 b .L_neon_pop_before_nopld 120 121 .L_neon_prime_pump: 122 mov lr, #(PLDOFFS*PLDSIZE) 123 add r10, r1, #(PLDOFFS*PLDSIZE) 124 bic r10, #0x3F 125 sub r12, r12, #PLDOFFS 126 ldr r3, [r10, #(-1*PLDSIZE)] 127 128 .L_neon_copy_64_loop_outer: 129 vld1.32 {q0, q1}, [r1]! 130 vld1.32 {q2, q3}, [r1]! 131 ldr r3, [r10] 132 subs r12, r12, #1 133 vst1.32 {q0, q1}, [r0]! 134 vst1.32 {q2, q3}, [r0]! 135 add r10, #64 136 bne .L_neon_copy_64_loop_outer 137 138 .L_neon_pop_before_nopld: 139 mov r12, lr, lsr #6 140 pop {r9, r10} 141 .cfi_adjust_cfa_offset -8 142 .cfi_restore r9 143 .cfi_restore r10 144 145 .L_neon_copy_64_loop_nopld: 146 vld1.32 {q8, q9}, [r1]! 147 vld1.32 {q10, q11}, [r1]! 148 subs r12, r12, #1 149 vst1.32 {q8, q9}, [r0]! 150 vst1.32 {q10, q11}, [r0]! 151 bne .L_neon_copy_64_loop_nopld 152 ands r2, r2, #0x3f 153 beq .L_neon_exit 154 155 .L_neon_copy_32_a: 156 movs r3, r2, lsl #27 157 bcc .L_neon_16 158 vld1.32 {q0,q1}, [r1]! 159 vst1.32 {q0,q1}, [r0]! 160 161 .L_neon_16: 162 bpl .L_neon_lt16 163 vld1.32 {q8}, [r1]! 164 vst1.32 {q8}, [r0]! 165 ands r2, r2, #0x0f 166 beq .L_neon_exit 167 168 .L_neon_lt16: 169 movs r3, r2, lsl #29 170 bcc 1f 171 vld1.8 {d0}, [r1]! 172 vst1.8 {d0}, [r0]! 173 1: 174 bge .L_neon_lt4 175 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 176 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! 177 178 .L_neon_lt4: 179 movs r2, r2, lsl #31 180 itt cs 181 ldrhcs r3, [r1], #2 182 strhcs r3, [r0], #2 183 itt mi 184 ldrbmi r3, [r1] 185 strbmi r3, [r0] 186 187 .L_neon_exit: 188 pop {r0, pc} 189