1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in 13 * the documentation and/or other materials provided with the 14 * distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #define CACHE_LINE_SIZE (64) 31 #define PREFETCH_DISTANCE (CACHE_LINE_SIZE*6) 32 33 ENTRY_PRIVATE(MEMCPY_BASE) 34 .cfi_def_cfa_offset 8 35 .cfi_rel_offset r0, 0 36 .cfi_rel_offset lr, 4 37 38 cmp r2, #0 39 beq .L_memcpy_done 40 cmp r0, r1 41 beq .L_memcpy_done 42 43 /* preload next cache line */ 44 pld [r1, #CACHE_LINE_SIZE*1] 45 46 /* Deal with very small blocks (< 32bytes) asap */ 47 cmp r2, #32 48 blo .L_memcpy_lt_32bytes 49 /* no need to align if len < 128 bytes */ 50 cmp r2, #128 51 blo .L_memcpy_lt_128bytes 52 53 /* large copy, align dest to 64 byte boundry */ 54 pld [r1, #CACHE_LINE_SIZE*2] 55 rsb r3, r0, #0 56 ands r3, r3, #0x3F 57 pld [r1, #CACHE_LINE_SIZE*3] 58 beq .L_memcpy_dispatch 59 sub r2, r2, r3 60 /* copy 1 byte */ 61 movs ip, r3, lsl #31 62 itt mi 63 ldrbmi ip, [r1], #1 64 strbmi ip, [r0], #1 65 /* copy 2 bytes */ 66 itt cs 67 ldrhcs ip, [r1], #2 68 strhcs ip, [r0], #2 69 /* copy 4 bytes */ 70 movs ip, r3, lsl #29 71 itt mi 72 ldrmi ip, [r1], #4 73 strmi ip, [r0], #4 74 /* copy 8 bytes */ 75 bcc 1f 76 vld1.8 {d0}, [r1]! 77 vst1.8 {d0}, [r0, :64]! 78 1: /* copy 16 bytes */ 79 movs ip, r3, lsl #27 80 bpl 1f 81 vld1.8 {q0}, [r1]! 82 vst1.8 {q0}, [r0, :128]! 83 1: /* copy 32 bytes */ 84 bcc .L_memcpy_dispatch 85 vld1.8 {q0, q1}, [r1]! 86 vst1.8 {q0, q1}, [r0, :256]! 87 88 .L_memcpy_dispatch: 89 // pre-decrement by 128 to detect nearly-done condition easily, but 90 // also need to check if we have less than 128 bytes left at this 91 // point due to alignment code above 92 subs r2, r2, #128 93 blo .L_memcpy_lt_128presub 94 95 // Denver does better if both source and dest are aligned so 96 // we'll special-case that even though the code is virually identical 97 tst r1, #0xF 98 bne .L_memcpy_neon_unalign_src_pld 99 100 // DRAM memcpy should be throttled slightly to get full bandwidth 101 // 102 cmp r2, #32768 103 bhi .L_memcpy_neon_unalign_src_pld 104 .align 4 105 1: 106 /* copy 128 bytes in each loop */ 107 subs r2, r2, #128 108 109 /* preload a cache line */ 110 pld [r1, #PREFETCH_DISTANCE] 111 /* copy a cache line */ 112 vld1.8 {q0, q1}, [r1, :128]! 113 vst1.8 {q0, q1}, [r0, :256]! 114 vld1.8 {q0, q1}, [r1, :128]! 115 vst1.8 {q0, q1}, [r0, :256]! 116 /* preload a cache line */ 117 pld [r1, #PREFETCH_DISTANCE] 118 /* copy a cache line */ 119 vld1.8 {q0, q1}, [r1, :128]! 120 vst1.8 {q0, q1}, [r0, :256]! 121 vld1.8 {q0, q1}, [r1, :128]! 122 vst1.8 {q0, q1}, [r0, :256]! 123 124 bhs 1b 125 adds r2, r2, #128 126 bne .L_memcpy_lt_128bytes_align 127 pop {r0, pc} 128 129 .align 4 130 .L_memcpy_neon_unalign_src_pld: 131 1: 132 /* copy 128 bytes in each loop */ 133 subs r2, r2, #128 134 135 /* preload a cache line */ 136 pld [r1, #PREFETCH_DISTANCE] 137 /* copy a cache line */ 138 vld1.8 {q0, q1}, [r1]! 139 vst1.8 {q0, q1}, [r0, :256]! 140 vld1.8 {q0, q1}, [r1]! 141 vst1.8 {q0, q1}, [r0, :256]! 142 /* preload a cache line */ 143 pld [r1, #PREFETCH_DISTANCE] 144 /* copy a cache line */ 145 vld1.8 {q0, q1}, [r1]! 146 vst1.8 {q0, q1}, [r0, :256]! 147 vld1.8 {q0, q1}, [r1]! 148 vst1.8 {q0, q1}, [r0, :256]! 149 150 bhs 1b 151 adds r2, r2, #128 152 bne .L_memcpy_lt_128bytes_align 153 pop {r0, pc} 154 155 .L_memcpy_lt_128presub: 156 add r2, r2, #128 157 .L_memcpy_lt_128bytes_align: 158 /* copy 64 bytes */ 159 movs ip, r2, lsl #26 160 bcc 1f 161 vld1.8 {q0, q1}, [r1]! 162 vst1.8 {q0, q1}, [r0, :256]! 163 vld1.8 {q0, q1}, [r1]! 164 vst1.8 {q0, q1}, [r0, :256]! 165 1: /* copy 32 bytes */ 166 bpl 1f 167 vld1.8 {q0, q1}, [r1]! 168 vst1.8 {q0, q1}, [r0, :256]! 169 1: /* copy 16 bytes */ 170 movs ip, r2, lsl #28 171 bcc 1f 172 vld1.8 {q0}, [r1]! 173 vst1.8 {q0}, [r0, :128]! 174 1: /* copy 8 bytes */ 175 bpl 1f 176 vld1.8 {d0}, [r1]! 177 vst1.8 {d0}, [r0, :64]! 178 1: /* copy 4 bytes */ 179 tst r2, #4 180 itt ne 181 ldrne ip, [r1], #4 182 strne ip, [r0], #4 183 /* copy 2 bytes */ 184 movs ip, r2, lsl #31 185 itt cs 186 ldrhcs ip, [r1], #2 187 strhcs ip, [r0], #2 188 /* copy 1 byte */ 189 itt mi 190 ldrbmi ip, [r1] 191 strbmi ip, [r0] 192 193 pop {r0, pc} 194 195 .L_memcpy_lt_128bytes: 196 /* copy 64 bytes */ 197 movs ip, r2, lsl #26 198 bcc 1f 199 vld1.8 {q0, q1}, [r1]! 200 vst1.8 {q0, q1}, [r0]! 201 vld1.8 {q0, q1}, [r1]! 202 vst1.8 {q0, q1}, [r0]! 203 1: /* copy 32 bytes */ 204 bpl .L_memcpy_lt_32bytes 205 vld1.8 {q0, q1}, [r1]! 206 vst1.8 {q0, q1}, [r0]! 207 .L_memcpy_lt_32bytes: 208 /* copy 16 bytes */ 209 movs ip, r2, lsl #28 210 bcc 1f 211 vld1.8 {q0}, [r1]! 212 vst1.8 {q0}, [r0]! 213 1: /* copy 8 bytes */ 214 bpl 1f 215 vld1.8 {d0}, [r1]! 216 vst1.8 {d0}, [r0]! 217 1: /* copy 4 bytes */ 218 tst r2, #4 219 itt ne 220 ldrne ip, [r1], #4 221 strne ip, [r0], #4 222 /* copy 2 bytes */ 223 movs ip, r2, lsl #31 224 itt cs 225 ldrhcs ip, [r1], #2 226 strhcs ip, [r0], #2 227 /* copy 1 byte */ 228 itt mi 229 ldrbmi ip, [r1] 230 strbmi ip, [r0] 231 232 .L_memcpy_done: 233 pop {r0, pc} 234 END(MEMCPY_BASE) 235