Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * This code assumes it is running on a processor that supports all arm v7
     31  * instructions, that supports neon instructions, and that has a 32 byte
     32  * cache line.
     33  */
     34 
     35 ENTRY_PRIVATE(MEMCPY_BASE)
     36         .cfi_def_cfa_offset 8
     37         .cfi_rel_offset r0, 0
     38         .cfi_rel_offset lr, 4
     39 
     40         // Check so divider is at least 16 bytes, needed for alignment code.
     41         cmp         r2, #16
     42         blo         5f
     43 
     44         /* check if buffers are aligned. If so, run arm-only version */
     45         eor         r3, r0, r1
     46         ands        r3, r3, #0x3
     47         beq         MEMCPY_BASE_ALIGNED
     48 
     49         /* Check the upper size limit for Neon unaligned memory access in memcpy */
     50         cmp         r2, #224
     51         blo         3f
     52 
     53         /* align destination to 16 bytes for the write-buffer */
     54         rsb         r3, r0, #0
     55         ands        r3, r3, #0xF
     56         beq         3f
     57 
     58         /* copy up to 15-bytes (count in r3) */
     59         sub         r2, r2, r3
     60         movs        ip, r3, lsl #31
     61         itt         mi
     62         ldrbmi      lr, [r1], #1
     63         strbmi      lr, [r0], #1
     64         itttt       cs
     65         ldrbcs      ip, [r1], #1
     66         ldrbcs      lr, [r1], #1
     67         strbcs      ip, [r0], #1
     68         strbcs      lr, [r0], #1
     69         movs        ip, r3, lsl #29
     70         bge         1f
     71         // copies 4 bytes, destination 32-bits aligned
     72         vld1.32     {d0[0]}, [r1]!
     73         vst1.32     {d0[0]}, [r0, :32]!
     74 1:      bcc         2f
     75         // copies 8 bytes, destination 64-bits aligned
     76         vld1.8      {d0}, [r1]!
     77         vst1.8      {d0}, [r0, :64]!
     78 2:
     79         /* preload immediately the next cache line, which we may need */
     80         pld         [r1, #0]
     81         pld         [r1, #(32 * 2)]
     82 3:
     83         /* make sure we have at least 64 bytes to copy */
     84         subs        r2, r2, #64
     85         blo         2f
     86 
     87         /* preload all the cache lines we need */
     88         pld         [r1, #(32 * 4)]
     89         pld         [r1, #(32 * 6)]
     90 
     91 1:      /* The main loop copies 64 bytes at a time */
     92         vld1.8      {d0 - d3}, [r1]!
     93         vld1.8      {d4 - d7}, [r1]!
     94         pld         [r1, #(32 * 6)]
     95         subs        r2, r2, #64
     96         vst1.8      {d0 - d3}, [r0]!
     97         vst1.8      {d4 - d7}, [r0]!
     98         bhs         1b
     99 
    100 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
    101         add         r2, r2, #64
    102         subs        r2, r2, #32
    103         blo         4f
    104 
    105 3:      /* 32 bytes at a time. These cache lines were already preloaded */
    106         vld1.8      {d0 - d3}, [r1]!
    107         subs        r2, r2, #32
    108         vst1.8      {d0 - d3}, [r0]!
    109         bhs         3b
    110 
    111 4:      /* less than 32 left */
    112         add         r2, r2, #32
    113         tst         r2, #0x10
    114         beq         5f
    115         // copies 16 bytes, 128-bits aligned
    116         vld1.8      {d0, d1}, [r1]!
    117         vst1.8      {d0, d1}, [r0]!
    118 5:      /* copy up to 15-bytes (count in r2) */
    119         movs        ip, r2, lsl #29
    120         bcc         1f
    121         vld1.8      {d0}, [r1]!
    122         vst1.8      {d0}, [r0]!
    123 1:      bge         2f
    124         vld1.32     {d0[0]}, [r1]!
    125         vst1.32     {d0[0]}, [r0]!
    126 2:      movs        ip, r2, lsl #31
    127         itt         mi
    128         ldrbmi      r3, [r1], #1
    129         strbmi      r3, [r0], #1
    130         itttt       cs
    131         ldrbcs      ip, [r1], #1
    132         ldrbcs      lr, [r1], #1
    133         strbcs      ip, [r0], #1
    134         strbcs      lr, [r0], #1
    135 
    136         ldmfd       sp!, {r0, pc}
    137 END(MEMCPY_BASE)
    138 
    139 ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
    140         .cfi_def_cfa_offset 8
    141         .cfi_rel_offset r0, 0
    142         .cfi_rel_offset lr, 4
    143 
    144         /* Simple arm-only copy loop to handle aligned copy operations */
    145         stmfd       sp!, {r4-r8}
    146         .cfi_adjust_cfa_offset 20
    147         .cfi_rel_offset r4, 0
    148         .cfi_rel_offset r5, 4
    149         .cfi_rel_offset r6, 8
    150         .cfi_rel_offset r7, 12
    151         .cfi_rel_offset r8, 16
    152         pld         [r1, #(32 * 4)]
    153 
    154         /* Check alignment */
    155         rsb         r3, r1, #0
    156         ands        r3, #3
    157         beq         2f
    158 
    159         /* align source to 32 bits. We need to insert 2 instructions between
    160          * a ldr[b|h] and str[b|h] because byte and half-word instructions
    161          * stall 2 cycles.
    162          */
    163         movs        r12, r3, lsl #31
    164         sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
    165         itt         mi
    166         ldrbmi      r3, [r1], #1
    167         strbmi      r3, [r0], #1
    168         itttt       cs
    169         ldrbcs      r4, [r1], #1
    170         ldrbcs      r5, [r1], #1
    171         strbcs      r4, [r0], #1
    172         strbcs      r5, [r0], #1
    173 
    174 2:
    175         subs        r2, r2, #64
    176         blt         4f
    177 
    178 3:      /* Main copy loop, copying 64 bytes at a time */
    179         pld         [r1, #(32 * 8)]
    180         ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
    181         stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
    182         ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
    183         stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
    184         subs        r2, r2, #64
    185         bge         3b
    186 
    187 4:      /* Check if there are > 32 bytes left */
    188         adds        r2, r2, #64
    189         subs        r2, r2, #32
    190         blt         5f
    191 
    192         /* Copy 32 bytes */
    193         ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
    194         stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
    195         subs        r2, #32
    196 
    197 5:      /* Handle any remaining bytes */
    198         adds        r2, #32
    199         beq         6f
    200 
    201         movs        r12, r2, lsl #28
    202         itt         cs
    203         ldmiacs     r1!, {r3, r4, r5, r6}   /* 16 bytes */
    204         stmiacs     r0!, {r3, r4, r5, r6}
    205         itt         mi
    206         ldmiami     r1!, {r7, r8}           /*  8 bytes */
    207         stmiami     r0!, {r7, r8}
    208         movs        r12, r2, lsl #30
    209         itt         cs
    210         ldrcs       r3, [r1], #4            /*  4 bytes */
    211         strcs       r3, [r0], #4
    212         itt         mi
    213         ldrhmi      r4, [r1], #2            /*  2 bytes */
    214         strhmi      r4, [r0], #2
    215         tst         r2, #0x1
    216         itt         ne
    217         ldrbne      r3, [r1]                /*  last byte  */
    218         strbne      r3, [r0]
    219 6:
    220         ldmfd       sp!, {r4-r8}
    221         ldmfd       sp!, {r0, pc}
    222 END(MEMCPY_BASE_ALIGNED)
    223