Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  * All rights reserved.
      4  * Copyright (c) 2013-2014, NVIDIA Corporation.  All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  *  * Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  *  * Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in
     13  *    the documentation and/or other materials provided with the
     14  *    distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     23  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     24  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     25  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     26  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  */
     29 
     30 #define CACHE_LINE_SIZE         (64)
     31 #define PREFETCH_DISTANCE       (CACHE_LINE_SIZE*6)
     32 
     33 ENTRY_PRIVATE(MEMCPY_BASE)
     34         .cfi_def_cfa_offset 8
     35         .cfi_rel_offset r0, 0
     36         .cfi_rel_offset lr, 4
     37 
     38         cmp         r2, #0
     39         beq         .L_memcpy_done
     40         cmp         r0, r1
     41         beq         .L_memcpy_done
     42 
     43         /* preload next cache line */
     44         pld         [r1, #CACHE_LINE_SIZE*1]
     45 
     46         /* Deal with very small blocks (< 32bytes) asap */
     47         cmp         r2, #32
     48         blo         .L_memcpy_lt_32bytes
     49         /* no need to align if len < 128 bytes */
     50         cmp         r2, #128
     51         blo         .L_memcpy_lt_128bytes
     52 
     53         /* large copy, align dest to 64 byte boundry */
     54         pld         [r1, #CACHE_LINE_SIZE*2]
     55         rsb         r3, r0, #0
     56         ands        r3, r3, #0x3F
     57         pld         [r1, #CACHE_LINE_SIZE*3]
     58         beq         .L_memcpy_dispatch
     59         sub         r2, r2, r3
     60         /* copy 1 byte */
     61         movs        ip, r3, lsl #31
     62         itt         mi
     63         ldrbmi      ip, [r1], #1
     64         strbmi      ip, [r0], #1
     65         /* copy 2 bytes */
     66         itt         cs
     67         ldrhcs      ip, [r1], #2
     68         strhcs      ip, [r0], #2
     69         /* copy 4 bytes */
     70         movs        ip, r3, lsl #29
     71         itt         mi
     72         ldrmi       ip, [r1], #4
     73         strmi       ip, [r0], #4
     74         /* copy 8 bytes */
     75         bcc         1f
     76         vld1.8      {d0}, [r1]!
     77         vst1.8      {d0}, [r0, :64]!
     78 1:      /* copy 16 bytes */
     79         movs        ip, r3, lsl #27
     80         bpl         1f
     81         vld1.8      {q0}, [r1]!
     82         vst1.8      {q0}, [r0, :128]!
     83 1:      /* copy 32 bytes */
     84         bcc         .L_memcpy_dispatch
     85         vld1.8      {q0, q1}, [r1]!
     86         vst1.8      {q0, q1}, [r0, :256]!
     87 
     88 .L_memcpy_dispatch:
     89         // pre-decrement by 128 to detect nearly-done condition easily, but
     90         // also need to check if we have less than 128 bytes left at this
     91         // point due to alignment code above
     92         subs        r2, r2, #128
     93         blo         .L_memcpy_lt_128presub
     94 
     95         // Denver does better if both source and dest are aligned so
     96         // we'll special-case that even though the code is virually identical
     97         tst         r1, #0xF
     98         bne         .L_memcpy_neon_unalign_src_pld
     99 
    100         // DRAM memcpy should be throttled slightly to get full bandwidth
    101         //
    102         cmp         r2, #32768
    103         bhi         .L_memcpy_neon_unalign_src_pld
    104         .align      4
    105 1:
    106         /* copy 128 bytes in each loop */
    107         subs        r2, r2, #128
    108 
    109         /* preload a cache line */
    110         pld         [r1, #PREFETCH_DISTANCE]
    111         /* copy a cache line */
    112         vld1.8      {q0, q1}, [r1, :128]!
    113         vst1.8      {q0, q1}, [r0, :256]!
    114         vld1.8      {q0, q1}, [r1, :128]!
    115         vst1.8      {q0, q1}, [r0, :256]!
    116         /* preload a cache line */
    117         pld         [r1, #PREFETCH_DISTANCE]
    118         /* copy a cache line */
    119         vld1.8      {q0, q1}, [r1, :128]!
    120         vst1.8      {q0, q1}, [r0, :256]!
    121         vld1.8      {q0, q1}, [r1, :128]!
    122         vst1.8      {q0, q1}, [r0, :256]!
    123 
    124         bhs         1b
    125         adds        r2, r2, #128
    126         bne         .L_memcpy_lt_128bytes_align
    127         pop         {r0, pc}
    128 
    129         .align      4
    130 .L_memcpy_neon_unalign_src_pld:
    131 1:
    132         /* copy 128 bytes in each loop */
    133         subs        r2, r2, #128
    134 
    135         /* preload a cache line */
    136         pld         [r1, #PREFETCH_DISTANCE]
    137         /* copy a cache line */
    138         vld1.8      {q0, q1}, [r1]!
    139         vst1.8      {q0, q1}, [r0, :256]!
    140         vld1.8      {q0, q1}, [r1]!
    141         vst1.8      {q0, q1}, [r0, :256]!
    142         /* preload a cache line */
    143         pld         [r1, #PREFETCH_DISTANCE]
    144         /* copy a cache line */
    145         vld1.8      {q0, q1}, [r1]!
    146         vst1.8      {q0, q1}, [r0, :256]!
    147         vld1.8      {q0, q1}, [r1]!
    148         vst1.8      {q0, q1}, [r0, :256]!
    149 
    150         bhs         1b
    151         adds        r2, r2, #128
    152         bne         .L_memcpy_lt_128bytes_align
    153         pop         {r0, pc}
    154 
    155 .L_memcpy_lt_128presub:
    156         add         r2, r2, #128
    157 .L_memcpy_lt_128bytes_align:
    158         /* copy 64 bytes */
    159         movs        ip, r2, lsl #26
    160         bcc         1f
    161         vld1.8      {q0, q1}, [r1]!
    162         vst1.8      {q0, q1}, [r0, :256]!
    163         vld1.8      {q0, q1}, [r1]!
    164         vst1.8      {q0, q1}, [r0, :256]!
    165 1:      /* copy 32 bytes */
    166         bpl         1f
    167         vld1.8      {q0, q1}, [r1]!
    168         vst1.8      {q0, q1}, [r0, :256]!
    169 1:      /* copy 16 bytes */
    170         movs        ip, r2, lsl #28
    171         bcc         1f
    172         vld1.8      {q0}, [r1]!
    173         vst1.8      {q0}, [r0, :128]!
    174 1:      /* copy 8 bytes */
    175         bpl         1f
    176         vld1.8      {d0}, [r1]!
    177         vst1.8      {d0}, [r0, :64]!
    178 1:      /* copy 4 bytes */
    179         tst         r2, #4
    180         itt         ne
    181         ldrne       ip, [r1], #4
    182         strne       ip, [r0], #4
    183         /* copy 2 bytes */
    184         movs        ip, r2, lsl #31
    185         itt         cs
    186         ldrhcs      ip, [r1], #2
    187         strhcs      ip, [r0], #2
    188         /* copy 1 byte */
    189         itt         mi
    190         ldrbmi      ip, [r1]
    191         strbmi      ip, [r0]
    192 
    193         pop         {r0, pc}
    194 
    195 .L_memcpy_lt_128bytes:
    196         /* copy 64 bytes */
    197         movs        ip, r2, lsl #26
    198         bcc         1f
    199         vld1.8      {q0, q1}, [r1]!
    200         vst1.8      {q0, q1}, [r0]!
    201         vld1.8      {q0, q1}, [r1]!
    202         vst1.8      {q0, q1}, [r0]!
    203 1:      /* copy 32 bytes */
    204         bpl	    .L_memcpy_lt_32bytes
    205         vld1.8      {q0, q1}, [r1]!
    206         vst1.8      {q0, q1}, [r0]!
    207 .L_memcpy_lt_32bytes:
    208         /* copy 16 bytes */
    209         movs        ip, r2, lsl #28
    210         bcc         1f
    211         vld1.8      {q0}, [r1]!
    212         vst1.8      {q0}, [r0]!
    213 1:      /* copy 8 bytes */
    214         bpl         1f
    215         vld1.8      {d0}, [r1]!
    216         vst1.8      {d0}, [r0]!
    217 1:      /* copy 4 bytes */
    218         tst         r2, #4
    219         itt         ne
    220         ldrne       ip, [r1], #4
    221         strne       ip, [r0], #4
    222         /* copy 2 bytes */
    223         movs        ip, r2, lsl #31
    224         itt         cs
    225         ldrhcs      ip, [r1], #2
    226         strhcs      ip, [r0], #2
    227         /* copy 1 byte */
    228         itt         mi
    229         ldrbmi      ip, [r1]
    230         strbmi      ip, [r0]
    231 
    232 .L_memcpy_done:
    233         pop         {r0, pc}
    234 END(MEMCPY_BASE)
    235