Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  * All rights reserved.
      4  * Copyright (c) 2013-2014 NVIDIA Corporation.  All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  *  * Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  *  * Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in
     13  *    the documentation and/or other materials provided with the
     14  *    distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     23  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     24  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     25  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     26  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  */
     29 
     30 #include <private/bionic_asm.h>
     31 
     32         .text
     33         .syntax unified
     34         .fpu    neon
     35 
     36 #define CACHE_LINE_SIZE         (64)
     37 #define MEMCPY_BLOCK_SIZE_SMALL (32768)
     38 #define MEMCPY_BLOCK_SIZE_MID   (1048576)
     39 #define PREFETCH_DISTANCE_NEAR  (CACHE_LINE_SIZE*4)
     40 #define PREFETCH_DISTANCE_MID   (CACHE_LINE_SIZE*4)
     41 #define PREFETCH_DISTANCE_FAR   (CACHE_LINE_SIZE*16)
     42 
     43 ENTRY(memmove)
     44         cmp         r2, #0
     45         cmpne       r0, r1
     46         bxeq        lr
     47         subs        r3, r0, r1
     48         bls         .L_jump_to_memcpy
     49         cmp         r2, r3
     50         bhi         .L_reversed_memcpy
     51 
     52 .L_jump_to_memcpy:
     53         b           memcpy
     54 
     55 .L_reversed_memcpy:
     56         push        {r0, lr}
     57         .cfi_def_cfa_offset 8
     58         .cfi_rel_offset r0, 0
     59         .cfi_rel_offset lr, 4
     60 
     61         add         r0, r0, r2
     62         add         r1, r1, r2
     63 
     64         /* preload next cache line */
     65         pld         [r1, #-CACHE_LINE_SIZE]
     66         pld         [r1, #-CACHE_LINE_SIZE*2]
     67 
     68 .L_reversed_memcpy_align_dest:
     69         /* Deal with very small blocks (< 32bytes) asap */
     70         cmp         r2, #32
     71         blo         .L_reversed_memcpy_lt_32bytes
     72         /* no need to align if len < 128 bytes */
     73         cmp         r2, #128
     74         blo         .L_reversed_memcpy_lt_128bytes
     75         /* align destination to 64 bytes (1 cache line) */
     76         ands        r3, r0, #0x3f
     77         beq         .L_reversed_memcpy_dispatch
     78         sub         r2, r2, r3
     79 0:      /* copy 1 byte */
     80         movs        ip, r3, lsl #31
     81         ldrbmi      ip, [r1, #-1]!
     82         strbmi      ip, [r0, #-1]!
     83 1:      /* copy 2 bytes */
     84         ldrbcs      ip, [r1, #-1]!
     85         strbcs      ip, [r0, #-1]!
     86         ldrbcs      ip, [r1, #-1]!
     87         strbcs      ip, [r0, #-1]!
     88 2:      /* copy 4 bytes */
     89         movs        ip, r3, lsl #29
     90         bpl         3f
     91         sub         r1, r1, #4
     92         sub         r0, r0, #4
     93         vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
     94         vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]
     95 3:      /* copy 8 bytes */
     96         bcc         4f
     97         sub         r1, r1, #8
     98         sub         r0, r0, #8
     99         vld1.8      {d0}, [r1]
    100         vst1.8      {d0}, [r0, :64]
    101 4:      /* copy 16 bytes */
    102         movs        ip, r3, lsl #27
    103         bpl         5f
    104         sub         r1, r1, #16
    105         sub         r0, r0, #16
    106         vld1.8      {q0}, [r1]
    107         vst1.8      {q0}, [r0, :128]
    108 5:      /* copy 32 bytes */
    109         bcc         .L_reversed_memcpy_dispatch
    110         sub         r1, r1, #32
    111         sub         r0, r0, #32
    112         vld1.8      {q0, q1}, [r1]
    113         vst1.8      {q0, q1}, [r0, :256]
    114 
    115 .L_reversed_memcpy_dispatch:
    116         /* preload more cache lines */
    117         pld         [r1, #-CACHE_LINE_SIZE*3]
    118         pld         [r1, #-CACHE_LINE_SIZE*4]
    119 
    120         cmp         r2, #MEMCPY_BLOCK_SIZE_SMALL
    121         blo         .L_reversed_memcpy_neon_pld_near
    122         cmp         r2, #MEMCPY_BLOCK_SIZE_MID
    123         blo         .L_reversed_memcpy_neon_pld_mid
    124         b           .L_reversed_memcpy_neon_pld_far
    125 
    126 .L_reversed_memcpy_neon_pld_near:
    127         /* less than 128 bytes? */
    128         subs        r2, r2, #128
    129         blo         1f
    130         sub         r1, r1, #32
    131         sub         r0, r0, #32
    132         mov         r3, #-32
    133         .align      4
    134 0:
    135         /* copy 128 bytes in each loop */
    136         subs        r2, r2, #128
    137 
    138         /* preload to cache */
    139         pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
    140         /* copy a cache line */
    141         vld1.8      {q0, q1}, [r1], r3
    142         vst1.8      {q0, q1}, [r0, :256], r3
    143         vld1.8      {q0, q1}, [r1], r3
    144         vst1.8      {q0, q1}, [r0, :256], r3
    145 
    146         /* preload to cache */
    147         pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
    148         /* copy a cache line */
    149         vld1.8      {q0, q1}, [r1], r3
    150         vst1.8      {q0, q1}, [r0, :256], r3
    151         vld1.8      {q0, q1}, [r1], r3
    152         vst1.8      {q0, q1}, [r0, :256], r3
    153 
    154         bhs         0b
    155         add         r1, r1, #32
    156         add         r0, r0, #32
    157 1:
    158         adds        r2, r2, #128
    159         bne         .L_reversed_memcpy_lt_128bytes
    160         pop         {r0, pc}
    161 
    162 .L_reversed_memcpy_neon_pld_mid:
    163         subs        r2, r2, #128
    164         sub         r1, r1, #32
    165         sub         r0, r0, #32
    166         mov         r3, #-32
    167         .align      4
    168 0:
    169         /* copy 128 bytes in each loop */
    170         subs        r2, r2, #128
    171 
    172         /* preload to cache */
    173         pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
    174         /* copy a cache line */
    175         vld1.8      {q0, q1}, [r1], r3
    176         vst1.8      {q0, q1}, [r0, :256], r3
    177         vld1.8      {q0, q1}, [r1], r3
    178         vst1.8      {q0, q1}, [r0, :256], r3
    179 
    180         /* preload to cache */
    181         pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
    182         /* copy a cache line */
    183         vld1.8      {q0, q1}, [r1], r3
    184         vst1.8      {q0, q1}, [r0, :256], r3
    185         vld1.8      {q0, q1}, [r1], r3
    186         vst1.8      {q0, q1}, [r0, :256], r3
    187 
    188         bhs         0b
    189         add         r1, r1, #32
    190         add         r0, r0, #32
    191 1:
    192         adds        r2, r2, #128
    193         bne         .L_reversed_memcpy_lt_128bytes
    194         pop         {r0, pc}
    195 
    196 .L_reversed_memcpy_neon_pld_far:
    197         sub         r2, r2, #128
    198         sub         r0, r0, #128
    199         sub         r1, r1, #128
    200         .align      4
    201 0:
    202         /* copy 128 bytes in each loop */
    203         subs        r2, r2, #128
    204 
    205         /* preload to cache */
    206         pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128]
    207         pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128]
    208         /* read */
    209         vld1.8      {q0, q1}, [r1]!
    210         vld1.8      {q2, q3}, [r1]!
    211         vld1.8      {q8, q9}, [r1]!
    212         vld1.8      {q10, q11}, [r1]!
    213         /* write */
    214         vst1.8      {q0, q1}, [r0, :256]!
    215         vst1.8      {q2, q3}, [r0, :256]!
    216         vst1.8      {q8, q9}, [r0, :256]!
    217         vst1.8      {q10, q11}, [r0, :256]!
    218 
    219         sub         r0, r0, #256
    220         sub         r1, r1, #256
    221         bhs         0b
    222         add         r0, r0, #128
    223         add         r1, r1, #128
    224 1:
    225         adds        r2, r2, #128
    226         bne         .L_reversed_memcpy_lt_128bytes
    227         pop         {r0, pc}
    228 
    229 .L_reversed_memcpy_lt_128bytes:
    230 6:      /* copy 64 bytes */
    231         movs        ip, r2, lsl #26
    232         bcc         5f
    233         sub         r1, r1, #32
    234         sub         r0, r0, #32
    235         vld1.8      {q0, q1}, [r1]
    236         vst1.8      {q0, q1}, [r0]
    237         sub         r1, r1, #32
    238         sub         r0, r0, #32
    239         vld1.8      {q0, q1}, [r1]
    240         vst1.8      {q0, q1}, [r0]
    241 5:      /* copy 32 bytes */
    242         bpl         4f
    243         sub         r1, r1, #32
    244         sub         r0, r0, #32
    245         vld1.8      {q0, q1}, [r1]
    246         vst1.8      {q0, q1}, [r0]
    247 .L_reversed_memcpy_lt_32bytes:
    248 4:      /* copy 16 bytes */
    249         movs        ip, r2, lsl #28
    250         bcc         3f
    251         sub         r1, r1, #16
    252         sub         r0, r0, #16
    253         vld1.8      {q0}, [r1]
    254         vst1.8      {q0}, [r0]
    255 3:      /* copy 8 bytes */
    256         bpl         2f
    257         sub         r1, r1, #8
    258         sub         r0, r0, #8
    259         vld1.8      {d0}, [r1]
    260         vst1.8      {d0}, [r0]
    261 2:      /* copy 4 bytes */
    262         ands        ip, r2, #0x4
    263         beq         1f
    264         sub         r1, r1, #4
    265         sub         r0, r0, #4
    266         vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
    267         vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]
    268 1:      /* copy 2 bytes */
    269         movs        ip, r2, lsl #31
    270         ldrbcs      ip, [r1, #-1]!
    271         strbcs      ip, [r0, #-1]!
    272         ldrbcs      ip, [r1, #-1]!
    273         strbcs      ip, [r0, #-1]!
    274 0:      /* copy 1 byte */
    275         ldrbmi      ip, [r1, #-1]!
    276         strbmi      ip, [r0, #-1]!
    277 
    278         pop         {r0, pc}
    279 
    280 END(memmove)
    281