Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  *  * Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  *  * Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in
     14  *    the documentation and/or other materials provided with the
     15  *    distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     21  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     24  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     27  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     28  * SUCH DAMAGE.
     29  */
     30 
     31 #include <private/bionic_asm.h>
     32 
     33 #define PLDOFFS (16)
     34 #define PLDSIZE (128) /* L2 cache line size */
     35 
     36         .code 32
     37 ENTRY(memcpy)
     38         push            {r0}
     39         .cfi_def_cfa_offset 4
     40         .cfi_rel_offset r0, 0
     41         cmp             r2, #4
     42         blt             .Lneon_lt4
     43         cmp             r2, #16
     44         blt             .Lneon_lt16
     45         cmp             r2, #32
     46         blt             .Lneon_16
     47         cmp              r2, #128
     48         blt              .Lneon_copy_32_a
     49         /* Copy blocks of 128-bytes (word-aligned) at a time*/
     50         /* Code below is optimized for PLDSIZE=128 only */
     51         mov             r12, r2, lsr #7
     52         cmp             r12, #PLDOFFS
     53         ble             .Lneon_copy_128_loop_nopld
     54         sub             r12, #PLDOFFS
     55         pld             [r1, #(PLDOFFS-1)*PLDSIZE]
     56 .Lneon_copy_128_loop_outer:
     57         pld             [r1, #(PLDOFFS*PLDSIZE)]
     58         pld             [r1, #(PLDOFFS)*(PLDSIZE)+64]
     59         vld1.32         {q0, q1}, [r1]!
     60         vld1.32         {q2, q3}, [r1]!
     61         vld1.32         {q8, q9}, [r1]!
     62         vld1.32         {q10, q11}, [r1]!
     63         subs            r12, r12, #1
     64         vst1.32         {q0, q1}, [r0]!
     65         vst1.32         {q2, q3}, [r0]!
     66         vst1.32         {q8, q9}, [r0]!
     67         vst1.32         {q10, q11}, [r0]!
     68         bne             .Lneon_copy_128_loop_outer
     69         mov             r12, #PLDOFFS
     70 .Lneon_copy_128_loop_nopld:
     71         vld1.32         {q0, q1}, [r1]!
     72         vld1.32         {q2, q3}, [r1]!
     73         vld1.32         {q8, q9}, [r1]!
     74         vld1.32         {q10, q11}, [r1]!
     75         subs            r12, r12, #1
     76         vst1.32         {q0, q1}, [r0]!
     77         vst1.32         {q2, q3}, [r0]!
     78         vst1.32         {q8, q9}, [r0]!
     79         vst1.32         {q10, q11}, [r0]!
     80         bne             .Lneon_copy_128_loop_nopld
     81         ands            r2, r2, #0x7f
     82         beq             .Lneon_exit
     83         cmp             r2, #32
     84         blt             .Lneon_16
     85         nop
     86         /* Copy blocks of 32-bytes (word aligned) at a time*/
     87 .Lneon_copy_32_a:
     88         mov             r12, r2, lsr #5
     89 .Lneon_copy_32_loop_a:
     90         vld1.32         {q0,q1}, [r1]!
     91         subs            r12, r12, #1
     92         vst1.32         {q0,q1}, [r0]!
     93         bne             .Lneon_copy_32_loop_a
     94         ands            r2, r2, #0x1f
     95         beq             .Lneon_exit
     96 .Lneon_16:
     97         subs            r2, r2, #16
     98         blt             .Lneon_lt16
     99         vld1.32         {q8}, [r1]!
    100         vst1.32         {q8}, [r0]!
    101         beq             .Lneon_exit
    102 .Lneon_lt16:
    103         movs            r12, r2, lsl #29
    104         bcc             .Lneon_skip8
    105         ldr             r3, [r1], #4
    106         ldr             r12, [r1], #4
    107         str             r3, [r0], #4
    108         str             r12, [r0], #4
    109 .Lneon_skip8:
    110         bpl             .Lneon_lt4
    111         ldr             r3, [r1], #4
    112         str             r3, [r0], #4
    113 .Lneon_lt4:
    114         movs            r2, r2, lsl #31
    115         bcc             .Lneon_lt2
    116         ldrh            r3, [r1], #2
    117         strh            r3, [r0], #2
    118 .Lneon_lt2:
    119         bpl             .Lneon_exit
    120         ldrb            r12, [r1]
    121         strb            r12, [r0]
    122 .Lneon_exit:
    123         pop             {r0}
    124         bx              lr
    125 
    126 END(memcpy)
    127