Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
      4  * All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  *  * Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  *  * Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in
     13  *    the documentation and/or other materials provided with the
     14  *    distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     19  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     20  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     22  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     23  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     24  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     25  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     26  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  */
     29 
     30 #include <private/bionic_asm.h>
     31 
     32         /*
     33          * Optimized memset() for ARM.
     34          *
     35          * memset() returns its first argument.
     36          */
     37 
     38         .cpu        cortex-a15
     39         .fpu        neon
     40         .syntax     unified
     41 
     42 ENTRY(__memset_chk)
     43         cmp         r2, r3
     44         bls         memset
     45 
     46         // Preserve lr for backtrace.
     47         push        {lr}
     48         .cfi_def_cfa_offset 4
     49         .cfi_rel_offset lr, 0
     50 
     51         bl          __memset_chk_fail
     52 END(__memset_chk)
     53 
     54 ENTRY(memset)
     55         pldw        [r0]
     56         mov         r3, r0
     57 
     58         // Duplicate the low byte of r1
     59         mov         r1, r1, lsl #24
     60         orr         r1, r1, r1, lsr #8
     61         orr         r1, r1, r1, lsr #16
     62 
     63         cmp         r2, #16
     64         blo         .L_less_than_16
     65 
     66         // This section handles regions 16 bytes or larger
     67         //
     68         // Use aligned vst1.8 and vstm when possible.  Register values will be:
     69         //   ip is scratch
     70         //   q0, q1, and r1 contain the memset value
     71         //   r2 is the number of bytes to set
     72         //   r3 is the advancing destination pointer
     73         vdup.32     q0, r1
     74 
     75         ands        ip, r3, 0xF
     76         beq         .L_memset_aligned
     77 
     78         // Align dest pointer to 16-byte boundary.
     79         pldw        [r0, #64]
     80         rsb         ip, ip, #16
     81 
     82         // Pre-adjust the byte count to reflect post-aligment value.  Expecting
     83         // 8-byte alignment to be rather common so we special case that one.
     84         sub         r2, r2, ip
     85 
     86         /* set 1 byte */
     87         tst         ip, #1
     88         it          ne
     89         strbne      r1, [r3], #1
     90         /* set 2 bytes */
     91         tst         ip, #2
     92         it          ne
     93         strhne      r1, [r3], #2
     94         /* set 4 bytes */
     95         movs        ip, ip, lsl #29
     96         it          mi
     97         strmi       r1, [r3], #4
     98         /* set 8 bytes */
     99         itt         cs
    100         strcs       r1, [r3], #4
    101         strcs       r1, [r3], #4
    102 
    103 .L_memset_aligned:
    104         // Destination is now 16-byte aligned.  Determine how to handle
    105         // remaining bytes.
    106         vmov        q1, q0
    107         cmp         r2, #128
    108         blo         .L_less_than_128
    109 
    110         // We need to set a larger block of memory.  Use four Q regs to
    111         // set a full cache line in one instruction.  Pre-decrement
    112         // r2 to simplify end-of-loop detection
    113         vmov        q2, q0
    114         vmov        q3, q0
    115         pldw        [r0, #128]
    116         sub         r2, r2, #128
    117         .align 4
    118 .L_memset_loop_128:
    119         pldw        [r3, #192]
    120         vstm        r3!, {q0, q1, q2, q3}
    121         vstm        r3!, {q0, q1, q2, q3}
    122         subs        r2, r2, #128
    123         bhs         .L_memset_loop_128
    124 
    125         // Un-bias r2 so it contains the number of bytes left.  Early
    126         // exit if we are done.
    127         adds        r2, r2, #128
    128         beq         2f
    129 
    130         .align 4
    131 .L_less_than_128:
    132         // set 64 bytes
    133         movs        ip, r2, lsl #26
    134         bcc         1f
    135         vst1.8      {q0, q1}, [r3, :128]!
    136         vst1.8      {q0, q1}, [r3, :128]!
    137         beq         2f
    138 1:
    139         // set 32 bytes
    140         bpl         1f
    141         vst1.8      {q0, q1}, [r3, :128]!
    142 1:
    143         // set 16 bytes
    144         movs        ip, r2, lsl #28
    145         bcc         1f
    146         vst1.8      {q0}, [r3, :128]!
    147         beq         2f
    148 1:
    149         // set 8 bytes
    150         bpl         1f
    151         vst1.8      {d0}, [r3, :64]!
    152 1:
    153         // set 4 bytes
    154         tst         r2, #4
    155         it          ne
    156         strne       r1, [r3], #4
    157 1:
    158         // set 2 bytes
    159         movs        ip, r2, lsl #31
    160         it          cs
    161         strhcs      r1, [r3], #2
    162         // set 1 byte
    163         it          mi
    164         strbmi      r1, [r3]
    165 2:
    166         bx          lr
    167 
    168 .L_less_than_16:
    169         // Store up to 15 bytes without worrying about byte alignment
    170         movs        ip, r2, lsl #29
    171         bcc         1f
    172         str         r1, [r3], #4
    173         str         r1, [r3], #4
    174         beq         2f
    175 1:
    176         it          mi
    177         strmi       r1, [r3], #4
    178         movs        ip, r2, lsl #31
    179         it          mi
    180         strbmi      r1, [r3], #1
    181         itt         cs
    182         strbcs      r1, [r3], #1
    183         strbcs      r1, [r3]
    184 2:
    185         bx          lr
    186 END(memset)
    187