Home | History | Annotate | Download | only in bionic
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *  * Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  *  * Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in
     12  *    the documentation and/or other materials provided with the
     13  *    distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <private/bionic_asm.h>
     30 
     31         /*
     32          * Optimized memset() for ARM.
     33          *
     34          * memset() returns its first argument.
     35          */
     36 
     37         .fpu        neon
     38         .syntax     unified
     39 
     40         // To avoid warning about deprecated instructions, add an explicit
     41         // arch. The code generated is exactly the same.
     42         .arch armv7-a
     43 
     44 ENTRY(__memset_chk)
     45         cmp         r2, r3
     46         bls         memset
     47 
     48         // Preserve lr for backtrace.
     49         push        {lr}
     50         .cfi_def_cfa_offset 4
     51         .cfi_rel_offset lr, 0
     52 
     53         bl          __memset_chk_fail
     54 END(__memset_chk)
     55 
     56 ENTRY(memset)
     57         stmfd       sp!, {r0}
     58         .cfi_def_cfa_offset 4
     59         .cfi_rel_offset r0, 0
     60 
     61         // The new algorithm is slower for copies < 16 so use the old
     62         // neon code in that case.
     63         cmp         r2, #16
     64         blo         .L_set_less_than_16_unknown_align
     65 
     66         // Use strd which requires an even and odd register so move the
     67         // values so that:
     68         //   r0 and r1 contain the memset value
     69         //   r2 is the number of bytes to set
     70         //   r3 is the destination pointer
     71         mov         r3, r0
     72 
     73         // Copy the byte value in every byte of r1.
     74         mov         r1, r1, lsl #24
     75         orr         r1, r1, r1, lsr #8
     76         orr         r1, r1, r1, lsr #16
     77 
     78 .L_check_alignment:
     79         // Align destination to a double word to avoid the strd crossing
     80         // a cache line boundary.
     81         ands        ip, r3, #7
     82         bne         .L_do_double_word_align
     83 
     84 .L_double_word_aligned:
     85         mov         r0, r1
     86 
     87         subs        r2, #64
     88         blo         .L_set_less_than_64
     89 
     90 1:      // Main loop sets 64 bytes at a time.
     91         .irp        offset, #0, #8, #16, #24, #32, #40, #48, #56
     92         strd        r0, r1, [r3, \offset]
     93         .endr
     94 
     95         add         r3, #64
     96         subs        r2, #64
     97         bge         1b
     98 
     99 .L_set_less_than_64:
    100         // Restore r2 to the count of bytes left to set.
    101         add         r2, #64
    102         lsls        ip, r2, #27
    103         bcc         .L_set_less_than_32
    104         // Set 32 bytes.
    105         .irp        offset, #0, #8, #16, #24
    106         strd        r0, r1, [r3, \offset]
    107         .endr
    108         add         r3, #32
    109 
    110 .L_set_less_than_32:
    111         bpl         .L_set_less_than_16
    112         // Set 16 bytes.
    113         .irp        offset, #0, #8
    114         strd        r0, r1, [r3, \offset]
    115         .endr
    116         add         r3, #16
    117 
    118 .L_set_less_than_16:
    119         // Less than 16 bytes to set.
    120         lsls        ip, r2, #29
    121         bcc         .L_set_less_than_8
    122 
    123         // Set 8 bytes.
    124         strd        r0, r1, [r3], #8
    125 
    126 .L_set_less_than_8:
    127         bpl         .L_set_less_than_4
    128         // Set 4 bytes
    129         str         r1, [r3], #4
    130 
    131 .L_set_less_than_4:
    132         lsls        ip, r2, #31
    133         it          ne
    134         strbne      r1, [r3], #1
    135         itt         cs
    136         strbcs      r1, [r3], #1
    137         strbcs      r1, [r3]
    138 
    139         ldmfd       sp!, {r0}
    140         bx          lr
    141 
    142 .L_do_double_word_align:
    143         rsb         ip, ip, #8
    144         sub         r2, r2, ip
    145         movs        r0, ip, lsl #31
    146         it          mi
    147         strbmi      r1, [r3], #1
    148         itt         cs
    149         strbcs      r1, [r3], #1
    150         strbcs      r1, [r3], #1
    151 
    152         // Dst is at least word aligned by this point.
    153         cmp         ip, #4
    154         blo         .L_double_word_aligned
    155         str         r1, [r3], #4
    156         b           .L_double_word_aligned
    157 
    158 .L_set_less_than_16_unknown_align:
    159         // Set up to 15 bytes.
    160         vdup.8      d0, r1
    161         movs        ip, r2, lsl #29
    162         bcc         1f
    163         vst1.8      {d0}, [r0]!
    164 1:      bge         2f
    165         vst1.32     {d0[0]}, [r0]!
    166 2:      movs        ip, r2, lsl #31
    167         it          mi
    168         strbmi      r1, [r0], #1
    169         itt         cs
    170         strbcs      r1, [r0], #1
    171         strbcs      r1, [r0], #1
    172         ldmfd       sp!, {r0}
    173         bx          lr
    174 END(memset)
    175