Home | History | Annotate | Download | only in opts
      1 /***************************************************************************
      2  * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  ***************************************************************************/
      7 
      8 /***************************************************************************
      9   Neon memset: Attempts to do a memset with Neon registers if possible,
     10      Inputs:
     11         s: The buffer to write to
     12         c: The integer data to write to the buffer
     13         n: The size_t count.
     14      Outputs:
     15 
     16 ***************************************************************************/
     17 
     18         .code 32
     19         .fpu neon
     20         .align 4
     21         .globl memset16_neon
     22         .func
     23 
     24 memset16_neon:
     25         cmp             r2, #0
     26         bxeq            lr
     27 
     28         /* Keep in mind that r2 -- the count argument -- is for the
     29          * number of 16-bit items to copy.
     30          */
     31         lsl             r2, r2, #1
     32 
     33         push            {r0}
     34 
     35         /* If we have < 8 bytes, just do a quick loop to handle that */
     36         cmp             r2, #8
     37         bgt             memset_gt4
     38 memset_smallcopy_loop:
     39         strh            r1, [r0], #2
     40         subs            r2, r2, #2
     41         bne             memset_smallcopy_loop
     42 memset_smallcopy_done:
     43         pop             {r0}
     44         bx              lr
     45 
     46 memset_gt4:
     47         /*
     48          * Duplicate the r1 lowest 16-bits across r1. The idea is to have
     49          * a register with two 16-bit-values we can copy. We do this by
     50          * duplicating lowest 16-bits of r1 to upper 16-bits.
     51          */
     52         orr             r1, r1, r1, lsl #16
     53         /*
     54          * If we're copying > 64 bytes, then we may want to get
     55          * onto a 16-byte boundary to improve speed even more.
     56          */
     57         cmp             r2, #64
     58         blt             memset_route
     59         ands            r12, r0, #0xf
     60         beq             memset_route
     61         /*
     62          * Determine the number of bytes to move forward to get to the 16-byte
     63          * boundary.  Note that this will be a multiple of 4, since we
     64          * already are word-aligned.
     65          */
     66         rsb             r12, r12, #16
     67         sub             r2, r2, r12
     68         lsls            r12, r12, #29
     69         strmi           r1, [r0], #4
     70         strcs           r1, [r0], #4
     71         strcs           r1, [r0], #4
     72         lsls            r12, r12, #2
     73         strcsh          r1, [r0], #2
     74 memset_route:
     75         /*
     76          * Decide where to route for the maximum copy sizes.  Note that we
     77          * build q0 and q1 depending on if we'll need it, so that's
     78          * interwoven here as well.
     79          */
     80         vdup.u32        d0, r1
     81         cmp             r2, #16
     82         blt             memset_8
     83         vmov            d1, d0
     84         cmp             r2, #64
     85         blt             memset_16
     86         vmov            q1, q0
     87         cmp             r2, #128
     88         blt             memset_32
     89 memset_128:
     90         mov             r12, r2, lsr #7
     91 memset_128_loop:
     92         vst1.64         {q0, q1}, [r0]!
     93         vst1.64         {q0, q1}, [r0]!
     94         vst1.64         {q0, q1}, [r0]!
     95         vst1.64         {q0, q1}, [r0]!
     96         subs            r12, r12, #1
     97         bne             memset_128_loop
     98         ands            r2, r2, #0x7f
     99         beq             memset_end
    100 memset_32:
    101         movs            r12, r2, lsr #5
    102         beq             memset_16
    103 memset_32_loop:
    104         subs            r12, r12, #1
    105         vst1.64         {q0, q1}, [r0]!
    106         bne             memset_32_loop
    107         ands            r2, r2, #0x1f
    108         beq             memset_end
    109 memset_16:
    110         movs            r12, r2, lsr #4
    111         beq             memset_8
    112 memset_16_loop:
    113         subs            r12, r12, #1
    114         vst1.32         {q0}, [r0]!
    115         bne             memset_16_loop
    116         ands            r2, r2, #0xf
    117         beq             memset_end
    118         /*
    119          * memset_8 isn't a loop, since we try to do our loops at 16
    120          * bytes and above.  We should loop there, then drop down here
    121          * to finish the <16-byte versions.  Same for memset_4 and
    122          * memset_1.
    123          */
    124 memset_8:
    125         cmp             r2, #8
    126         blt             memset_4
    127         subs            r2, r2, #8
    128         vst1.32         {d0}, [r0]!
    129 memset_4:
    130         cmp             r2, #4
    131         blt             memset_2
    132         subs            r2, r2, #4
    133         str             r1, [r0], #4
    134 memset_2:
    135         cmp             r2, #0
    136         ble             memset_end
    137         strh            r1, [r0], #2
    138 memset_end:
    139         pop             {r0}
    140         bx              lr
    141 
    142         .endfunc
    143         .end
    144