Home | History | Annotate | Download | only in opts
      1 /***************************************************************************
      2  * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  ***************************************************************************/
      7 
      8 /***************************************************************************
      9   Neon memset: Attempts to do a memset with Neon registers if possible,
     10      Inputs:
     11         s: The buffer to write to
     12         c: The integer data to write to the buffer
     13         n: The size_t count.
     14      Outputs:
     15 
     16 ***************************************************************************/
     17 
     18         .syntax unified
     19 
     20         .code 32
     21         .fpu neon
     22         .align 4
     23         .globl memset16_neon
     24 
     25 memset16_neon:
     26         cmp             r2, #0
     27         bxeq            lr
     28 
     29         /* Keep in mind that r2 -- the count argument -- is for the
     30          * number of 16-bit items to copy.
     31          */
     32         lsl             r2, r2, #1
     33 
     34         push            {r0}
     35 
     36         /* If we have < 8 bytes, just do a quick loop to handle that */
     37         cmp             r2, #8
     38         bgt             memset_gt4
     39 memset_smallcopy_loop:
     40         strh            r1, [r0], #2
     41         subs            r2, r2, #2
     42         bne             memset_smallcopy_loop
     43 memset_smallcopy_done:
     44         pop             {r0}
     45         bx              lr
     46 
     47 memset_gt4:
     48         /*
     49          * Duplicate the r1 lowest 16-bits across r1. The idea is to have
     50          * a register with two 16-bit-values we can copy. We do this by
     51          * duplicating lowest 16-bits of r1 to upper 16-bits.
     52          */
     53         orr             r1, r1, r1, lsl #16
     54         /*
     55          * If we're copying > 64 bytes, then we may want to get
     56          * onto a 16-byte boundary to improve speed even more.
     57          */
     58         cmp             r2, #64
     59         blt             memset_route
     60         ands            r12, r0, #0xf
     61         beq             memset_route
     62         /*
     63          * Determine the number of bytes to move forward to get to the 16-byte
     64          * boundary.  Note that this will be a multiple of 4, since we
     65          * already are word-aligned.
     66          */
     67         rsb             r12, r12, #16
     68         sub             r2, r2, r12
     69         lsls            r12, r12, #29
     70         strmi           r1, [r0], #4
     71         strcs           r1, [r0], #4
     72         strcs           r1, [r0], #4
     73         lsls            r12, r12, #2
     74         strhcs          r1, [r0], #2
     75 memset_route:
     76         /*
     77          * Decide where to route for the maximum copy sizes.  Note that we
     78          * build q0 and q1 depending on if we'll need it, so that's
     79          * interwoven here as well.
     80          */
     81         vdup.u32        d0, r1
     82         cmp             r2, #16
     83         blt             memset_8
     84         vmov            d1, d0
     85         cmp             r2, #64
     86         blt             memset_16
     87         vmov            q1, q0
     88         cmp             r2, #128
     89         blt             memset_32
     90 memset_128:
     91         mov             r12, r2, lsr #7
     92 memset_128_loop:
     93         vst1.64         {q0, q1}, [r0]!
     94         vst1.64         {q0, q1}, [r0]!
     95         vst1.64         {q0, q1}, [r0]!
     96         vst1.64         {q0, q1}, [r0]!
     97         subs            r12, r12, #1
     98         bne             memset_128_loop
     99         ands            r2, r2, #0x7f
    100         beq             memset_end
    101 memset_32:
    102         movs            r12, r2, lsr #5
    103         beq             memset_16
    104 memset_32_loop:
    105         subs            r12, r12, #1
    106         vst1.64         {q0, q1}, [r0]!
    107         bne             memset_32_loop
    108         ands            r2, r2, #0x1f
    109         beq             memset_end
    110 memset_16:
    111         movs            r12, r2, lsr #4
    112         beq             memset_8
    113 memset_16_loop:
    114         subs            r12, r12, #1
    115         vst1.32         {q0}, [r0]!
    116         bne             memset_16_loop
    117         ands            r2, r2, #0xf
    118         beq             memset_end
    119         /*
    120          * memset_8 isn't a loop, since we try to do our loops at 16
    121          * bytes and above.  We should loop there, then drop down here
    122          * to finish the <16-byte versions.  Same for memset_4 and
    123          * memset_1.
    124          */
    125 memset_8:
    126         cmp             r2, #8
    127         blt             memset_4
    128         subs            r2, r2, #8
    129         vst1.32         {d0}, [r0]!
    130 memset_4:
    131         cmp             r2, #4
    132         blt             memset_2
    133         subs            r2, r2, #4
    134         str             r1, [r0], #4
    135 memset_2:
    136         cmp             r2, #0
    137         ble             memset_end
    138         strh            r1, [r0], #2
    139 memset_end:
    140         pop             {r0}
    141         bx              lr
    142 
    143         .end
    144