Home | History | Annotate | Download | only in opts
      1 /***************************************************************************
      2  Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
      3 
      4  Licensed under the Apache License, Version 2.0 (the "License"); you
      5  may not use this file except in compliance with the License.  You may
      6  obtain a copy of the License at
      7 
      8  http://www.apache.org/licenses/LICENSE-2.0
      9 
     10  Unless required by applicable law or agreed to in writing, software
     11  distributed under the License is distributed on an "AS IS" BASIS,
     12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
     13  implied.  See the License for the specific language governing
     14  permissions and limitations under the License.
     15  ***************************************************************************/
     16 
     17 /***************************************************************************
     18   Neon memset: Attempts to do a memset with Neon registers if possible,
     19      Inputs:
     20         s: The buffer to write to
     21         c: The integer data to write to the buffer
     22         n: The size_t count.
     23      Outputs:
     24 
     25 ***************************************************************************/
     26 
     27         .code 32
     28         .fpu neon
     29         .align 4
     30         .globl memset16_neon
     31         .func
     32 
     33 memset16_neon:
     34         cmp             r2, #0
     35         bxeq            lr
     36 
     37         /* Keep in mind that r2 -- the count argument -- is for the
     38          * number of 16-bit items to copy.
     39          */
     40         lsl             r2, r2, #1
     41 
     42         push            {r0}
     43 
     44         /* If we have < 8 bytes, just do a quick loop to handle that */
     45         cmp             r2, #8
     46         bgt             memset_gt4
     47 memset_smallcopy_loop:
     48         strh            r1, [r0], #2
     49         subs            r2, r2, #2
     50         bne             memset_smallcopy_loop
     51 memset_smallcopy_done:
     52         pop             {r0}
     53         bx              lr
     54 
     55 memset_gt4:
     56         /*
     57          * Duplicate the r1 lowest 16-bits across r1. The idea is to have
     58          * a register with two 16-bit-values we can copy. We do this by
     59          * duplicating lowest 16-bits of r1 to upper 16-bits.
     60          */
     61         orr             r1, r1, r1, lsl #16
     62         /*
     63          * If we're copying > 64 bytes, then we may want to get
     64          * onto a 16-byte boundary to improve speed even more.
     65          */
     66         cmp             r2, #64
     67         blt             memset_route
     68         ands            r12, r0, #0xf
     69         beq             memset_route
     70         /*
     71          * Determine the number of bytes to move forward to get to the 16-byte
     72          * boundary.  Note that this will be a multiple of 4, since we
     73          * already are word-aligned.
     74          */
     75         rsb             r12, r12, #16
     76         sub             r2, r2, r12
     77         lsls            r12, r12, #29
     78         strmi           r1, [r0], #4
     79         strcs           r1, [r0], #4
     80         strcs           r1, [r0], #4
     81         lsls            r12, r12, #2
     82         strcsh          r1, [r0], #2
     83 memset_route:
     84         /*
     85          * Decide where to route for the maximum copy sizes.  Note that we
     86          * build q0 and q1 depending on if we'll need it, so that's
     87          * interwoven here as well.
     88          */
     89         vdup.u32        d0, r1
     90         cmp             r2, #16
     91         blt             memset_8
     92         vmov            d1, d0
     93         cmp             r2, #64
     94         blt             memset_16
     95         vmov            q1, q0
     96         cmp             r2, #128
     97         blt             memset_32
     98 memset_128:
     99         mov             r12, r2, lsr #7
    100 memset_128_loop:
    101         vst1.64         {q0, q1}, [r0]!
    102         vst1.64         {q0, q1}, [r0]!
    103         vst1.64         {q0, q1}, [r0]!
    104         vst1.64         {q0, q1}, [r0]!
    105         subs            r12, r12, #1
    106         bne             memset_128_loop
    107         ands            r2, r2, #0x7f
    108         beq             memset_end
    109 memset_32:
    110         movs            r12, r2, lsr #5
    111         beq             memset_16
    112 memset_32_loop:
    113         subs            r12, r12, #1
    114         vst1.64         {q0, q1}, [r0]!
    115         bne             memset_32_loop
    116         ands            r2, r2, #0x1f
    117         beq             memset_end
    118 memset_16:
    119         movs            r12, r2, lsr #4
    120         beq             memset_8
    121 memset_16_loop:
    122         subs            r12, r12, #1
    123         vst1.32         {q0}, [r0]!
    124         bne             memset_16_loop
    125         ands            r2, r2, #0xf
    126         beq             memset_end
    127         /*
    128          * memset_8 isn't a loop, since we try to do our loops at 16
    129          * bytes and above.  We should loop there, then drop down here
    130          * to finish the <16-byte versions.  Same for memset_4 and
    131          * memset_1.
    132          */
    133 memset_8:
    134         cmp             r2, #8
    135         blt             memset_4
    136         subs            r2, r2, #8
    137         vst1.32         {d0}, [r0]!
    138 memset_4:
    139         cmp             r2, #4
    140         blt             memset_2
    141         subs            r2, r2, #4
    142         str             r1, [r0], #4
    143 memset_2:
    144         cmp             r2, #0
    145         ble             memset_end
    146         strh            r1, [r0], #2
    147 memset_end:
    148         pop             {r0}
    149         bx              lr
    150 
    151         .endfunc
    152         .end
    153