Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright (C) 2010 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /* Changes:
     18  * 2010-08-11 Steve McIntyre <steve.mcintyre (at) arm.com>
     19  *    Added small changes to the two functions to make them work on the
     20  *    specified number of 16- or 32-bit values rather than the original
     21  *    code which was specified as a count of bytes. More verbose comments
     22  *    to aid future maintenance.
     23  */
     24 
     25     .text
     26     .align
     27 
     28     .global arm_memset32
     29     .type   arm_memset32, %function
     30     .global arm_memset16
     31     .type   arm_memset16, %function
     32 
     33 /*
     34  * Optimized memset functions for ARM.
     35  *
     36  * void arm_memset16(uint16_t* dst, uint16_t value, int count);
     37  * void arm_memset32(uint32_t* dst, uint32_t value, int count);
     38  *
     39  */
     40 arm_memset16:
     41         .fnstart
     42         push        {lr}
     43 
     44         /* expand the data to 32 bits */
     45         orr         r1, r1, lsl #16
     46 
     47         /* align to 32 bits */
     48         tst         r0, #2
     49         strneh      r1, [r0], #2
     50         subne       r2, r2, #2
     51 
     52         /* Multiply count by 2 - go from the number of 16-bit shorts
     53          * to the number of bytes desired. */
     54         mov         r2, r2, lsl #1
     55 
     56         /* Now jump into the main loop below. */
     57         b           .Lwork_32
     58         .fnend
     59 
     60 arm_memset32:
     61         .fnstart
     62         push        {lr}
     63 
     64         /* Multiply count by 4 - go from the number of 32-bit words to
     65          * the number of bytes desired. */
     66         mov         r2, r2, lsl #2
     67 
     68 .Lwork_32:
     69         /* Set up registers ready for writing them out. */
     70         mov         ip, r1
     71         mov         lr, r1
     72 
     73         /* Try to align the destination to a cache line. Assume 32
     74          * byte (8 word) cache lines, it's the common case. */
     75         rsb         r3, r0, #0
     76         ands        r3, r3, #0x1C
     77         beq         .Laligned32
     78         cmp         r3, r2
     79         andhi       r3, r2, #0x1C
     80         sub         r2, r2, r3
     81 
     82         /* (Optionally) write any unaligned leading bytes.
     83          * (0-28 bytes, length in r3) */
     84         movs        r3, r3, lsl #28
     85         stmcsia     r0!, {r1, lr}
     86         stmcsia     r0!, {r1, lr}
     87         stmmiia     r0!, {r1, lr}
     88         movs        r3, r3, lsl #2
     89         strcs       r1, [r0], #4
     90 
     91         /* Now quickly loop through the cache-aligned data. */
     92 .Laligned32:
     93         mov         r3, r1
     94 1:      subs        r2, r2, #32
     95         stmhsia     r0!, {r1,r3,ip,lr}
     96         stmhsia     r0!, {r1,r3,ip,lr}
     97         bhs         1b
     98         add         r2, r2, #32
     99 
    100         /* (Optionally) store any remaining trailing bytes.
    101          * (0-30 bytes, length in r2) */
    102         movs        r2, r2, lsl #28
    103         stmcsia     r0!, {r1,r3,ip,lr}
    104         stmmiia     r0!, {r1,lr}
    105         movs        r2, r2, lsl #2
    106         strcs       r1, [r0], #4
    107         strmih      lr, [r0], #2
    108 
    109         pop         {pc}
    110         .fnend
    111