Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2010 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 /* Changes:
      9  * 2010-08-11 Steve McIntyre <steve.mcintyre (at) arm.com>
     10  *    Added small changes to the two functions to make them work on the
     11  *    specified number of 16- or 32-bit values rather than the original
     12  *    code which was specified as a count of bytes. More verbose comments
     13  *    to aid future maintenance.
     14  */
     15 
     16     .text
     17     .align
     18 
     19     .global arm_memset32
     20     .type   arm_memset32, %function
     21     .global arm_memset16
     22     .type   arm_memset16, %function
     23 
     24 /*
     25  * Optimized memset functions for ARM.
     26  *
     27  * void arm_memset16(uint16_t* dst, uint16_t value, int count);
     28  * void arm_memset32(uint32_t* dst, uint32_t value, int count);
     29  *
     30  */
     31 arm_memset16:
     32         .fnstart
     33         push        {lr}
     34 
     35         /* if count is equal to zero then abort */
     36         teq         r2, #0
     37         ble         .Lfinish
     38 
     39         /* Multiply count by 2 - go from the number of 16-bit shorts
     40          * to the number of bytes desired. */
     41         mov         r2, r2, lsl #1
     42 
     43         /* expand the data to 32 bits */
     44         orr         r1, r1, lsl #16
     45 
     46         /* align to 32 bits */
     47         tst         r0, #2
     48         strneh      r1, [r0], #2
     49         subne       r2, r2, #2
     50 
     51         /* Now jump into the main loop below. */
     52         b           .Lwork_32
     53         .fnend
     54 
     55 arm_memset32:
     56         .fnstart
     57         push        {lr}
     58 
     59         /* if count is equal to zero then abort */
     60         teq         r2, #0
     61         ble         .Lfinish
     62 
     63         /* Multiply count by 4 - go from the number of 32-bit words to
     64          * the number of bytes desired. */
     65         mov         r2, r2, lsl #2
     66 
     67 .Lwork_32:
     68         /* Set up registers ready for writing them out. */
     69         mov         ip, r1
     70         mov         lr, r1
     71 
     72         /* Try to align the destination to a cache line. Assume 32
     73          * byte (8 word) cache lines, it's the common case. */
     74         rsb         r3, r0, #0
     75         ands        r3, r3, #0x1C
     76         beq         .Laligned32
     77         cmp         r3, r2
     78         andhi       r3, r2, #0x1C
     79         sub         r2, r2, r3
     80 
     81         /* (Optionally) write any unaligned leading bytes.
     82          * (0-28 bytes, length in r3) */
     83         movs        r3, r3, lsl #28
     84         stmcsia     r0!, {r1, lr}
     85         stmcsia     r0!, {r1, lr}
     86         stmmiia     r0!, {r1, lr}
     87         movs        r3, r3, lsl #2
     88         strcs       r1, [r0], #4
     89 
     90         /* Now quickly loop through the cache-aligned data. */
     91 .Laligned32:
     92         mov         r3, r1
     93 1:      subs        r2, r2, #32
     94         stmhsia     r0!, {r1,r3,ip,lr}
     95         stmhsia     r0!, {r1,r3,ip,lr}
     96         bhs         1b
     97         add         r2, r2, #32
     98 
     99         /* (Optionally) store any remaining trailing bytes.
    100          * (0-30 bytes, length in r2) */
    101         movs        r2, r2, lsl #28
    102         stmcsia     r0!, {r1,r3,ip,lr}
    103         stmmiia     r0!, {r1,lr}
    104         movs        r2, r2, lsl #2
    105         strcs       r1, [r0], #4
    106         strmih      lr, [r0], #2
    107 
    108 .Lfinish:
    109         pop         {pc}
    110         .fnend
    111