1 /*************************************************************************** 2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 ***************************************************************************/ 7 8 /*************************************************************************** 9 Neon memset: Attempts to do a memset with Neon registers if possible, 10 Inputs: 11 s: The buffer to write to 12 c: The integer data to write to the buffer 13 n: The size_t count. 14 Outputs: 15 16 ***************************************************************************/ 17 18 .syntax unified 19 20 .code 32 21 .fpu neon 22 .align 4 23 .globl memset16_neon 24 25 memset16_neon: 26 cmp r2, #0 27 bxeq lr 28 29 /* Keep in mind that r2 -- the count argument -- is for the 30 * number of 16-bit items to copy. 31 */ 32 lsl r2, r2, #1 33 34 push {r0} 35 36 /* If we have < 8 bytes, just do a quick loop to handle that */ 37 cmp r2, #8 38 bgt memset_gt4 39 memset_smallcopy_loop: 40 strh r1, [r0], #2 41 subs r2, r2, #2 42 bne memset_smallcopy_loop 43 memset_smallcopy_done: 44 pop {r0} 45 bx lr 46 47 memset_gt4: 48 /* 49 * Duplicate the r1 lowest 16-bits across r1. The idea is to have 50 * a register with two 16-bit-values we can copy. We do this by 51 * duplicating lowest 16-bits of r1 to upper 16-bits. 52 */ 53 orr r1, r1, r1, lsl #16 54 /* 55 * If we're copying > 64 bytes, then we may want to get 56 * onto a 16-byte boundary to improve speed even more. 57 */ 58 cmp r2, #64 59 blt memset_route 60 ands r12, r0, #0xf 61 beq memset_route 62 /* 63 * Determine the number of bytes to move forward to get to the 16-byte 64 * boundary. Note that this will be a multiple of 4, since we 65 * already are word-aligned. 66 */ 67 rsb r12, r12, #16 68 sub r2, r2, r12 69 lsls r12, r12, #29 70 strmi r1, [r0], #4 71 strcs r1, [r0], #4 72 strcs r1, [r0], #4 73 lsls r12, r12, #2 74 strhcs r1, [r0], #2 75 memset_route: 76 /* 77 * Decide where to route for the maximum copy sizes. Note that we 78 * build q0 and q1 depending on if we'll need it, so that's 79 * interwoven here as well. 80 */ 81 vdup.u32 d0, r1 82 cmp r2, #16 83 blt memset_8 84 vmov d1, d0 85 cmp r2, #64 86 blt memset_16 87 vmov q1, q0 88 cmp r2, #128 89 blt memset_32 90 memset_128: 91 mov r12, r2, lsr #7 92 memset_128_loop: 93 vst1.64 {q0, q1}, [r0]! 94 vst1.64 {q0, q1}, [r0]! 95 vst1.64 {q0, q1}, [r0]! 96 vst1.64 {q0, q1}, [r0]! 97 subs r12, r12, #1 98 bne memset_128_loop 99 ands r2, r2, #0x7f 100 beq memset_end 101 memset_32: 102 movs r12, r2, lsr #5 103 beq memset_16 104 memset_32_loop: 105 subs r12, r12, #1 106 vst1.64 {q0, q1}, [r0]! 107 bne memset_32_loop 108 ands r2, r2, #0x1f 109 beq memset_end 110 memset_16: 111 movs r12, r2, lsr #4 112 beq memset_8 113 memset_16_loop: 114 subs r12, r12, #1 115 vst1.32 {q0}, [r0]! 116 bne memset_16_loop 117 ands r2, r2, #0xf 118 beq memset_end 119 /* 120 * memset_8 isn't a loop, since we try to do our loops at 16 121 * bytes and above. We should loop there, then drop down here 122 * to finish the <16-byte versions. Same for memset_4 and 123 * memset_1. 124 */ 125 memset_8: 126 cmp r2, #8 127 blt memset_4 128 subs r2, r2, #8 129 vst1.32 {d0}, [r0]! 130 memset_4: 131 cmp r2, #4 132 blt memset_2 133 subs r2, r2, #4 134 str r1, [r0], #4 135 memset_2: 136 cmp r2, #0 137 ble memset_end 138 strh r1, [r0], #2 139 memset_end: 140 pop {r0} 141 bx lr 142 143 .end 144