1 /*************************************************************************** 2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 ***************************************************************************/ 7 8 /*************************************************************************** 9 Neon memset: Attempts to do a memset with Neon registers if possible, 10 Inputs: 11 s: The buffer to write to 12 c: The integer data to write to the buffer 13 n: The size_t count. 14 Outputs: 15 16 ***************************************************************************/ 17 18 .code 32 19 .fpu neon 20 .align 4 21 .globl memset16_neon 22 .func 23 24 memset16_neon: 25 cmp r2, #0 26 bxeq lr 27 28 /* Keep in mind that r2 -- the count argument -- is for the 29 * number of 16-bit items to copy. 30 */ 31 lsl r2, r2, #1 32 33 push {r0} 34 35 /* If we have < 8 bytes, just do a quick loop to handle that */ 36 cmp r2, #8 37 bgt memset_gt4 38 memset_smallcopy_loop: 39 strh r1, [r0], #2 40 subs r2, r2, #2 41 bne memset_smallcopy_loop 42 memset_smallcopy_done: 43 pop {r0} 44 bx lr 45 46 memset_gt4: 47 /* 48 * Duplicate the r1 lowest 16-bits across r1. The idea is to have 49 * a register with two 16-bit-values we can copy. We do this by 50 * duplicating lowest 16-bits of r1 to upper 16-bits. 51 */ 52 orr r1, r1, r1, lsl #16 53 /* 54 * If we're copying > 64 bytes, then we may want to get 55 * onto a 16-byte boundary to improve speed even more. 56 */ 57 cmp r2, #64 58 blt memset_route 59 ands r12, r0, #0xf 60 beq memset_route 61 /* 62 * Determine the number of bytes to move forward to get to the 16-byte 63 * boundary. Note that this will be a multiple of 4, since we 64 * already are word-aligned. 65 */ 66 rsb r12, r12, #16 67 sub r2, r2, r12 68 lsls r12, r12, #29 69 strmi r1, [r0], #4 70 strcs r1, [r0], #4 71 strcs r1, [r0], #4 72 lsls r12, r12, #2 73 strcsh r1, [r0], #2 74 memset_route: 75 /* 76 * Decide where to route for the maximum copy sizes. Note that we 77 * build q0 and q1 depending on if we'll need it, so that's 78 * interwoven here as well. 79 */ 80 vdup.u32 d0, r1 81 cmp r2, #16 82 blt memset_8 83 vmov d1, d0 84 cmp r2, #64 85 blt memset_16 86 vmov q1, q0 87 cmp r2, #128 88 blt memset_32 89 memset_128: 90 mov r12, r2, lsr #7 91 memset_128_loop: 92 vst1.64 {q0, q1}, [r0]! 93 vst1.64 {q0, q1}, [r0]! 94 vst1.64 {q0, q1}, [r0]! 95 vst1.64 {q0, q1}, [r0]! 96 subs r12, r12, #1 97 bne memset_128_loop 98 ands r2, r2, #0x7f 99 beq memset_end 100 memset_32: 101 movs r12, r2, lsr #5 102 beq memset_16 103 memset_32_loop: 104 subs r12, r12, #1 105 vst1.64 {q0, q1}, [r0]! 106 bne memset_32_loop 107 ands r2, r2, #0x1f 108 beq memset_end 109 memset_16: 110 movs r12, r2, lsr #4 111 beq memset_8 112 memset_16_loop: 113 subs r12, r12, #1 114 vst1.32 {q0}, [r0]! 115 bne memset_16_loop 116 ands r2, r2, #0xf 117 beq memset_end 118 /* 119 * memset_8 isn't a loop, since we try to do our loops at 16 120 * bytes and above. We should loop there, then drop down here 121 * to finish the <16-byte versions. Same for memset_4 and 122 * memset_1. 123 */ 124 memset_8: 125 cmp r2, #8 126 blt memset_4 127 subs r2, r2, #8 128 vst1.32 {d0}, [r0]! 129 memset_4: 130 cmp r2, #4 131 blt memset_2 132 subs r2, r2, #4 133 str r1, [r0], #4 134 memset_2: 135 cmp r2, #0 136 ble memset_end 137 strh r1, [r0], #2 138 memset_end: 139 pop {r0} 140 bx lr 141 142 .endfunc 143 .end 144