1 @ 2 @ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 @ 4 @ Use of this source code is governed by a BSD-style license 5 @ that can be found in the LICENSE file in the root of the source 6 @ tree. An additional intellectual property rights grant can be found 7 @ in the file PATENTS. All contributing project authors may 8 @ be found in the AUTHORS file in the root of the source tree. 9 @ 10 11 @ lattice_neon.s 12 @ 13 @ Contains a function for the core loop in the normalized lattice MA 14 @ filter routine for iSAC codec, optimized for ARM Neon platform. 15 @ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, 16 @ int16_t input1, 17 @ int32_t input2, 18 @ int32_t* ptr0, 19 @ int32_t* ptr1, 20 @ int32_t* __restrict ptr2); 21 @ It calculates 22 @ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); 23 @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); 24 @ in Q15 domain. 25 @ 26 @ Reference code in lattice.c. 27 @ Output is not bit-exact with the reference C code, due to the replacement 28 @ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon 29 @ instructions, smulwb, and smull. Speech quality was not degraded by 30 @ testing speech and tone vectors. 31 32 #include "webrtc/system_wrappers/interface/asm_defines.h" 33 #include "settings.h" 34 35 GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon 36 .align 2 37 DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon 38 push {r4-r8} 39 40 vdup.32 d28, r0 @ Initialize Neon register with input0 41 vdup.32 d29, r1 @ Initialize Neon register with input1 42 vdup.32 d30, r2 @ Initialize Neon register with input2 43 ldr r4, [sp, #20] @ ptr1 44 ldr r12, [sp, #24] @ ptr2 45 46 @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2 47 @ Leftover samples after the loop, in r6: 48 @ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2 49 mov r6, #HALF_SUBFRAMELEN 50 sub r6, #1 51 lsr r5, r6, #2 52 sub r6, r5, lsl #2 53 54 @ First r5 iterations in a loop. 55 56 LOOP: 57 vld1.32 {d0, d1}, [r3]! @ *ptr0 58 59 vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0) 60 vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0) 61 vmull.s32 q12, d0, d29 @ input1 * (*ptr0) 62 vmull.s32 q13, d1, d29 @ input1 * (*ptr0) 63 64 vrshrn.i64 d4, q10, #15 65 vrshrn.i64 d5, q11, #15 66 67 vld1.32 {d2, d3}, [r12] @ *ptr2 68 vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a 69 70 vrshrn.i64 d0, q12, #15 71 72 vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b) 73 vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b) 74 75 vrshrn.i64 d16, q10, #16 76 vrshrn.i64 d17, q11, #16 77 78 vmull.s32 q10, d16, d28 @ input0 * (*ptr2) 79 vmull.s32 q11, d17, d28 @ input0 * (*ptr2) 80 81 vrshrn.i64 d1, q13, #15 82 vrshrn.i64 d18, q10, #15 83 vrshrn.i64 d19, q11, #15 84 85 vst1.32 {d16, d17}, [r12]! @ *ptr2 86 87 vadd.i32 q9, q0, q9 88 subs r5, #1 89 vst1.32 {d18, d19}, [r4]! @ *ptr1 90 91 bgt LOOP 92 93 @ Check how many samples still need to be processed. 94 subs r6, #2 95 blt LAST_SAMPLE 96 97 @ Process two more samples: 98 vld1.32 d0, [r3]! @ *ptr0 99 100 vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0) 101 vmull.s32 q13, d0, d29 @ input1 * (*ptr0) 102 103 vld1.32 d18, [r12] @ *ptr2 104 vrshrn.i64 d4, q11, #15 105 106 vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a 107 vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b) 108 vrshrn.i64 d16, q11, #16 109 110 vmull.s32 q11, d16, d28 @ input0 * (*ptr2) 111 vst1.32 d16, [r12]! @ *ptr2 112 113 vrshrn.i64 d0, q13, #15 114 vrshrn.i64 d19, q11, #15 115 vadd.i32 d19, d0, d19 116 117 vst1.32 d19, [r4]! @ *ptr1 118 119 @ If there's still one more sample, process it here. 120 LAST_SAMPLE: 121 cmp r6, #1 122 bne END 123 124 @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)); 125 126 ldr r7, [r3] @ *ptr0 127 ldr r8, [r12] @ *ptr2 128 129 smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16 130 add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1) 131 smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits 132 lsl r6, #16 133 add r6, r5, lsr #16 @ Only take the middle 32 bits 134 str r6, [r12] @ Output (*ptr2, as 32 bits) 135 136 @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); 137 138 smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16 139 smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16 140 lsl r5, r5, #1 141 add r5, r6, lsl #1 142 str r5, [r4] @ Output (*ptr1) 143 144 END: 145 pop {r4-r8} 146 bx lr 147