1 @ 2 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 @ 4 @ Use of this source code is governed by a BSD-style license 5 @ that can be found in the LICENSE file in the root of the source 6 @ tree. An additional intellectual property rights grant can be found 7 @ in the file PATENTS. All contributing project authors may 8 @ be found in the AUTHORS file in the root of the source tree. 9 @ 10 11 @ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for 12 @ ARMv7 platform. The description header can be found in 13 @ signal_processing_library.h 14 @ 15 @ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and 16 @ the reference C code at end of this file. 17 18 @ Assumptions: 19 @ (1) data_length > 0 20 @ (2) coefficients_length > 1 21 22 @ Register usage: 23 @ 24 @ r0: &data_in[i] 25 @ r1: &data_out[i], for result ouput 26 @ r2: &coefficients[0] 27 @ r3: coefficients_length 28 @ r4: Iteration counter for the outer loop. 29 @ r5: data_out[j] as multiplication inputs 30 @ r6: Calculated value for output data_out[]; interation counter for inner loop 31 @ r7: Partial sum of a filtering multiplication results 32 @ r8: Partial sum of a filtering multiplication results 33 @ r9: &data_out[], for filtering input; data_in[i] 34 @ r10: coefficients[j] 35 @ r11: Scratch 36 @ r12: &coefficients[j] 37 38 #include "webrtc/system_wrappers/include/asm_defines.h" 39 40 GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12 41 .align 2 42 DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12 43 push {r4-r11} 44 45 ldrsh r12, [sp, #32] @ data_length 46 subs r4, r12, #1 47 beq ODD_LENGTH @ jump if data_length == 1 48 49 LOOP_LENGTH: 50 add r12, r2, r3, lsl #1 51 sub r12, #4 @ &coefficients[coefficients_length - 2] 52 sub r9, r1, r3, lsl #1 53 add r9, #2 @ &data_out[i - coefficients_length + 1] 54 ldr r5, [r9], #4 @ data_out[i - coefficients_length + {1,2}] 55 56 mov r7, #0 @ sum1 57 mov r8, #0 @ sum2 58 subs r6, r3, #3 @ Iteration counter for inner loop. 59 beq ODD_A_LENGTH @ branch if coefficients_length == 3 60 blt POST_LOOP_A_LENGTH @ branch if coefficients_length == 2 61 62 LOOP_A_LENGTH: 63 ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] 64 subs r6, #2 65 smlatt r8, r10, r5, r8 @ sum2 += coefficients[j] * data_out[i - j + 1]; 66 smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; 67 smlabt r7, r10, r5, r7 @ coefficients[j - 1] * data_out[i - j + 1]; 68 ldr r5, [r9], #4 @ data_out[i - j + 2], data_out[i - j + 3] 69 smlabb r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 2]; 70 bgt LOOP_A_LENGTH 71 blt POST_LOOP_A_LENGTH 72 73 ODD_A_LENGTH: 74 ldrsh r10, [r12, #2] @ Filter coefficients coefficients[2] 75 sub r12, #2 @ &coefficients[0] 76 smlabb r7, r10, r5, r7 @ sum1 += coefficients[2] * data_out[i - 2]; 77 smlabt r8, r10, r5, r8 @ sum2 += coefficients[2] * data_out[i - 1]; 78 ldr r5, [r9, #-2] @ data_out[i - 1], data_out[i] 79 80 POST_LOOP_A_LENGTH: 81 ldr r10, [r12] @ coefficients[0], coefficients[1] 82 smlatb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; 83 84 ldr r9, [r0], #4 @ data_in[i], data_in[i + 1] 85 smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; 86 sub r6, r7 @ output1 -= sum1; 87 88 sbfx r11, r6, #12, #16 89 ssat r7, #16, r6, asr #12 90 cmp r7, r11 91 addeq r6, r6, #2048 92 ssat r6, #16, r6, asr #12 93 strh r6, [r1], #2 @ Store data_out[i] 94 95 smlatb r8, r10, r6, r8 @ sum2 += coefficients[1] * data_out[i]; 96 smulbt r6, r10, r9 @ output2 = coefficients[0] * data_in[i + 1]; 97 sub r6, r8 @ output1 -= sum1; 98 99 sbfx r11, r6, #12, #16 100 ssat r7, #16, r6, asr #12 101 cmp r7, r11 102 addeq r6, r6, #2048 103 ssat r6, #16, r6, asr #12 104 strh r6, [r1], #2 @ Store data_out[i + 1] 105 106 subs r4, #2 107 bgt LOOP_LENGTH 108 blt END @ For even data_length, it's done. Jump to END. 109 110 @ Process i = data_length -1, for the case of an odd length. 111 ODD_LENGTH: 112 add r12, r2, r3, lsl #1 113 sub r12, #4 @ &coefficients[coefficients_length - 2] 114 sub r9, r1, r3, lsl #1 115 add r9, #2 @ &data_out[i - coefficients_length + 1] 116 mov r7, #0 @ sum1 117 mov r8, #0 @ sum1 118 subs r6, r3, #2 @ inner loop counter 119 beq EVEN_A_LENGTH @ branch if coefficients_length == 2 120 121 LOOP2_A_LENGTH: 122 ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] 123 ldr r5, [r9], #4 @ data_out[i - j], data_out[i - j + 1] 124 subs r6, #2 125 smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; 126 smlabt r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 1]; 127 bgt LOOP2_A_LENGTH 128 addlt r12, #2 129 blt POST_LOOP2_A_LENGTH 130 131 EVEN_A_LENGTH: 132 ldrsh r10, [r12, #2] @ Filter coefficients coefficients[1] 133 ldrsh r5, [r9] @ data_out[i - 1] 134 smlabb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; 135 136 POST_LOOP2_A_LENGTH: 137 ldrsh r10, [r12] @ Filter coefficients coefficients[0] 138 ldrsh r9, [r0] @ data_in[i] 139 smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; 140 sub r6, r7 @ output1 -= sum1; 141 sub r6, r8 @ output1 -= sum1; 142 sbfx r8, r6, #12, #16 143 ssat r7, #16, r6, asr #12 144 cmp r7, r8 145 addeq r6, r6, #2048 146 ssat r6, #16, r6, asr #12 147 strh r6, [r1] @ Store the data_out[i] 148 149 END: 150 pop {r4-r11} 151 bx lr 152 153 @Reference C code: 154 @ 155 @void WebRtcSpl_FilterARFastQ12(int16_t* data_in, 156 @ int16_t* data_out, 157 @ int16_t* __restrict coefficients, 158 @ size_t coefficients_length, 159 @ size_t data_length) { 160 @ size_t i = 0; 161 @ size_t j = 0; 162 @ 163 @ assert(data_length > 0); 164 @ assert(coefficients_length > 1); 165 @ 166 @ for (i = 0; i < data_length - 1; i += 2) { 167 @ int32_t output1 = 0; 168 @ int32_t sum1 = 0; 169 @ int32_t output2 = 0; 170 @ int32_t sum2 = 0; 171 @ 172 @ for (j = coefficients_length - 1; j > 2; j -= 2) { 173 @ sum1 += coefficients[j] * data_out[i - j]; 174 @ sum1 += coefficients[j - 1] * data_out[i - j + 1]; 175 @ sum2 += coefficients[j] * data_out[i - j + 1]; 176 @ sum2 += coefficients[j - 1] * data_out[i - j + 2]; 177 @ } 178 @ 179 @ if (j == 2) { 180 @ sum1 += coefficients[2] * data_out[i - 2]; 181 @ sum2 += coefficients[2] * data_out[i - 1]; 182 @ } 183 @ 184 @ sum1 += coefficients[1] * data_out[i - 1]; 185 @ output1 = coefficients[0] * data_in[i]; 186 @ output1 -= sum1; 187 @ // Saturate and store the output. 188 @ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); 189 @ data_out[i] = (int16_t)((output1 + 2048) >> 12); 190 @ 191 @ sum2 += coefficients[1] * data_out[i]; 192 @ output2 = coefficients[0] * data_in[i + 1]; 193 @ output2 -= sum2; 194 @ // Saturate and store the output. 195 @ output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728); 196 @ data_out[i + 1] = (int16_t)((output2 + 2048) >> 12); 197 @ } 198 @ 199 @ if (i == data_length - 1) { 200 @ int32_t output1 = 0; 201 @ int32_t sum1 = 0; 202 @ 203 @ for (j = coefficients_length - 1; j > 1; j -= 2) { 204 @ sum1 += coefficients[j] * data_out[i - j]; 205 @ sum1 += coefficients[j - 1] * data_out[i - j + 1]; 206 @ } 207 @ 208 @ if (j == 1) { 209 @ sum1 += coefficients[1] * data_out[i - 1]; 210 @ } 211 @ 212 @ output1 = coefficients[0] * data_in[i]; 213 @ output1 -= sum1; 214 @ // Saturate and store the output. 215 @ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); 216 @ data_out[i] = (int16_t)((output1 + 2048) >> 12); 217 @ } 218 @} 219