1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <stddef.h> 12 13 #include "webrtc/modules/audio_coding/codecs/isac/fix/source/settings.h" 14 #include "webrtc/typedefs.h" 15 16 // Filter ar_g_Q0[] and ar_f_Q0[] through an AR filter with coefficients 17 // cth_Q15[] and sth_Q15[]. 18 void WebRtcIsacfix_FilterArLoop(int16_t* ar_g_Q0, // Input samples 19 int16_t* ar_f_Q0, // Input samples 20 int16_t* cth_Q15, // Filter coefficients 21 int16_t* sth_Q15, // Filter coefficients 22 size_t order_coef) { // order of the filter 23 int n = 0; 24 25 for (n = 0; n < HALF_SUBFRAMELEN - 1; n++) { 26 int count = (int)(order_coef - 1); 27 int offset; 28 #if !defined(MIPS_DSP_R1_LE) 29 int16_t* tmp_cth; 30 int16_t* tmp_sth; 31 int16_t* tmp_arg; 32 int32_t max_q16 = 0x7fff; 33 int32_t min_q16 = 0xffff8000; 34 #endif 35 // Declare variables used as temporary registers. 36 int32_t r0, r1, r2, t0, t1, t2, t_ar; 37 38 __asm __volatile ( 39 ".set push \n\t" 40 ".set noreorder \n\t" 41 "bltz %[count], 2f \n\t" 42 " lh %[t_ar], 0(%[tmp]) \n\t" 43 // Inner loop 44 "1: \n\t" 45 "sll %[offset], %[count], 1 \n\t" 46 #if defined(MIPS_DSP_R1_LE) 47 "lhx %[r0], %[offset](%[cth_Q15]) \n\t" 48 "lhx %[r1], %[offset](%[sth_Q15]) \n\t" 49 "lhx %[r2], %[offset](%[ar_g_Q0]) \n\t" 50 #else 51 "addu %[tmp_cth], %[cth_Q15], %[offset] \n\t" 52 "addu %[tmp_sth], %[sth_Q15], %[offset] \n\t" 53 "addu %[tmp_arg], %[ar_g_Q0], %[offset] \n\t" 54 "lh %[r0], 0(%[tmp_cth]) \n\t" 55 "lh %[r1], 0(%[tmp_sth]) \n\t" 56 "lh %[r2], 0(%[tmp_arg]) \n\t" 57 #endif 58 "mul %[t0], %[r0], %[t_ar] \n\t" 59 "mul %[t1], %[r1], %[t_ar] \n\t" 60 "mul %[t2], %[r1], %[r2] \n\t" 61 "mul %[r0], %[r0], %[r2] \n\t" 62 "subu %[t0], %[t0], %[t2] \n\t" 63 "addu %[t1], %[t1], %[r0] \n\t" 64 #if defined(MIPS_DSP_R1_LE) 65 "shra_r.w %[t1], %[t1], 15 \n\t" 66 "shra_r.w %[t0], %[t0], 15 \n\t" 67 #else 68 "addiu %[t1], %[t1], 0x4000 \n\t" 69 "sra %[t1], %[t1], 15 \n\t" 70 "addiu %[t0], %[t0], 0x4000 \n\t" 71 "sra %[t0], %[t0], 15 \n\t" 72 #endif 73 "addiu %[offset], %[offset], 2 \n\t" 74 #if defined(MIPS_DSP_R1_LE) 75 "shll_s.w %[t1], %[t1], 16 \n\t" 76 "shll_s.w %[t_ar], %[t0], 16 \n\t" 77 #else 78 "slt %[r0], %[t1], %[max_q16] \n\t" 79 "slt %[r1], %[t0], %[max_q16] \n\t" 80 "movz %[t1], %[max_q16], %[r0] \n\t" 81 "movz %[t0], %[max_q16], %[r1] \n\t" 82 #endif 83 "addu %[offset], %[offset], %[ar_g_Q0] \n\t" 84 #if defined(MIPS_DSP_R1_LE) 85 "sra %[t1], %[t1], 16 \n\t" 86 "sra %[t_ar], %[t_ar], 16 \n\t" 87 #else 88 "slt %[r0], %[t1], %[min_q16] \n\t" 89 "slt %[r1], %[t0], %[min_q16] \n\t" 90 "movn %[t1], %[min_q16], %[r0] \n\t" 91 "movn %[t0], %[min_q16], %[r1] \n\t" 92 "addu %[t_ar], $zero, %[t0] \n\t" 93 #endif 94 "sh %[t1], 0(%[offset]) \n\t" 95 "bgtz %[count], 1b \n\t" 96 " addiu %[count], %[count], -1 \n\t" 97 "2: \n\t" 98 "sh %[t_ar], 0(%[tmp]) \n\t" 99 "sh %[t_ar], 0(%[ar_g_Q0]) \n\t" 100 ".set pop \n\t" 101 : [t_ar] "=&r" (t_ar), [count] "+r" (count), [offset] "=&r" (offset), 102 [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [t0] "=&r" (t0), 103 #if !defined(MIPS_DSP_R1_LE) 104 [tmp_cth] "=&r" (tmp_cth), [tmp_sth] "=&r" (tmp_sth), 105 [tmp_arg] "=&r" (tmp_arg), 106 #endif 107 [t1] "=&r" (t1), [t2] "=&r" (t2) 108 : [tmp] "r" (&ar_f_Q0[n+1]), [cth_Q15] "r" (cth_Q15), 109 #if !defined(MIPS_DSP_R1_LE) 110 [max_q16] "r" (max_q16), [min_q16] "r" (min_q16), 111 #endif 112 [sth_Q15] "r" (sth_Q15), [ar_g_Q0] "r" (ar_g_Q0) 113 : "memory", "hi", "lo" 114 ); 115 } 116 } 117 118 // MIPS optimization of the inner loop used for function 119 // WebRtcIsacfix_NormLatticeFilterMa(). It does: 120 // 121 // for 0 <= n < HALF_SUBFRAMELEN - 1: 122 // *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); 123 // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); 124 // 125 // Note, function WebRtcIsacfix_FilterMaLoopMIPS and WebRtcIsacfix_FilterMaLoopC 126 // are not bit-exact. The accuracy of the MIPS function is same or better. 127 void WebRtcIsacfix_FilterMaLoopMIPS(int16_t input0, // Filter coefficient 128 int16_t input1, // Filter coefficient 129 int32_t input2, // Inverse coeff (1/input1) 130 int32_t* ptr0, // Sample buffer 131 int32_t* ptr1, // Sample buffer 132 int32_t* ptr2) { // Sample buffer 133 #if defined(MIPS_DSP_R2_LE) 134 // MIPS DSPR2 version. 4 available accumulators allows loop unrolling 4 times. 135 // This variant is not bit-exact with WebRtcIsacfix_FilterMaLoopC, since we 136 // are exploiting 64-bit accumulators. The accuracy of the MIPS DSPR2 function 137 // is same or better. 138 int n = (HALF_SUBFRAMELEN - 1) >> 2; 139 int m = (HALF_SUBFRAMELEN - 1) & 3; 140 141 int r0, r1, r2, r3; 142 int t0, t1, t2, t3; 143 int s0, s1, s2, s3; 144 145 __asm __volatile ( 146 ".set push \n\t" 147 ".set noreorder \n\t" 148 "1: \n\t" 149 "lw %[r0], 0(%[ptr0]) \n\t" 150 "lw %[r1], 4(%[ptr0]) \n\t" 151 "lw %[r2], 8(%[ptr0]) \n\t" 152 "lw %[r3], 12(%[ptr0]) \n\t" 153 "mult $ac0, %[r0], %[input0] \n\t" 154 "mult $ac1, %[r1], %[input0] \n\t" 155 "mult $ac2, %[r2], %[input0] \n\t" 156 "mult $ac3, %[r3], %[input0] \n\t" 157 "lw %[t0], 0(%[ptr2]) \n\t" 158 "extr_rs.w %[s0], $ac0, 15 \n\t" 159 "extr_rs.w %[s1], $ac1, 15 \n\t" 160 "extr_rs.w %[s2], $ac2, 15 \n\t" 161 "extr_rs.w %[s3], $ac3, 15 \n\t" 162 "lw %[t1], 4(%[ptr2]) \n\t" 163 "lw %[t2], 8(%[ptr2]) \n\t" 164 "lw %[t3], 12(%[ptr2]) \n\t" 165 "addu %[t0], %[t0], %[s0] \n\t" 166 "addu %[t1], %[t1], %[s1] \n\t" 167 "addu %[t2], %[t2], %[s2] \n\t" 168 "addu %[t3], %[t3], %[s3] \n\t" 169 "mult $ac0, %[t0], %[input2] \n\t" 170 "mult $ac1, %[t1], %[input2] \n\t" 171 "mult $ac2, %[t2], %[input2] \n\t" 172 "mult $ac3, %[t3], %[input2] \n\t" 173 "addiu %[ptr0], %[ptr0], 16 \n\t" 174 "extr_rs.w %[t0], $ac0, 16 \n\t" 175 "extr_rs.w %[t1], $ac1, 16 \n\t" 176 "extr_rs.w %[t2], $ac2, 16 \n\t" 177 "extr_rs.w %[t3], $ac3, 16 \n\t" 178 "addiu %[n], %[n], -1 \n\t" 179 "mult $ac0, %[r0], %[input1] \n\t" 180 "mult $ac1, %[r1], %[input1] \n\t" 181 "mult $ac2, %[r2], %[input1] \n\t" 182 "mult $ac3, %[r3], %[input1] \n\t" 183 "sw %[t0], 0(%[ptr2]) \n\t" 184 "extr_rs.w %[s0], $ac0, 15 \n\t" 185 "extr_rs.w %[s1], $ac1, 15 \n\t" 186 "extr_rs.w %[s2], $ac2, 15 \n\t" 187 "extr_rs.w %[s3], $ac3, 15 \n\t" 188 "sw %[t1], 4(%[ptr2]) \n\t" 189 "sw %[t2], 8(%[ptr2]) \n\t" 190 "sw %[t3], 12(%[ptr2]) \n\t" 191 "mult $ac0, %[t0], %[input0] \n\t" 192 "mult $ac1, %[t1], %[input0] \n\t" 193 "mult $ac2, %[t2], %[input0] \n\t" 194 "mult $ac3, %[t3], %[input0] \n\t" 195 "addiu %[ptr2], %[ptr2], 16 \n\t" 196 "extr_rs.w %[t0], $ac0, 15 \n\t" 197 "extr_rs.w %[t1], $ac1, 15 \n\t" 198 "extr_rs.w %[t2], $ac2, 15 \n\t" 199 "extr_rs.w %[t3], $ac3, 15 \n\t" 200 "addu %[t0], %[t0], %[s0] \n\t" 201 "addu %[t1], %[t1], %[s1] \n\t" 202 "addu %[t2], %[t2], %[s2] \n\t" 203 "addu %[t3], %[t3], %[s3] \n\t" 204 "sw %[t0], 0(%[ptr1]) \n\t" 205 "sw %[t1], 4(%[ptr1]) \n\t" 206 "sw %[t2], 8(%[ptr1]) \n\t" 207 "sw %[t3], 12(%[ptr1]) \n\t" 208 "bgtz %[n], 1b \n\t" 209 " addiu %[ptr1], %[ptr1], 16 \n\t" 210 "beq %[m], %0, 3f \n\t" 211 " nop \n\t" 212 "2: \n\t" 213 "lw %[r0], 0(%[ptr0]) \n\t" 214 "lw %[t0], 0(%[ptr2]) \n\t" 215 "addiu %[ptr0], %[ptr0], 4 \n\t" 216 "mult $ac0, %[r0], %[input0] \n\t" 217 "mult $ac1, %[r0], %[input1] \n\t" 218 "extr_rs.w %[r1], $ac0, 15 \n\t" 219 "extr_rs.w %[t1], $ac1, 15 \n\t" 220 "addu %[t0], %[t0], %[r1] \n\t" 221 "mult $ac0, %[t0], %[input2] \n\t" 222 "extr_rs.w %[t0], $ac0, 16 \n\t" 223 "sw %[t0], 0(%[ptr2]) \n\t" 224 "mult $ac0, %[t0], %[input0] \n\t" 225 "addiu %[ptr2], %[ptr2], 4 \n\t" 226 "addiu %[m], %[m], -1 \n\t" 227 "extr_rs.w %[t0], $ac0, 15 \n\t" 228 "addu %[t0], %[t0], %[t1] \n\t" 229 "sw %[t0], 0(%[ptr1]) \n\t" 230 "bgtz %[m], 2b \n\t" 231 " addiu %[ptr1], %[ptr1], 4 \n\t" 232 "3: \n\t" 233 ".set pop \n\t" 234 : [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), 235 [r3] "=&r" (r3), [t0] "=&r" (t0), [t1] "=&r" (t1), 236 [t2] "=&r" (t2), [t3] "=&r" (t3), [s0] "=&r" (s0), 237 [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3), 238 [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), [m] "+r" (m), 239 [ptr2] "+r" (ptr2), [n] "+r" (n) 240 : [input0] "r" (input0), [input1] "r" (input1), 241 [input2] "r" (input2) 242 : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", 243 "$ac2lo", "$ac3hi", "$ac3lo" 244 ); 245 #else 246 // Non-DSPR2 version of the function. Avoiding the accumulator usage due to 247 // large latencies. This variant is bit-exact with C code. 248 int n = HALF_SUBFRAMELEN - 1; 249 int32_t t16a, t16b; 250 int32_t r0, r1, r2, r3, r4; 251 252 __asm __volatile ( 253 ".set push \n\t" 254 ".set noreorder \n\t" 255 "sra %[t16a], %[input2], 16 \n\t" 256 "andi %[t16b], %[input2], 0xFFFF \n\t" 257 #if defined(MIPS32R2_LE) 258 "seh %[t16b], %[t16b] \n\t" 259 "seh %[input0], %[input0] \n\t" 260 "seh %[input1], %[input1] \n\t" 261 #else 262 "sll %[t16b], %[t16b], 16 \n\t" 263 "sra %[t16b], %[t16b], 16 \n\t" 264 "sll %[input0], %[input0], 16 \n\t" 265 "sra %[input0], %[input0], 16 \n\t" 266 "sll %[input1], %[input1], 16 \n\t" 267 "sra %[input1], %[input1], 16 \n\t" 268 #endif 269 "addiu %[r0], %[t16a], 1 \n\t" 270 "slt %[r1], %[t16b], $zero \n\t" 271 "movn %[t16a], %[r0], %[r1] \n\t" 272 "1: \n\t" 273 "lw %[r0], 0(%[ptr0]) \n\t" 274 "lw %[r1], 0(%[ptr2]) \n\t" 275 "addiu %[ptr0], %[ptr0], 4 \n\t" 276 "sra %[r2], %[r0], 16 \n\t" 277 "andi %[r0], %[r0], 0xFFFF \n\t" 278 "mul %[r3], %[r2], %[input0] \n\t" 279 "mul %[r4], %[r0], %[input0] \n\t" 280 "mul %[r2], %[r2], %[input1] \n\t" 281 "mul %[r0], %[r0], %[input1] \n\t" 282 "addiu %[ptr2], %[ptr2], 4 \n\t" 283 "sll %[r3], %[r3], 1 \n\t" 284 "sra %[r4], %[r4], 1 \n\t" 285 "addiu %[r4], %[r4], 0x2000 \n\t" 286 "sra %[r4], %[r4], 14 \n\t" 287 "addu %[r3], %[r3], %[r4] \n\t" 288 "addu %[r1], %[r1], %[r3] \n\t" 289 "sra %[r3], %[r1], 16 \n\t" 290 "andi %[r4], %[r1], 0xFFFF \n\t" 291 "sra %[r4], %[r4], 1 \n\t" 292 "mul %[r1], %[r1], %[t16a] \n\t" 293 "mul %[r3], %[r3], %[t16b] \n\t" 294 "mul %[r4], %[r4], %[t16b] \n\t" 295 "sll %[r2], %[r2], 1 \n\t" 296 "sra %[r0], %[r0], 1 \n\t" 297 "addiu %[r0], %[r0], 0x2000 \n\t" 298 "sra %[r0], %[r0], 14 \n\t" 299 "addu %[r0], %[r0], %[r2] \n\t" 300 "addiu %[n], %[n], -1 \n\t" 301 "addu %[r1], %[r1], %[r3] \n\t" 302 "addiu %[r4], %[r4], 0x4000 \n\t" 303 "sra %[r4], %[r4], 15 \n\t" 304 "addu %[r1], %[r1], %[r4] \n\t" 305 "sra %[r2], %[r1], 16 \n\t" 306 "andi %[r3], %[r1], 0xFFFF \n\t" 307 "mul %[r3], %[r3], %[input0] \n\t" 308 "mul %[r2], %[r2], %[input0] \n\t" 309 "sw %[r1], -4(%[ptr2]) \n\t" 310 "sra %[r3], %[r3], 1 \n\t" 311 "addiu %[r3], %[r3], 0x2000 \n\t" 312 "sra %[r3], %[r3], 14 \n\t" 313 "addu %[r0], %[r0], %[r3] \n\t" 314 "sll %[r2], %[r2], 1 \n\t" 315 "addu %[r0], %[r0], %[r2] \n\t" 316 "sw %[r0], 0(%[ptr1]) \n\t" 317 "bgtz %[n], 1b \n\t" 318 " addiu %[ptr1], %[ptr1], 4 \n\t" 319 ".set pop \n\t" 320 : [t16a] "=&r" (t16a), [t16b] "=&r" (t16b), [r0] "=&r" (r0), 321 [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3), 322 [r4] "=&r" (r4), [ptr0] "+r" (ptr0), [ptr1] "+r" (ptr1), 323 [ptr2] "+r" (ptr2), [n] "+r" (n) 324 : [input0] "r" (input0), [input1] "r" (input1), 325 [input2] "r" (input2) 326 : "hi", "lo", "memory" 327 ); 328 #endif 329 } 330