1 @ 2 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 @ 4 @ Use of this source code is governed by a BSD-style license 5 @ that can be found in the LICENSE file in the root of the source 6 @ tree. An additional intellectual property rights grant can be found 7 @ in the file PATENTS. All contributing project authors may 8 @ be found in the AUTHORS file in the root of the source tree. 9 @ 10 11 @ nsx_core_neon.s 12 @ This file contains some functions in NS, optimized for ARM Neon 13 @ platforms. Reference C code is in file nsx_core.c. Bit-exact. 14 15 .syntax unified 16 17 #include "nsx_core_neon_offsets.h" 18 #include "webrtc/modules/audio_processing/ns/nsx_defines.h" 19 #include "webrtc/system_wrappers/interface/asm_defines.h" 20 21 GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon 22 GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon 23 GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon 24 GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon 25 GLOBAL_LABEL WebRtcNsx_kLogTable 26 GLOBAL_LABEL WebRtcNsx_kCounterDiv 27 GLOBAL_LABEL WebRtcNsx_kLogTableFrac 28 29 .align 2 30 WebRtcNsx_kLogTableFrac: 31 _WebRtcNsx_kLogTableFrac: 32 .short 0, 1, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26 33 .short 28, 29, 30, 32, 33, 34, 36, 37, 38, 40, 41, 42, 44, 45, 46, 47, 49, 50 34 .short 51, 52, 54, 55, 56, 57, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 71, 72 35 .short 73, 74, 75, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 92, 93 36 .short 94, 95, 96, 97, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110 37 .short 111, 112, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126 38 .short 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141 39 .short 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 155 40 .short 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 169 41 .short 170, 171, 172, 173, 174, 175, 176, 177, 178, 178, 179, 180, 181, 182, 183 42 .short 184, 185, 185, 186, 187, 188, 189, 190, 191, 192, 192, 193, 194, 195, 196 43 .short 197, 198, 198, 199, 200, 201, 202, 203, 203, 204, 205, 206, 207, 208, 208 44 .short 209, 210, 211, 212, 212, 213, 214, 215, 216, 216, 217, 218, 219, 220, 220 45 .short 221, 222, 223, 224, 224, 225, 226, 227, 228, 228, 229, 230, 231, 231, 232 46 .short 233, 234, 234, 235, 236, 237, 238, 238, 239, 240, 241, 241, 242, 243, 244 47 .short 244, 245, 246, 247, 247, 248, 249, 249, 250, 251, 252, 252, 253, 254, 255 48 .short 255 49 50 .align 2 51 WebRtcNsx_kCounterDiv: 52 _WebRtcNsx_kCounterDiv: 53 .short 32767, 16384, 10923, 8192, 6554, 5461, 4681, 4096, 3641, 3277, 2979 54 .short 2731, 2521, 2341, 2185, 2048, 1928, 1820, 1725, 1638, 1560, 1489 55 .short 1425, 1365, 1311, 1260, 1214, 1170, 1130, 1092, 1057, 1024, 993, 964 56 .short 936, 910, 886, 862, 840, 819, 799, 780, 762, 745, 728, 712, 697, 683 57 .short 669, 655, 643, 630, 618, 607, 596, 585, 575, 565, 555, 546, 537, 529 58 .short 520, 512, 504, 496, 489, 482, 475, 468, 462, 455, 449, 443, 437, 431 59 .short 426, 420, 415, 410, 405, 400, 395, 390, 386, 381, 377, 372, 368, 364 60 .short 360, 356, 352, 349, 345, 341, 338, 334, 331, 328, 324, 321, 318, 315 61 .short 312, 309, 306, 303, 301, 298, 295, 293, 290, 287, 285, 282, 280, 278 62 .short 275, 273, 271, 269, 266, 264, 262, 260, 258, 256, 254, 252, 250, 248 63 .short 246, 245, 243, 241, 239, 237, 236, 234, 232, 231, 229, 228, 226, 224 64 .short 223, 221, 220, 218, 217, 216, 214, 213, 211, 210, 209, 207, 206, 205 65 .short 204, 202, 201, 200, 199, 197, 196, 195, 194, 193, 192, 191, 189, 188 66 .short 187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174 67 .short 173, 172, 172, 171, 170, 169, 168, 167, 166, 165, 165, 164, 163 68 69 .align 2 70 WebRtcNsx_kLogTable: 71 _WebRtcNsx_kLogTable: 72 .short 0, 177, 355, 532, 710, 887, 1065, 1242, 1420 73 74 @ void NoiseEstimationNeon(NsxInst_t* inst, 75 @ uint16_t* magn, 76 @ uint32_t* noise, 77 @ int16_t* q_noise); 78 79 @ Register usage (across major loops of NoiseEstimationNeon()): 80 @ r0-r3: function arguments, and scratch registers. 81 @ r4: &inst 82 @ r5: &noiseEstLogQuantile[] 83 @ r6: inst->magnLen 84 @ r7: offset 85 @ r8: s, the loop counter for the LOOP_SIMULT 86 @ r9: &inst->noiseEstDensity[] 87 @ r10: &inst->noiseEstCounter[] 88 @ r11: countDiv 89 @ r12: i, the loop counter for LOOP_NOISEESTIMATION_MAGNLEN_INNER 90 91 .align 2 92 DEFINE_FUNCTION WebRtcNsx_NoiseEstimationNeon 93 push {r4-r12, r14} @ Make sure 8-byte stack alignment. 94 vpush {d8-d15} 95 sub sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8) 96 97 @ [sp, #0]: logval 98 @ [sp, #4]: noise 99 @ [sp, #8]: q_noise 100 @ [sp, #12]: factor 101 @ [sp, #16 ~ #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)]: lmagn[HALF_ANAL_BLOCKL] 102 103 str r2, [sp, #4] @ noise 104 str r3, [sp, #8] @ q_noise 105 movw r4, #offset_nsx_normData 106 ldr r2, [r0, #offset_nsx_stages] @ inst->stages 107 ldr r4, [r0, r4] @ inst->normData 108 adr r12, WebRtcNsx_kLogTable 109 subs r3, r2, r4 @ tabind = inst->stages - inst->normData; 110 ldr r5, [r0, #offset_nsx_magnLen] @ magnLen 111 rsblt r3, #0 112 lsl r3, #1 113 ldrh r3, [r12, r3] @ logval = WebRtcNsx_kLogTable[tabind]; 114 add r12, sp, #16 @ lmagn[] 115 rsblt r3, #0 @ logval = -WebRtcNsx_kLogTable[-tabind]; 116 str r3, [sp] 117 vdup.16 q15, r3 118 119 adr r9, WebRtcNsx_kLogTableFrac 120 121 LOOP_SET_LMAGN: 122 ldrh r2, [r1], #2 @ magn[i] 123 cmp r2, #0 124 strheq r3, [r12], #2 @ lmagn[i] = logval; 125 beq CHECK_LMAGN_COUNTER 126 127 clz r6, r2 128 mov r4, r6 @ zeros 129 rsb r6, #31 130 lsl r2, r4 131 ubfx r4, r2, #23, #8 132 mov r2, r4, lsl #1 133 ldrh r4, [r9, r2] @ WebRtcNsx_kLogTableFrac[frac] 134 add r7, r4, r6, lsl #8 @ log2 135 movw r2, #22713 @ log2_const 136 smulbb r2, r7, r2 137 add r2, r3, r2, lsr #15 138 strh r2, [r12], #2 @ lmagn[i] 139 140 CHECK_LMAGN_COUNTER: 141 subs r5, #1 142 bgt LOOP_SET_LMAGN 143 144 movw r3, #21845 @ width_factor 145 vdup.16 q5, r3 146 vmov.s16 q14, #WIDTH_Q8 147 148 movw r5, #offset_nsx_noiseEstLogQuantile 149 movw r7, #offset_nsx_blockIndex 150 movw r9, #offset_nsx_noiseEstDensity 151 add r5, r0 152 ldr r6, [r0, #offset_nsx_magnLen] 153 ldr r7, [r0, r7] 154 add r9, r0 155 cmp r7, #END_STARTUP_LONG 156 movw r10, #offset_nsx_noiseEstCounter 157 add r10, r0 158 movge r7, #FACTOR_Q7 159 movlt r7, #FACTOR_Q7_STARTUP 160 mov r4, r0 161 str r7, [sp, #12] @ factor 162 mov r8, #SIMULT 163 mov r7, #0 164 165 LOOP_SIMULT: 166 ldrsh r1, [r10] @ inst->noiseEstCounter[s] 167 adr r3, WebRtcNsx_kCounterDiv 168 mov r11, r1, lsl #1 @ counter 169 ldrh r11, [r3, r11] @ countDiv = WebRtcNsx_kCounterDiv[counter]; 170 sub r12, r6, #1 @ Loop counter. 171 smulbb r3, r1, r11 @ countProd 172 vdup.16 q11, r11 173 174 vqrdmulh.s16 q11, q5, q11 @ WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( 175 @ width_factor, countDiv, 15); 176 vdup.16 d24, r11 177 vdup.16 d25, r3 178 179 ldr r3, [sp, #12] @ factor 180 add r1, sp, #16 @ &lmagn[0] 181 vdup.16 q9, r3 182 vmov.i16 q13, #512 183 vmov.i16 q7, #15 184 vmov.i32 q6, #FACTOR_Q16 185 186 LOOP_NOISEESTIMATION_MAGNLEN_INNER: 187 vld1.16 {q0}, [r9] @ noiseEstDensity[offset + i] 188 189 @ Compute delta in the next two blocks. 190 vclz.i16 q4, q0 191 vsub.i16 q4, q4, q7 @ Value of the shift factors; likely negative. 192 vmovl.s16 q3, d8 193 vmovl.s16 q2, d9 194 195 vshl.s32 q1, q6, q3 196 vmovn.i32 d8, q1 @ d8 holds shifted FACTOR_Q16. 197 vshl.s32 q1, q6, q2 198 vcgt.s16 q3, q0, q13 @ Compare noiseEstDensity to 512. 199 vmovn.i32 d9, q1 @ d9 holds shifted FACTOR_Q16. 200 vmov.i16 q1, q9 201 vbit.s16 q1, q4, q3 @ If bigger than 512, delta = shifted FACTOR_Q16. 202 203 vmull.s16 q8, d3, d24 204 vmull.s16 q4, d2, d24 205 vshrn.i32 d2, q4, #14 206 vshrn.i32 d3, q8, #14 207 208 vrshr.s16 q3, q1, #1 209 vrshr.s16 q8, q1, #2 210 vmull.s16 q4, d7, d28 211 vmull.s16 q3, d6, d28 212 vld1.16 {q10}, [r5] @ inst->noiseEstLogQuantile[offset + i] 213 vshrn.i32 d4, q3, #1 214 vshrn.i32 d5, q4, #1 215 216 vld1.16 {q3}, [r1]! @ lmagn[i] 217 vsub.i16 q4, q10, q2 218 vadd.i16 q8, q10, q8 219 vsub.i16 q2, q3, q10 220 vmax.s16 q4, q4, q15 221 vcgt.s16 q1, q2, #0 222 vbit q10, q8, q1 223 vbif q10, q4, q1 224 225 vsub.i16 q1, q3, q10 226 vst1.16 {q10}, [r5]! @ inst->noiseEstLogQuantile[offset + i] 227 vabs.s16 q4, q1 228 vqrdmulh.s16 d2, d0, d25 229 vqrdmulh.s16 d3, d1, d25 230 vcgt.s16 q4, q14, q4 231 vadd.i16 q1, q1, q11 232 vbit q0, q1, q4 233 subs r12, #8 234 vst1.16 {q0}, [r9]! @ noiseEstDensity[offset + i] 235 bgt LOOP_NOISEESTIMATION_MAGNLEN_INNER 236 237 @ 238 @ Last iteration over magnitude spectrum. 239 @ 240 241 COMPUTE_DELTA: 242 ldrsh r2, [r9] @ inst->noiseEstDensity[offset + i] 243 cmp r2, #512 244 bgt COMPUTE_DELTA_BIGGER_DENSITY 245 246 movw r2, #offset_nsx_blockIndex 247 ldr r0, [r4, r2] 248 cmp r0, #END_STARTUP_LONG 249 movge r0, #FACTOR_Q7 @ delta 250 movlt r0, #FACTOR_Q7_STARTUP @ delta 251 b UPDATE_LOG_QUANTILE_ESTIMATE 252 253 COMPUTE_DELTA_BIGGER_DENSITY: 254 clz r2, r2 255 rsb r0, r2, #31 @ 14 - factor 256 mov r2, #FACTOR_Q16 257 mov r0, r2, lsr r0 @ FACTOR_Q16 >> (14 - factor) 258 259 UPDATE_LOG_QUANTILE_ESTIMATE: 260 smulbb r12, r0, r11 261 ldrsh r1, [r1] @ lmagn[i] 262 ubfx r12, r12, #14, #16 @ tmp16 263 ldrsh r2, [r5] @ inst->noiseEstLogQuantile[offset + i] 264 cmp r1, r2 265 bgt UPDATE_LOG_QUANTILE_ESTIMATE_BIGGER_LMAGN 266 267 add r12, #1 268 ldr r3, [sp] @ logval 269 mov r0, r12, lsr #1 @ tmp16no1 270 mov r12, #3 271 smulbb r12, r0, r12 @ tmp16no2 272 sub r2, r2, r12, lsr #1 273 cmp r3, r2 274 ldrgt r2, [sp] 275 ldrgt r3, [sp] 276 b UPDATE_LOG_QUANTILE_ESTIMATE_STORE 277 278 UPDATE_LOG_QUANTILE_ESTIMATE_BIGGER_LMAGN: 279 add r3, r12, #2 280 add r2, r2, r3, lsr #2 281 282 UPDATE_LOG_QUANTILE_ESTIMATE_STORE: 283 vmov.s16 r0, d25[0] @ countProd 284 strh r2, [r5] 285 add r5, #2 @ increment &noiseEstLogQuantile[offset + i] 286 287 UPDATE_DENSITY_ESTIMATE: 288 subs r12, r1, r2 289 rsblt r12, #0 290 cmp r12, #WIDTH_Q8 291 bge UPDATE_DENSITY_ESTIMATE_CHECK_COUNTER 292 293 movw r3, #21845 @ width_factor 294 ldrh r12, [r9] @ inst->noiseEstDensity[offset + i] 295 smulbb r2, r3, r11 296 smulbb r1, r12, r0 297 add r0, r2, #1 << 14 @ Rounding 298 add r12, r1, #1 << 14 299 mov r1, r12, lsr #15 300 add r3, r1, r0, lsr #15 301 strh r3, [r9] @ inst->noiseEstDensity[offset + i] 302 303 UPDATE_DENSITY_ESTIMATE_CHECK_COUNTER: 304 add r9, #2 @ updata &noiseEstDensity[offset + i] 305 ldrsh r3, [r10] @ inst->noiseEstCounter[s] 306 cmp r3, #END_STARTUP_LONG 307 blt POST_UPDATE_DENSITY_ESTIMATE 308 309 movw r2, #offset_nsx_blockIndex 310 mov r12, #0 311 ldr r2, [r4, r2] 312 strh r12, [r10] 313 cmp r2, #END_STARTUP_LONG 314 blt POST_UPDATE_DENSITY_ESTIMATE 315 316 mov r0, r4 317 mov r1, r7 318 CALL_FUNCTION UpdateNoiseEstimateNeon 319 320 POST_UPDATE_DENSITY_ESTIMATE: 321 ldrh r3, [r10] 322 add r3, #1 323 strh r3, [r10], #2 324 subs r8, #1 325 add r7, r6 @ offset += inst->magnLen; 326 bgt LOOP_SIMULT 327 328 movw r2, #offset_nsx_blockIndex 329 ldr r2, [r4, r2] 330 cmp r2, #END_STARTUP_LONG 331 bge UPDATE_NOISE 332 333 sub r1, r7, r6 334 mov r0, r4 335 CALL_FUNCTION UpdateNoiseEstimateNeon 336 337 UPDATE_NOISE: 338 movw r1, #offset_nsx_noiseEstQuantile 339 add r1, r4 340 ldr r2, [sp, #4] 341 342 @ Initial value of loop counter r6 = inst->magnLen. 343 LOOP_UPDATE_NOISE: 344 ldrsh r0, [r1], #2 345 subs r6, #1 346 str r0, [r2], #4 347 bgt LOOP_UPDATE_NOISE 348 349 UPDATE_Q_NOISE: 350 movw r2, #offset_nsx_qNoise 351 ldr r1, [sp, #8] 352 ldrh r2, [r4, r2] 353 strh r2, [r1] 354 355 add sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8) 356 vpop {d8-d15} 357 pop {r4-r12, pc} 358 359 @ static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset); 360 @ Neon registers touched: q0-q3, q8-q13. 361 .align 2 362 DEFINE_FUNCTION UpdateNoiseEstimateNeon 363 push {r4, r5, r6, r14} 364 mov r5, r0 365 366 vmov.i32 q10, #21 367 vmov.i32 q11, #0x1FFFFF 368 vmov.i32 q9, #0x200000 369 370 movw r0, #offset_nsx_noiseEstLogQuantile 371 movw r6, #offset_nsx_magnLen 372 add r0, r5 @ &inst->noiseEstLogQuantile 373 add r4, r0, r1, lsl #1 @ &inst->noiseEstLogQuantile[offset] 374 ldrsh r6, [r5, r6] @ &inst->magnLen 375 376 mov r0, r4 377 mov r1, r6 378 CALL_FUNCTION WebRtcSpl_MaxValueW16Neon 379 380 sub r12, r6, #1 @ Loop counter: inst->magnLen - 1. 381 382 movw r6, #11819 @ kExp2Const in Q13 383 movw r2, #offset_nsx_noiseEstQuantile 384 vdup.16 d16, r6 385 smulbb r3, r6, r0 386 add r0, r3, #1 << 20 @ Round 387 movw r1, #offset_nsx_qNoise 388 mov r0, r0, lsr #21 389 rsb r0, r0, #14 @ 14 - (round(kExp2Const * tmp16) >> 21) 390 add r2, r5 @ &inst->noiseEstQuantile 391 vdup.32 q13, r0 392 str r0, [r5, r1] 393 394 LOOP_UPDATE: 395 vld1.16 {d0, d1}, [r4]! @ &inst->noiseEstLogQuantile[offset + i] 396 vmull.s16 q1, d0, d16 397 vmull.s16 q0, d1, d16 398 vshr.s32 q3, q1, #21 399 vshr.s32 q2, q0, #21 400 vand q1, q1, q11 401 vand q0, q0, q11 402 vsub.i32 q3, q3, q10 403 vsub.i32 q2, q2, q10 404 vorr q1, q1, q9 405 vorr q0, q0, q9 406 vadd.i32 q3, q3, q13 407 vadd.i32 q2, q2, q13 408 vshl.s32 q1, q1, q3 409 vshl.s32 q0, q0, q2 410 vqmovn.s32 d1, q0 411 vqmovn.s32 d0, q1 412 subs r12, #8 413 vst1.16 {d0, d1}, [r2]! 414 bgt LOOP_UPDATE 415 416 POST_LOOP_MAGNLEN: 417 ldrh r1, [r4] 418 smulbb r3, r6, r1 @ kExp2Const * ptr_noiseEstLogQuantile[offset + i] 419 mov r12, #0x00200000 420 bfi r12, r3, #0, #21 @ tmp32no1 = 0x00200000 | (tmp32no2 & 0x001FFFFF); 421 rsb r0, #21 @ 21 - &inst->qNoise 422 sub r14, r0, r3, lsr #21 @ -tmp16 423 mov r0, r12, lsr r14 424 ssat r3, #16, r0 425 strh r3, [r2] 426 427 pop {r4, r5, r6, pc} 428 429 @ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf); 430 .align 2 431 DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon 432 push {r4-r9} 433 434 movw r2, #offset_nsx_real 435 movw r12, #offset_nsx_noiseSupFilter 436 movw r4, #offset_nsx_imag 437 movw r5, #offset_nsx_magnLen 438 439 add r2, r0 @ &inst->real[0] 440 add r4, r0 @ &inst->image[0] 441 mov r9, r4 @ &inst->image[0] 442 mov r3, r2 @ &inst->real[0] 443 ldr r5, [r0, r5] @ inst->magnLen 444 add r6, r4, #2 @ &inst->image[1] 445 sub r5, #1 446 add r12, r0 @ &inst->noiseSupFilter[0] 447 add r5, r2, r5, lsl #1 @ &inst->real[inst->magnLen - 1] 448 449 LOOP_MAGNLEN: 450 @ Filter the elements. 451 vld1.16 {d20, d21}, [r2] @ inst->real[] 452 vld1.16 {d24, d25}, [r12]! @ inst->noiseSupFilter[] 453 vld1.16 {d22, d23}, [r4] @ inst->imag[] 454 vmull.s16 q0, d20, d24 455 vmull.s16 q1, d21, d25 456 vmull.s16 q2, d22, d24 457 vmull.s16 q3, d23, d25 458 vshrn.s32 d0, q0, #14 459 vshrn.s32 d1, q1, #14 460 vshrn.s32 d2, q2, #14 461 vshrn.s32 d3, q3, #14 462 vst1.16 {d0, d1}, [r2]! 463 vst1.16 {d2, d3}, [r4]! 464 cmp r2, r5 465 bcc LOOP_MAGNLEN 466 467 @ Last two elements to filter: 468 ldrh r7, [r2] 469 ldrh r8, [r12] 470 ldrh r5, [r4] 471 smulbb r7, r7, r8 472 smulbb r5, r5, r8 473 mov r7, r7, lsr #14 474 mov r8, r5, lsr #14 475 strh r7, [r2] 476 strh r8, [r4] 477 478 ldr r5, [r0, #offset_nsx_anaLen2] @ inst->anaLen2 479 ldr r7, [r0, #offset_nsx_anaLen] @ inst->anaLen 480 lsr r5, #3 @ inst->anaLen2 / 8 481 sub r5, #1 @ Loop counter. 482 483 @ Process and write the first 2 samples into freq_buf[]. 484 ldrh r2, [r3], #2 @ inst->real[0] 485 ldrh r0, [r9] @ inst->imag[0] 486 strh r2, [r1], #2 @ Store to freq_buf[0] 487 rsb r0, r0, #0 488 strh r0, [r1], #2 @ Store to freq_buf[1]. Now r1 -> &freq_buf[2] 489 490 @ Process and write (inst->anaLen2 * 4 - 32) samples into freq_buf[]. 491 LOOP_ANALEN2: 492 vld1.16 d5, [r6]! @ inst->imag[], starting from inst->imag[1] 493 vld1.16 d7, [r6]! 494 vneg.s16 d5, d5 495 vld1.16 d4, [r3]! @ inst->real[], starting from inst->real[1] 496 vneg.s16 d7, d7 497 vld1.16 d6, [r3]! 498 vzip.16 d4, d5 499 vzip.16 d6, d7 500 subs r5, #1 501 vst1.16 {d4, d5, d6, d7}, [r1]! 502 bgt LOOP_ANALEN2 503 504 @ Process and write 32 samples into freq_buf[]. We need to adjust the pointers 505 @ to overwrite the 2 starting samples in the back half of the buffer. 506 vld1.16 d5, [r6]! @ inst->imag[], starting from inst->imag[1] 507 vld1.16 d7, [r6]! 508 vneg.s16 d5, d5 509 vld1.16 d4, [r3]! @ inst->real[], starting from inst->real[1] 510 vneg.s16 d7, d7 511 vld1.16 d6, [r3]! 512 vzip.16 d4, d5 513 vzip.16 d6, d7 514 vst1.16 {d4, d5, d6, d7}, [r1] 515 516 pop {r4-r9} 517 bx r14 518 519 @ void SynthesisUpdateNeon(NsxInst_t* inst, 520 @ int16_t* out_frame, 521 @ int16_t gain_factor); 522 .align 2 523 DEFINE_FUNCTION WebRtcNsx_SynthesisUpdateNeon 524 push {r4, r5} 525 526 vdup.16 d31, r2 527 528 movw r2, #offset_nsx_anaLen 529 movw r4, #offset_nsx_real 530 movw r12, #offset_nsx_synthesisBuffer 531 532 ldrsh r5, [r0, r2] @ inst->anaLen 533 add r12, r0 @ &inst->synthesisBuffer[0]; 534 ldr r3, [r0, #offset_nsx_window] @ &inst->window[0] 535 add r4, r0 @ &inst->real[0] 536 add r5, r12, r5, lsl #1 @ &inst->synthesisBuffer[inst->anaLen] 537 538 mov r2, r12 @ &inst->synthesisBuffer[0]; 539 540 LOOP_SYNTHESIS: 541 vld1.16 {d0, d1}, [r4]! @ inst->real[] 542 vld1.16 {d2, d3}, [r3]! @ inst->window[] 543 vld1.16 {d4, d5}, [r2] @ inst->synthesisBuffer[]; 544 vmull.s16 q3, d0, d2 545 vmull.s16 q8, d1, d3 546 vrshrn.i32 d0, q3, #14 547 vrshrn.i32 d1, q8, #14 548 vmull.s16 q3, d31, d0 549 vmull.s16 q8, d31, d1 550 vqrshrn.s32 d0, q3, #13 551 vqrshrn.s32 d1, q8, #13 552 vqadd.s16 d4, d0 553 vqadd.s16 d5, d1 554 vst1.16 {d4, d5}, [r2]! 555 cmp r2, r5 556 blt LOOP_SYNTHESIS 557 558 POST_LOOP_SYNTHESIS: 559 movw r3, #offset_nsx_blockLen10ms 560 ldr r2, [r0, r3] 561 mov r3, r12 @ &inst->synthesisBuffer[0]; 562 add r0, r12, r2, lsl #1 @ &inst->synthesisBuffer[inst->blockLen10ms] 563 564 LOOP_BLOCKLEN10MS: 565 vld1.16 {q0, q1}, [r3]! @ inst->synthesisBuffer[]; 566 cmp r3, r0 567 vst1.16 {q0, q1}, [r1]! @ out_frame[] 568 blt LOOP_BLOCKLEN10MS 569 570 cmp r0, r5 571 bge POST_LOOP_MEMCPY 572 573 LOOP_MEMCPY: 574 vld1.16 {q0, q1}, [r0]! @ inst->synthesisBuffer[i + inst->blockLen10ms] 575 cmp r0, r5 576 vst1.16 {q0, q1}, [r12]! @ inst->synthesisBuffer[i] 577 blt LOOP_MEMCPY 578 579 POST_LOOP_MEMCPY: 580 cmp r12, r5 581 vmov.i16 q10, #0 582 vmov.i16 q11, #0 583 bge EXIT_SYNTHESISUPDATE 584 585 LOOP_ZEROSARRAY: 586 vst1.16 {q10, q11}, [r12]! @ inst->synthesisBuffer[i + inst->anaLen] 587 cmp r12, r5 588 blt LOOP_ZEROSARRAY 589 590 EXIT_SYNTHESISUPDATE: 591 pop {r4, r5} 592 bx r14 593 594 @ void AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech); 595 .align 2 596 DEFINE_FUNCTION WebRtcNsx_AnalysisUpdateNeon 597 push {r4-r6} 598 599 movw r3, #offset_nsx_analysisBuffer 600 movw r4, #offset_nsx_anaLen 601 movw r12, #offset_nsx_blockLen10ms 602 add r3, r0 @ &inst->analysisBuffer[0] 603 ldrsh r4, [r0, r4] @ inst->anaLen 604 ldr r12, [r0, r12] @ inst->blockLen10ms 605 sub r6, r4, r12 606 add r6, r3, r6, lsl #1 @ &inst->analysisBuffer[inst->anaLen 607 @ - inst->blockLen10ms] 608 cmp r3, r6 609 mov r5, r3 610 bge POST_LOOP_MEMCPY_1 611 612 add r12, r3, r12, lsl #1 @ &inst->analysisBuffer[inst->blockLen10ms] 613 614 LOOP_MEMCPY_1: 615 vld1.16 {q10, q11}, [r12]! @ inst->analysisBuffer[i + inst->blockLen10ms] 616 vst1.16 {q10, q11}, [r5]! @ inst->analysisBuffer[i] 617 cmp r5, r6 618 blt LOOP_MEMCPY_1 619 620 POST_LOOP_MEMCPY_1: 621 add r12, r3, r4, lsl #1 @ &inst->analysisBuffer[inst->anaLen] 622 cmp r5, r12 623 bge POST_LOOP_MEMCPY_2 624 625 LOOP_MEMCPY_2: 626 vld1.16 {q10, q11}, [r2]! @ new_speech[i] 627 vst1.16 {q10, q11}, [r5]! @ inst->analysisBuffer[ 628 @ i + inst->anaLen - inst->blockLen10ms] 629 cmp r5, r12 630 blt LOOP_MEMCPY_2 631 632 POST_LOOP_MEMCPY_2: 633 add r4, r1, r4, lsl #1 @ &out[inst->anaLen] 634 cmp r1, r4 635 ldr r2, [r0, #offset_nsx_window] @ &inst->window[0] 636 bge POST_LOOP_WINDOW_DATA 637 638 LOOP_WINDOW_DATA: 639 vld1.16 {d4, d5}, [r3]! @ inst->analysisBuffer[] 640 vld1.16 {d6, d7}, [r2]! @ inst->window[] 641 vmull.s16 q0, d4, d6 642 vmull.s16 q1, d5, d7 643 vrshrn.i32 d4, q0, #14 644 vrshrn.i32 d5, q1, #14 645 vst1.16 {d4, d5}, [r1]! @ out[] 646 cmp r1, r4 647 blt LOOP_WINDOW_DATA 648 649 POST_LOOP_WINDOW_DATA: 650 pop {r4-r6} 651 bx r14 652