Home | History | Annotate | Download | only in ns
      1 @
      2 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3 @
      4 @ Use of this source code is governed by a BSD-style license
      5 @ that can be found in the LICENSE file in the root of the source
      6 @ tree. An additional intellectual property rights grant can be found
      7 @ in the file PATENTS.  All contributing project authors may
      8 @ be found in the AUTHORS file in the root of the source tree.
      9 @
     10 
     11 @ nsx_core_neon.s
     12 @ This file contains some functions in NS, optimized for ARM Neon
     13 @ platforms. Reference C code is in file nsx_core.c. Bit-exact.
     14 
     15 .syntax unified
     16 
     17 #include "nsx_core_neon_offsets.h"
     18 #include "webrtc/modules/audio_processing/ns/nsx_defines.h"
     19 #include "webrtc/system_wrappers/interface/asm_defines.h"
     20 
     21 GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon
     22 GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon
     23 GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon
     24 GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon
     25 GLOBAL_LABEL WebRtcNsx_kLogTable
     26 GLOBAL_LABEL WebRtcNsx_kCounterDiv
     27 GLOBAL_LABEL WebRtcNsx_kLogTableFrac
     28 
     29 .align 2
     30 WebRtcNsx_kLogTableFrac:
     31 _WebRtcNsx_kLogTableFrac:
     32 .short 0, 1, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26
     33 .short 28, 29, 30, 32, 33, 34, 36, 37, 38, 40, 41, 42, 44, 45, 46, 47, 49, 50
     34 .short 51, 52, 54, 55, 56, 57, 59, 60, 61, 62, 63, 65, 66, 67, 68, 69, 71, 72
     35 .short 73, 74, 75, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 92, 93
     36 .short 94, 95, 96, 97, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110
     37 .short 111, 112, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
     38 .short 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141
     39 .short 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 155
     40 .short 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 169
     41 .short 170, 171, 172, 173, 174, 175, 176, 177, 178, 178, 179, 180, 181, 182, 183
     42 .short 184, 185, 185, 186, 187, 188, 189, 190, 191, 192, 192, 193, 194, 195, 196
     43 .short 197, 198, 198, 199, 200, 201, 202, 203, 203, 204, 205, 206, 207, 208, 208
     44 .short 209, 210, 211, 212, 212, 213, 214, 215, 216, 216, 217, 218, 219, 220, 220
     45 .short 221, 222, 223, 224, 224, 225, 226, 227, 228, 228, 229, 230, 231, 231, 232
     46 .short 233, 234, 234, 235, 236, 237, 238, 238, 239, 240, 241, 241, 242, 243, 244
     47 .short 244, 245, 246, 247, 247, 248, 249, 249, 250, 251, 252, 252, 253, 254, 255
     48 .short 255
     49 
     50 .align 2
     51 WebRtcNsx_kCounterDiv:
     52 _WebRtcNsx_kCounterDiv:
     53 .short 32767, 16384, 10923, 8192, 6554, 5461, 4681, 4096, 3641, 3277, 2979
     54 .short 2731, 2521, 2341, 2185, 2048, 1928, 1820, 1725, 1638, 1560, 1489
     55 .short 1425, 1365, 1311, 1260, 1214, 1170, 1130, 1092, 1057, 1024, 993, 964
     56 .short 936, 910, 886, 862, 840, 819, 799, 780, 762, 745, 728, 712, 697, 683
     57 .short 669, 655, 643, 630, 618, 607, 596, 585, 575, 565, 555, 546, 537, 529
     58 .short 520, 512, 504, 496, 489, 482, 475, 468, 462, 455, 449, 443, 437, 431
     59 .short 426, 420, 415, 410, 405, 400, 395, 390, 386, 381, 377, 372, 368, 364
     60 .short 360, 356, 352, 349, 345, 341, 338, 334, 331, 328, 324, 321, 318, 315
     61 .short 312, 309, 306, 303, 301, 298, 295, 293, 290, 287, 285, 282, 280, 278
     62 .short 275, 273, 271, 269, 266, 264, 262, 260, 258, 256, 254, 252, 250, 248
     63 .short 246, 245, 243, 241, 239, 237, 236, 234, 232, 231, 229, 228, 226, 224
     64 .short 223, 221, 220, 218, 217, 216, 214, 213, 211, 210, 209, 207, 206, 205
     65 .short 204, 202, 201, 200, 199, 197, 196, 195, 194, 193, 192, 191, 189, 188
     66 .short 187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174
     67 .short 173, 172, 172, 171, 170, 169, 168, 167, 166, 165, 165, 164, 163
     68 
     69 .align 2
     70 WebRtcNsx_kLogTable:
     71 _WebRtcNsx_kLogTable:
     72 .short  0, 177, 355, 532, 710, 887, 1065, 1242, 1420
     73 
     74 @ void NoiseEstimationNeon(NsxInst_t* inst,
     75 @                          uint16_t* magn,
     76 @                          uint32_t* noise,
     77 @                          int16_t* q_noise);
     78 
     79 @ Register usage (across major loops of NoiseEstimationNeon()):
     80 @ r0-r3: function arguments, and scratch registers.
     81 @ r4: &inst
     82 @ r5: &noiseEstLogQuantile[]
     83 @ r6: inst->magnLen
     84 @ r7: offset
     85 @ r8: s, the loop counter for the LOOP_SIMULT
     86 @ r9: &inst->noiseEstDensity[]
     87 @ r10: &inst->noiseEstCounter[]
     88 @ r11: countDiv
     89 @ r12: i, the loop counter for LOOP_NOISEESTIMATION_MAGNLEN_INNER
     90 
     91 .align 2
     92 DEFINE_FUNCTION WebRtcNsx_NoiseEstimationNeon
     93   push {r4-r12, r14}          @ Make sure 8-byte stack alignment.
     94   vpush {d8-d15}
     95   sub sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
     96 
     97 @ [sp, #0]: logval
     98 @ [sp, #4]: noise
     99 @ [sp, #8]: q_noise
    100 @ [sp, #12]: factor
    101 @ [sp, #16 ~ #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)]: lmagn[HALF_ANAL_BLOCKL]
    102 
    103   str r2, [sp, #4]            @ noise
    104   str r3, [sp, #8]            @ q_noise
    105   movw r4, #offset_nsx_normData
    106   ldr r2, [r0, #offset_nsx_stages]            @ inst->stages
    107   ldr r4, [r0, r4]            @ inst->normData
    108   adr r12, WebRtcNsx_kLogTable
    109   subs r3, r2, r4             @ tabind = inst->stages - inst->normData;
    110   ldr r5, [r0, #offset_nsx_magnLen]            @ magnLen
    111   rsblt r3, #0
    112   lsl r3, #1
    113   ldrh r3, [r12, r3]          @ logval = WebRtcNsx_kLogTable[tabind];
    114   add r12, sp, #16            @ lmagn[]
    115   rsblt r3, #0                @ logval = -WebRtcNsx_kLogTable[-tabind];
    116   str r3, [sp]
    117   vdup.16 q15, r3
    118 
    119   adr r9, WebRtcNsx_kLogTableFrac
    120 
    121 LOOP_SET_LMAGN:
    122   ldrh r2, [r1], #2           @ magn[i]
    123   cmp r2, #0
    124   strheq r3, [r12], #2        @ lmagn[i] = logval;
    125   beq CHECK_LMAGN_COUNTER
    126 
    127   clz r6, r2
    128   mov r4, r6                  @ zeros
    129   rsb r6, #31
    130   lsl r2, r4
    131   ubfx r4, r2, #23, #8
    132   mov r2, r4, lsl #1
    133   ldrh r4, [r9, r2]           @ WebRtcNsx_kLogTableFrac[frac]
    134   add r7, r4, r6, lsl #8      @ log2
    135   movw r2, #22713             @ log2_const
    136   smulbb r2, r7, r2
    137   add r2, r3, r2, lsr #15
    138   strh r2, [r12], #2          @ lmagn[i]
    139 
    140 CHECK_LMAGN_COUNTER:
    141   subs r5, #1
    142   bgt LOOP_SET_LMAGN
    143 
    144   movw r3, #21845             @ width_factor
    145   vdup.16 q5, r3
    146   vmov.s16 q14, #WIDTH_Q8
    147 
    148   movw r5, #offset_nsx_noiseEstLogQuantile
    149   movw r7, #offset_nsx_blockIndex
    150   movw r9, #offset_nsx_noiseEstDensity
    151   add r5, r0
    152   ldr r6, [r0, #offset_nsx_magnLen]
    153   ldr r7, [r0, r7]
    154   add r9, r0
    155   cmp r7, #END_STARTUP_LONG
    156   movw r10, #offset_nsx_noiseEstCounter
    157   add r10, r0
    158   movge r7, #FACTOR_Q7
    159   movlt r7, #FACTOR_Q7_STARTUP
    160   mov r4, r0
    161   str r7, [sp, #12]           @ factor
    162   mov r8, #SIMULT
    163   mov r7, #0
    164 
    165 LOOP_SIMULT:
    166   ldrsh r1, [r10]             @ inst->noiseEstCounter[s]
    167   adr r3, WebRtcNsx_kCounterDiv
    168   mov r11, r1, lsl #1         @ counter
    169   ldrh r11, [r3, r11]         @ countDiv = WebRtcNsx_kCounterDiv[counter];
    170   sub r12, r6, #1             @ Loop counter.
    171   smulbb r3, r1, r11          @ countProd
    172   vdup.16 q11, r11
    173 
    174   vqrdmulh.s16 q11, q5, q11   @ WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
    175                               @   width_factor, countDiv, 15);
    176   vdup.16 d24, r11
    177   vdup.16 d25, r3
    178 
    179   ldr r3, [sp, #12]           @ factor
    180   add r1, sp, #16             @ &lmagn[0]
    181   vdup.16 q9, r3
    182   vmov.i16 q13, #512
    183   vmov.i16 q7, #15
    184   vmov.i32 q6, #FACTOR_Q16
    185 
    186 LOOP_NOISEESTIMATION_MAGNLEN_INNER:
    187   vld1.16 {q0}, [r9]          @ noiseEstDensity[offset + i]
    188 
    189   @ Compute delta in the next two blocks.
    190   vclz.i16 q4, q0
    191   vsub.i16 q4, q4, q7         @ Value of the shift factors; likely negative.
    192   vmovl.s16 q3, d8
    193   vmovl.s16 q2, d9
    194 
    195   vshl.s32 q1, q6, q3
    196   vmovn.i32 d8, q1            @ d8 holds shifted FACTOR_Q16.
    197   vshl.s32 q1, q6, q2
    198   vcgt.s16 q3, q0, q13        @ Compare noiseEstDensity to 512.
    199   vmovn.i32 d9, q1            @ d9 holds shifted FACTOR_Q16.
    200   vmov.i16 q1, q9
    201   vbit.s16 q1, q4, q3         @ If bigger than 512, delta = shifted FACTOR_Q16.
    202 
    203   vmull.s16 q8, d3, d24
    204   vmull.s16 q4, d2, d24
    205   vshrn.i32 d2, q4, #14
    206   vshrn.i32 d3, q8, #14
    207 
    208   vrshr.s16 q3, q1, #1
    209   vrshr.s16 q8, q1, #2
    210   vmull.s16 q4, d7, d28
    211   vmull.s16 q3, d6, d28
    212   vld1.16 {q10}, [r5]         @ inst->noiseEstLogQuantile[offset + i]
    213   vshrn.i32 d4, q3, #1
    214   vshrn.i32 d5, q4, #1
    215 
    216   vld1.16 {q3}, [r1]!         @ lmagn[i]
    217   vsub.i16 q4, q10, q2
    218   vadd.i16 q8, q10, q8
    219   vsub.i16 q2, q3, q10
    220   vmax.s16 q4, q4, q15
    221   vcgt.s16 q1, q2, #0
    222   vbit q10, q8, q1
    223   vbif q10, q4, q1
    224 
    225   vsub.i16 q1, q3, q10
    226   vst1.16 {q10}, [r5]!        @ inst->noiseEstLogQuantile[offset + i]
    227   vabs.s16 q4, q1
    228   vqrdmulh.s16 d2, d0, d25
    229   vqrdmulh.s16 d3, d1, d25
    230   vcgt.s16 q4, q14, q4
    231   vadd.i16 q1, q1, q11
    232   vbit q0, q1, q4
    233   subs r12, #8
    234   vst1.16 {q0}, [r9]!         @ noiseEstDensity[offset + i]
    235   bgt LOOP_NOISEESTIMATION_MAGNLEN_INNER
    236 
    237 @
    238 @ Last iteration over magnitude spectrum.
    239 @
    240 
    241 COMPUTE_DELTA:
    242   ldrsh r2, [r9]              @ inst->noiseEstDensity[offset + i]
    243   cmp r2, #512
    244   bgt COMPUTE_DELTA_BIGGER_DENSITY
    245 
    246   movw r2, #offset_nsx_blockIndex
    247   ldr r0, [r4, r2]
    248   cmp r0, #END_STARTUP_LONG
    249   movge r0, #FACTOR_Q7          @ delta
    250   movlt r0, #FACTOR_Q7_STARTUP  @ delta
    251   b UPDATE_LOG_QUANTILE_ESTIMATE
    252 
    253 COMPUTE_DELTA_BIGGER_DENSITY:
    254   clz r2, r2
    255   rsb r0, r2, #31             @ 14 - factor
    256   mov r2, #FACTOR_Q16
    257   mov r0, r2, lsr r0          @ FACTOR_Q16 >> (14 - factor)
    258 
    259 UPDATE_LOG_QUANTILE_ESTIMATE:
    260   smulbb r12, r0, r11
    261   ldrsh r1, [r1]              @ lmagn[i]
    262   ubfx r12, r12, #14, #16     @ tmp16
    263   ldrsh r2, [r5]              @ inst->noiseEstLogQuantile[offset + i]
    264   cmp r1, r2
    265   bgt UPDATE_LOG_QUANTILE_ESTIMATE_BIGGER_LMAGN
    266 
    267   add r12, #1
    268   ldr r3, [sp]                @ logval
    269   mov r0, r12, lsr #1         @ tmp16no1
    270   mov r12, #3
    271   smulbb r12, r0, r12         @ tmp16no2
    272   sub r2, r2, r12, lsr #1
    273   cmp r3, r2
    274   ldrgt r2, [sp]
    275   ldrgt r3, [sp]
    276   b UPDATE_LOG_QUANTILE_ESTIMATE_STORE
    277 
    278 UPDATE_LOG_QUANTILE_ESTIMATE_BIGGER_LMAGN:
    279   add r3, r12, #2
    280   add r2, r2, r3, lsr #2
    281 
    282 UPDATE_LOG_QUANTILE_ESTIMATE_STORE:
    283   vmov.s16 r0, d25[0]         @ countProd
    284   strh r2, [r5]
    285   add r5, #2                  @ increment &noiseEstLogQuantile[offset + i]
    286 
    287 UPDATE_DENSITY_ESTIMATE:
    288   subs r12, r1, r2
    289   rsblt r12, #0
    290   cmp r12, #WIDTH_Q8
    291   bge UPDATE_DENSITY_ESTIMATE_CHECK_COUNTER
    292 
    293   movw r3, #21845             @ width_factor
    294   ldrh r12, [r9]              @ inst->noiseEstDensity[offset + i]
    295   smulbb r2, r3, r11
    296   smulbb r1, r12, r0
    297   add r0, r2, #1 << 14        @ Rounding
    298   add r12, r1, #1 << 14
    299   mov r1, r12, lsr #15
    300   add r3, r1, r0, lsr #15
    301   strh r3, [r9]               @ inst->noiseEstDensity[offset + i]
    302 
    303 UPDATE_DENSITY_ESTIMATE_CHECK_COUNTER:
    304   add r9, #2                  @ updata &noiseEstDensity[offset + i]
    305   ldrsh r3, [r10]             @ inst->noiseEstCounter[s]
    306   cmp r3, #END_STARTUP_LONG
    307   blt POST_UPDATE_DENSITY_ESTIMATE
    308 
    309   movw r2, #offset_nsx_blockIndex
    310   mov r12, #0
    311   ldr r2, [r4, r2]
    312   strh r12, [r10]
    313   cmp r2, #END_STARTUP_LONG
    314   blt POST_UPDATE_DENSITY_ESTIMATE
    315 
    316   mov r0, r4
    317   mov r1, r7
    318   CALL_FUNCTION UpdateNoiseEstimateNeon
    319 
    320 POST_UPDATE_DENSITY_ESTIMATE:
    321   ldrh r3, [r10]
    322   add r3, #1
    323   strh r3, [r10], #2
    324   subs r8, #1
    325   add r7, r6                  @ offset += inst->magnLen;
    326   bgt LOOP_SIMULT
    327 
    328   movw r2, #offset_nsx_blockIndex
    329   ldr r2, [r4, r2]
    330   cmp r2, #END_STARTUP_LONG
    331   bge UPDATE_NOISE
    332 
    333   sub r1, r7, r6
    334   mov r0, r4
    335   CALL_FUNCTION UpdateNoiseEstimateNeon
    336 
    337 UPDATE_NOISE:
    338   movw r1, #offset_nsx_noiseEstQuantile
    339   add r1, r4
    340   ldr r2, [sp, #4]
    341 
    342 @ Initial value of loop counter r6 = inst->magnLen.
    343 LOOP_UPDATE_NOISE:
    344   ldrsh r0, [r1], #2
    345   subs r6, #1
    346   str r0, [r2], #4
    347   bgt LOOP_UPDATE_NOISE
    348 
    349 UPDATE_Q_NOISE:
    350   movw r2, #offset_nsx_qNoise
    351   ldr r1, [sp, #8]
    352   ldrh r2, [r4, r2]
    353   strh r2, [r1]
    354 
    355   add sp, #(16 + (HALF_ANAL_BLOCKL + 3) / 4 * 8)
    356   vpop {d8-d15}
    357   pop {r4-r12, pc}
    358 
    359 @ static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset);
    360 @ Neon registers touched: q0-q3, q8-q13.
    361 .align 2
    362 DEFINE_FUNCTION UpdateNoiseEstimateNeon
    363   push {r4, r5, r6, r14}
    364   mov r5, r0
    365 
    366   vmov.i32 q10, #21
    367   vmov.i32 q11, #0x1FFFFF
    368   vmov.i32 q9, #0x200000
    369 
    370   movw r0, #offset_nsx_noiseEstLogQuantile
    371   movw r6, #offset_nsx_magnLen
    372   add r0, r5                  @ &inst->noiseEstLogQuantile
    373   add r4, r0, r1, lsl #1      @ &inst->noiseEstLogQuantile[offset]
    374   ldrsh r6, [r5, r6]          @ &inst->magnLen
    375 
    376   mov r0, r4
    377   mov r1, r6
    378   CALL_FUNCTION WebRtcSpl_MaxValueW16Neon
    379 
    380   sub r12, r6, #1             @ Loop counter: inst->magnLen - 1.
    381 
    382   movw r6, #11819             @ kExp2Const in Q13
    383   movw r2, #offset_nsx_noiseEstQuantile
    384   vdup.16 d16, r6
    385   smulbb r3, r6, r0
    386   add r0, r3, #1 << 20        @ Round
    387   movw r1, #offset_nsx_qNoise
    388   mov r0, r0, lsr #21
    389   rsb r0, r0, #14             @ 14 - (round(kExp2Const * tmp16) >> 21)
    390   add r2, r5                  @ &inst->noiseEstQuantile
    391   vdup.32 q13, r0
    392   str r0, [r5, r1]
    393 
    394 LOOP_UPDATE:
    395   vld1.16 {d0, d1}, [r4]!     @ &inst->noiseEstLogQuantile[offset + i]
    396   vmull.s16 q1, d0, d16
    397   vmull.s16 q0, d1, d16
    398   vshr.s32 q3, q1, #21
    399   vshr.s32 q2, q0, #21
    400   vand q1, q1, q11
    401   vand q0, q0, q11
    402   vsub.i32 q3, q3, q10
    403   vsub.i32 q2, q2, q10
    404   vorr q1, q1, q9
    405   vorr q0, q0, q9
    406   vadd.i32 q3, q3, q13
    407   vadd.i32 q2, q2, q13
    408   vshl.s32 q1, q1, q3
    409   vshl.s32 q0, q0, q2
    410   vqmovn.s32 d1, q0
    411   vqmovn.s32 d0, q1
    412   subs r12, #8
    413   vst1.16 {d0, d1}, [r2]!
    414   bgt LOOP_UPDATE
    415 
    416 POST_LOOP_MAGNLEN:
    417   ldrh r1, [r4]
    418   smulbb r3, r6, r1           @ kExp2Const * ptr_noiseEstLogQuantile[offset + i]
    419   mov r12, #0x00200000
    420   bfi r12, r3, #0, #21        @ tmp32no1 = 0x00200000 | (tmp32no2 & 0x001FFFFF);
    421   rsb r0, #21                 @ 21 - &inst->qNoise
    422   sub r14, r0, r3, lsr #21    @ -tmp16
    423   mov r0, r12, lsr r14
    424   ssat r3, #16, r0
    425   strh r3, [r2]
    426 
    427   pop {r4, r5, r6, pc}
    428 
    429 @ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf);
    430 .align 2
    431 DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon
    432   push {r4-r9}
    433 
    434   movw r2, #offset_nsx_real
    435   movw r12, #offset_nsx_noiseSupFilter
    436   movw r4, #offset_nsx_imag
    437   movw r5, #offset_nsx_magnLen
    438 
    439   add r2, r0                  @ &inst->real[0]
    440   add r4, r0                  @ &inst->image[0]
    441   mov r9, r4                  @ &inst->image[0]
    442   mov r3, r2                  @ &inst->real[0]
    443   ldr r5, [r0, r5]            @ inst->magnLen
    444   add r6, r4, #2              @ &inst->image[1]
    445   sub r5, #1
    446   add r12, r0                 @ &inst->noiseSupFilter[0]
    447   add r5, r2, r5, lsl #1      @ &inst->real[inst->magnLen - 1]
    448 
    449 LOOP_MAGNLEN:
    450   @ Filter the elements.
    451   vld1.16 {d20, d21}, [r2]    @ inst->real[]
    452   vld1.16 {d24, d25}, [r12]!  @ inst->noiseSupFilter[]
    453   vld1.16 {d22, d23}, [r4]    @ inst->imag[]
    454   vmull.s16 q0, d20, d24
    455   vmull.s16 q1, d21, d25
    456   vmull.s16 q2, d22, d24
    457   vmull.s16 q3, d23, d25
    458   vshrn.s32 d0, q0, #14
    459   vshrn.s32 d1, q1, #14
    460   vshrn.s32 d2, q2, #14
    461   vshrn.s32 d3, q3, #14
    462   vst1.16 {d0, d1}, [r2]!
    463   vst1.16 {d2, d3}, [r4]!
    464   cmp r2, r5
    465   bcc LOOP_MAGNLEN
    466 
    467   @ Last two elements to filter:
    468   ldrh r7, [r2]
    469   ldrh r8, [r12]
    470   ldrh r5, [r4]
    471   smulbb r7, r7, r8
    472   smulbb r5, r5, r8
    473   mov r7, r7, lsr #14
    474   mov r8, r5, lsr #14
    475   strh r7, [r2]
    476   strh r8, [r4]
    477 
    478   ldr r5, [r0, #offset_nsx_anaLen2]           @ inst->anaLen2
    479   ldr r7, [r0, #offset_nsx_anaLen]            @ inst->anaLen
    480   lsr r5, #3                  @ inst->anaLen2 / 8
    481   sub r5, #1                  @ Loop counter.
    482 
    483 @ Process and write the first 2 samples into freq_buf[].
    484   ldrh r2, [r3], #2           @ inst->real[0]
    485   ldrh r0, [r9]               @ inst->imag[0]
    486   strh r2, [r1], #2           @ Store to freq_buf[0]
    487   rsb r0, r0, #0
    488   strh r0, [r1], #2           @ Store to freq_buf[1]. Now r1 -> &freq_buf[2]
    489 
    490 @ Process and write (inst->anaLen2 * 4 - 32) samples into freq_buf[].
    491 LOOP_ANALEN2:
    492   vld1.16 d5, [r6]!     @ inst->imag[], starting from inst->imag[1]
    493   vld1.16 d7, [r6]!
    494   vneg.s16 d5, d5
    495   vld1.16 d4, [r3]!     @ inst->real[], starting from inst->real[1]
    496   vneg.s16 d7, d7
    497   vld1.16 d6, [r3]!
    498   vzip.16 d4, d5
    499   vzip.16 d6, d7
    500   subs r5, #1
    501   vst1.16 {d4, d5, d6, d7}, [r1]!
    502   bgt LOOP_ANALEN2
    503 
    504 @ Process and write 32 samples into freq_buf[]. We need to adjust the pointers
    505 @ to overwrite the 2 starting samples in the back half of the buffer.
    506   vld1.16 d5, [r6]!     @ inst->imag[], starting from inst->imag[1]
    507   vld1.16 d7, [r6]!
    508   vneg.s16 d5, d5
    509   vld1.16 d4, [r3]!     @ inst->real[], starting from inst->real[1]
    510   vneg.s16 d7, d7
    511   vld1.16 d6, [r3]!
    512   vzip.16 d4, d5
    513   vzip.16 d6, d7
    514   vst1.16 {d4, d5, d6, d7}, [r1]
    515 
    516   pop {r4-r9}
    517   bx r14
    518 
    519 @ void SynthesisUpdateNeon(NsxInst_t* inst,
    520 @                          int16_t* out_frame,
    521 @                          int16_t gain_factor);
    522 .align 2
    523 DEFINE_FUNCTION WebRtcNsx_SynthesisUpdateNeon
    524   push {r4, r5}
    525 
    526   vdup.16 d31, r2
    527 
    528   movw r2, #offset_nsx_anaLen
    529   movw r4, #offset_nsx_real
    530   movw r12, #offset_nsx_synthesisBuffer
    531 
    532   ldrsh r5, [r0, r2]          @ inst->anaLen
    533   add r12, r0                 @ &inst->synthesisBuffer[0];
    534   ldr r3, [r0, #offset_nsx_window]            @ &inst->window[0]
    535   add r4, r0                  @ &inst->real[0]
    536   add r5, r12, r5, lsl #1     @ &inst->synthesisBuffer[inst->anaLen]
    537 
    538   mov r2, r12                 @ &inst->synthesisBuffer[0];
    539 
    540 LOOP_SYNTHESIS:
    541   vld1.16 {d0, d1}, [r4]!     @ inst->real[]
    542   vld1.16 {d2, d3}, [r3]!     @ inst->window[]
    543   vld1.16 {d4, d5}, [r2]      @ inst->synthesisBuffer[];
    544   vmull.s16 q3, d0, d2
    545   vmull.s16 q8, d1, d3
    546   vrshrn.i32 d0, q3, #14
    547   vrshrn.i32 d1, q8, #14
    548   vmull.s16 q3, d31, d0
    549   vmull.s16 q8, d31, d1
    550   vqrshrn.s32 d0, q3, #13
    551   vqrshrn.s32 d1, q8, #13
    552   vqadd.s16 d4, d0
    553   vqadd.s16 d5, d1
    554   vst1.16 {d4, d5}, [r2]!
    555   cmp r2, r5
    556   blt LOOP_SYNTHESIS
    557 
    558 POST_LOOP_SYNTHESIS:
    559   movw r3, #offset_nsx_blockLen10ms
    560   ldr r2, [r0, r3]
    561   mov r3, r12                 @ &inst->synthesisBuffer[0];
    562   add r0, r12, r2, lsl #1     @ &inst->synthesisBuffer[inst->blockLen10ms]
    563 
    564 LOOP_BLOCKLEN10MS:
    565   vld1.16 {q0, q1}, [r3]!     @ inst->synthesisBuffer[];
    566   cmp r3, r0
    567   vst1.16 {q0, q1}, [r1]!     @ out_frame[]
    568   blt LOOP_BLOCKLEN10MS
    569 
    570   cmp r0, r5
    571   bge POST_LOOP_MEMCPY
    572 
    573 LOOP_MEMCPY:
    574   vld1.16 {q0, q1}, [r0]!     @ inst->synthesisBuffer[i + inst->blockLen10ms]
    575   cmp r0, r5
    576   vst1.16 {q0, q1}, [r12]!    @ inst->synthesisBuffer[i]
    577   blt LOOP_MEMCPY
    578 
    579 POST_LOOP_MEMCPY:
    580   cmp r12, r5
    581   vmov.i16 q10, #0
    582   vmov.i16 q11, #0
    583   bge EXIT_SYNTHESISUPDATE
    584 
    585 LOOP_ZEROSARRAY:
    586   vst1.16 {q10, q11}, [r12]!  @ inst->synthesisBuffer[i + inst->anaLen]
    587   cmp r12, r5
    588   blt LOOP_ZEROSARRAY
    589 
    590 EXIT_SYNTHESISUPDATE:
    591   pop {r4, r5}
    592   bx r14
    593 
    594 @ void AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech);
    595 .align 2
    596 DEFINE_FUNCTION WebRtcNsx_AnalysisUpdateNeon
    597   push {r4-r6}
    598 
    599   movw r3, #offset_nsx_analysisBuffer
    600   movw r4, #offset_nsx_anaLen
    601   movw r12, #offset_nsx_blockLen10ms
    602   add r3, r0                  @ &inst->analysisBuffer[0]
    603   ldrsh r4, [r0, r4]          @ inst->anaLen
    604   ldr r12, [r0, r12]          @ inst->blockLen10ms
    605   sub r6, r4, r12
    606   add r6, r3, r6, lsl #1      @ &inst->analysisBuffer[inst->anaLen
    607                               @     - inst->blockLen10ms]
    608   cmp r3, r6
    609   mov r5, r3
    610   bge POST_LOOP_MEMCPY_1
    611 
    612   add r12, r3, r12, lsl #1    @ &inst->analysisBuffer[inst->blockLen10ms]
    613 
    614 LOOP_MEMCPY_1:
    615   vld1.16 {q10, q11}, [r12]!  @ inst->analysisBuffer[i + inst->blockLen10ms]
    616   vst1.16 {q10, q11}, [r5]!   @ inst->analysisBuffer[i]
    617   cmp r5, r6
    618   blt LOOP_MEMCPY_1
    619 
    620 POST_LOOP_MEMCPY_1:
    621   add r12, r3, r4, lsl #1     @ &inst->analysisBuffer[inst->anaLen]
    622   cmp r5, r12
    623   bge POST_LOOP_MEMCPY_2
    624 
    625 LOOP_MEMCPY_2:
    626   vld1.16 {q10, q11}, [r2]!   @ new_speech[i]
    627   vst1.16 {q10, q11}, [r5]!   @ inst->analysisBuffer[
    628                               @     i + inst->anaLen - inst->blockLen10ms]
    629   cmp r5, r12
    630   blt LOOP_MEMCPY_2
    631 
    632 POST_LOOP_MEMCPY_2:
    633   add r4, r1, r4, lsl #1      @ &out[inst->anaLen]
    634   cmp r1, r4
    635   ldr r2, [r0, #offset_nsx_window]            @ &inst->window[0]
    636   bge POST_LOOP_WINDOW_DATA
    637 
    638 LOOP_WINDOW_DATA:
    639   vld1.16 {d4, d5}, [r3]!     @ inst->analysisBuffer[]
    640   vld1.16 {d6, d7}, [r2]!     @ inst->window[]
    641   vmull.s16 q0, d4, d6
    642   vmull.s16 q1, d5, d7
    643   vrshrn.i32 d4, q0, #14
    644   vrshrn.i32 d5, q1, #14
    645   vst1.16 {d4, d5}, [r1]!     @ out[]
    646   cmp r1, r4
    647   blt LOOP_WINDOW_DATA
    648 
    649 POST_LOOP_WINDOW_DATA:
    650   pop {r4-r6}
    651   bx r14
    652