Home | History | Annotate | Download | only in source
      1 @
      2 @ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
      3 @
      4 @ Use of this source code is governed by a BSD-style license
      5 @ that can be found in the LICENSE file in the root of the source
      6 @ tree. An additional intellectual property rights grant can be found
      7 @ in the file PATENTS.  All contributing project authors may
      8 @ be found in the AUTHORS file in the root of the source tree.
      9 @
     10 
     11 @ lattice_neon.s
     12 @
     13 @ Contains a function for the core loop in the normalized lattice MA
     14 @ filter routine for iSAC codec, optimized for ARM Neon platform.
     15 @ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
     16 @                                     int16_t input1,
     17 @                                     int32_t input2,
     18 @                                     int32_t* ptr0,
     19 @                                     int32_t* ptr1,
     20 @                                     int32_t* __restrict ptr2);
     21 @ It calculates
     22 @   *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
     23 @   *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
     24 @ in Q15 domain.
     25 @
     26 @ Reference code in lattice.c.
     27 @ Output is not bit-exact with the reference C code, due to the replacement
     28 @ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
     29 @ instructions, smulwb, and smull. Speech quality was not degraded by
     30 @ testing speech and tone vectors.
     31 
     32 #include "webrtc/system_wrappers/interface/asm_defines.h"
     33 #include "settings.h"
     34 
     35 GLOBAL_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
     36 .align  2
     37 DEFINE_FUNCTION WebRtcIsacfix_FilterMaLoopNeon
     38   push        {r4-r8}
     39 
     40   vdup.32     d28, r0             @ Initialize Neon register with input0
     41   vdup.32     d29, r1             @ Initialize Neon register with input1
     42   vdup.32     d30, r2             @ Initialize Neon register with input2
     43   ldr         r4, [sp, #20]       @ ptr1
     44   ldr         r12, [sp, #24]      @ ptr2
     45 
     46   @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
     47   @ Leftover samples after the loop, in r6:
     48   @    r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
     49   mov         r6, #HALF_SUBFRAMELEN
     50   sub         r6, #1
     51   lsr         r5, r6, #2
     52   sub         r6, r5, lsl #2
     53 
     54   @ First r5 iterations in a loop.
     55 
     56 LOOP:
     57   vld1.32     {d0, d1}, [r3]!     @ *ptr0
     58 
     59   vmull.s32   q10, d0, d28        @ tmp32a = input0 * (*ptr0)
     60   vmull.s32   q11, d1, d28        @ tmp32a = input0 * (*ptr0)
     61   vmull.s32   q12, d0, d29        @ input1 * (*ptr0)
     62   vmull.s32   q13, d1, d29        @ input1 * (*ptr0)
     63 
     64   vrshrn.i64  d4, q10, #15
     65   vrshrn.i64  d5, q11, #15
     66 
     67   vld1.32     {d2, d3}, [r12]     @ *ptr2
     68   vadd.i32    q3, q2, q1          @ tmp32b = *ptr2 + tmp32a
     69 
     70   vrshrn.i64  d0, q12, #15
     71 
     72   vmull.s32   q10, d6, d30        @ input2 * (*ptr2 + tmp32b)
     73   vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
     74 
     75   vrshrn.i64  d16, q10, #16
     76   vrshrn.i64  d17, q11, #16
     77 
     78   vmull.s32   q10, d16, d28       @ input0 * (*ptr2)
     79   vmull.s32   q11, d17, d28       @ input0 * (*ptr2)
     80 
     81   vrshrn.i64  d1, q13, #15
     82   vrshrn.i64  d18, q10, #15
     83   vrshrn.i64  d19, q11, #15
     84 
     85   vst1.32     {d16, d17}, [r12]!  @ *ptr2
     86 
     87   vadd.i32    q9, q0, q9
     88   subs        r5, #1
     89   vst1.32     {d18, d19}, [r4]!   @ *ptr1
     90 
     91   bgt         LOOP
     92 
     93   @ Check how many samples still need to be processed.
     94   subs        r6, #2
     95   blt         LAST_SAMPLE
     96 
     97   @ Process two more samples:
     98   vld1.32     d0, [r3]!           @ *ptr0
     99 
    100   vmull.s32   q11, d0, d28        @ tmp32a = input0 * (*ptr0)
    101   vmull.s32   q13, d0, d29        @ input1 * (*ptr0)
    102 
    103   vld1.32     d18, [r12]          @ *ptr2
    104   vrshrn.i64  d4, q11, #15
    105 
    106   vadd.i32    d7, d4, d18         @ tmp32b = *ptr2 + tmp32a
    107   vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
    108   vrshrn.i64  d16, q11, #16
    109 
    110   vmull.s32   q11, d16, d28       @ input0 * (*ptr2)
    111   vst1.32     d16, [r12]!         @ *ptr2
    112 
    113   vrshrn.i64  d0, q13, #15
    114   vrshrn.i64  d19, q11, #15
    115   vadd.i32    d19, d0, d19
    116 
    117   vst1.32     d19, [r4]!          @ *ptr1
    118 
    119   @ If there's still one more sample, process it here.
    120 LAST_SAMPLE:
    121   cmp         r6, #1
    122   bne         END
    123 
    124   @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
    125 
    126   ldr         r7, [r3]            @ *ptr0
    127   ldr         r8, [r12]           @ *ptr2
    128 
    129   smulwb      r5, r7, r0          @ tmp32a = *ptr0 * input0 >> 16
    130   add         r8, r8, r5, lsl #1  @ tmp32b = *ptr2 + (tmp32a << 1)
    131   smull       r5, r6, r8, r2      @ tmp32b * input2, in 64 bits
    132   lsl         r6, #16
    133   add         r6, r5, lsr #16     @ Only take the middle 32 bits
    134   str         r6, [r12]           @ Output (*ptr2, as 32 bits)
    135 
    136   @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
    137 
    138   smulwb      r5, r7, r1          @ tmp32a = *ptr0 * input1 >> 16
    139   smulwb      r6, r6, r0          @ tmp32b = *ptr2 * input0 >> 16
    140   lsl         r5, r5, #1
    141   add         r5, r6, lsl #1
    142   str         r5, [r4]            @ Output (*ptr1)
    143 
    144 END:
    145   pop         {r4-r8}
    146   bx          lr
    147