Home | History | Annotate | Download | only in source
      1 @
      2 @ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
      3 @
      4 @ Use of this source code is governed by a BSD-style license
      5 @ that can be found in the LICENSE file in the root of the source
      6 @ tree. An additional intellectual property rights grant can be found
      7 @ in the file PATENTS.  All contributing project authors may
      8 @ be found in the AUTHORS file in the root of the source tree.
      9 @
     10 
     11 @ lattice_neon.s
     12 @
     13 @ Contains a function for the core loop in the normalized lattice MA
     14 @ filter routine for iSAC codec, optimized for ARM Neon platform.
     15 @ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
     16 @                                     int16_t input1,
     17 @                                     int32_t input2,
     18 @                                     int32_t* ptr0,
     19 @                                     int32_t* ptr1,
     20 @                                     int32_t* __restrict ptr2);
     21 @ It calculates
     22 @   *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
     23 @   *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
     24 @ in Q15 domain.
     25 @
     26 @ Reference code in lattice.c.
     27 @ Output is not bit-exact with the reference C code, due to the replacement
     28 @ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
     29 @ instructions, smulwb, and smull. Speech quality was not degraded by
     30 @ testing speech and tone vectors.
     31 
     32 .arch armv7-a
     33 .fpu neon
     34 
     35 #include "settings.h"
     36 
     37 .global WebRtcIsacfix_FilterMaLoopNeon
     38 
     39 .align  2
     40 
     41 WebRtcIsacfix_FilterMaLoopNeon:
     42 .fnstart
     43 
     44 .save {r4-r8}
     45   push        {r4-r8}
     46 
     47   vdup.32     d28, r0             @ Initialize Neon register with input0
     48   vdup.32     d29, r1             @ Initialize Neon register with input1
     49   vdup.32     d30, r2             @ Initialize Neon register with input2
     50   ldr         r4, [sp, #20]       @ ptr1
     51   ldr         r12, [sp, #24]      @ ptr2
     52 
     53   @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
     54   @ Leftover samples after the loop, in r6:
     55   @    r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
     56   mov         r6, #HALF_SUBFRAMELEN
     57   sub         r6, #1
     58   lsr         r5, r6, #2
     59   sub         r6, r5, lsl #2
     60 
     61   @ First r5 iterations in a loop.
     62 
     63 LOOP:
     64   vld1.32     {d0, d1}, [r3]!     @ *ptr0
     65 
     66   vmull.s32   q10, d0, d28        @ tmp32a = input0 * (*ptr0)
     67   vmull.s32   q11, d1, d28        @ tmp32a = input0 * (*ptr0)
     68   vmull.s32   q12, d0, d29        @ input1 * (*ptr0)
     69   vmull.s32   q13, d1, d29        @ input1 * (*ptr0)
     70 
     71   vrshrn.i64  d4, q10, #15
     72   vrshrn.i64  d5, q11, #15
     73 
     74   vld1.32     {d2, d3}, [r12]     @ *ptr2
     75   vadd.i32    q3, q2, q1          @ tmp32b = *ptr2 + tmp32a
     76 
     77   vrshrn.i64  d0, q12, #15
     78 
     79   vmull.s32   q10, d6, d30        @ input2 * (*ptr2 + tmp32b)
     80   vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
     81 
     82   vrshrn.i64  d16, q10, #16
     83   vrshrn.i64  d17, q11, #16
     84 
     85   vmull.s32   q10, d16, d28       @ input0 * (*ptr2)
     86   vmull.s32   q11, d17, d28       @ input0 * (*ptr2)
     87 
     88   vrshrn.i64  d1, q13, #15
     89   vrshrn.i64  d18, q10, #15
     90   vrshrn.i64  d19, q11, #15
     91 
     92   vst1.32     {d16, d17}, [r12]!  @ *ptr2
     93 
     94   vadd.i32    q9, q0, q9
     95   subs        r5, #1
     96   vst1.32     {d18, d19}, [r4]!   @ *ptr1
     97 
     98   bgt         LOOP
     99 
    100   @ Check how many samples still need to be processed.
    101   subs        r6, #2
    102   blt         LAST_SAMPLE
    103 
    104   @ Process two more samples:
    105   vld1.32     d0, [r3]!           @ *ptr0
    106 
    107   vmull.s32   q11, d0, d28        @ tmp32a = input0 * (*ptr0)
    108   vmull.s32   q13, d0, d29        @ input1 * (*ptr0)
    109 
    110   vld1.32     d18, [r12]          @ *ptr2
    111   vrshrn.i64  d4, q11, #15
    112 
    113   vadd.i32    d7, d4, d18         @ tmp32b = *ptr2 + tmp32a
    114   vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
    115   vrshrn.i64  d16, q11, #16
    116 
    117   vmull.s32   q11, d16, d28       @ input0 * (*ptr2)
    118   vst1.32     d16, [r12]!         @ *ptr2
    119 
    120   vrshrn.i64  d0, q13, #15
    121   vrshrn.i64  d19, q11, #15
    122   vadd.i32    d19, d0, d19
    123 
    124   vst1.32     d19, [r4]!          @ *ptr1
    125 
    126   @ If there's still one more sample, process it here.
    127 LAST_SAMPLE:
    128   cmp         r6, #1
    129   bne         END
    130 
    131   @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
    132 
    133   ldr         r7, [r3]            @ *ptr0
    134   ldr         r8, [r12]           @ *ptr2
    135 
    136   smulwb      r5, r7, r0          @ tmp32a = *ptr0 * input0 >> 16
    137   add         r8, r8, r5, lsl #1  @ tmp32b = *ptr2 + (tmp32a << 1)
    138   smull       r5, r6, r8, r2      @ tmp32b * input2, in 64 bits
    139   lsl         r6, #16
    140   add         r6, r5, lsr #16     @ Only take the middle 32 bits
    141   str         r6, [r12]           @ Output (*ptr2, as 32 bits)
    142 
    143   @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
    144 
    145   smulwb      r5, r7, r1          @ tmp32a = *ptr0 * input1 >> 16
    146   smulwb      r6, r6, r0          @ tmp32b = *ptr2 * input0 >> 16
    147   lsl         r5, r5, #1
    148   add         r5, r6, lsl #1
    149   str         r5, [r4]            @ Output (*ptr1)
    150 
    151 END:
    152   pop         {r4-r8}
    153   bx          lr
    154 
    155 .fnend
    156