Home | History | Annotate | Download | only in signal_processing
      1 @
      2 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3 @
      4 @ Use of this source code is governed by a BSD-style license
      5 @ that can be found in the LICENSE file in the root of the source
      6 @ tree. An additional intellectual property rights grant can be found
      7 @ in the file PATENTS.  All contributing project authors may
      8 @ be found in the AUTHORS file in the root of the source tree.
      9 @
     10 
     11 @ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for
     12 @ ARMv7  platform. The description header can be found in
     13 @ signal_processing_library.h
     14 @
     15 @ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and
     16 @ the reference C code at end of this file.
     17 
     18 @ Assumptions:
     19 @ (1) data_length > 0
     20 @ (2) coefficients_length > 1
     21 
     22 @ Register usage:
     23 @
     24 @ r0:  &data_in[i]
     25 @ r1:  &data_out[i], for result ouput
     26 @ r2:  &coefficients[0]
     27 @ r3:  coefficients_length
     28 @ r4:  Iteration counter for the outer loop.
     29 @ r5:  data_out[j] as multiplication inputs
     30 @ r6:  Calculated value for output data_out[]; interation counter for inner loop
     31 @ r7:  Partial sum of a filtering multiplication results
     32 @ r8:  Partial sum of a filtering multiplication results
     33 @ r9:  &data_out[], for filtering input; data_in[i]
     34 @ r10: coefficients[j]
     35 @ r11: Scratch
     36 @ r12: &coefficients[j]
     37 
     38 #include "webrtc/system_wrappers/include/asm_defines.h"
     39 
     40 GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12
     41 .align  2
     42 DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12
     43   push {r4-r11}
     44 
     45   ldrsh r12, [sp, #32]         @ data_length
     46   subs r4, r12, #1
     47   beq ODD_LENGTH               @ jump if data_length == 1
     48 
     49 LOOP_LENGTH:
     50   add r12, r2, r3, lsl #1
     51   sub r12, #4                  @ &coefficients[coefficients_length - 2]
     52   sub r9, r1, r3, lsl #1
     53   add r9, #2                   @ &data_out[i - coefficients_length + 1]
     54   ldr r5, [r9], #4             @ data_out[i - coefficients_length + {1,2}]
     55 
     56   mov r7, #0                   @ sum1
     57   mov r8, #0                   @ sum2
     58   subs r6, r3, #3              @ Iteration counter for inner loop.
     59   beq ODD_A_LENGTH             @ branch if coefficients_length == 3
     60   blt POST_LOOP_A_LENGTH       @ branch if coefficients_length == 2
     61 
     62 LOOP_A_LENGTH:
     63   ldr r10, [r12], #-4          @ coefficients[j - 1], coefficients[j]
     64   subs r6, #2
     65   smlatt r8, r10, r5, r8       @ sum2 += coefficients[j] * data_out[i - j + 1];
     66   smlatb r7, r10, r5, r7       @ sum1 += coefficients[j] * data_out[i - j];
     67   smlabt r7, r10, r5, r7       @ coefficients[j - 1] * data_out[i - j + 1];
     68   ldr r5, [r9], #4             @ data_out[i - j + 2],  data_out[i - j + 3]
     69   smlabb r8, r10, r5, r8       @ coefficients[j - 1] * data_out[i - j + 2];
     70   bgt LOOP_A_LENGTH
     71   blt POST_LOOP_A_LENGTH
     72 
     73 ODD_A_LENGTH:
     74   ldrsh r10, [r12, #2]         @ Filter coefficients coefficients[2]
     75   sub r12, #2                  @ &coefficients[0]
     76   smlabb r7, r10, r5, r7       @ sum1 += coefficients[2] * data_out[i - 2];
     77   smlabt r8, r10, r5, r8       @ sum2 += coefficients[2] * data_out[i - 1];
     78   ldr r5, [r9, #-2]            @ data_out[i - 1],  data_out[i]
     79 
     80 POST_LOOP_A_LENGTH:
     81   ldr r10, [r12]               @ coefficients[0], coefficients[1]
     82   smlatb r7, r10, r5, r7       @ sum1 += coefficients[1] * data_out[i - 1];
     83 
     84   ldr r9, [r0], #4             @ data_in[i], data_in[i + 1]
     85   smulbb r6, r10, r9           @ output1 = coefficients[0] * data_in[i];
     86   sub r6, r7                   @ output1 -= sum1;
     87 
     88   sbfx r11, r6, #12, #16
     89   ssat r7, #16, r6, asr #12
     90   cmp r7, r11
     91   addeq r6, r6, #2048
     92   ssat r6, #16, r6, asr #12
     93   strh r6, [r1], #2            @ Store data_out[i]
     94 
     95   smlatb r8, r10, r6, r8       @ sum2 += coefficients[1] * data_out[i];
     96   smulbt r6, r10, r9           @ output2 = coefficients[0] * data_in[i + 1];
     97   sub r6, r8                   @ output1 -= sum1;
     98 
     99   sbfx r11, r6, #12, #16
    100   ssat r7, #16, r6, asr #12
    101   cmp r7, r11
    102   addeq r6, r6, #2048
    103   ssat r6, #16, r6, asr #12
    104   strh r6, [r1], #2            @ Store data_out[i + 1]
    105 
    106   subs r4, #2
    107   bgt LOOP_LENGTH
    108   blt END                      @ For even data_length, it's done. Jump to END.
    109 
    110 @ Process i = data_length -1, for the case of an odd length.
    111 ODD_LENGTH:
    112   add r12, r2, r3, lsl #1
    113   sub r12, #4                  @ &coefficients[coefficients_length - 2]
    114   sub r9, r1, r3, lsl #1
    115   add r9, #2                   @ &data_out[i - coefficients_length + 1]
    116   mov r7, #0                   @ sum1
    117   mov r8, #0                   @ sum1
    118   subs r6, r3, #2              @ inner loop counter
    119   beq EVEN_A_LENGTH            @ branch if coefficients_length == 2
    120 
    121 LOOP2_A_LENGTH:
    122   ldr r10, [r12], #-4          @ coefficients[j - 1], coefficients[j]
    123   ldr r5, [r9], #4             @ data_out[i - j],  data_out[i - j + 1]
    124   subs r6, #2
    125   smlatb r7, r10, r5, r7       @ sum1 += coefficients[j] * data_out[i - j];
    126   smlabt r8, r10, r5, r8       @ coefficients[j - 1] * data_out[i - j + 1];
    127   bgt LOOP2_A_LENGTH
    128   addlt r12, #2
    129   blt POST_LOOP2_A_LENGTH
    130 
    131 EVEN_A_LENGTH:
    132   ldrsh r10, [r12, #2]         @ Filter coefficients coefficients[1]
    133   ldrsh r5, [r9]               @ data_out[i - 1]
    134   smlabb r7, r10, r5, r7       @ sum1 += coefficients[1] * data_out[i - 1];
    135 
    136 POST_LOOP2_A_LENGTH:
    137   ldrsh r10, [r12]             @ Filter coefficients coefficients[0]
    138   ldrsh r9, [r0]               @ data_in[i]
    139   smulbb r6, r10, r9           @ output1 = coefficients[0] * data_in[i];
    140   sub r6, r7                   @ output1 -= sum1;
    141   sub r6, r8                   @ output1 -= sum1;
    142   sbfx r8, r6, #12, #16
    143   ssat r7, #16, r6, asr #12
    144   cmp r7, r8
    145   addeq r6, r6, #2048
    146   ssat r6, #16, r6, asr #12
    147   strh r6, [r1]                @ Store the data_out[i]
    148 
    149 END:
    150   pop {r4-r11}
    151   bx  lr
    152 
    153 @Reference C code:
    154 @
    155 @void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
    156 @                               int16_t* data_out,
    157 @                               int16_t* __restrict coefficients,
    158 @                               size_t coefficients_length,
    159 @                               size_t data_length) {
    160 @  size_t i = 0;
    161 @  size_t j = 0;
    162 @
    163 @  assert(data_length > 0);
    164 @  assert(coefficients_length > 1);
    165 @
    166 @  for (i = 0; i < data_length - 1; i += 2) {
    167 @    int32_t output1 = 0;
    168 @    int32_t sum1 = 0;
    169 @    int32_t output2 = 0;
    170 @    int32_t sum2 = 0;
    171 @
    172 @    for (j = coefficients_length - 1; j > 2; j -= 2) {
    173 @      sum1 += coefficients[j]      * data_out[i - j];
    174 @      sum1 += coefficients[j - 1]  * data_out[i - j + 1];
    175 @      sum2 += coefficients[j]     * data_out[i - j + 1];
    176 @      sum2 += coefficients[j - 1] * data_out[i - j + 2];
    177 @    }
    178 @
    179 @    if (j == 2) {
    180 @      sum1 += coefficients[2] * data_out[i - 2];
    181 @      sum2 += coefficients[2] * data_out[i - 1];
    182 @    }
    183 @
    184 @    sum1 += coefficients[1] * data_out[i - 1];
    185 @    output1 = coefficients[0] * data_in[i];
    186 @    output1 -= sum1;
    187 @    // Saturate and store the output.
    188 @    output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
    189 @    data_out[i] = (int16_t)((output1 + 2048) >> 12);
    190 @
    191 @    sum2 += coefficients[1] * data_out[i];
    192 @    output2 = coefficients[0] * data_in[i + 1];
    193 @    output2 -= sum2;
    194 @    // Saturate and store the output.
    195 @    output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728);
    196 @    data_out[i + 1] = (int16_t)((output2 + 2048) >> 12);
    197 @  }
    198 @
    199 @  if (i == data_length - 1) {
    200 @    int32_t output1 = 0;
    201 @    int32_t sum1 = 0;
    202 @
    203 @    for (j = coefficients_length - 1; j > 1; j -= 2) {
    204 @      sum1 += coefficients[j]      * data_out[i - j];
    205 @      sum1 += coefficients[j - 1]  * data_out[i - j + 1];
    206 @    }
    207 @
    208 @    if (j == 1) {
    209 @      sum1 += coefficients[1] * data_out[i - 1];
    210 @    }
    211 @
    212 @    output1 = coefficients[0] * data_in[i];
    213 @    output1 -= sum1;
    214 @    // Saturate and store the output.
    215 @    output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
    216 @    data_out[i] = (int16_t)((output1 + 2048) >> 12);
    217 @  }
    218 @}
    219