Home | History | Annotate | Download | only in signal_processing
      1 @
      2 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3 @
      4 @ Use of this source code is governed by a BSD-style license
      5 @ that can be found in the LICENSE file in the root of the source
      6 @ tree. An additional intellectual property rights grant can be found
      7 @ in the file PATENTS.  All contributing project authors may
      8 @ be found in the AUTHORS file in the root of the source tree.
      9 @
     10 
     11 @ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for
     12 @ ARM Neon platform. The description header can be found in
     13 @ signal_processing_library.h
     14 @
     15 @ The reference C code is in file downsample_fast.c. Bit-exact.
     16 
     17 #include "webrtc/system_wrappers/interface/asm_defines.h"
     18 
     19 GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
     20 .align  2
     21 DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
     22   push {r4-r11}
     23 
     24   cmp r3, #0                                @ data_out_length <= 0?
     25   movle r0, #-1
     26   ble END
     27 
     28   ldrsh r12, [sp, #44]
     29   ldr r5, [sp, #40]                         @ r5: factor
     30   add r4, r12, #1                           @ r4: delay + 1
     31   sub r3, r3, #1                            @ r3: data_out_length - 1
     32   smulbb r3, r5, r3
     33   ldr r8, [sp, #32]                         @ &coefficients[0]
     34   mov r9, r12                               @ Iteration counter for outer loops.
     35   add r3, r4                                @ delay + factor * (out_length-1) +1
     36 
     37   cmp r3, r1                                @ data_in_length < endpos?
     38   movgt r0, #-1
     39   bgt END
     40 
     41   @ Initializations.
     42   sub r3, r5, asl #3
     43   add r11, r0, r12, asl #1                  @ &data_in[delay]
     44   ldr r0, [sp, #36]                         @ coefficients_length
     45   add r3, r5                                @ endpos - factor * 7
     46 
     47   cmp r0, #0                                @ coefficients_length <= 0 ?
     48   movle r0, #-1
     49   ble END
     50 
     51   add r8, r0, asl #1                        @ &coeffieient[coefficients_length]
     52   cmp r9, r3
     53   bge POST_LOOP_ENDPOS                      @ branch when Iteration < 8 times.
     54 
     55 @
     56 @ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
     57 @
     58   mov r4, #-2
     59 
     60   @ Direct program flow to the right channel.
     61 
     62   @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
     63   @ move the pointer back to original after advancing 16 bytes by a vld1, and
     64   @ then move 2 bytes forward to increment one more sample.
     65   cmp r5, #2
     66   moveq r10, #-14
     67   beq LOOP_ENDPOS_FACTOR2                   @ Branch when factor == 2
     68 
     69   @ Similar here, for r10, we need to move the pointer back to original after
     70   @ advancing 32 bytes, then move 2 bytes forward to increment one sample.
     71   cmp r5, #4
     72   moveq r10, #-30
     73   beq LOOP_ENDPOS_FACTOR4                   @ Branch when factor == 4
     74 
     75   @ For r10, we need to move the pointer back to original after advancing
     76   @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
     77   mov r10, r5, asl #4
     78   rsb r10, #2
     79   add r10, r5, asl #1
     80   lsl r5, #1                                @ r5 = factor * sizeof(data_in)
     81 
     82 @ The general case (factor != 2 && factor != 4)
     83 LOOP_ENDPOS_GENERAL:
     84   @ Initializations.
     85   vmov.i32 q2, #2048
     86   vmov.i32 q3, #2048
     87   sub r7, r8, #2
     88   sub r12, r0, #1                           @ coefficients_length - 1
     89   sub r1, r11, r12, asl #1                  @ &data_in[i - j]
     90 
     91 LOOP_COEFF_LENGTH_GENERAL:
     92   vld1.16 {d2[], d3[]}, [r7], r4            @ coefficients[j]
     93   vld1.16 d0[0], [r1], r5                   @ data_in[i - j]
     94   vld1.16 d0[1], [r1], r5                   @ data_in[i + factor - j]
     95   vld1.16 d0[2], [r1], r5                   @ data_in[i + factor * 2 - j]
     96   vld1.16 d0[3], [r1], r5                   @ data_in[i + factor * 3 - j]
     97   vld1.16 d1[0], [r1], r5                   @ data_in[i + factor * 4 - j]
     98   vld1.16 d1[1], [r1], r5                   @ data_in[i + factor * 5 - j]
     99   vld1.16 d1[2], [r1], r5                   @ data_in[i + factor * 6 - j]
    100   vld1.16 d1[3], [r1], r10                  @ data_in[i + factor * 7 - j]
    101   subs r12, #1
    102   vmlal.s16 q2, d0, d2
    103   vmlal.s16 q3, d1, d3
    104   bge LOOP_COEFF_LENGTH_GENERAL
    105 
    106   @ Shift, saturate, and store the result.
    107   vqshrn.s32 d0, q2, #12
    108   vqshrn.s32 d1, q3, #12
    109   vst1.16 {d0, d1}, [r2]!
    110 
    111   add r11, r5, asl #3                       @ r11 -> &data_in[i + factor * 8]
    112   add r9, r5, asl #2                        @ Counter i = delay + factor * 8.
    113   cmp r9, r3                                @ i < endpos - factor * 7 ?
    114   blt LOOP_ENDPOS_GENERAL
    115   asr r5, #1                                @ Restore r5 to the value of factor.
    116   b POST_LOOP_ENDPOS
    117 
    118 @ The case for factor == 2.
    119 LOOP_ENDPOS_FACTOR2:
    120   @ Initializations.
    121   vmov.i32 q2, #2048
    122   vmov.i32 q3, #2048
    123   sub r7, r8, #2
    124   sub r12, r0, #1                           @ coefficients_length - 1
    125   sub r1, r11, r12, asl #1                  @ &data_in[i - j]
    126 
    127 LOOP_COEFF_LENGTH_FACTOR2:
    128   vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j]
    129   vld2.16 {d0, d1}, [r1]!                   @ data_in[]
    130   vld2.16 {d2, d3}, [r1], r10               @ data_in[]
    131   subs r12, #1
    132   vmlal.s16 q2, d0, d16
    133   vmlal.s16 q3, d2, d17
    134   bge LOOP_COEFF_LENGTH_FACTOR2
    135 
    136   @ Shift, saturate, and store the result.
    137   vqshrn.s32 d0, q2, #12
    138   vqshrn.s32 d1, q3, #12
    139   vst1.16 {d0, d1}, [r2]!
    140 
    141   add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
    142   add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
    143   cmp r9, r3                                @ i < endpos - factor * 7 ?
    144   blt LOOP_ENDPOS_FACTOR2
    145   b POST_LOOP_ENDPOS
    146 
    147 @ The case for factor == 4.
    148 LOOP_ENDPOS_FACTOR4:
    149   @ Initializations.
    150   vmov.i32 q2, #2048
    151   vmov.i32 q3, #2048
    152   sub r7, r8, #2
    153   sub r12, r0, #1                           @ coefficients_length - 1
    154   sub r1, r11, r12, asl #1                  @ &data_in[i - j]
    155 
    156 LOOP_COEFF_LENGTH_FACTOR4:
    157   vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j]
    158   vld4.16 {d0, d1, d2, d3}, [r1]!           @ data_in[]
    159   vld4.16 {d18, d19, d20, d21}, [r1], r10   @ data_in[]
    160   subs r12, #1
    161   vmlal.s16 q2, d0, d16
    162   vmlal.s16 q3, d18, d17
    163   bge LOOP_COEFF_LENGTH_FACTOR4
    164 
    165   add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8]
    166   add r9, r5, asl #3                        @ Counter i = delay + factor * 8.
    167 
    168   @ Shift, saturate, and store the result.
    169   vqshrn.s32 d0, q2, #12
    170   vqshrn.s32 d1, q3, #12
    171   cmp r9, r3                                @ i < endpos - factor * 7 ?
    172   vst1.16 {d0, d1}, [r2]!
    173 
    174   blt LOOP_ENDPOS_FACTOR4
    175 
    176 @
    177 @ Second part, do the rest iterations (if any).
    178 @
    179 
    180 POST_LOOP_ENDPOS:
    181   add r3, r5, asl #3
    182   sub r3, r5                                @ Restore r3 to endpos.
    183   cmp r9, r3
    184   movge r0, #0
    185   bge END
    186 
    187 LOOP2_ENDPOS:
    188   @ Initializations.
    189   mov r7, r8
    190   sub r12, r0, #1                           @ coefficients_length - 1
    191   sub r6, r11, r12, asl #1                  @ &data_in[i - j]
    192 
    193   mov r1, #2048
    194 
    195 LOOP2_COEFF_LENGTH:
    196   ldrsh r4, [r7, #-2]!                      @ coefficients[j]
    197   ldrsh r10, [r6], #2                       @ data_in[i - j]
    198   smlabb r1, r4, r10, r1
    199   subs r12, #1
    200   bge LOOP2_COEFF_LENGTH
    201 
    202   @ Shift, saturate, and store the result.
    203   ssat r1, #16, r1, asr #12
    204   strh r1, [r2], #2
    205 
    206   add r11, r5, asl #1                       @ r11 -> &data_in[i + factor]
    207   add r9, r5                                @ Counter i = delay + factor.
    208   cmp r9, r3                                @ i < endpos?
    209   blt LOOP2_ENDPOS
    210 
    211   mov r0, #0
    212 
    213 END:
    214   pop {r4-r11}
    215   bx  lr
    216