Home | History | Annotate | Download | only in ARMV7
      1 @/*
      2 @ ** Copyright 2003-2010, VisualOn, Inc.
      3 @ **
      4 @ ** Licensed under the Apache License, Version 2.0 (the "License");
      5 @ ** you may not use this file except in compliance with the License.
      6 @ ** You may obtain a copy of the License at
      7 @ **
      8 @ **     http://www.apache.org/licenses/LICENSE-2.0
      9 @ **
     10 @ ** Unless required by applicable law or agreed to in writing, software
     11 @ ** distributed under the License is distributed on an "AS IS" BASIS,
     12 @ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 @ ** See the License for the specific language governing permissions and
     14 @ ** limitations under the License.
     15 @ */
     16 @
     17 @*void Convolve (
     18 @*    Word16 x[],        /* (i)     : input vector                           */
     19 @*    Word16 h[],        /* (i)     : impulse response                       */
     20 @*    Word16 y[],        /* (o)     : output vector                          */
     21 @*    Word16 L           /* (i)     : vector size                            */
     22 @*)
     23 @
     24 @ r0 --- x[]
     25 @ r1 --- h[]
     26 @ r2 --- y[]
     27 @ r3 --- L
     28 
     29 	.section  .text
     30         .global   Convolve_asm
     31 
     32 Convolve_asm:
     33 
     34         STMFD          r13!, {r4 - r12, r14}
     35         MOV            r3,  #0
     36 	MOV            r11, #0x8000
     37 
     38 LOOP:
     39         @MOV            r8, #0                            @ s = 0
     40         ADD            r4, r1, r3, LSL #1                @ tmpH address
     41         ADD            r5, r3, #1                        @ i = n + 1
     42         MOV            r6, r0
     43         LDRSH          r9,  [r6], #2                     @ *tmpX++
     44         LDRSH          r10, [r4]                         @ *tmpH--
     45         SUB            r5, r5, #1
     46         VMOV.S32       Q10, #0
     47         MUL            r8,  r9, r10
     48 
     49 LOOP1:
     50         CMP            r5, #0
     51         BLE            L1
     52         SUB            r4, r4, #8
     53         MOV            r9, r4
     54         VLD1.S16       D0, [r6]!
     55         VLD1.S16       D1, [r9]!
     56         VREV64.16      D1, D1
     57         SUBS           r5, r5, #4
     58         VMLAL.S16      Q10, D0, D1
     59         B              LOOP1
     60 L1:
     61         VADD.S32       D20, D20, D21
     62         VPADD.S32      D20, D20, D20
     63         VMOV.S32       r5, D20[0]
     64         ADD            r5, r5, r8
     65         ADD            r5, r11, r5, LSL #1
     66         MOV            r5, r5, LSR #16                   @extract_h(s)
     67         ADD            r3, r3, #1
     68         STRH           r5, [r2], #2                      @y[n]
     69 
     70 
     71         @MOV            r8, #0
     72         ADD            r4, r1, r3, LSL #1                @tmpH address
     73         ADD            r5, r3, #1
     74         MOV            r6, r0
     75         LDRSH          r9,  [r6], #2                     @ *tmpX++
     76         LDRSH          r10, [r4], #-2
     77         LDRSH          r12, [r6], #2
     78         LDRSH          r14, [r4]
     79 
     80         MUL            r8, r9, r10
     81         SUB            r5, r5, #2
     82         MLA            r8, r12, r14, r8
     83 
     84         VMOV.S32       Q10, #0
     85 LOOP2:
     86         CMP            r5, #0
     87         BLE            L2
     88         SUB            r4, r4, #8
     89         MOV            r9, r4
     90         VLD1.S16       D0, [r6]!
     91         VLD1.S16       D1, [r9]!
     92         SUBS           r5, r5, #4
     93         VREV64.16      D1, D1
     94         VMLAL.S16      Q10, D0, D1
     95         B              LOOP2
     96 L2:
     97         VADD.S32       D20, D20, D21
     98         VPADD.S32      D20, D20, D20
     99         VMOV.S32       r5, D20[0]
    100         ADD            r8, r8, r5
    101         ADD            r8, r11, r8, LSL #1
    102         MOV            r8, r8, LSR #16                   @extract_h(s)
    103         ADD            r3, r3, #1
    104         STRH           r8, [r2], #2                      @y[n]
    105 
    106 
    107         @MOV            r8, #0
    108         ADD            r4, r1, r3, LSL #1
    109         ADD            r5, r3, #1
    110         MOV            r6, r0
    111         LDRSH          r9,  [r6], #2
    112         LDRSH          r10, [r4], #-2
    113         LDRSH          r12, [r6], #2
    114         LDRSH          r14, [r4], #-2
    115         MUL            r8, r9, r10
    116         LDRSH          r9,  [r6], #2
    117         LDRSH          r10, [r4]
    118         MLA            r8, r12, r14, r8
    119         SUB            r5, r5, #3
    120         MLA            r8, r9, r10, r8
    121 
    122         VMOV.S32       Q10, #0
    123 LOOP3:
    124         CMP            r5, #0
    125         BLE            L3
    126         SUB            r4, r4, #8
    127         MOV            r9, r4
    128         VLD1.S16       D0, [r6]!
    129         VLD1.S16       D1, [r9]!
    130         VREV64.16      D1, D1
    131         SUBS           r5, r5, #4
    132         VMLAL.S16      Q10, D0, D1
    133         B              LOOP3
    134 
    135 L3:
    136         VADD.S32       D20, D20, D21
    137         VPADD.S32      D20, D20, D20
    138         VMOV.S32       r5, D20[0]
    139         ADD            r8, r8, r5
    140         ADD            r8, r11, r8, LSL #1
    141         MOV            r8, r8, LSR #16                   @extract_h(s)
    142         ADD            r3, r3, #1
    143         STRH           r8, [r2], #2                      @y[n]
    144 
    145         ADD            r5, r3, #1                        @ i = n + 1
    146         ADD            r4, r1, r5, LSL #1                @ tmpH address
    147         MOV            r6, r0
    148         VMOV.S32       Q10, #0
    149 LOOP4:
    150         CMP            r5, #0
    151         BLE            L4
    152         SUB            r4, r4, #8
    153         MOV            r9, r4
    154         VLD1.S16       D0, [r6]!
    155         VLD1.S16       D1, [r9]!
    156         VREV64.16      D1, D1
    157         SUBS           r5, r5, #4
    158         VMLAL.S16      Q10, D0, D1
    159         B              LOOP4
    160 L4:
    161         VADD.S32       D20, D20, D21
    162         VPADD.S32      D20, D20, D20
    163         VMOV.S32       r5,  D20[0]
    164         ADD            r5, r11, r5, LSL #1
    165         MOV            r5, r5, LSR #16                   @extract_h(s)
    166         ADD            r3, r3, #1
    167         STRH           r5, [r2], #2                      @y[n]
    168 
    169         CMP            r3, #64
    170         BLT            LOOP
    171 
    172 Convolve_asm_end:
    173 
    174         LDMFD      r13!, {r4 - r12, r15}
    175 
    176         @ENDFUNC
    177         .END
    178 
    179