Home | History | Annotate | Download | only in armv7
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of
     11 @//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S to support float
     12 @//  instead of SC32.
     13 @//
     14 
     15 @//
     16 @// Description:
     17 @// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
     18 @// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
     19 @// It implements the "scaled"(by 1/2) version of the above formula.
     20 @//
     21 @//
     22 
     23 
     24 @// Include standard headers
     25 
     26 #include "dl/api/arm/armCOMM_s.h"
     27 #include "dl/api/arm/omxtypes_s.h"
     28 
     29 @//        M_VARIANTS ARM1136JS
     30 
     31 @// Import symbols required from other files
     32 @// (For example tables)
     33 
     34 
     35 @// Set debugging level
     36 @//DEBUG_ON    SETL {TRUE}
     37 
     38 
     39 
     40 @// Guarding implementation by the processor name
     41 
     42 @/    IF  ARM1136JS
     43 
     44 @//Input Registers
     45 
     46 #define pSrc            r0
     47 #define pDst            r1
     48 #define pFFTSpec        r2
     49 
     50 
     51 @// Output registers
     52 #define result          r0
     53 
     54 @//Local Scratch Registers
     55 
     56 
     57 #define argTwiddle      r1
     58 #define argDst          r2
     59 #define argScale        r4
     60 #define pTwiddle        r4
     61 #define pOut            r5
     62 #define subFFTSize      r7
     63 #define subFFTNum       r6
     64 #define N               r6
     65 #define order           r14
     66 #define diff            r9
     67 #define count           r8
     68 #define diffMinusOne    r2
     69 #define round           r3
     70 
     71 #define pOut1           r2
     72 #define size            r7
     73 #define step            r3
     74 #define step1           r6
     75 #define twStep          r12
     76 #define pTwiddleTmp     r14
     77 #define t0              r12
     78 
     79 #define x0r     s0
     80 #define x0i     s1
     81 #define x1r     s2
     82 #define x1i     s3
     83 #define w0r     s4
     84 #define w0i     s5
     85 #define y0r     s6
     86 #define y0i     s7
     87 #define w1r     s6
     88 #define w1i     s7
     89 #define y1r     s6              /*@// w1r,w1i*/
     90 #define y1i     s7
     91 #define st0     s8
     92 #define st1     s9
     93 #define st2     s10
     94 #define st3     s11
     95 #define st4     s12
     96 #define st5     s13
     97 //@ half = 0.5
     98 #define half    s15
     99 
    100 
    101 
    102 
    103 
    104         .macro FFTSTAGE scaled, inverse,name
    105 
    106         @// Initialize half now.
    107         movw    N, #0x0000
    108         movt    N, #0x3f00
    109         vmov.f32 half, N                @// half = 0.5
    110 
    111         @// Read the size from structure and take log
    112         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
    113 
    114         @// Read other structure parameters
    115         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
    116         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
    117 
    118 
    119         MOV     size,N,ASR #1           @// preserve the contents of N
    120 
    121         MOV     step,size,LSL #3        @// step = N/2 * 8 bytes
    122         ADD     pTwiddleTmp,pTwiddle,#8 @// W^2
    123 
    124         ADD     pOut1,pOut,step         @// pOut1 = pOut+ N/2*8 bytes
    125         @// twStep = 3N/8 * 8 bytes pointing to W^1
    126         SUB     twStep,step,size,LSL #1
    127         MOV     step1,size,LSL #2       @// step1 = N/4 * 8 = N/2*4 bytes
    128         SUB     step1,step1,#8          @// (N/4-1)*8 bytes
    129         ADD     argTwiddle,pTwiddle,twStep      @// W^1
    130 
    131         @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
    132         @// Note: W^(k) is stored as negated value and also need to
    133         @// conjugate the values from the table
    134 
    135         @// Z(0) : no need of twiddle multiply
    136         @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
    137 
    138 
    139         add      pSrc, step             @// step = N/2*8 bytes
    140         vldm.f32 pSrc, {x1r, x1i}       @// {x1r, x1i} = [pSrc, step]
    141         sub      pSrc, step
    142         vldm.f32 pSrc!, {x0r, x0i}
    143 
    144         SUBS    size,size,#2
    145 
    146         vadd.f32 st0, x0r, x1r          @// a+c
    147         vsub.f32 st1, x0r, x1r          @// a-c
    148         vmov.f32 x0r, st0
    149         vmov.f32 x1r, st1
    150         vsub.f32 st0, x0i, x1i          @// b-d
    151         vadd.f32 x1i, x0i, x1i          @// b+d
    152         vmov.f32 x0i, st0
    153 
    154 
    155         vsub.f32     x0r,x0r,x1i        @// Z(0).r
    156         vadd.f32     x0i,x0i,x1r        @// Z(0).i
    157 
    158         vmul.f32 x0r, half
    159         vmul.f32 x0i, half
    160         vstm.f32 pOut1!, {x0r, x0i}     @// pOut1 = pOut+ N/2*8 bytes
    161 
    162         BLT     end\name
    163         BEQ     lastElement\name
    164 
    165         ASR     size,size,#1
    166 evenOddButterflyLoop\name:
    167 
    168         SUB     step,step,#16           @// (N/2-2)*8 bytes
    169 
    170         add      pSrc, step             @// (N/2-1)*8 bytes
    171         vldm.f32 pSrc, {x1r, x1i}       @// {x1r, x1i} = [pSrc, step]
    172         sub      pSrc, step
    173         vldm.f32 pSrc!, {x0r, x0i}
    174         add      argTwiddle, step1
    175         vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step]
    176         sub      argTwiddle, step1
    177         vldm.f32 argTwiddle!, {w0r, w0i}
    178 
    179         SUB     step1,step1,#8
    180         SUBS    size,size,#1
    181 
    182 
    183         vsub.f32     st2,x0r,x1r        @// a-c
    184         vadd.f32     st3,x0i,x1i        @// b+d
    185         vadd.f32     st0,x0r,x1r        @// a+c
    186         vsub.f32     st1,x0i,x1i        @// b-d
    187 
    188         vmul.f32  x1r,w1r,st2
    189         vmul.f32  x1i,w1r,st3
    190         vmls.f32  x1r,w1i,st3
    191         vmla.f32  x1i,w1i,st2
    192 
    193         vadd.f32     y1r,st0,x1i        @// F(N/2 -1)
    194         vsub.f32     y1i,x1r,st1        @// y1r,y1i same as w1r, w1i
    195 
    196 
    197         vmul.f32  x0r,w0r,st2
    198         vmul.f32  x0i,w0r,st3
    199         vmla.f32  x0r,w0i,st3
    200         vmls.f32  x0i,w0i,st2
    201 
    202 
    203         vadd.f32     st4,st0,x0i        @// F(1)
    204         vsub.f32     st5,st1,x0r
    205 
    206 
    207         vmul.f32 y1r, half
    208         vmul.f32 y1i, half
    209         vmul.f32 st4, half
    210         vmul.f32 st5, half
    211         add      pOut1, step            @// (N/2-1)*8 bytes
    212         vstm.f32 pOut1, {y1r, y1i}      @// {y1r,y1i} = [pOut1, step]
    213         sub      pOut1, step
    214         vstm.f32 pOut1!, {st4, st5}
    215 
    216         MOV     t0,argTwiddle           @// swap ptr for even and odd twiddles
    217         MOV     argTwiddle,pTwiddleTmp
    218         MOV     pTwiddleTmp,t0
    219 
    220         BGT     evenOddButterflyLoop\name
    221 
    222 
    223         @// Last element can be expanded as follows
    224         @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)]
    225         @// (since W^k is stored as -ve)
    226         @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
    227         @// 1/2[2a+j0] + j (c-jd) [0+j2b]
    228         @// (a+bc, -bd)
    229         @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
    230 
    231 lastElement\name:
    232         vldm.f32 pSrc, {x0r, x0i}
    233 
    234         vneg.f32 x0i, x0i
    235         vstm.f32 pOut1, {x0r, x0i}
    236 end\name:
    237 
    238 
    239         .endm
    240 
    241 
    242 @ Structure offsets for FFTSpec
    243         .set    ARMsFFTSpec_N, 0
    244         .set    ARMsFFTSpec_pBitRev, 4
    245         .set    ARMsFFTSpec_pTwiddle, 8
    246         .set    ARMsFFTSpec_pBuf, 12
    247 
    248 
    249         M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp,r4
    250              FFTSTAGE "FALSE","TRUE",Inv
    251         M_END
    252 
    253 @//    ENDIF                                           @//ARM1136JS
    254 
    255 
    256       @// Guarding implementation by the processor name
    257 
    258 
    259 
    260     .end
    261