Home | History | Annotate | Download | only in armv7
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
     11 @//  to support float instead of SC32.
     12 @//
     13 
     14 @//
     15 @// Description:
     16 @// Compute FFT for a real signal
     17 @//
     18 @//
     19 
     20 
     21 @// Include standard headers
     22 
     23 #include "dl/api/arm/armCOMM_s.h"
     24 #include "dl/api/arm/omxtypes_s.h"
     25 
     26 @//        M_VARIANTS ARM1136JS
     27 
     28 @// Import symbols required from other files
     29 @// (For example tables)
     30 
     31         .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
     32         .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
     33         .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
     34         .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
     35 
     36 @// Set debugging level
     37 @//DEBUG_ON    SETL {TRUE}
     38 
     39 
     40 
     41 @// Guarding implementation by the processor name
     42 
     43 @//    IF  ARM1136JS
     44 
     45 @//Input Registers
     46 
     47 #define pSrc            r0
     48 #define pDst            r1
     49 #define pFFTSpec        r2
     50 
     51 
     52 @// Output registers
     53 #define result          r0
     54 
     55 @//Local Scratch Registers
     56 
     57 @// N=1 case
     58 #define scaleMinusOne   r2
     59 #define rnd             r2
     60 #define zero            r8
     61 #define Zero            r9
     62 
     63 
     64 #define argTwiddle      r1
     65 #define argDst          r2
     66 #define argScale        r4
     67 #define pTwiddle        r4
     68 #define pOut            r5
     69 #define subFFTSize      r7
     70 #define subFFTNum       r6
     71 #define N               r6
     72 #define order           r14
     73 #define diff            r9
     74 #define count           r8
     75 #define diffMinusOne    r10
     76 #define round           r3
     77 
     78 #define step            r3
     79 #define step1           r6
     80 #define twStep          r12
     81 #define pTwiddleTmp     r14
     82 #define t0              r12
     83 #define t1              r14              /*@// pTwiddleTmp*/
     84 #define t2              r0
     85 #define t3              r1               /*@// pSrc,argTwiddle*/
     86 #define t4              r6
     87 #define t5              r7               /*@// step1,subFFTSize*/
     88 
     89 #define x0r     s0
     90 #define x0i     s1
     91 #define y0r     s2
     92 #define y0i     s3
     93 #define x1r     s4
     94 #define x1i     s5
     95 #define w1r     s2
     96 #define w1i     s3
     97 #define w0r     s6
     98 #define w0i     s7
     99 #define y1r     s2              /*@// w1r,w1i*/
    100 #define y1i     s3
    101 #define st0     s8
    102 #define st1     s9
    103 #define st2     s10
    104 #define st3     s11
    105 #define st4     s12
    106 #define st5     s13
    107 #define half    s15
    108 
    109 
    110 
    111 
    112     @// Allocate stack memory required by the function
    113 
    114 
    115 
    116     @// Write function header
    117         M_START     omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11
    118 
    119 @ Structure offsets for FFTSpec
    120         .set    ARMsFFTSpec_N, 0
    121         .set    ARMsFFTSpec_pBitRev, 4
    122         .set    ARMsFFTSpec_pTwiddle, 8
    123         .set    ARMsFFTSpec_pBuf, 12
    124 
    125         @// Define stack arguments
    126 
    127         @// Setup half value
    128         movw    N, #0                   @// Use N as a temp.
    129         movt    N, #0x3f00
    130         vmov.f32 half, N
    131 
    132         @// Read the size from structure and take log
    133         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
    134 
    135         @// Read other structure parameters
    136         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
    137         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
    138 
    139         @//  N=1 Treat seperately
    140         CMP     N,#1
    141         BGT     sizeGreaterThanOne
    142         // N<=1 is not supported
    143         @// Set return value
    144         MOV     result, #OMX_Sts_NoErr
    145         B       FunctionEnd
    146 
    147 sizeGreaterThanOne:
    148         @// Do a N/2 point complex FFT including the scaling
    149 
    150         MOV     N,N,ASR #1              @// N/2 point complex FFT
    151         CLZ     order,N                 @// N = 2^order
    152         RSB     order,order,#31
    153         MOV     subFFTSize,#1
    154         @//MOV     subFFTNum,N
    155 
    156 
    157         CMP     order,#1
    158         BGT     orderGreaterthan1       @// order > 1
    159         vldmlt.f32 pSrc, {x0r, x0i}
    160         vstmlt.f32 pOut, {x0r, x0i}
    161         MOVLT   pSrc,pOut
    162         MOVLT   argDst,pDst
    163         BLT     FFTEnd
    164 
    165         MOV     argDst,pOut             @// Set input args to fft stages
    166         MOV     pOut,pDst               @// Set input args to fft stages
    167         MOV     argTwiddle,pTwiddle
    168 
    169         BL    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
    170         B     finalComplexToRealFixup
    171 
    172 orderGreaterthan1:
    173 
    174         TST     order, #2               @// Set input args to fft stages
    175         MOVEQ   argDst,pDst
    176         MOVNE   argDst,pOut
    177         MOVNE   pOut,pDst               @// Pass the first stage dest in RN5
    178         MOV     argTwiddle,pTwiddle
    179 
    180         @//check for even or odd order
    181 
    182         @// NOTE: The following combination of BL's would work fine
    183         @// eventhough the first BL would corrupt the flags. This is
    184         @// because the end of the "grpZeroSetLoop" loop inside
    185         @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
    186         @// the Z flag to EQ
    187 
    188         TST     order,#0x00000001
    189         BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
    190         BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
    191 
    192 unscaledRadix4Loop:
    193         CMP        subFFTNum,#1
    194          BEQ        FFTEnd
    195          BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
    196          B        unscaledRadix4Loop
    197 
    198 FFTEnd:
    199 finalComplexToRealFixup:
    200 
    201         @// step = N/2 * 8 bytes
    202         MOV     step,subFFTSize,LSL #3
    203         @// twStep = 3N/8 * 8 bytes pointing to W^1
    204         SUB     twStep,step,subFFTSize,LSL #1
    205         @// step1 = N/4 * 8 = N/2*4 bytes
    206         MOV     step1,subFFTSize,LSL #2
    207         @// (N/4-1)*8 bytes
    208         SUB     step1,step1,#8
    209 
    210         @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
    211         @// 1/2 [(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
    212         @// 1/2 [2a+j0] - j [0+j2b]
    213         @// (a+b, 0)
    214 
    215         @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
    216         @// 1/2 [(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
    217         @// 1/2 [2a+j0] + j [0+j2b]
    218         @// (a-b, 0)
    219 
    220         @// F(0) and F(N/2)
    221         vldm.f32 pSrc!, {x0r, x0i}
    222         vadd.f32 y0r,x0r,x0i            @// F(0) = (2(Z0.r+Z0.i) , 0)
    223         vsub.f32 x0r,x0r,x0i            @// F(N/2) = (2(Z0.r-Z0.i) , 0)
    224         vsub.f32 y0i, y0i               @ y0i and x0i set to 0.0
    225         vsub.f32 x0i, x0i
    226 
    227         add      argDst, step
    228         vstm.f32 argDst, {x0r, x0i}     @// {x0r,x0i}->[argDst, step]
    229         sub      argDst, step
    230         vstm.f32 argDst!, {y0r, y0i}
    231 
    232         SUBS    subFFTSize,subFFTSize,#2
    233 
    234         ADD     pTwiddleTmp,argTwiddle,#8       @// W^2
    235         ADD     argTwiddle,argTwiddle,twStep    @// W^1
    236         BLT     End
    237         BEQ     lastElement
    238 
    239 
    240         @// F(k) = 1/2 [Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
    241         @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since
    242         @// both of them require Z(1) and Z(N/2-1)
    243 
    244         ASR     subFFTSize,subFFTSize,#1
    245 evenOddButterflyLoop:
    246 
    247         SUB     step,step,#16           @// (N/2-2)*8 bytes
    248 
    249         add      pSrc, step
    250         vldm.f32 pSrc, {x1r, x1i}       @// {x1r, x1i} = [pSrc, step]
    251         sub      pSrc, step
    252         vldm.f32 pSrc!, {x0r, x0i}
    253         add      argTwiddle, step1
    254         vldm.f32 argTwiddle, {w1r, w1i}  @// {w1r, w1i} = [argTwiddle, step1]
    255         sub      argTwiddle, step1
    256         vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8
    257 
    258         SUB     step1,step1,#8
    259         SUBS    subFFTSize,subFFTSize,#1
    260 
    261         vsub.f32 st2,x0r,x1r            @// a-c
    262         vadd.f32 st3,x0i,x1i            @// b+d
    263         vadd.f32 st0,x0r,x1r            @// a+c
    264         vsub.f32 st1,x0i,x1i            @// b-d
    265 
    266         vmul.f32 x1r,w1r,st2
    267         vmul.f32 x1i,w1r,st3
    268         vmla.f32 x1r,w1i,st3            @// x1r = w1r*st2 + w1i*st3
    269         @//RSB     x1r,x1r,#0
    270         vmls.f32 x1i,w1i,st2            @// x1i = w1r*st3 - wli*st2
    271 
    272         vsub.f32 y1r, st0, x1i
    273         vadd.f32 y1i, x1r, st1
    274         vneg.f32 y1i, y1i
    275 
    276         vmul.f32  x0r,w0r,st2
    277         vmul.f32  x0i,w0r,st3
    278         vmls.f32  x0r,w0i,st3           @// x0r = w0r*st2 - w0i*st3
    279         vmla.f32  x0i,w0i,st2           @// x0i = w0r*st3 + x0i*st1
    280 
    281         vsub.f32   st4,st0,x0i          @// F(1)
    282         vadd.f32   st5,x0r,st1
    283 
    284 
    285         vmul.f32 y1r, half
    286         vmul.f32 y1i, half
    287         vmul.f32 st4, half
    288         vmul.f32 st5, half
    289 
    290         add      argDst, step
    291         vstm.f32 argDst, {y1r, y1i}     @// {y1r,y1i} -> [argDst,step]
    292         sub      argDst, step
    293         vstm.f32 argDst!, {st4, st5}
    294 
    295 
    296         MOV     t0,argTwiddle           @// swap ptr for even and odd twiddles
    297         MOV     argTwiddle,pTwiddleTmp
    298         MOV     pTwiddleTmp,t0
    299 
    300         BGT     evenOddButterflyLoop
    301 
    302         @// Last element can be expanded as follows
    303         @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
    304         @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
    305         @// 1/2[2a+j0] + j (c+jd) [0+j2b]
    306         @// (a-bc, -bd)
    307 
    308 lastElement:
    309         vldm.f32 pSrc, {x0r, x0i}
    310         vneg.f32 x0i, x0i
    311         vstm.f32 argDst, {x0r, x0i}
    312 
    313 End:
    314         @// Set return value
    315         MOV     result, #OMX_Sts_NoErr
    316 
    317 FunctionEnd:
    318         @// Write function tail
    319         M_END
    320 
    321 @//    ENDIF                                           @//ARM1136JS
    322 
    323 
    324     @// Guarding implementation by the processor name
    325 
    326 
    327 
    328     .end
    329