Home | History | Annotate | Download | only in armv7
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
     11 @//  to support float instead of SC32.
     12 @//
     13 
     14 @//
     15 @// Description:
     16 @// Compute a Radix 4 FFT stage for a N point complex signal
     17 @//
     18 @//
     19 
     20 
     21 @// Include standard headers
     22 
     23 #include "dl/api/arm/armCOMM_s.h"
     24 #include "dl/api/arm/omxtypes_s.h"
     25 
     26 @//        M_VARIANTS ARM1136JS
     27 
     28 @// Import symbols required from other files
     29 @// (For example tables)
     30 
     31 
     32 
     33 
     34 @// Set debugging level
     35 @//DEBUG_ON    SETL {TRUE}
     36 
     37 
     38 
     39 @// Guarding implementation by the processor name
     40 
     41 @//    IF  ARM1136JS
     42 
     43 @//Input Registers
     44 
     45 #define pSrc            r0
     46 #define pDst            r2
     47 #define pTwiddle        r1
     48 #define subFFTNum       r6
     49 #define subFFTSize      r7
     50 
     51 
     52 
     53 @//Output Registers
     54 
     55 
     56 @//Local Scratch Registers
     57 
     58 #define grpCount        r12
     59 #define step            r12                  /*@// Reuse grpCount*/
     60 #define outPointStep    r3
     61 #define setCount        r8
     62 #define diff            r9
     63 #define pointStep       r14
     64 
     65 #define t1              r3                 /*@// Reuse outPointStep*/
     66 
     67 @// Real and Imaginary parts used in the inner grp loop
     68 #define x0r s0
     69 #define x0i s1
     70 #define x1r s2
     71 #define x1i s3
     72 #define x2r s4
     73 #define x2i s5
     74 #define x3r s6
     75 #define x3i s7
     76 
     77 @// Temporary reg to hold the twiddle multiplies
     78 
     79 #define t0r s8
     80 #define t0i s9
     81 #define t2r s10
     82 #define t2i s11
     83 #define sr  s12
     84 #define si  s13
     85 
     86 
     87 
     88 
     89         .macro FFTSTAGE scaled, inverse , name
     90 
     91         @// Define stack arguments
     92 
     93 
     94         @// Update grpCount and grpSize rightaway inorder to reuse
     95         @// pGrpCount and pGrpSize regs
     96 
     97         LSL     grpCount,subFFTSize,#2
     98         lsr     subFFTNum, subFFTNum, #2
     99         mov     subFFTSize, grpCount
    100 
    101 
    102         @// pT0+1 increments pT0 by 8 bytes
    103         @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
    104         mov     pointStep, subFFTNum, lsl #1
    105 
    106 
    107         @// pOut0+1 increments pOut0 by 8 bytes
    108         @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size
    109         @// bytes
    110 
    111         @// Use setCount as dummy.  It's set correctly below.
    112         smull   outPointStep, setCount, grpCount, pointStep
    113 
    114         LSL     pointStep,pointStep,#2                      @// 2*grpSize
    115 
    116 
    117         MOV     setCount,pointStep,LSR #3
    118 
    119         @// Interchange grpLoop and setLoop
    120 
    121 setLoop\name:
    122 
    123         MOV     step,#0
    124         @// Set pSrc and pDst for the grpLoop
    125 
    126         SUB      diff,outPointStep,pointStep
    127 
    128         @// Save setCount on stack to reuse the reg
    129 
    130         ADD      pSrc,pSrc,diff,LSL #2  @// pSrc += (grpCount-1)*grpStep
    131         ADD      pDst,pDst,diff         @// pDst += (grpCount-1)*setCount
    132         ADD      step,step,diff         @// step += (grpCount-1)*setCount
    133 
    134 
    135 
    136         @// Loop on the grps
    137 
    138 grpLoop\name:
    139 
    140 
    141 
    142         @// butterfly loop
    143         add         pSrc, pointStep
    144         vldm.f32    pSrc, {x3r, x3i}                    @// data[1]
    145         add         pTwiddle, step
    146         vldm.f32    pTwiddle, {x1r, x1i}                @// coef[1]
    147         add         pTwiddle, step
    148         vldm.f32    pTwiddle, {x2r, x2i}                @// coef[2]
    149         add         pSrc, pointStep
    150         vldm.f32    pSrc, {x0r, x0i}                    @// data[2]
    151 
    152         @// do first complex multiply
    153         vmul.f32 t0r, x3r, x1r
    154         vmul.f32 t0i, x3i, x1r
    155 
    156         .ifeqs  "\inverse", "TRUE"
    157             vmla.f32 t0r, x3i, x1i
    158             vmls.f32 t0i, x3r, x1i
    159             vmov.f32 x1r, t0r
    160             vmov.f32 x1i, t0i
    161         .else
    162             vmls.f32 t0r, x3i, x1i
    163             vmla.f32 t0i, x3r, x1i
    164             vmov.f32 x1r, t0r
    165             vmov.f32 x1i, t0i
    166         .endif
    167 
    168         add     pTwiddle, pTwiddle, step
    169         vldm    pTwiddle, {x3r, x3i}                    @// coef[3]
    170         sub     pTwiddle, pTwiddle, step
    171 
    172         @// do second complex multiply
    173         vmul.f32 t0r, x0r, x2r
    174         vmul.f32 t0i, x0i, x2r
    175 
    176         .ifeqs  "\inverse", "TRUE"
    177             vmla.f32 t0r, x0i, x2i
    178             vmls.f32 t0i, x0r, x2i
    179             vmov.f32 x2r, t0r
    180             vmov.f32 x2i, t0i
    181         .else
    182             vmls.f32 t0r, x0i, x2i
    183             vmla.f32 t0i, x0r, x2i
    184             vmov.f32 x2r, t0r
    185             vmov.f32 x2i, t0i
    186         .endif
    187 
    188         add     pSrc, pointStep
    189         vldm    pSrc, {x0r, x0i}                @// data[3]
    190         sub     pSrc, pointStep
    191 
    192         SUB     pTwiddle,pTwiddle,step,LSL #1   @// reset pTwiddle
    193         SUBS    step,step,pointStep             @// decrement loop counter
    194 
    195         @// do third complex multiply
    196         SUB     pSrc,pSrc,pointStep,LSL #1      @// reset pSrc to data[0]
    197         vmul.f32 t0r, x0r, x3r
    198         vmul.f32 t0i, x0i, x3r
    199 
    200         .ifeqs  "\inverse", "TRUE"
    201             vmla.f32 t0r, x0i, x3i
    202             vmls.f32 t0i, x0r, x3i
    203             vmov.f32 x3r, t0r
    204             vmov.f32 x3i, t0i
    205         .else
    206             vmls.f32 t0r, x0i, x3i
    207             vmla.f32 t0i, x0r, x3i
    208             vmov.f32 x3r, t0r
    209             vmov.f32 x3i, t0i
    210         .endif
    211 
    212         vldm    pSrc, {x0r, x0i}                @// data[0]
    213 
    214         @// finish first stage of 4 point FFT
    215         vadd.f32     x0r,x0r,x2r                @// x0 = x0 + x2 (u0)
    216         vadd.f32     x0i,x0i,x2i
    217 
    218         vadd.f32     sr, x2r, x2r
    219         vadd.f32     si, x2i, x2i
    220         vsub.f32     x2r,x0r,sr                 @// x2 = x0 - x2 (u1)
    221         vsub.f32     x2i,x0i,si
    222 
    223         vadd.f32     x1r,x1r,x3r                @// x1 = x1/2 + x3/2 (u2/2)
    224         vadd.f32     x1i,x1i,x3i
    225 
    226         vadd.f32     sr, x3r, x3r
    227         vadd.f32     si, x3i, x3i
    228         vsub.f32     x3r,x1r,sr                 @// x3 = x1/2 - x3/2 (u3/2)
    229         vsub.f32     x3i,x1i,si
    230 
    231 
    232         @// finish second stage of 4 point FFT
    233 
    234         @// y0 = u1-u2 since twiddle's are stored as -ve values
    235         vsub.f32     x2r,x2r,x1r
    236         vsub.f32     x2i,x2i,x1i
    237 
    238         vadd.f32     sr, x1r, x1r
    239         vadd.f32     si, x1i, x1i
    240         vadd.f32     x1r,x2r,sr                 @// y2 = u1+u2
    241         vadd.f32     x1i,x2i,si
    242         vstm    pDst, {x2r, x2i}                @// store y0
    243 
    244         vsub.f32     x0r,x0r,x3i                @// y3 = u0+ju3
    245         vadd.f32     x0i,x0i,x3r
    246 
    247         vadd.f32     sr, x3r, x3r
    248         vadd.f32     si, x3i, x3i
    249         vadd.f32     t2r,x0r,si                 @// y1 = u0-ju3
    250         vsub.f32     t2i,x0i,sr                 @// t2 will be same as x2r reg
    251 
    252         .ifeqs  "\inverse", "TRUE"
    253             add     pDst, outPointStep
    254             vstm    pDst, {t2r, t2i}            @// store y1
    255             add     pDst, outPointStep
    256             vstm    pDst, {x1r, x1i}            @// store y2
    257             add     pDst, outPointStep
    258             vstm    pDst, {x0r, x0i}            @// store y3
    259             sub     pDst, outPointStep
    260         .else
    261             add     pDst, outPointStep
    262             vstm    pDst, {x0r, x0i}            @// store y1
    263             add     pDst, outPointStep
    264             vstm    pDst, {x1r, x1i}            @// store y2
    265             add     pDst, outPointStep
    266             vstm    pDst, {t2r, t2i}            @// store y3
    267             sub     pDst, outPointStep
    268         .endif
    269 
    270         SUB     pDst,pDst,outPointStep, LSL #1  @// reset pDst
    271         @// update the pDst for the next grp
    272         SUBGE   pDst,pDst,pointStep
    273         @// update the pSrc for the next grp
    274         SUBGE   pSrc,pSrc,pointStep,LSL #2
    275 
    276 
    277         BGE     grpLoop\name
    278 
    279         ADD     pSrc,pSrc,#8                    @// pSrc += 1; for the next set
    280         ADD     pDst,pDst,#8                    @// pDst += 1; for the next set
    281 
    282         SUBS    setCount,setCount,#1            @// decrement loop counter
    283 
    284 
    285         BGT     setLoop\name
    286 
    287         @// Reset and Swap pSrc and pDst for the next stage
    288         MOV     t1,pDst
    289         SUB     pDst,pSrc,subFFTNum,LSL #3
    290         SUB     pSrc,t1,subFFTNum,LSL #3
    291 
    292         .endm
    293 
    294 
    295         M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
    296         FFTSTAGE "FALSE","FALSE",FWD
    297         M_END
    298 
    299         M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
    300         FFTSTAGE "FALSE","TRUE",INV
    301         M_END
    302 
    303 
    304 @//    ENDIF                                                           @//ARM1136JS
    305 
    306 
    307 
    308 @// Guarding implementation by the processor name
    309 
    310     .end
    311