Home | History | Annotate | Download | only in armv7
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
     11 @//  to support float instead of SC32.
     12 @//
     13 
     14 @//
     15 @// Description:
     16 @// Compute a first stage Radix 8 FFT stage for a N point complex signal
     17 @//
     18 @//
     19 
     20 
     21 @// Include standard headers
     22 
     23 #include "dl/api/arm/armCOMM_s.h"
     24 #include "dl/api/arm/omxtypes_s.h"
     25 
     26 @//        M_VARIANTS ARM1136JS
     27 
     28 @// Import symbols required from other files
     29 @// (For example tables)
     30 
     31 
     32 @// Set debugging level
     33 @//DEBUG_ON    SETL {TRUE}
     34 
     35 
     36 
     37 @// Guarding implementation by the processor name
     38 
     39 @//    IF  ARM1136JS
     40 
     41 @//Input Registers
     42 
     43 #define pSrc            r0
     44 #define pDst            r2
     45 #define pTwiddle        r1
     46 #define subFFTNum       r6
     47 #define subFFTSize      r7
     48 #define pPingPongBuf    r5
     49 
     50 
     51 @//Output Registers
     52 
     53 
     54 @//Local Scratch Registers
     55 
     56 #define grpSize         r14
     57 #define step1           r3
     58 #define step2           r8
     59 #define setCount        r14             /*@// Reuse grpSize as setCount*/
     60 #define pointStep       r12
     61 
     62 #define t0              r4
     63 @// Real and Imaginary parts
     64 
     65 #define x0r             s0
     66 #define x0i             s1
     67 #define x1r             s2
     68 #define x1i             s3
     69 #define x2r             s4
     70 #define x2i             s5
     71 #define x3r             s6
     72 #define x3i             s7
     73 #define t3r             s8              /*@// Temporarily hold x3r and x3i*/
     74 #define t3i             s9
     75 #define t1r             s4
     76 #define t1i             s5
     77 #define sr              s10
     78 #define si              s11
     79 #define roothalf        s12
     80 
     81 @// Define macros to load/store two float regs from/to the stack.
     82         .macro M_VSTM r0, r1, p
     83         .set    _Offset, _Workspace + \p\()_F
     84         add     t0, sp, #_Offset
     85         vstm.f32 t0, {\r0, \r1}
     86         .endm
     87 
     88         .macro M_VLDM r0, r1, p
     89         .set    _Offset, _Workspace + \p\()_F
     90         add     t0, sp, #_Offset
     91         vldm.f32 t0, {\r0, \r1}
     92         .endm
     93 
     94 @// Define constants
     95 
     96         .macro FFTSTAGE scaled, inverse , name
     97 
     98         @// Define stack arguments
     99 
    100 
    101         @// Update grpCount and grpSize rightaway inorder to reuse
    102         @// pSubFFTSize and pSubFFTNum regs
    103 
    104         mov     subFFTSize, #8
    105         lsr     grpSize, subFFTNum, #3
    106         mov     subFFTNum, grpSize
    107 
    108 
    109         @// pT0+1 increments pT0 by 8 bytes
    110         @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
    111         @// Note: setCount = grpSize/8 (reuse the updated grpSize for
    112         @// setCount)
    113         MOV     pointStep,grpSize,LSL #3
    114 
    115 
    116         @// Calculate the step of input data for the next set
    117         MOV     step1,grpSize,LSL #4
    118         MOV     step2,pointStep,LSL #3
    119         SUB     step2,step2,pointStep           @// step2 = 7*pointStep
    120 
    121 
    122         @// grp = 0 a special case since all the twiddle factors are 1
    123         @// Loop on the sets
    124 
    125         movw    t0,#0x04f3
    126         movt    t0,#0x3f35
    127         vmov.f32 roothalf, t0                   @// roothalf = sqrt(1/2)
    128 
    129 grpZeroSetLoop\name:
    130 
    131         vldm.f32 pSrc, {x0r, x0i}               @// x0
    132         add      pSrc, step1
    133         vldm.f32 pSrc, {x1r, x1i}               @// x2
    134         add      pSrc, step1
    135         vldm.f32 pSrc, {x2r, x2i}               @// x4
    136         add      pSrc, step1
    137         vldm.f32 pSrc, {x3r, x3i}               @// x6
    138         add      pSrc, step1
    139 
    140         SUB     pSrc, pSrc, step2
    141 
    142         @// finish first stage of 8 point FFT and save on stack
    143 
    144         vadd.f32     x0r,x0r,x2r                @// u0
    145         vadd.f32     x0i,x0i,x2i
    146 
    147         vadd.f32     sr, x2r, x2r
    148         vadd.f32     si, x2i, x2i
    149         vsub.f32     x2r,x0r,sr                 @// u1
    150         vsub.f32     x2i,x0i,si
    151 
    152         M_VSTM   x0r,x0i, pU0
    153         M_VSTM   x2r,x2i, pU1
    154 
    155         vadd.f32     x1r,x1r,x3r                @// u4
    156         vadd.f32     x1i,x1i,x3i
    157 
    158         vadd.f32     sr, x3r, x3r
    159         vadd.f32     si, x3i, x3i
    160         vsub.f32     x3r,x1r,sr                 @// u5
    161         vsub.f32     x3i,x1i,si
    162 
    163         M_VSTM   x1r,x1i, pU4
    164         M_VSTM   x3r,x3i, pU5
    165 
    166 
    167         vldm    pSrc, {x0r, x0i}                @// x1
    168         add     pSrc, step1
    169         vldm    pSrc, {x1r, x1i}                @// x3
    170         add     pSrc, step1
    171         vldm    pSrc, {x2r, x2i}                @// x5
    172         add     pSrc, step1
    173         vldm    pSrc, {x3r, x3i}                @// x7
    174         add     pSrc, #8
    175 
    176         SUB     pSrc, pSrc, step2
    177 
    178         vadd.f32     x0r,x0r,x2r                @// u2
    179         vadd.f32     x0i,x0i,x2i
    180 
    181         vadd.f32         sr, x2r, x2r
    182         vadd.f32         si, x2i, x2i
    183         vsub.f32     x2r,x0r,sr                 @// u3
    184         vsub.f32     x2i,x0i,si
    185 
    186         M_VSTM   x2r,x2i, pU3
    187 
    188         vadd.f32     x1r,x1r,x3r                @// u6
    189         vadd.f32     x1i,x1i,x3i
    190 
    191         vadd.f32         sr, x3r, x3r
    192         vadd.f32         si, x3i, x3i
    193         vsub.f32     x3r,x1r,sr                 @// u7
    194         vsub.f32     x3i,x1i,si
    195 
    196         @// finish second and third stage of 8 point FFT
    197 
    198         M_VSTM  x3r,x3i, pU7
    199         M_VLDM  x2r,x2i, pU0
    200 
    201         @// Decrement setcount
    202         SUBS    setCount,setCount,#1
    203         M_VLDM  x3r,x3i, pU4
    204 
    205         vadd.f32     x0r,x0r,x1r                @// v4
    206         vadd.f32     x0i,x0i,x1i
    207 
    208         vadd.f32     sr, x1r, x1r
    209         vadd.f32     si, x1i, x1i
    210         vsub.f32     x1r,x0r,sr                 @// v6
    211         vsub.f32     x1i,x0i,si
    212 
    213         vadd.f32     x2r,x2r,x3r                @// v0
    214         vadd.f32     x2i,x2i,x3i
    215 
    216         vadd.f32     sr, x3r, x3r
    217         vadd.f32     si, x3i, x3i
    218         vsub.f32     x3r,x2r,sr                 @// v2
    219         vsub.f32     x3i,x2i,si
    220 
    221 
    222 
    223         vadd.f32     x2r,x2r,x0r                @// y0
    224         vadd.f32     x2i,x2i,x0i
    225 
    226         vadd.f32     sr, x0r, x0r
    227         vadd.f32     si, x0i, x0i
    228         vsub.f32     x0r,x2r,sr                 @// y4
    229         vsub.f32     x0i,x2i,si
    230 
    231         vstm    pDst, {x2r, x2i}                @// store y0
    232         add     pDst, step1
    233 
    234         vadd.f32     x3r,x3r,x1i                @// y6
    235         vsub.f32     x3i,x3i,x1r
    236 
    237         vadd.f32     sr, x1r, x1r
    238         vadd.f32     si, x1i, x1i
    239         vsub.f32     t1r,x3r,si                 @// t1r=x2r reg;t1i=x2i reg
    240         vadd.f32     t1i,x3i,sr                 @// y2
    241 
    242         .ifeqs  "\inverse", "TRUE"
    243             vstm        pDst, {t1r, t1i}        @// store y2
    244             add pDst, step1
    245             vstm        pDst, {x0r, x0i}        @// store y4
    246             add pDst, step1
    247             vstm        pDst, {x3r, x3i}        @// store y6
    248             add pDst, step1
    249         .else
    250             vstm        pDst, {x3r, x3i}        @// store y2
    251             add pDst, step1
    252             vstm        pDst, {x0r, x0i}        @// store y4
    253             add pDst, step1
    254             vstm        pDst, {t1r, t1i}        @// store y6
    255             add pDst, step1
    256         .endif
    257 
    258         SUB     pDst, pDst, step2               @// set pDst to y1
    259 
    260 
    261         M_VLDM  x0r,x0i,pU1                     @// Load u1,u3,u5,u7
    262         M_VLDM  x1r,x1i,pU5
    263         M_VLDM  x3r,x3i,pU7
    264 
    265         vsub.f32     x0r,x0r,x1i                @// v1
    266         vadd.f32     x0i,x0i,x1r
    267         vadd.f32     sr, x1r, x1r
    268         vadd.f32     si, x1i, x1i
    269         vadd.f32     t1r,x0r,si                 @// t1r=x2r reg;t1i=x2i reg
    270         vsub.f32     t1i,x0i,sr                 @// v3
    271 
    272         M_VLDM  x1r,x1i,pU3
    273 
    274         vsub.f32     x1r,x1r,x3i                @// v5
    275         vadd.f32     x1i,x1i,x3r
    276 
    277         vadd.f32     sr, x3r, x3r
    278         vadd.f32     si, x3i, x3i
    279         vadd.f32     t3r,x1r,si                 @// t3i = x3i
    280         vsub.f32     t3i,x1i,sr                 @// v7
    281 
    282         @// store v5  as (v5.r - v5.i,v5.r + v5.i)
    283         @// store v7  as (v7.i + v7.r,v7.i - v7.r)
    284 
    285         vadd.f32     x3r,t3i,t3r                @// v7
    286         vsub.f32     x3i,t3i,t3r
    287 
    288         vsub.f32     x1r,x1r,x1i                @// v5
    289         vadd.f32     x1i, x1i
    290         vadd.f32     x1i,x1r,x1i
    291 
    292         vmul.f32  x3r, x3r, roothalf            @// (v7.i + v7.r)*(1/sqrt(2))
    293         vmul.f32  x3i, x3i, roothalf            @// (v7.i - v7.r)*(1/sqrt(2))
    294         vmul.f32  x1r, x1r, roothalf            @// (v5.r - v5.i)*(1/sqrt(2))
    295         vmul.f32  x1i, x1i, roothalf            @// (v5.r + v5.i)*(1/sqrt(2))
    296 
    297         vadd.f32     x2r,x2r,x3r                @// y7
    298         vadd.f32     x2i,x2i,x3i
    299 
    300         vadd.f32     sr, x3r, x3r
    301         vadd.f32     si, x3i, x3i
    302         vsub.f32     x3r,x2r,sr                 @// y3
    303         vsub.f32     x3i,x2i,si
    304 
    305 
    306         vsub.f32     x0r,x0r,x1r                @// y5
    307         vsub.f32     x0i,x0i,x1i
    308 
    309         vadd.f32     sr, x1r, x1r
    310         vadd.f32     si, x1i, x1i
    311         vadd.f32     x1r,x0r,sr                 @// y1
    312         vadd.f32     x1i,x0i,si
    313 
    314         .ifeqs  "\inverse", "TRUE"
    315             vstm    pDst, {x1r, x1i}            @// store y1
    316             add pDst, step1
    317             vstm    pDst, {x3r, x3i}            @// store y3
    318             add pDst, step1
    319             vstm    pDst, {x0r, x0i}            @// store y5
    320             add pDst, step1
    321             vstm    pDst, {x2r, x2i}            @// store y7
    322             add pDst, #8
    323         .else
    324             vstm    pDst, {x2r, x2i}            @// store y1
    325             add pDst, step1
    326             vstm    pDst, {x0r, x0i}            @// store y3
    327             add pDst, step1
    328             vstm    pDst, {x3r, x3i}            @// store y5
    329             add pDst, step1
    330             vstm    pDst, {x1r, x1i}            @// store y7
    331             add pDst, #8
    332         .endif
    333 
    334         SUB     pDst, pDst, step2               @// update pDst for the next set
    335 
    336 
    337         BGT     grpZeroSetLoop\name
    338 
    339 
    340         @// reset pSrc to pDst for the next stage
    341         SUB     pSrc,pDst,pointStep             @// pDst -= 2*grpSize
    342         mov     pDst, pPingPongBuf
    343 
    344 
    345         .endm
    346 
    347 
    348 
    349 
    350 
    351         @// Allocate stack memory required by the function
    352 
    353         @// Ensure 8 byte alignment to use M_VLDM
    354         M_ALLOC8    pU0, 8
    355         M_ALLOC8    pU1, 8
    356         M_ALLOC8    pU3, 8
    357         M_ALLOC8    pU4, 8
    358         M_ALLOC8    pU5, 8
    359         M_ALLOC8    pU7, 8
    360 
    361         M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
    362             FFTSTAGE "FALSE","FALSE",FWD
    363         M_END
    364 
    365         @// Allocate stack memory required by the function
    366 
    367         @// Ensure 8 byte alignment to use M_VLDM
    368         M_ALLOC8    pU0, 8
    369         M_ALLOC8    pU1, 8
    370         M_ALLOC8    pU3, 8
    371         M_ALLOC8    pU4, 8
    372         M_ALLOC8    pU5, 8
    373         M_ALLOC8    pU7, 8
    374 
    375         M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
    376             FFTSTAGE "FALSE","TRUE",INV
    377         M_END
    378 
    379 @//    ENDIF        @//ARM1136JS
    380 
    381 
    382 
    383 @// Guarding implementation by the processor name
    384 
    385 
    386     .end
    387