Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
     11 @//  to support float instead of SC32.
     12 @//
     13 
     14 @//
     15 @// Description:
     16 @// Compute FFT for a real signal
     17 @//
     18 @//
     19 
     20 
     21 @// Include standard headers
     22 
     23 #include "dl/api/arm/armCOMM_s.h"
     24 #include "dl/api/arm/omxtypes_s.h"
     25 
     26 
     27 @// Import symbols required from other files
     28 @// (For example tables)
     29 
     30         .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
     31         .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
     32         .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
     33         .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
     34         .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
     35         .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
     36         .extern  armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
     37 
     38 @// Set debugging level
     39 @//DEBUG_ON    SETL {TRUE}
     40 
     41 
     42 
     43 @// Guarding implementation by the processor name
     44 
     45 
     46 
     47     @// Guarding implementation by the processor name
     48 
     49 @// Import symbols required from other files
     50 @// (For example tables)
     51         .extern  armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
     52         .extern  armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
     53 
     54 
     55 @//Input Registers
     56 
     57 #define pSrc            r0
     58 #define pDst            r1
     59 #define pFFTSpec        r2
     60 #define scale           r3
     61 
     62 
     63 @// Output registers
     64 #define result          r0
     65 
     66 @//Local Scratch Registers
     67 
     68 #define argTwiddle      r1
     69 #define argDst          r2
     70 #define argScale        r4
     71 #define tmpOrder        r4
     72 #define pTwiddle        r4
     73 #define pOut            r5
     74 #define subFFTSize      r7
     75 #define subFFTNum       r6
     76 #define N               r6
     77 #define order           r14
     78 #define diff            r9
     79 @// Total num of radix stages required to comple the FFT
     80 #define count           r8
     81 #define x0r             r4
     82 #define x0i             r5
     83 #define diffMinusOne    r2
     84 #define subFFTSizeTmp   r6
     85 #define step            r3
     86 #define step1           r4
     87 #define twStep          r8
     88 #define zero            r9
     89 #define pTwiddleTmp     r5
     90 #define t0              r10
     91 
     92 @// Neon registers
     93 
     94 #define dX0       d0.f32
     95 #define dzero     d1.f32
     96 #define dZero     d2.f32
     97 #define dShift    d3.f32
     98 #define dX0r      d2.f32
     99 #define dX0i      d3.f32
    100 #define dX1r      d4.f32
    101 #define dX1i      d5.f32
    102 #define dT0       d6.f32
    103 #define dT1       d7.f32
    104 #define dT2       d8.f32
    105 #define dT3       d9.f32
    106 #define qT0       d10.f32
    107 #define qT1       d12.f32
    108 #define dW0r      d14.f32
    109 #define dW0i      d15.f32
    110 #define dW1r      d16.f32
    111 #define dW1i      d17.f32
    112 #define dY0r      d14.f32
    113 #define dY0i      d15.f32
    114 #define dY1r      d16.f32
    115 #define dY1i      d17.f32
    116 #define dY0rS64   d14.s64
    117 #define dY0iS64   d15.s64
    118 #define qT2       d18.f32
    119 #define qT3       d20.f32
    120 @// lastThreeelements
    121 #define dX1       d3.f32
    122 #define dW0       d4.f32
    123 #define dW1       d5.f32
    124 #define dY0       d10.f32
    125 #define dY1       d11.f32
    126 #define dY2       d12.f32
    127 #define dY3       d13.f32
    128 
    129 #define half      d0.f32
    130 
    131     @// Allocate stack memory required by the function
    132 
    133     @// Write function header
    134         M_START     omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15
    135 
    136 @ Structure offsets for the FFTSpec
    137         .set    ARMsFFTSpec_N, 0
    138         .set    ARMsFFTSpec_pBitRev, 4
    139         .set    ARMsFFTSpec_pTwiddle, 8
    140         .set    ARMsFFTSpec_pBuf, 12
    141 
    142         @// Define stack arguments
    143 
    144         @// Read the size from structure and take log
    145         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
    146 
    147         @// Read other structure parameters
    148         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
    149         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
    150 
    151         @//  N=1 Treat seperately
    152         CMP     N,#1
    153         BGT     sizeGreaterThanOne
    154         VLD1    dX0[0],[pSrc]
    155         MOV     zero,#0
    156         VMOV    dzero[0],zero
    157         VMOV    dZero[0],zero
    158         VST3    {dX0[0],dzero[0],dZero[0]},[pDst]
    159 
    160         B       End
    161 
    162 
    163 
    164 sizeGreaterThanOne:
    165         @// Do a N/2 point complex FFT including the scaling
    166 
    167         MOV     N,N,ASR #1                          @// N/2 point complex FFT
    168 
    169         CLZ     order,N                             @// N = 2^order
    170         RSB     order,order,#31
    171         MOV     subFFTSize,#1
    172         @//MOV     subFFTNum,N
    173 
    174         CMP     order,#3
    175         BGT     orderGreaterthan3                   @// order > 3
    176 
    177         CMP     order,#1
    178         BGE     orderGreaterthan0                   @// order > 0
    179         VLD1    dX0,[pSrc]
    180         VST1    dX0,[pOut]
    181         MOV     pSrc,pOut
    182         MOV     argDst,pDst
    183         BLT     FFTEnd
    184 
    185 orderGreaterthan0:
    186         @// set the buffers appropriately for various orders
    187         CMP     order,#2
    188         MOVEQ   argDst,pDst
    189         MOVNE   argDst,pOut
    190         @// Pass the first stage destination in RN5
    191         MOVNE   pOut,pDst
    192         MOV     argTwiddle,pTwiddle
    193 
    194         CMP     order,#1
    195         BGT     orderGreaterthan1
    196         @// order = 1
    197         BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
    198         B       FFTEnd
    199 
    200 orderGreaterthan1:
    201         CMP     order,#2
    202         BGT     orderGreaterthan2
    203         @// order =2
    204         BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
    205         BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
    206         B       FFTEnd
    207 
    208 orderGreaterthan2:@// order =3
    209         BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
    210         BL      armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
    211         BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
    212 
    213         B       FFTEnd
    214 
    215 
    216 
    217 orderGreaterthan3:
    218 specialScaleCase:
    219 
    220         @// Set input args to fft stages
    221         TST     order, #2
    222         MOVEQ   argDst,pDst
    223         MOVNE   argDst,pOut
    224         @// Pass the first stage destination in RN5
    225         MOVNE   pOut,pDst
    226         MOV     argTwiddle,pTwiddle
    227 
    228         @//check for even or odd order
    229         @// NOTE: The following combination of BL's would work fine even though
    230         @// the first BL would corrupt the flags. This is because the end of
    231         @// the "grpZeroSetLoop" loop inside
    232         @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
    233         @// to EQ
    234 
    235         TST     order,#0x00000001
    236         BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
    237         BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
    238 
    239         CMP        subFFTNum,#4
    240         BLT     FFTEnd
    241 
    242 
    243 unscaledRadix4Loop:
    244         BEQ        lastStageUnscaledRadix4
    245          BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
    246          CMP        subFFTNum,#4
    247          B        unscaledRadix4Loop
    248 
    249 lastStageUnscaledRadix4:
    250         BL      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
    251         B        FFTEnd
    252 
    253 
    254 FFTEnd:
    255 finalComplexToRealFixup:
    256 
    257 
    258         @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
    259         @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
    260         @// 1/2[2a+j0] - j [0+j2b]
    261         @// (a+b, 0)
    262 
    263         @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
    264         @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
    265         @// 1/2[2a+j0] + j [0+j2b]
    266         @// (a-b, 0)
    267 
    268         @// F(0) and F(N/2)
    269         VLD2    {dX0r[0],dX0i[0]},[pSrc]!
    270         MOV     zero,#0
    271         VMOV    dX0r[1],zero
    272         MOV     step,subFFTSize,LSL #3            @// step = N/2 * 8 bytes
    273         VMOV    dX0i[1],zero
    274         @// twStep = 3N/8 * 8 bytes pointing to W^1
    275         SUB     twStep,step,subFFTSize,LSL #1
    276 
    277         VADD    dY0r,dX0r,dX0i                    @// F(0) = ((Z0.r+Z0.i) , 0)
    278         MOV     step1,subFFTSize,LSL #2           @// step1 = N/2 * 4 bytes
    279         VSUB    dY0i,dX0r,dX0i                    @// F(N/2) = ((Z0.r-Z0.i) , 0)
    280         SUBS    subFFTSize,subFFTSize,#2
    281 
    282         VST1    dY0r,[argDst],step
    283         ADD     pTwiddleTmp,argTwiddle,#8         @// W^2
    284         VST1    dY0i,[argDst]!
    285         ADD     argTwiddle,argTwiddle,twStep      @// W^1
    286 
    287         VDUP    dzero,zero
    288         SUB     argDst,argDst,step
    289 
    290         BLT     End
    291         BEQ     lastElement
    292         SUB     step,step,#24
    293         SUB     step1,step1,#8                    @// (N/4-1)*8 bytes
    294 
    295         @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
    296         @// Note: W^k is stored as negative values in the table
    297         @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
    298         @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
    299 
    300         VMOV    half, #0.5
    301 
    302 evenOddButterflyLoop:
    303 
    304 
    305         VLD1    dW0r,[argTwiddle],step1
    306         VLD1    dW1r,[argTwiddle]!
    307 
    308         VLD2    {dX0r,dX0i},[pSrc],step
    309         SUB     argTwiddle,argTwiddle,step1
    310         VLD2    {dX1r,dX1i},[pSrc]!
    311 
    312 
    313 
    314         SUB     step1,step1,#8                    @// (N/4-2)*8 bytes
    315         VLD1    dW0i,[pTwiddleTmp],step1
    316         VLD1    dW1i,[pTwiddleTmp]!
    317         SUB     pSrc,pSrc,step
    318 
    319         SUB     pTwiddleTmp,pTwiddleTmp,step1
    320         VREV64  dX1r,dX1r
    321         VREV64  dX1i,dX1i
    322         SUBS    subFFTSize,subFFTSize,#4
    323 
    324 
    325 
    326         VSUB    dT2,dX0r,dX1r                     @// a-c
    327         SUB     step1,step1,#8
    328         VADD    dT0,dX0r,dX1r                     @// a+c
    329         VSUB    dT1,dX0i,dX1i                     @// b-d
    330         VADD    dT3,dX0i,dX1i                     @// b+d
    331         VMUL   dT0,dT0,half[0]
    332         VMUL   dT1,dT1,half[0]
    333         VZIP    dW1r,dW1i
    334         VZIP    dW0r,dW0i
    335 
    336 
    337         VMUL   qT0,dW1r,dT2
    338         VMUL   qT1,dW1r,dT3
    339         VMUL   qT2,dW0r,dT2
    340         VMUL   qT3,dW0r,dT3
    341 
    342         VMLA   qT0,dW1i,dT3
    343         VMLS   qT1,dW1i,dT2
    344 
    345         VMLS   qT2,dW0i,dT3
    346         VMLA   qT3,dW0i,dT2
    347 
    348 
    349         VMUL  dX1r,qT0,half[0]
    350         VMUL  dX1i,qT1,half[0]
    351 
    352         VSUB    dY1r,dT0,dX1i                     @// F(N/2 -1)
    353         VADD    dY1i,dT1,dX1r
    354         VNEG    dY1i,dY1i
    355 
    356         VREV64  dY1r,dY1r
    357         VREV64  dY1i,dY1i
    358 
    359 
    360         VMUL  dX0r,qT2,half[0]
    361         VMUL  dX0i,qT3,half[0]
    362 
    363         VSUB    dY0r,dT0,dX0i                     @// F(1)
    364         VADD    dY0i,dT1,dX0r
    365 
    366 
    367         VST2    {dY0r,dY0i},[argDst],step
    368         VST2    {dY1r,dY1i},[argDst]!
    369         SUB     argDst,argDst,step
    370         SUB     step,step,#32                     @// (N/2-4)*8 bytes
    371 
    372 
    373         BGT     evenOddButterflyLoop
    374 
    375         @// set both the ptrs to the last element
    376         SUB     pSrc,pSrc,#8
    377         SUB     argDst,argDst,#8
    378 
    379 
    380 
    381         @// Last element can be expanded as follows
    382         @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
    383         @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
    384         @// 1/2[2a+j0] + j (c+jd) [0+j2b]
    385         @// (a-bc, -bd)
    386         @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
    387 
    388 lastElement:
    389         VLD1    dX0r,[pSrc]
    390 
    391         VST1    dX0r[0],[argDst]!
    392         VNEG    dX0r,dX0r
    393         VST1    dX0r[1],[argDst]!
    394 
    395 End:
    396         @// Set return value
    397         MOV     result, #OMX_Sts_NoErr
    398 
    399         @// Write function tail
    400         M_END
    401 
    402         .end
    403