Home | History | Annotate | Download | only in arm64
      1 //
      2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 //
      4 //  Use of this source code is governed by a BSD-style license
      5 //  that can be found in the LICENSE file in the root of the source
      6 //  tree. An additional intellectual property rights grant can be found
      7 //  in the file PATENTS.  All contributing project authors may
      8 //  be found in the AUTHORS file in the root of the source tree.
      9 //
     10 //  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
     11 //  to support float instead of SC32.
     12 //
     13 
     14 //
     15 // Description:
     16 // Compute FFT for a real signal
     17 //
     18 //
     19 
     20 
     21 // Include standard headers
     22 
     23 #include "dl/api/arm/arm64COMM_s.h"
     24 #include "dl/api/arm/omxtypes_s.h"
     25 
     26 
     27 // Import symbols required from other files
     28 // (For example tables)
     29 
     30 // Set debugging level
     31 //DEBUG_ON    SETL {TRUE}
     32 
     33 
     34 
     35 // Guarding implementation by the processor name
     36 
     37 
     38 
     39     // Guarding implementation by the processor name
     40 
     41 // Import symbols required from other files
     42 
     43 
     44 //Input Registers
     45 
     46 #define pSrc            x0
     47 #define pDst            x1
     48 #define pTwiddle        x2
     49 #define	pOut		x3
     50 #define	subFFTNum	x4
     51 
     52 // Output registers
     53 
     54 //Local Scratch Registers
     55 
     56 #define argTwiddle      x5
     57 #define argDst          x6
     58 #define subFFTSize      x7
     59 #define N               subFFTNum
     60 #define order           x14
     61 #define step            x8
     62 #define step1           pTwiddle
     63 #define twStep          x9
     64 #define zero            w10
     65 #define pTwiddleTmp     pOut
     66 
     67 // Neon registers
     68 
     69 #define dX0       v0.2s
     70 #define dX0s      v0.s
     71 #define dX0r      v2.2s
     72 #define dX0rs     v2.s
     73 #define dX0i      v3.2s
     74 #define dX0is     v3.s
     75 #define dX1r      v4.2s
     76 #define dX1i      v5.2s
     77 #define dT0       v6.2s
     78 #define dT1       v7.2s
     79 #define dT2       v8.2s
     80 #define dT3       v9.2s
     81 #define qT0       v10.2s
     82 #define qT1       v12.2s
     83 #define dW0r      v14.2s
     84 #define dW0r8b    v14.8b
     85 #define dW0i      v15.2s
     86 #define dW1r      v16.2s
     87 #define dW1r8b    v16.8b
     88 #define dW1i      v17.2s
     89 #define dY0r      v14.2s
     90 #define dY0i      v15.2s
     91 #define dY1r      v16.2s
     92 #define dY1i      v17.2s
     93 #define qT2       v18.2s
     94 #define qT3       v20.2s
     95 
     96 #define half      v0.2s
     97 #define dZip      v21.2s
     98 #define dZip8b    v21.8b
     99 
    100     // Allocate stack memory required by the function
    101 
    102     // Write function header
    103         M_START     ComplexToRealFixup,,d15
    104 
    105         asr     N, N, #1
    106 
    107         clz     order, subFFTNum                    // N = 2^order
    108 
    109         RSB     order,order,#63
    110         MOV     subFFTSize,subFFTNum            // subFFTSize = N/2
    111         //MOV     subFFTNum,N
    112         mov     argDst, pDst
    113         mov     argTwiddle, pTwiddle
    114 
    115         // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
    116         // 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
    117         // 1/2[2a+j0] - j [0+j2b]
    118         // (a+b, 0)
    119 
    120         // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
    121         // 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
    122         // 1/2[2a+j0] + j [0+j2b]
    123         // (a-b, 0)
    124 
    125         // F(0) and F(N/2)
    126         ld2     {dX0rs,dX0is}[0],[pSrc], #8
    127         MOV     zero,#0
    128         mov    dX0rs[1],zero
    129         lsl     step,subFFTSize, #3               // step = N/2 * 8 bytes
    130         mov    dX0i[1],zero
    131         // twStep = 3N/8 * 8 bytes pointing to W^1
    132         SUB     twStep,step,subFFTSize,LSL #1
    133 
    134         fadd    dY0r,dX0r,dX0i                    // F(0) = ((Z0.r+Z0.i) , 0)
    135         lsl     step1,subFFTSize, #2              // step1 = N/2 * 4 bytes
    136         fsub    dY0i,dX0r,dX0i                    // F(N/2) = ((Z0.r-Z0.i) , 0)
    137         SUBS    subFFTSize,subFFTSize,#2
    138 
    139         st1     {dY0r},[argDst],step
    140         ADD     pTwiddleTmp,argTwiddle,#8         // W^2
    141         st1     {dY0i},[argDst], #8
    142         ADD     argTwiddle,argTwiddle,twStep      // W^1
    143 
    144 //        dup     dzero,zero
    145         SUB     argDst,argDst,step
    146 
    147         BLT     End
    148         BEQ     lastElement
    149         SUB     step,step,#24
    150         SUB     step1,step1,#8                    // (N/4-1)*8 bytes
    151 
    152         // F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
    153         // Note: W^k is stored as negative values in the table
    154         // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
    155         // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
    156 
    157         fmov     half, #0.5
    158 
    159 evenOddButterflyLoop:
    160 
    161 
    162         ld1     {dW0r},[argTwiddle],step1
    163         ld1     {dW1r},[argTwiddle], #8
    164 
    165         ld2     {dX0r,dX0i},[pSrc],step
    166         SUB     argTwiddle,argTwiddle,step1
    167         ld2     {dX1r,dX1i},[pSrc], #16
    168 
    169 
    170 
    171         SUB     step1,step1,#8                    // (N/4-2)*8 bytes
    172         ld1     {dW0i},[pTwiddleTmp],step1
    173         ld1     {dW1i},[pTwiddleTmp], #8
    174         SUB     pSrc,pSrc,step
    175 
    176         SUB     pTwiddleTmp,pTwiddleTmp,step1
    177         rev64   dX1r,dX1r
    178         rev64   dX1i,dX1i
    179         SUBS    subFFTSize,subFFTSize,#4
    180 
    181 
    182 
    183         fsub    dT2,dX0r,dX1r                     // a-c
    184         SUB     step1,step1,#8
    185         fadd    dT0,dX0r,dX1r                     // a+c
    186         fsub    dT1,dX0i,dX1i                     // b-d
    187         fadd    dT3,dX0i,dX1i                     // b+d
    188         fmul   dT0,dT0,half[0]
    189         fmul   dT1,dT1,half[0]
    190         // VZIP    dW1r,dW1i
    191         // VZIP    dW0r,dW0i
    192         zip1    dZip, dW1r, dW1i
    193         zip2    dW1i, dW1r, dW1i
    194         mov     dW1r8b, dZip8b
    195         zip1    dZip, dW0r, dW0i
    196         zip2    dW0i, dW0r, dW0i
    197         mov     dW0r8b, dZip8b
    198 
    199         fmul   qT0,dW1r,dT2
    200         fmul   qT1,dW1r,dT3
    201         fmul   qT2,dW0r,dT2
    202         fmul   qT3,dW0r,dT3
    203 
    204         fmla   qT0,dW1i,dT3
    205         fmls   qT1,dW1i,dT2
    206 
    207         fmls   qT2,dW0i,dT3
    208         fmla   qT3,dW0i,dT2
    209 
    210 
    211         fmul  dX1r,qT0,half[0]
    212         fmul  dX1i,qT1,half[0]
    213 
    214         fsub    dY1r,dT0,dX1i                     // F(N/2 -1)
    215         fadd    dY1i,dT1,dX1r
    216         fneg    dY1i,dY1i
    217 
    218         rev64   dY1r,dY1r
    219         rev64   dY1i,dY1i
    220 
    221 
    222         fmul  dX0r,qT2,half[0]
    223         fmul  dX0i,qT3,half[0]
    224 
    225         fsub    dY0r,dT0,dX0i                     // F(1)
    226         fadd    dY0i,dT1,dX0r
    227 
    228 
    229         st2     {dY0r,dY0i},[argDst],step
    230         st2     {dY1r,dY1i},[argDst], #16
    231         SUB     argDst,argDst,step
    232         SUB     step,step,#32                     // (N/2-4)*8 bytes
    233 
    234 
    235         BGT     evenOddButterflyLoop
    236 
    237         // set both the ptrs to the last element
    238         SUB     pSrc,pSrc,#8
    239         SUB     argDst,argDst,#8
    240 
    241 
    242 
    243         // Last element can be expanded as follows
    244         // 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
    245         // 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
    246         // 1/2[2a+j0] + j (c+jd) [0+j2b]
    247         // (a-bc, -bd)
    248         // Since (c,d) = (0,1) for the last element, result is just (a,-b)
    249 
    250 lastElement:
    251         ld1     {dX0r},[pSrc]
    252 
    253         st1     {dX0rs}[0],[argDst], #4
    254         fneg    dX0r,dX0r
    255         st1     {dX0rs}[1],[argDst], #4
    256 End:
    257 
    258         // Write function tail
    259         M_END
    260 
    261         .end
    262