Home | History | Annotate | Download | only in arm64
      1 //
      2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 //
      4 //  Use of this source code is governed by a BSD-style license
      5 //  that can be found in the LICENSE file in the root of the source
      6 //  tree. An additional intellectual property rights grant can be found
      7 //  in the file PATENTS.  All contributing project authors may
      8 //  be found in the AUTHORS file in the root of the source tree.
      9 //
     10 //  This is a modification of
     11 //  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
     12 //  instead of SC32.
     13 //
     14 
     15 //
     16 // Description:
     17 // Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
     18 // It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
     19 //
     20 //
     21 
     22 
     23 // Include standard headers
     24 
     25 #include "dl/api/arm/arm64COMM_s.h"
     26 #include "dl/api/arm/omxtypes_s.h"
     27 
     28 
     29 // Import symbols required from other files
     30 // (For example tables)
     31 
     32 
     33 // Set debugging level
     34 //DEBUG_ON    SETL {TRUE}
     35 
     36 
     37 
     38 // Guarding implementation by the processor name
     39 
     40 
     41 
     42       // Guarding implementation by the processor name
     43 
     44 
     45 
     46 //Input Registers
     47 
     48 #define pSrc            x0
     49 #define pTwiddle        x1
     50 #define	pOut		x2
     51 #define	subFFTNum	x3
     52 
     53 // Output registers
     54 
     55 //Local Scratch Registers
     56 
     57 #define argTwiddle      x5
     58 #define argDst          x6
     59 #define subFFTSize      x7
     60 #define N               subFFTNum
     61 
     62 #define pOut1           x13
     63 
     64 #define size            x7
     65 #define step            x8
     66 #define step1           x9
     67 #define twStep          x10
     68 #define pTwiddleTmp     x11
     69 #define argTwiddle1     x12
     70 
     71 // Neon registers
     72 
     73 #define dX0     v0.2s
     74 #define dX0s    v0.s
     75 #define dShift  v1.2s
     76 #define dX1     v1.2s
     77 #define dX1s    v1.s
     78 #define dY0     v2.2s
     79 #define dY08b   v2.8b
     80 #define dY1     v3.2s
     81 #define dX0r    v0.2s
     82 #define dX0rs   v0.s
     83 #define dX0i    v1.2s
     84 #define dX1r    v2.2s
     85 #define dX1i    v3.2s
     86 #define dW0r    v4.2s
     87 #define dW0r8b  v4.8b
     88 #define dW0i    v5.2s
     89 #define dW1r    v6.2s
     90 #define dW1r8b  v6.8b
     91 #define dW1i    v7.2s
     92 #define dT0     v8.2s
     93 #define dT1     v9.2s
     94 #define dT2     v10.2s
     95 #define dT3     v11.2s
     96 #define qT0     v12.2s
     97 #define qT1     v14.2s
     98 #define qT2     v16.2s
     99 #define qT3     v18.2s
    100 #define dY0r    v4.2s
    101 #define dY0i    v5.2s
    102 #define dY1r    v6.2s
    103 #define dY1i    v7.2s
    104 
    105 #define dY2     v4.2s
    106 #define dY3     v5.2s
    107 #define dW0     v6.2s
    108 #define dW1     v7.2s
    109 #define dW0Tmp  v10.2s
    110 #define dW1Neg  v11.2s
    111 
    112 #define dZip    v19.2s
    113 #define dZip8b  v19.8b
    114 #define half    v13.2s
    115 
    116         .MACRO FFTSTAGE scaled, inverse, name
    117 
    118         fmov    half, 0.5
    119 
    120         asr     size, subFFTNum, #1           // preserve the contents of N = subFFTNum
    121         lsl     step, subFFTNum, #2           // step = N/2 * 8 bytes
    122 
    123 
    124         // Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
    125         // Note: W^(k) is stored as negated value and also need to
    126         // conjugate the values from the table
    127 
    128         // Z(0) : no need of twiddle multiply
    129         // Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
    130 
    131         ld1     {dX0},[pSrc],step
    132         ADD     pOut1,pOut,step               // pOut1 = pOut+ N/2*8 bytes
    133 
    134         ld1     {dX1},[pSrc], #8
    135         // twStep = 3N/8 * 8 bytes pointing to W^1
    136         SUB     twStep,step,size,LSL #1
    137 
    138         lsl     step1,size, #2                // step1 = N/4 * 8 = N/2*4 bytes
    139         SUB     step1,step1,#8                // (N/4-1)*8 bytes
    140 
    141         fadd    dY0,dX0,dX1                   // [b+d | a+c]
    142         fsub    dY1,dX0,dX1                   // [b-d | a-c]
    143         fmul    dY0, dY0, half[0]
    144         fmul    dY1, dY1, half[0]
    145 
    146         // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
    147         // VZIP    dY0,dY1
    148         zip1    dZip,dY0,dY1
    149         zip2    dY1,dY0,dY1
    150         mov     dY08b, dZip8b
    151 
    152         fsub   dX0,dY0,dY1
    153         SUBS   size,size,#2
    154         fadd   dX1,dY0,dY1
    155 
    156         SUB     pSrc,pSrc,step
    157 
    158         st1     {dX0s}[0],[pOut1], #4
    159         ADD     pTwiddleTmp,pTwiddle,#8       // W^2
    160         st1     {dX1s}[1],[pOut1], #4
    161         ADD     argTwiddle1,pTwiddle,twStep   // W^1
    162 
    163 
    164         BLT     decrementScale\name
    165         BEQ     lastElement\name
    166 
    167 
    168         // Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
    169         // Note: W^k is stored as negative values in the table and also
    170         // need to conjugate the values from the table.
    171         //
    172         // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
    173         // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
    174 
    175 
    176         SUB     step,step,#24
    177 evenOddButterflyLoop\name :
    178 
    179 
    180         ld1     {dW0r},[argTwiddle1],step1
    181         ld1     {dW1r},[argTwiddle1], #8
    182 
    183         ld2     {dX0r,dX0i},[pSrc],step
    184         SUB     argTwiddle1,argTwiddle1,step1
    185         ld2     {dX1r,dX1i},[pSrc], #16
    186 
    187         SUB     step1,step1,#8                // (N/4-2)*8 bytes
    188         ld1     {dW0i},[pTwiddleTmp],step1
    189         ld1     {dW1i},[pTwiddleTmp], #8
    190         SUB     pSrc,pSrc,step
    191 
    192         SUB     pTwiddleTmp,pTwiddleTmp,step1
    193         rev64   dX1r,dX1r
    194         rev64   dX1i,dX1i
    195         SUBS    size,size,#4
    196 
    197 
    198         fsub    dT2,dX0r,dX1r                 // a-c
    199         fadd    dT3,dX0i,dX1i                 // b+d
    200         fadd    dT0,dX0r,dX1r                 // a+c
    201         fsub    dT1,dX0i,dX1i                 // b-d
    202         SUB     step1,step1,#8
    203 
    204         fmul    dT2, dT2, half[0]
    205         fmul    dT3, dT3, half[0]
    206 
    207         fmul    dT0, dT0, half[0]
    208         fmul    dT1, dT1, half[0]
    209 
    210         // VZIP    dW1r,dW1i
    211         // VZIP    dW0r,dW0i
    212         zip1    dZip, dW1r,dW1i
    213         zip2    dW1i,dW1r,dW1i
    214         mov     dW1r8b, dZip8b
    215         zip1    dZip,dW0r,dW0i
    216         zip2    dW0i,dW0r,dW0i
    217         mov     dW0r8b, dZip8b
    218 
    219         fmul   dX1r,dW1r,dT2
    220         fmul   dX1i,dW1r,dT3
    221         fmul   dX0r,dW0r,dT2
    222         fmul   dX0i,dW0r,dT3
    223 
    224         fmls   dX1r,dW1i,dT3
    225         fmla   dX1i,dW1i,dT2
    226 
    227         fmla   dX0r,dW0i,dT3
    228         fmls   dX0i,dW0i,dT2
    229 
    230 
    231         fadd    dY1r,dT0,dX1i                 // F(N/2 -1)
    232         fsub    dY1i,dX1r,dT1
    233 
    234         rev64   dY1r,dY1r
    235         rev64   dY1i,dY1i
    236 
    237 
    238         fadd    dY0r,dT0,dX0i                 // F(1)
    239         fsub    dY0i,dT1,dX0r
    240 
    241 
    242         st2     {dY0r,dY0i},[pOut1],step
    243         st2     {dY1r,dY1i},[pOut1], #16
    244         SUB     pOut1,pOut1,step
    245         SUB     step,step,#32                 // (N/2-4)*8 bytes
    246 
    247 
    248         BGT     evenOddButterflyLoop\name
    249 
    250 
    251         // set both the ptrs to the last element
    252         SUB     pSrc,pSrc,#8
    253         SUB     pOut1,pOut1,#8
    254 
    255         // Last element can be expanded as follows
    256         // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
    257         // -ve)
    258         // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
    259         // 1/2[2a+j0] - j (c-jd) [0+j2b]
    260         // (a+bc, -bd)
    261         // Since (c,d) = (0,1) for the last element, result is just (a,-b)
    262 
    263 lastElement\name :
    264         ld1     {dX0r},[pSrc]
    265 
    266         st1     {dX0rs}[0],[pOut1], #4
    267         fneg    dX0r,dX0r
    268         st1     {dX0rs}[1],[pOut1]
    269 
    270 
    271 
    272 decrementScale\name :
    273 
    274         .endm
    275 
    276         M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
    277             FFTSTAGE "FALSE","TRUE",Inv
    278         M_END
    279 
    280         .end
    281