Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This is a modification of
     11 @//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
     12 @//  instead of SC32.
     13 @//
     14 
     15 @//
     16 @// Description:
     17 @// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
     18 @// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
     19 @//
     20 @//
     21 
     22 
     23 @// Include standard headers
     24 
     25 #include "dl/api/arm/armCOMM_s.h"
     26 #include "dl/api/arm/omxtypes_s.h"
     27 
     28 
     29 @// Import symbols required from other files
     30 @// (For example tables)
     31 
     32 
     33 @// Set debugging level
     34 @//DEBUG_ON    SETL {TRUE}
     35 
     36 
     37 
     38 @// Guarding implementation by the processor name
     39 
     40 
     41 
     42       @// Guarding implementation by the processor name
     43 
     44 
     45 
     46 @//Input Registers
     47 
     48 #define pSrc            r0
     49 #define pDst            r1
     50 #define pFFTSpec        r2
     51 #define scale           r3
     52 
     53 
     54 @// Output registers
     55 #define result          r0
     56 
     57 @//Local Scratch Registers
     58 
     59 #define argTwiddle      r1
     60 #define argDst          r2
     61 #define argScale        r4
     62 #define tmpOrder        r4
     63 #define pTwiddle        r4
     64 #define pOut            r5
     65 #define subFFTSize      r7
     66 #define subFFTNum       r6
     67 #define N               r6
     68 #define order           r14
     69 #define diff            r9
     70 @// Total num of radix stages required to complete the FFT
     71 #define count           r8
     72 #define x0r             r4
     73 #define x0i             r5
     74 #define diffMinusOne    r2
     75 #define round           r3
     76 
     77 #define pOut1           r2
     78 #define size            r7
     79 #define step            r8
     80 #define step1           r9
     81 #define twStep          r10
     82 #define pTwiddleTmp     r11
     83 #define argTwiddle1     r12
     84 #define zero            r14
     85 
     86 @// Neon registers
     87 
     88 #define dX0     D0.F32
     89 #define dShift  D1.F32
     90 #define dX1     D1.F32
     91 #define dY0     D2.F32
     92 #define dY1     D3.F32
     93 #define dX0r    D0.F32
     94 #define dX0i    D1.F32
     95 #define dX1r    D2.F32
     96 #define dX1i    D3.F32
     97 #define dW0r    D4.F32
     98 #define dW0i    D5.F32
     99 #define dW1r    D6.F32
    100 #define dW1i    D7.F32
    101 #define dT0     D8.F32
    102 #define dT1     D9.F32
    103 #define dT2     D10.F32
    104 #define dT3     D11.F32
    105 #define qT0     D12.F32
    106 #define qT1     D14.F32
    107 #define qT2     D16.F32
    108 #define qT3     D18.F32
    109 #define dY0r    D4.F32
    110 #define dY0i    D5.F32
    111 #define dY1r    D6.F32
    112 #define dY1i    D7.F32
    113 
    114 #define dY2     D4.F32
    115 #define dY3     D5.F32
    116 #define dW0     D6.F32
    117 #define dW1     D7.F32
    118 #define dW0Tmp  D10.F32
    119 #define dW1Neg  D11.F32
    120 
    121 #define half    D13.F32
    122 
    123 @ Structure offsets for the FFTSpec
    124         .set    ARMsFFTSpec_N, 0
    125         .set    ARMsFFTSpec_pBitRev, 4
    126         .set    ARMsFFTSpec_pTwiddle, 8
    127         .set    ARMsFFTSpec_pBuf, 12
    128 
    129         .MACRO FFTSTAGE scaled, inverse, name
    130 
    131         @// Read the size from structure and take log
    132         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
    133 
    134         @// Read other structure parameters
    135         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
    136         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
    137 
    138         VMOV    half, 0.5
    139 
    140 
    141         MOV     size,N,ASR #1                 @// preserve the contents of N
    142         MOV     step,N,LSL #2                 @// step = N/2 * 8 bytes
    143 
    144 
    145         @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
    146         @// Note: W^(k) is stored as negated value and also need to
    147         @// conjugate the values from the table
    148 
    149         @// Z(0) : no need of twiddle multiply
    150         @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
    151 
    152         VLD1    dX0,[pSrc],step
    153         ADD     pOut1,pOut,step               @// pOut1 = pOut+ N/2*8 bytes
    154 
    155         VLD1    dX1,[pSrc]!
    156         @// twStep = 3N/8 * 8 bytes pointing to W^1
    157         SUB     twStep,step,size,LSL #1
    158 
    159         MOV     step1,size,LSL #2             @// step1 = N/4 * 8 = N/2*4 bytes
    160         SUB     step1,step1,#8                @// (N/4-1)*8 bytes
    161 
    162         VADD    dY0,dX0,dX1                   @// [b+d | a+c]
    163         VSUB    dY1,dX0,dX1                   @// [b-d | a-c]
    164         VMUL    dY0, dY0, half[0]
    165         VMUL    dY1, dY1, half[0]
    166 
    167         @// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
    168         VZIP    dY0,dY1
    169 
    170         VSUB   dX0,dY0,dY1
    171         SUBS   size,size,#2
    172         VADD   dX1,dY0,dY1
    173 
    174         SUB     pSrc,pSrc,step
    175 
    176         VST1    dX0[0],[pOut1]!
    177         ADD     pTwiddleTmp,pTwiddle,#8       @// W^2
    178         VST1    dX1[1],[pOut1]!
    179         ADD     argTwiddle1,pTwiddle,twStep   @// W^1
    180 
    181 
    182         BLT     decrementScale\name
    183         BEQ     lastElement\name
    184 
    185 
    186         @// Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
    187         @// Note: W^k is stored as negative values in the table and also
    188         @// need to conjugate the values from the table.
    189         @//
    190         @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
    191         @// since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
    192 
    193 
    194         SUB     step,step,#24
    195 evenOddButterflyLoop\name :
    196 
    197 
    198         VLD1    dW0r,[argTwiddle1],step1
    199         VLD1    dW1r,[argTwiddle1]!
    200 
    201         VLD2    {dX0r,dX0i},[pSrc],step
    202         SUB     argTwiddle1,argTwiddle1,step1
    203         VLD2    {dX1r,dX1i},[pSrc]!
    204 
    205         SUB     step1,step1,#8                @// (N/4-2)*8 bytes
    206         VLD1    dW0i,[pTwiddleTmp],step1
    207         VLD1    dW1i,[pTwiddleTmp]!
    208         SUB     pSrc,pSrc,step
    209 
    210         SUB     pTwiddleTmp,pTwiddleTmp,step1
    211         VREV64  dX1r,dX1r
    212         VREV64  dX1i,dX1i
    213         SUBS    size,size,#4
    214 
    215 
    216         VSUB    dT2,dX0r,dX1r                 @// a-c
    217         VADD    dT3,dX0i,dX1i                 @// b+d
    218         VADD    dT0,dX0r,dX1r                 @// a+c
    219         VSUB    dT1,dX0i,dX1i                 @// b-d
    220         SUB     step1,step1,#8
    221 
    222         VMUL    dT2, dT2, half[0]
    223         VMUL    dT3, dT3, half[0]
    224 
    225         VMUL    dT0, dT0, half[0]
    226         VMUL    dT1, dT1, half[0]
    227 
    228         VZIP    dW1r,dW1i
    229         VZIP    dW0r,dW0i
    230 
    231 
    232         VMUL   dX1r,dW1r,dT2
    233         VMUL   dX1i,dW1r,dT3
    234         VMUL   dX0r,dW0r,dT2
    235         VMUL   dX0i,dW0r,dT3
    236 
    237         VMLS   dX1r,dW1i,dT3
    238         VMLA   dX1i,dW1i,dT2
    239 
    240         VMLA   dX0r,dW0i,dT3
    241         VMLS   dX0i,dW0i,dT2
    242 
    243 
    244         VADD    dY1r,dT0,dX1i                 @// F(N/2 -1)
    245         VSUB    dY1i,dX1r,dT1
    246 
    247         VREV64  dY1r,dY1r
    248         VREV64  dY1i,dY1i
    249 
    250 
    251         VADD    dY0r,dT0,dX0i                 @// F(1)
    252         VSUB    dY0i,dT1,dX0r
    253 
    254 
    255         VST2    {dY0r,dY0i},[pOut1],step
    256         VST2    {dY1r,dY1i},[pOut1]!
    257         SUB     pOut1,pOut1,step
    258         SUB     step,step,#32                 @// (N/2-4)*8 bytes
    259 
    260 
    261         BGT     evenOddButterflyLoop\name
    262 
    263 
    264         @// set both the ptrs to the last element
    265         SUB     pSrc,pSrc,#8
    266         SUB     pOut1,pOut1,#8
    267 
    268         @// Last element can be expanded as follows
    269         @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
    270         @// -ve)
    271         @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
    272         @// 1/2[2a+j0] - j (c-jd) [0+j2b]
    273         @// (a+bc, -bd)
    274         @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
    275 
    276 lastElement\name :
    277         VLD1    dX0r,[pSrc]
    278 
    279         VST1    dX0r[0],[pOut1]!
    280         VNEG    dX0r,dX0r
    281         VST1    dX0r[1],[pOut1]
    282 
    283 
    284 
    285 decrementScale\name :
    286 
    287         .endm
    288 
    289         M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe,r4
    290 
    291             FFTSTAGE "FALSE","TRUE",Inv
    292         M_END
    293 
    294         .end
    295