Home | History | Annotate | Download | only in neon
      1 @
      2 @  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @
      4 @  Use of this source code is governed by a BSD-style license
      5 @  that can be found in the LICENSE file in the root of the source
      6 @  tree. An additional intellectual property rights grant can be found
      7 @  in the file PATENTS.  All contributing project authors may
      8 @  be found in the AUTHORS file in the root of the source tree.
      9 @
     10 @ Some code in this file was originally from file
     11 @ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
     12 @ follows. It has been relicensed with permission from the copyright holders.
     13 @
     14 
     15 @
     16 @ OpenMAX DL: v1.0.2
     17 @ Last Modified Revision:   7485
     18 @ Last Modified Date:       Fri, 21 Sep 2007
     19 @
     20 @ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
     21 @
     22 
     23 @
     24 @ Description:
     25 @ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
     26 @ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
     27 @ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
     28 @ formula.
     29 @
     30 
     31 #include "dl/api/arm/armCOMM_s.h"
     32 #include "dl/api/arm/omxtypes_s.h"
     33 
     34 @//Input Registers
     35 #define pSrc            r0
     36 #define pDst            r1
     37 #define pFFTSpec        r2
     38 #define scale           r3
     39 
     40 @ Output registers
     41 #define result          r0
     42 
     43 @//Local Scratch Registers
     44 #define argTwiddle      r1
     45 #define argDst          r2
     46 #define argScale        r4
     47 #define tmpOrder        r4
     48 #define pTwiddle        r4
     49 #define pOut            r5
     50 #define subFFTSize      r7
     51 #define subFFTNum       r6
     52 #define N               r6
     53 #define order           r14
     54 #define diff            r9
     55 @ Total num of radix stages to comple the FFT.
     56 #define count           r8
     57 #define x0r             r4
     58 #define x0i             r5
     59 #define diffMinusOne    r2
     60 #define round           r3
     61 #define pOut1           r2
     62 #define size            r7
     63 #define step            r8
     64 #define step1           r9
     65 #define step2           r10
     66 #define twStep          r10
     67 #define pTwiddleTmp     r11
     68 #define argTwiddle1     r12
     69 #define zero            r14
     70 
     71 @ Neon registers
     72 #define dX0             D0.S16
     73 #define dX0S32          D0.S32
     74 #define dShift          D1.S16
     75 #define dX1             D1.S16
     76 #define dX1S32          D1.S32
     77 #define dY0             D2.S16
     78 #define dY1             D3.S16
     79 #define dX0r            D0.S16
     80 #define dX0rS32         D0.S32
     81 #define dX0i            D1.S16
     82 #define dX1r            D2.S16
     83 #define dX1i            D3.S16
     84 #define qX1             Q1.S16
     85 #define dW0r            D4.S16
     86 #define dW0i            D5.S16
     87 #define dW1r            D6.S16
     88 #define dW1i            D7.S16
     89 #define dW0rS32         D4.S32
     90 #define dW0iS32         D5.S32
     91 #define dW1rS32         D6.S32
     92 #define dW1iS32         D7.S32
     93 #define dT0             D8.S16
     94 #define dT1             D9.S16
     95 #define dT2             D10.S16
     96 #define dT3             D11.S16
     97 #define qT0             Q6.S32
     98 #define qT1             Q7.S32
     99 #define qT2             Q8.S32
    100 #define qT3             Q9.S32
    101 #define dY0r            D4.S16
    102 #define dY0i            D5.S16
    103 #define dY1r            D6.S16
    104 #define dY1i            D7.S16
    105 #define qY1             Q3.S16
    106 #define dY2             D4.S16
    107 #define dY3             D5.S16
    108 #define dW0             D6.S16
    109 #define dW1             D7.S16
    110 #define dW0Tmp          D10.S16
    111 #define dW1Neg          D11.S16
    112 
    113         @ Structure offsets for the FFTSpec
    114         .set    ARMsFFTSpec_N, 0
    115         .set    ARMsFFTSpec_pBitRev, 4
    116         .set    ARMsFFTSpec_pTwiddle, 8
    117         .set    ARMsFFTSpec_pBuf, 12
    118 
    119         .macro FFTSTAGE scaled, inverse, name
    120 
    121         @ Read the size from structure and take log
    122         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
    123 
    124         @ Read other structure parameters
    125         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
    126         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
    127 
    128         MOV     size,N,ASR #1        @ preserve the contents of N
    129         MOV     step,N,LSL #1        @ step = N/2 * 4 bytes
    130 
    131         @ Process different FFT sizes with different loops.
    132         CMP    size,#4
    133         BLE    smallFFTSize\name
    134 
    135         @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
    136         @ Note: W^(k) is stored as negated value and also need to
    137         @ conjugate the values from the table.
    138 
    139         @ Z(0) : no need of twiddle multiply
    140         @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
    141 
    142         VLD1    dX0S32[0],[pSrc],step
    143         ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes
    144 
    145         VLD1    dX1S32[0],[pSrc]!
    146         SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
    147 
    148         MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
    149         SUB     step1,step1,#4       @ (N/4-1)*4 bytes
    150 
    151         VHADD    dY0,dX0,dX1         @ [b+d | a+c]
    152         VHSUB    dY1,dX0,dX1         @ [b-d | a-c]
    153         VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
    154 
    155         .ifeqs  "\scaled", "TRUE"
    156             VHSUB   dX0,dY0,dY1
    157             SUBS    size,size,#2
    158             VHADD   dX1,dY0,dY1
    159         .else
    160             VSUB   dX0,dY0,dY1
    161             SUBS    size,size,#2
    162             VADD   dX1,dY0,dY1
    163         .endif
    164 
    165         SUB     pSrc,pSrc,step
    166         VST1    dX0[0],[pOut1]!
    167         ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
    168         VST1    dX1[1],[pOut1]!
    169         ADD     argTwiddle1,pTwiddle,twStep            @ W^1
    170 
    171         BLT     decrementScale\name
    172         BEQ     lastElement\name
    173 
    174         SUB     step,step,#20
    175         SUB     step1,step1,#4                         @ (N/4-1)*8 bytes
    176         SUB     step2, step1, #4
    177 
    178         @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
    179         @ Note: W^k is stored as negative values in the table and also need to
    180         @ conjugate the values from the table.
    181         @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
    182         @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
    183 
    184 evenOddButterflyLoop\name:
    185         VLD2    {dX0r,dX0i},[pSrc],step
    186         VLD2    {dX1r,dX1i},[pSrc]!
    187         SUB     pSrc, pSrc, step
    188 
    189         VLD1    dW0r,[argTwiddle1],step1
    190         VREV64  qX1,qX1
    191         VLD1    dW1r,[argTwiddle1]!
    192         VHSUB   dT2,dX0r,dX1r                          @ a-c
    193         SUB     argTwiddle1, argTwiddle1, step1
    194         SUB     step1,step1,#16
    195 
    196         VLD1    dW0i,[pTwiddleTmp],step2
    197         VHADD   dT3,dX0i,dX1i                          @ b+d
    198         VLD1    dW1i,[pTwiddleTmp]!
    199         VHADD   dT0,dX0r,dX1r                          @ a+c
    200         VHSUB   dT1,dX0i,dX1i                          @ b-d
    201         SUB     pTwiddleTmp, pTwiddleTmp, step2
    202         SUB     step2,step2,#16
    203 
    204         SUBS    size,size,#8
    205 
    206         VZIP    dW1r,dW1i
    207         VTRN    dW0r,dW0i
    208         VZIP    dW1iS32, dW1rS32
    209 
    210         VMULL   qT0,dW1i,dT2
    211         VMLSL   qT0,dW1r,dT3
    212         VMULL   qT1,dW1i,dT3
    213         VMLAL   qT1,dW1r,dT2
    214         VMULL   qT2,dW0r,dT2
    215         VMLAL   qT2,dW0i,dT3
    216         VMULL   qT3,dW0r,dT3
    217         VMLSL   qT3,dW0i,dT2
    218 
    219         VRSHRN  dX1r,qT0,#15
    220         VRSHRN  dX1i,qT1,#15
    221         VRSHRN  dX0r,qT2,#15
    222         VRSHRN  dX0i,qT3,#15
    223 
    224         .ifeqs  "\scaled", "TRUE"
    225             VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
    226             VHSUB    dY1i,dX1r,dT1
    227         .else
    228             VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
    229             VSUB    dY1i,dX1r,dT1
    230         .endif
    231 
    232         .ifeqs  "\scaled", "TRUE"
    233             VHADD    dY0r,dT0,dX0i                     @ F(1)
    234             VHSUB    dY0i,dT1,dX0r
    235         .else
    236             VADD    dY0r,dT0,dX0i                      @ F(1)
    237             VSUB    dY0i,dT1,dX0r
    238         .endif
    239 
    240         VREV64  qY1,qY1
    241 
    242         VST2    {dY0r,dY0i},[pOut1],step
    243         VST2    {dY1r,dY1i},[pOut1]
    244         ADD     pOut1,pOut1,#16
    245         SUB     pOut1, pOut1, step
    246         SUB     step,step,#32
    247 
    248         BGT     evenOddButterflyLoop\name
    249 
    250         SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
    251         SUB     pOut1,pOut1,#4
    252         B       lastElement\name
    253 
    254 smallFFTSize\name:
    255         @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
    256         @ Note: W^(k) is stored as negated value and also need to
    257         @ conjugate the values from the table.
    258 
    259         @ Z(0) : no need of twiddle multiply
    260         @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
    261 
    262         VLD1    dX0S32[0],[pSrc],step
    263         ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes
    264 
    265         VLD1    dX1S32[0],[pSrc]!
    266         SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
    267 
    268         MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
    269         SUB     step1,step1,#4       @ (N/4-1)*4 bytes
    270 
    271         VHADD    dY0,dX0,dX1         @ [b+d | a+c]
    272         VHSUB    dY1,dX0,dX1         @ [b-d | a-c]
    273         VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
    274 
    275         .ifeqs  "\scaled", "TRUE"
    276             VHSUB   dX0,dY0,dY1
    277             SUBS    size,size,#2
    278             VHADD   dX1,dY0,dY1
    279         .else
    280             VSUB   dX0,dY0,dY1
    281             SUBS    size,size,#2
    282             VADD   dX1,dY0,dY1
    283         .endif
    284 
    285         SUB     pSrc,pSrc,step
    286         VST1    dX0[0],[pOut1]!
    287         ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
    288         VST1    dX1[1],[pOut1]!
    289         ADD     argTwiddle1,pTwiddle,twStep            @ W^1
    290 
    291         BLT     decrementScale\name
    292         BEQ     lastElement\name
    293 
    294         @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
    295         @ Note: W^k is stored as negative values in the table and also need to
    296         @ conjugate the values from the table.
    297         @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
    298         @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
    299 
    300         SUB     step,step,#12
    301 
    302 evenOddButterflyLoopSize4\name:
    303         VLD1    dW0rS32[0],[argTwiddle1],step1
    304         VLD1    dW1rS32[0],[argTwiddle1]!
    305 
    306         VLD2    {dX0r[0],dX0i[0]},[pSrc]!
    307         VLD2    {dX0r[1],dX0i[1]},[pSrc],step
    308         SUB     pSrc,pSrc,#4
    309         SUB     argTwiddle1,argTwiddle1,step1
    310         VLD2    {dX1r[0],dX1i[0]},[pSrc]!
    311         VLD2    {dX1r[1],dX1i[1]},[pSrc]!
    312 
    313         SUB     step1,step1,#4                         @ (N/4-2)*4 bytes
    314         VLD1    dW0iS32[0],[pTwiddleTmp],step1
    315         VLD1    dW1iS32[0],[pTwiddleTmp]!
    316         SUB     pSrc,pSrc,step
    317 
    318         SUB     pTwiddleTmp,pTwiddleTmp,step1
    319         VREV32  dX1r,dX1r
    320         VREV32  dX1i,dX1i
    321         SUBS    size,size,#4
    322 
    323         VHSUB   dT2,dX0r,dX1r                          @ a-c
    324         VHADD   dT3,dX0i,dX1i                          @ b+d
    325         SUB     step1,step1,#4
    326         VHADD   dT0,dX0r,dX1r                          @ a+c
    327         VHSUB   dT1,dX0i,dX1i                          @ b-d
    328 
    329         VTRN    dW1r,dW1i
    330         VTRN    dW0r,dW0i
    331 
    332         VMULL   qT0,dW1r,dT2
    333         VMLSL   qT0,dW1i,dT3
    334         VMULL   qT1,dW1r,dT3
    335         VMLAL   qT1,dW1i,dT2
    336         VMULL   qT2,dW0r,dT2
    337         VMLAL   qT2,dW0i,dT3
    338         VMULL   qT3,dW0r,dT3
    339         VMLSL   qT3,dW0i,dT2
    340 
    341         VRSHRN  dX1r,qT0,#15
    342         VRSHRN  dX1i,qT1,#15
    343 
    344         .ifeqs  "\scaled", "TRUE"
    345             VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
    346             VHSUB    dY1i,dX1r,dT1
    347         .else
    348             VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
    349             VSUB    dY1i,dX1r,dT1
    350         .endif
    351 
    352         VREV32  dY1r,dY1r
    353         VREV32  dY1i,dY1i
    354 
    355         VRSHRN  dX0r,qT2,#15
    356         VRSHRN  dX0i,qT3,#15
    357 
    358         .ifeqs  "\scaled", "TRUE"
    359             VHADD    dY0r,dT0,dX0i                     @ F(1)
    360             VHSUB    dY0i,dT1,dX0r
    361         .else
    362             VADD    dY0r,dT0,dX0i                      @ F(1)
    363             VSUB    dY0i,dT1,dX0r
    364         .endif
    365 
    366         VST2    {dY0r[0],dY0i[0]},[pOut1]!
    367         VST2    {dY0r[1],dY0i[1]},[pOut1],step
    368         SUB     pOut1, #4
    369         VST2    {dY1r[0],dY1i[0]},[pOut1]!
    370         VST2    {dY1r[1],dY1i[1]},[pOut1]!
    371         SUB     pOut1,pOut1,step
    372         SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
    373         SUB     pOut1,pOut1,#4
    374 
    375         @ Last element can be expanded as follows
    376         @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
    377         @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
    378         @ 1/2[2a+j0] - j (c-jd) [0+j2b]
    379         @ (a+bc, -bd)
    380         @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
    381 
    382 lastElement\name:
    383         VLD1    dX0rS32[0],[pSrc]
    384 
    385         .ifeqs  "\scaled", "TRUE"
    386             VSHR    dX0r,dX0r,#1
    387         .endif
    388 
    389         VST1    dX0r[0],[pOut1]!
    390         VNEG    dX0r,dX0r
    391         VST1    dX0r[1],[pOut1]
    392 
    393 decrementScale\name:
    394         .ifeqs  "\scaled", "TRUE"
    395             SUB scale,scale,#1
    396         .endif
    397 
    398         .endm
    399 
    400         M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
    401         FFTSTAGE "FALSE","TRUE",Inv
    402         M_END
    403 
    404         M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
    405         FFTSTAGE "TRUE","TRUE",InvSfs
    406         M_END
    407 
    408 
    409         .end
    410