Home | History | Annotate | Download | only in neon
      1 @//
      2 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 @//
      4 @//  Use of this source code is governed by a BSD-style license
      5 @//  that can be found in the LICENSE file in the root of the source
      6 @//  tree. An additional intellectual property rights grant can be found
      7 @//  in the file PATENTS.  All contributing project authors may
      8 @//  be found in the AUTHORS file in the root of the source tree.
      9 @//
     10 @//  This file was originally licensed as follows. It has been
     11 @//  relicensed with permission from the copyright holders.
     12 @//
     13 
     14 @//
     15 @// File Name:  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s
     16 @// OpenMAX DL: v1.0.2
     17 @// Last Modified Revision:   7485
     18 @// Last Modified Date:       Fri, 21 Sep 2007
     19 @//
     20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
     21 @//
     22 @//
     23 @//
     24 @// Description:
     25 @// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
     26 @// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
     27 @// It implements both "scaled"(by 1/2) and "unsclaed" versions of the above formula
     28 @//
     29 
     30 
     31 @// Include standard headers
     32 
     33 #include "dl/api/arm/armCOMM_s.h"
     34 #include "dl/api/arm/omxtypes_s.h"
     35 
     36 
     37 @// Import symbols required from other files
     38 @// (For example tables)
     39 
     40 
     41 @// Set debugging level
     42 @//DEBUG_ON    SETL {TRUE}
     43 
     44 
     45 
     46 @// Guarding implementation by the processor name
     47 
     48 
     49 
     50       @// Guarding implementation by the processor name
     51 
     52 
     53 
     54 @//Input Registers
     55 
     56 #define pSrc            r0
     57 #define pDst            r1
     58 #define pFFTSpec        r2
     59 #define scale           r3
     60 
     61 
     62 @// Output registers
     63 #define result          r0
     64 
     65 @//Local Scratch Registers
     66 
     67 #define argTwiddle      r1
     68 #define argDst          r2
     69 #define argScale        r4
     70 #define tmpOrder        r4
     71 #define pTwiddle        r4
     72 #define pOut            r5
     73 #define subFFTSize      r7
     74 #define subFFTNum       r6
     75 #define N               r6
     76 #define order           r14
     77 #define diff            r9
     78 #define count           r8                   @// Total num of radix stages required to comple the FFT
     79 #define x0r             r4
     80 #define x0i             r5
     81 #define diffMinusOne    r2
     82 #define round           r3
     83 
     84 #define pOut1           r2
     85 #define size            r7
     86 #define step            r8
     87 #define step1           r9
     88 #define twStep          r10
     89 #define pTwiddleTmp     r11
     90 #define argTwiddle1     r12
     91 #define zero            r14
     92 
     93 @// Neon registers
     94 
     95 #define dX0     D0.S32
     96 #define dShift  D1.S32
     97 #define dX1     D1.S32
     98 #define dY0     D2.S32
     99 #define dY1     D3.S32
    100 #define dX0r    D0.S32
    101 #define dX0i    D1.S32
    102 #define dX1r    D2.S32
    103 #define dX1i    D3.S32
    104 #define dW0r    D4.S32
    105 #define dW0i    D5.S32
    106 #define dW1r    D6.S32
    107 #define dW1i    D7.S32
    108 #define dT0     D8.S32
    109 #define dT1     D9.S32
    110 #define dT2     D10.S32
    111 #define dT3     D11.S32
    112 #define qT0     Q6.S64
    113 #define qT1     Q7.S64
    114 #define qT2     Q8.S64
    115 #define qT3     Q9.S64
    116 #define dY0r    D4.S32
    117 #define dY0i    D5.S32
    118 #define dY1r    D6.S32
    119 #define dY1i    D7.S32
    120 
    121 #define dY2     D4.S32
    122 #define dY3     D5.S32
    123 #define dW0     D6.S32
    124 #define dW1     D7.S32
    125 #define dW0Tmp  D10.S32
    126 #define dW1Neg  D11.S32
    127 
    128 
    129 @ Structure offsets for the FFTSpec
    130         .set    ARMsFFTSpec_N, 0
    131         .set    ARMsFFTSpec_pBitRev, 4
    132         .set    ARMsFFTSpec_pTwiddle, 8
    133         .set    ARMsFFTSpec_pBuf, 12
    134 
    135 
    136         .MACRO FFTSTAGE scaled, inverse, name
    137 
    138         @// Read the size from structure and take log
    139         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
    140 
    141         @// Read other structure parameters
    142         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
    143         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
    144 
    145 
    146 
    147         MOV     size,N,ASR #1                    @// preserve the contents of N
    148         MOV     step,N,LSL #2                    @// step = N/2 * 8 bytes
    149 
    150 
    151         @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
    152         @// Note: W^(k) is stored as negated value and also need to conjugate the values from the table
    153 
    154         @// Z(0) : no need of twiddle multiply
    155         @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
    156 
    157         VLD1    dX0,[pSrc],step
    158         ADD     pOut1,pOut,step                  @// pOut1 = pOut+ N/2*8 bytes
    159 
    160         VLD1    dX1,[pSrc]!
    161         SUB     twStep,step,size,LSL #1          @// twStep = 3N/8 * 8 bytes pointing to W^1
    162 
    163         MOV     step1,size,LSL #2                @// step1 = N/4 * 8 = N/2*4 bytes
    164         SUB     step1,step1,#8                   @// (N/4-1)*8 bytes
    165 
    166         VHADD    dY0,dX0,dX1                     @// [b+d | a+c]
    167         VHSUB    dY1,dX0,dX1                     @// [b-d | a-c]
    168         VZIP    dY0,dY1                          @// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
    169 
    170         .ifeqs  "\scaled", "TRUE"
    171             VHSUB   dX0,dY0,dY1
    172             SUBS    size,size,#2
    173             VHADD   dX1,dY0,dY1
    174         .else
    175             VSUB   dX0,dY0,dY1
    176             SUBS    size,size,#2
    177             VADD   dX1,dY0,dY1
    178         .endif
    179 
    180         SUB     pSrc,pSrc,step
    181 
    182         VST1    dX0[0],[pOut1]!
    183         ADD     pTwiddleTmp,pTwiddle,#8                @// W^2
    184         VST1    dX1[1],[pOut1]!
    185         ADD     argTwiddle1,pTwiddle,twStep            @// W^1
    186 
    187 
    188         BLT     decrementScale\name
    189         BEQ     lastElement\name
    190 
    191 
    192         @// Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
    193         @// Note: W^k is stored as negative values in the table and also need to conjugate the values from the table
    194         @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) since both of them
    195         @// require F(1),F(2) and F(N/2-2),F(N/2-1)
    196 
    197 
    198         SUB     step,step,#24
    199 evenOddButterflyLoop\name :
    200 
    201 
    202         VLD1    dW0r,[argTwiddle1],step1
    203         VLD1    dW1r,[argTwiddle1]!
    204 
    205         VLD2    {dX0r,dX0i},[pSrc],step
    206         SUB     argTwiddle1,argTwiddle1,step1
    207         VLD2    {dX1r,dX1i},[pSrc]!
    208 
    209         SUB     step1,step1,#8                          @// (N/4-2)*8 bytes
    210         VLD1    dW0i,[pTwiddleTmp],step1
    211         VLD1    dW1i,[pTwiddleTmp]!
    212         SUB     pSrc,pSrc,step
    213 
    214         SUB     pTwiddleTmp,pTwiddleTmp,step1
    215         VREV64  dX1r,dX1r
    216         VREV64  dX1i,dX1i
    217         SUBS    size,size,#4
    218 
    219 
    220         VHSUB    dT2,dX0r,dX1r                            @// a-c
    221         VHADD    dT3,dX0i,dX1i                            @// b+d
    222         SUB     step1,step1,#8
    223         VHADD    dT0,dX0r,dX1r                           @// a+c
    224         VHSUB    dT1,dX0i,dX1i                            @// b-d
    225 
    226         VZIP    dW1r,dW1i
    227         VZIP    dW0r,dW0i
    228 
    229 
    230         VMULL   qT0,dW1r,dT2
    231         VMLSL   qT0,dW1i,dT3
    232         VMULL   qT1,dW1r,dT3
    233         VMLAL   qT1,dW1i,dT2
    234 
    235         VMULL   qT2,dW0r,dT2
    236         VMLAL   qT2,dW0i,dT3
    237         VMULL   qT3,dW0r,dT3
    238         VMLSL   qT3,dW0i,dT2
    239 
    240 
    241         VRSHRN  dX1r,qT0,#31
    242         VRSHRN  dX1i,qT1,#31
    243 
    244         .ifeqs  "\scaled", "TRUE"
    245             VHADD    dY1r,dT0,dX1i                           @// F(N/2 -1)
    246             VHSUB    dY1i,dX1r,dT1
    247         .else
    248             VADD    dY1r,dT0,dX1i                           @// F(N/2 -1)
    249             VSUB    dY1i,dX1r,dT1
    250 
    251         .endif
    252 
    253 
    254         VREV64  dY1r,dY1r
    255         VREV64  dY1i,dY1i
    256 
    257 
    258         VRSHRN  dX0r,qT2,#31
    259         VRSHRN  dX0i,qT3,#31
    260 
    261         .ifeqs  "\scaled", "TRUE"
    262             VHADD    dY0r,dT0,dX0i                           @// F(1)
    263             VHSUB    dY0i,dT1,dX0r
    264         .else
    265             VADD    dY0r,dT0,dX0i                           @// F(1)
    266             VSUB    dY0i,dT1,dX0r
    267         .endif
    268 
    269 
    270         VST2    {dY0r,dY0i},[pOut1],step
    271         VST2    {dY1r,dY1i},[pOut1]!
    272         SUB     pOut1,pOut1,step
    273         SUB     step,step,#32                            @// (N/2-4)*8 bytes
    274 
    275 
    276         BGT     evenOddButterflyLoop\name
    277 
    278 
    279         SUB     pSrc,pSrc,#8                @// set both the ptrs to the last element
    280         SUB     pOut1,pOut1,#8
    281 
    282         @// Last element can be expanded as follows
    283         @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as -ve)
    284         @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
    285         @// 1/2[2a+j0] - j (c-jd) [0+j2b]
    286         @// (a+bc, -bd)
    287         @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
    288 
    289 lastElement\name :
    290         VLD1    dX0r,[pSrc]
    291 
    292         .ifeqs  "\scaled", "TRUE"
    293             VSHR    dX0r,dX0r,#1
    294         .endif
    295 
    296         VST1    dX0r[0],[pOut1]!
    297         VNEG    dX0r,dX0r
    298         VST1    dX0r[1],[pOut1]
    299 
    300 
    301 
    302 decrementScale\name :
    303 
    304         .ifeqs  "\scaled", "TRUE"
    305             SUB scale,scale,#1
    306         .endif
    307 
    308         .endm
    309 
    310         M_START armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe,r4
    311 
    312             FFTSTAGE "FALSE","TRUE",Inv
    313         M_END
    314 
    315         M_START armSP_FFTInv_CCSToR_S32_Sfs_preTwiddleRadix2_unsafe,r4
    316 
    317             FFTSTAGE "TRUE","TRUE",InvSfs
    318         M_END
    319 
    320 
    321         .end
    322