Home | History | Annotate | Download | only in arm64
      1 //
      2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 //
      4 //  Use of this source code is governed by a BSD-style license
      5 //  that can be found in the LICENSE file in the root of the source
      6 //  tree. An additional intellectual property rights grant can be found
      7 //  in the file PATENTS.  All contributing project authors may
      8 //  be found in the AUTHORS file in the root of the source tree.
      9 //
     10 //  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
     11 //  to support float instead of SC32.
     12 //
     13 
     14 //
     15 // Description:
     16 // Compute a Radix 4 FFT stage for a N point complex signal
     17 //
     18 //
     19 
     20 
     21 // Include standard headers
     22 
     23 #include "dl/api/arm/arm64COMM_s.h"
     24 #include "dl/api/arm/omxtypes_s.h"
     25 
     26 // Import symbols required from other files
     27 // (For example tables)
     28 
     29 
     30 
     31 
     32 // Set debugging level
     33 //DEBUG_ON    SETL {TRUE}
     34 
     35 
     36 // Guarding implementation by the processor name
     37 
     38 
     39 // Import symbols required from other files
     40 // (For example tables)
     41     //IMPORT  armAAC_constTable
     42 
     43 //Input Registers
     44 
     45 #define pSrc            x0
     46 #define pDst            x1
     47 #define pTwiddle        x2
     48 #define	pSubFFTNum	x3
     49 #define pSubFFTSize	x4
     50 
     51 
     52 
     53 //Output Registers
     54 
     55 
     56 //Local Scratch Registers
     57 
     58 #define subFFTNum       x5
     59 #define subFFTSize      x6
     60 #define outPointStep    x8
     61 #define grpCount        x9
     62 #define dstStep         x10
     63 #define grpTwStep       x13
     64 #define stepTwiddle     x14
     65 #define twStep          x15
     66 #define step16          x11
     67 #define step24          x12
     68 
     69 
     70 // Neon Registers
     71 
     72 #define dButterfly1Real02       v0.2s
     73 #define dButterfly1Real028b     v0.8b
     74 #define dButterfly1Imag02       v1.2s
     75 #define dButterfly1Imag028b     v1.8b
     76 #define dButterfly1Real13       v2.2s
     77 #define dButterfly1Real138b     v2.8b
     78 #define dButterfly1Imag13       v3.2s
     79 #define dButterfly1Imag138b     v3.8b
     80 #define dButterfly2Real02       v4.2s
     81 #define dButterfly2Imag02       v5.2s
     82 #define dButterfly2Real13       v6.2s
     83 #define dButterfly2Imag13       v7.2s
     84 #define dXr0                    v0.2s
     85 #define dXi0                    v1.2s
     86 #define dXr08b                  v0.8b
     87 #define dXi08b                  v1.8b
     88 #define dXr1                    v2.2s
     89 #define dXi1                    v3.2s
     90 #define dXr2                    v4.2s
     91 #define dXi2                    v5.2s
     92 #define dXr3                    v6.2s
     93 #define dXi3                    v7.2s
     94 
     95 #define dYr0                    v16.2s
     96 #define dYi0                    v17.2s
     97 #define dYr1                    v18.2s
     98 #define dYi1                    v19.2s
     99 #define dYr2                    v20.2s
    100 #define dYi2                    v21.2s
    101 #define dYr3                    v22.2s
    102 #define dYi3                    v23.2s
    103 
    104 #define dW1r                    v8.2s
    105 #define dW1i                    v9.2s
    106 #define dW2r                    v10.2s
    107 #define dW2r8b                  v10.8b
    108 #define dW2i                    v11.2s
    109 #define dW3r                    v12.2s
    110 #define dW3r8b                  v12.8b
    111 #define dW3i                    v13.2s
    112 
    113 #define dZr0                    v14.2s
    114 #define dZi0                    v15.2s
    115 #define dZr08b                  v14.8b
    116 #define dZi08b                  v15.8b
    117 #define dZr1                    v26.2s
    118 #define dZi1                    v27.2s
    119 #define dZr2                    v28.2s
    120 #define dZi2                    v29.2s
    121 #define dZr3                    v30.2s
    122 #define dZi3                    v31.2s
    123 
    124 #define dZip                    v24.2s
    125 #define dZip8b                  v24.8b
    126 
    127         .macro FFTSTAGE scaled, inverse , name
    128 
    129         // Define stack arguments
    130 
    131         // Move args values into our work registers
    132         ldr     subFFTNum, [pSubFFTNum]
    133         ldr     subFFTSize, [pSubFFTSize]
    134 
    135         // pOut0+1 increments pOut0 by 8 bytes
    136         // pOut0+outPointStep == increment of 8*outPointStep bytes
    137         lsl     outPointStep,subFFTSize, #3
    138 
    139         // Update grpCount and grpSize rightaway
    140 
    141         ld2    {dW1r,dW1i},[pTwiddle]             // [wi|wr]
    142         MOV     step16,#16
    143         LSL     grpCount,subFFTSize,#2
    144 
    145         ld1    {dW2r},[pTwiddle]                  // [wi|wr]
    146         MOV     subFFTNum,#1                      //after the last stage
    147 
    148         ld1    {dW3r},[pTwiddle],step16           // [wi|wr]
    149         MOV     stepTwiddle,#0
    150 
    151         ld1    {dW2i},[pTwiddle],#8               // [wi|wr]
    152         SUB     grpTwStep,stepTwiddle,#8          // grpTwStep = -8 to start with
    153 
    154         // update subFFTSize for the next stage
    155         MOV     subFFTSize,grpCount
    156         ld1    {dW3i},[pTwiddle],grpTwStep        // [wi|wr]
    157         lsl     dstStep,outPointStep, #1
    158 
    159         // AC.r AC.i BD.r BD.i
    160         ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
    161         ADD     dstStep,dstStep,outPointStep      // dstStep = 3*outPointStep
    162 
    163         rsb     dstStep,dstStep,#16               // dstStep = - 3*outPointStep+16
    164         MOV     step24,#24
    165 
    166         // AC.r AC.i BD.r BD.i
    167         ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
    168 
    169 
    170         // Process two groups at a time
    171 
    172 radix4lsGrpLoop\name :
    173 
    174         // VZIP    dW2r,dW2i
    175         zip1    dZip, dW2r, dW2i
    176         zip2    dW2i, dW2r, dW2i
    177         mov     dW2r8b, dZip8b
    178 
    179         ADD     stepTwiddle,stepTwiddle,#16
    180 
    181         // VZIP    dW3r,dW3i
    182         zip1    dZip, dW3r,dW3i
    183         zip2    dW3i, dW3r, dW3i
    184         mov     dW3r8b, dZip8b
    185         ADD     grpTwStep,stepTwiddle,#4
    186 
    187         // VUZP     dButterfly1Real13, dButterfly2Real13      // B.r D.r
    188         uzp1     dZip, dButterfly1Real13, dButterfly2Real13   // B.r D.r
    189         uzp2     dButterfly2Real13, dButterfly1Real13, dButterfly2Real13   // B.r D.r
    190         mov      dButterfly1Real138b, dZip8b
    191 
    192         SUB     twStep,stepTwiddle,#16                        // -16+stepTwiddle
    193 
    194         // VUZP     dButterfly1Imag13, dButterfly2Imag13      // B.i D.i
    195         uzp1     dZip, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
    196         uzp2     dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
    197         mov      dButterfly1Imag138b, dZip8b
    198         lsl     grpTwStep,grpTwStep,#1
    199 
    200         // VUZP     dButterfly1Real02, dButterfly2Real02      // A.r C.r
    201         uzp1     dZip, dButterfly1Real02, dButterfly2Real02   // A.r C.r
    202         uzp2     dButterfly2Real02, dButterfly1Real02, dButterfly2Real02   // A.r C.r
    203         mov      dButterfly1Real028b, dZip8b
    204         rsb     grpTwStep,grpTwStep,#0                        // -8-2*stepTwiddle
    205 
    206         // VUZP     dButterfly1Imag02, dButterfly2Imag02      // A.i C.i
    207         uzp1     dZip, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
    208         uzp2     dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
    209         mov      dButterfly1Imag028b, dZip8b
    210 
    211 
    212         // grpCount is multiplied by 4
    213         SUBS    grpCount,grpCount,#8
    214 
    215         .ifeqs  "\inverse", "TRUE"
    216             fmul   dZr1,dW1r,dXr1
    217             fmla   dZr1,dW1i,dXi1                       // real part
    218             fmul   dZi1,dW1r,dXi1
    219             fmls   dZi1,dW1i,dXr1                       // imag part
    220 
    221         .else
    222 
    223             fmul   dZr1,dW1r,dXr1
    224             fmls   dZr1,dW1i,dXi1                       // real part
    225             fmul   dZi1,dW1r,dXi1
    226             fmla   dZi1,dW1i,dXr1                       // imag part
    227 
    228         .endif
    229 
    230         ld2    {dW1r,dW1i},[pTwiddle],stepTwiddle       // [wi|wr]
    231 
    232         .ifeqs  "\inverse", "TRUE"
    233             fmul   dZr2,dW2r,dXr2
    234             fmla   dZr2,dW2i,dXi2                       // real part
    235             fmul   dZi2,dW2r,dXi2
    236             ld1   {dW2r},[pTwiddle],step16              // [wi|wr]
    237             fmls   dZi2,dW2i,dXr2                       // imag part
    238 
    239         .else
    240 
    241             fmul   dZr2,dW2r,dXr2
    242             fmls   dZr2,dW2i,dXi2                       // real part
    243             fmul   dZi2,dW2r,dXi2
    244             ld1    {dW2r},[pTwiddle],step16             // [wi|wr]
    245             fmla   dZi2,dW2i,dXr2                       // imag part
    246 
    247         .endif
    248 
    249 
    250         ld1    {dW2i},[pTwiddle],twStep                 // [wi|wr]
    251 
    252         // move qX0 so as to load for the next iteration
    253         // MOV     qZ0,qX0
    254         mov     dZr08b, dXr08b
    255         mov     dZi08b, dXi08b
    256 
    257         .ifeqs  "\inverse", "TRUE"
    258             fmul   dZr3,dW3r,dXr3
    259             fmla   dZr3,dW3i,dXi3                       // real part
    260             fmul   dZi3,dW3r,dXi3
    261             ld1    {dW3r},[pTwiddle],step24
    262             fmls   dZi3,dW3i,dXr3                       // imag part
    263 
    264         .else
    265 
    266             fmul   dZr3,dW3r,dXr3
    267             fmls   dZr3,dW3i,dXi3                       // real part
    268             fmul   dZi3,dW3r,dXi3
    269             ld1    {dW3r},[pTwiddle],step24
    270             fmla   dZi3,dW3i,dXr3                       // imag part
    271 
    272         .endif
    273 
    274         ld1    {dW3i},[pTwiddle],grpTwStep              // [wi|wr]
    275 
    276         // Don't do the load on the last iteration so we don't read past the end
    277         // of pSrc.
    278         bne     skipIncrement\name
    279         add     pSrc, pSrc, #64
    280 skipIncrement\name:
    281         beq     radix4lsSkipRead\name
    282         // AC.r AC.i BD.r BD.i
    283         ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
    284 
    285         // AC.r AC.i BD.r BD.i
    286         ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
    287 radix4lsSkipRead\name:
    288 
    289         // finish first stage of 4 point FFT
    290 
    291         // fadd    qY0,qZ0,qZ2
    292         fadd    dYr0,dZr0,dZr2
    293         fadd    dYi0,dZi0,dZi2
    294         // fsub    qY2,qZ0,qZ2
    295         fsub    dYr2,dZr0,dZr2
    296         fsub    dYi2,dZi0,dZi2
    297         // fadd    qY1,qZ1,qZ3
    298         fadd    dYr1,dZr1,dZr3
    299         fadd    dYi1,dZi1,dZi3
    300         // fsub    qY3,qZ1,qZ3
    301         fsub    dYr3,dZr1,dZr3
    302         fsub    dYi3,dZi1,dZi3
    303 
    304 
    305         // finish second stage of 4 point FFT
    306 
    307         .ifeqs  "\inverse", "TRUE"
    308 
    309             // fsub    qZ0,qY2,qY1
    310             fsub    dZr0,dYr2,dYr1
    311             fsub    dZi0,dYi2,dYi1
    312             fadd    dZr3,dYr0,dYi3
    313             st2    {dZr0,dZi0},[pDst],outPointStep
    314             fsub    dZi3,dYi0,dYr3
    315 
    316             // fadd    qZ2,qY2,qY1
    317             fadd    dZr2,dYr2,dYr1
    318             fadd    dZi2,dYi2,dYi1
    319 
    320             st2    {dZr3,dZi3},[pDst],outPointStep
    321 
    322             fsub    dZr1,dYr0,dYi3
    323             st2    {dZr2,dZi2},[pDst],outPointStep
    324             fadd    dZi1,dYi0,dYr3
    325 
    326             // dstStep = -outPointStep + 16
    327             st2    {dZr1,dZi1},[pDst],dstStep
    328 
    329 
    330         .else
    331 
    332             // fsub    qZ0,qY2,qY1
    333             fsub    dZr0,dYr2,dYr1
    334             fsub    dZi0,dYi2,dYi1
    335 
    336             fsub    dZr1,dYr0,dYi3
    337             st2    {dZr0,dZi0},[pDst],outPointStep
    338             fadd    dZi1,dYi0,dYr3
    339 
    340             // fadd    qZ2,qY2,qY1
    341             fadd    dZr2,dYr2,dYr1
    342             fadd    dZi2,dYi2,dYi1
    343 
    344             st2    {dZr1,dZi1},[pDst],outPointStep
    345 
    346             fadd    dZr3,dYr0,dYi3
    347             st2    {dZr2,dZi2},[pDst],outPointStep
    348             fsub    dZi3,dYi0,dYr3
    349 
    350             // dstStep = -outPointStep + 16
    351             st2    {dZr3,dZi3},[pDst],dstStep
    352 
    353 
    354         .endif
    355 
    356         BGT     radix4lsGrpLoop\name
    357 
    358         .endm
    359 
    360 
    361         M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
    362         FFTSTAGE "FALSE","FALSE",fwd
    363         M_END
    364 
    365 
    366         M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
    367         FFTSTAGE "FALSE","TRUE",inv
    368         M_END
    369 
    370 
    371         .end
    372