Home | History | Annotate | Download | only in arm64
      1 //
      2 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3 //
      4 //  Use of this source code is governed by a BSD-style license
      5 //  that can be found in the LICENSE file in the root of the source
      6 //  tree. An additional intellectual property rights grant can be found
      7 //  in the file PATENTS.  All contributing project authors may
      8 //  be found in the AUTHORS file in the root of the source tree.
      9 //
     10 //  This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
     11 //  to support float instead of SC32.
     12 //
     13 
     14 //
     15 // Description:
     16 // Compute a first stage Radix 8 FFT stage for a N point complex signal
     17 //
     18 //
     19 
     20 
     21 // Include standard headers
     22 
     23 #include "dl/api/arm/arm64COMM_s.h"
     24 #include "dl/api/arm/omxtypes_s.h"
     25 
     26 // Import symbols required from other files
     27 // (For example tables)
     28 
     29 
     30 // Set debugging level
     31 //DEBUG_ON    SETL {TRUE}
     32 
     33 
     34 
     35 // Guarding implementation by the processor name
     36 
     37 
     38 
     39 
     40 // Guarding implementation by the processor name
     41 
     42 //Input Registers
     43 
     44 #define pSrc            x0
     45 #define pDst            x1
     46 #define pTwiddle        x2
     47 #define	pSubFFTNum	x3
     48 #define pSubFFTSize	x4
     49 
     50 
     51 //Output Registers
     52 
     53 
     54 //Local Scratch Registers
     55 
     56 #define subFFTNum       x5
     57 #define subFFTSize      x6
     58 #define grpSize         x7
     59 // Reuse grpSize as setCount
     60 #define setCount        x7
     61 #define pointStep       x8
     62 #define outPointStep    x8
     63 #define setStep         x9
     64 #define step1           x10
     65 #define step2           x11
     66 #define t0              w12
     67 
     68 
     69 // Neon Registers
     70 
     71 #define dXr0    v0.2s
     72 #define dXi0    v1.2s
     73 #define dXr1    v2.2s
     74 #define dXi1    v3.2s
     75 #define dXr2    v4.2s
     76 #define dXi2    v5.2s
     77 #define dXr3    v6.2s
     78 #define dXi3    v7.2s
     79 #define dXr4    v8.2s
     80 #define dXi4    v9.2s
     81 #define dXr5    v10.2s
     82 #define dXi5    v11.2s
     83 #define dXr6    v12.2s
     84 #define dXi6    v13.2s
     85 #define dXr7    v14.2s
     86 #define dXi7    v15.2s
     87 #define qX0     v0.4s
     88 #define qX1     v1.4s
     89 #define qX2     v2.4s
     90 #define qX3     v3.4s
     91 #define qX4     v4.4s
     92 #define qX5     v5.4s
     93 #define qX6     v6.4s
     94 #define qX7     v7.4s
     95 
     96 #define dUr0    v16.2s
     97 #define dUi0    v17.2s
     98 #define dUr2    v18.2s
     99 #define dUi2    v19.2s
    100 #define dUr4    v20.2s
    101 #define dUi4    v21.2s
    102 #define dUr6    v22.2s
    103 #define dUi6    v23.2s
    104 #define dUr1    v24.2s
    105 #define dUi1    v25.2s
    106 #define dUr3    v26.2s
    107 #define dUi3    v27.2s
    108 #define dUr5    v28.2s
    109 #define dUi5    v29.2s
    110 // reuse dXr7 and dXi7
    111 #define dUr7    v30.2s
    112 #define dUi7    v31.2s
    113 #define qU0     v8.4s
    114 #define qU1     v12.4s
    115 #define qU2     v9.4s
    116 #define qU3     v13.4s
    117 #define qU4     v10.4s
    118 #define qU5     v14.4s
    119 #define qU6     v11.4s
    120 #define qU7     v15.4s
    121 
    122 
    123 #define dVr0    v24.2s
    124 #define dVi0    v25.2s
    125 #define dVr2    v26.2s
    126 #define dVi2    v27.2s
    127 #define dVr4    v28.2s
    128 #define dVi4    v29.2s
    129 #define dVr6    v30.2s
    130 #define dVi6    v31.2s
    131 #define dVr1    v16.2s
    132 #define dVi1    v17.2s
    133 #define dVr3    v18.2s
    134 #define dVi3    v19.2s
    135 #define dVr5    v20.2s
    136 #define dVi5    v21.2s
    137 #define dVr7    v22.2s
    138 #define dVi7    v23.2s
    139 #define qV0     v12.4s
    140 #define qV1     v8.4s
    141 #define qV2     v13.4s
    142 #define qV3     v9.4s
    143 #define qV4     v14.4s
    144 #define qV5     v10.4s
    145 #define qV6     v15.4s
    146 #define qV7     v11.4s
    147 
    148 #define dYr0    v16.2s
    149 #define dYi0    v17.2s
    150 #define dYr2    v18.2s
    151 #define dYi2    v19.2s
    152 #define dYr4    v20.2s
    153 #define dYi4    v21.2s
    154 #define dYr6    v22.2s
    155 #define dYi6    v23.2s
    156 #define dYr1    v24.2s
    157 #define dYi1    v25.2s
    158 #define dYr3    v26.2s
    159 #define dYi3    v27.2s
    160 #define dYr5    v28.2s
    161 #define dYi5    v29.2s
    162 #define dYr7    v30.2s
    163 #define dYi7    v31.2s
    164 #define qY0     v8.4s
    165 #define qY1     v12.4s
    166 #define qY2     v9.4s
    167 #define qY3     v13.4s
    168 #define qY4     v10.4s
    169 #define qY5     v14.4s
    170 #define qY6     v11.4s
    171 #define qY7     v15.4s
    172 
    173 #define dT0     v14.2s
    174 #define dT0s    v14.s
    175 #define dT1     v15.2s
    176 
    177         .macro FFTSTAGE scaled, inverse, name
    178 
    179         // Define stack arguments
    180 
    181         // Move args values into our work registers
    182         ldr     subFFTNum, [pSubFFTNum]
    183         ldr     subFFTSize, [pSubFFTSize]
    184 
    185         // Update pSubFFTSize and pSubFFTNum regs
    186         // subFFTSize = 1 for the first stage
    187 
    188         movz    t0, 0x3f35, lsl #16               // High half word of sqrt(1/2).
    189         movk    t0, 0x04f3                        // Low half word of sqrt(1/2).
    190         MOV     subFFTSize,#8
    191 
    192         // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
    193         LSR     grpSize,subFFTNum,#3
    194         MOV     subFFTNum,grpSize
    195 
    196 
    197         // pT0+1 increments pT0 by 8 bytes
    198         // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
    199         // Note: outPointStep = pointStep for firststage
    200 
    201         lsl     pointStep,grpSize, #3
    202 
    203 
    204         // Calculate the step of input data for the next set
    205         //MOV     step1,pointStep,LSL #1             // step1 = 2*pointStep
    206         ld2     {dXr0,dXi0},[pSrc],pointStep         //  data[0]
    207         lsl     step1,grpSize, #4
    208         lsl     step2,pointStep, #3
    209 
    210         ld2     {dXr1,dXi1},[pSrc],pointStep         //  data[1]
    211         SUB     step2,step2,pointStep                // step2 = 7*pointStep
    212         // setStep = - 7*pointStep+16
    213         rsb     setStep,step2,#16
    214 
    215         ld2     {dXr2,dXi2},[pSrc],pointStep         //  data[2]
    216         ld2     {dXr3,dXi3},[pSrc],pointStep         //  data[3]
    217         ld2     {dXr4,dXi4},[pSrc],pointStep         //  data[4]
    218         ld2     {dXr5,dXi5},[pSrc],pointStep         //  data[5]
    219         ld2     {dXr6,dXi6},[pSrc],pointStep         //  data[6]
    220         //  data[7] & update pSrc for the next set
    221         //  setStep = -7*pointStep + 16
    222         ld2     {dXr7,dXi7},[pSrc],setStep
    223         // grp = 0 a special case since all the twiddle factors are 1
    224         // Loop on the sets
    225 
    226 radix8fsGrpZeroSetLoop\name :
    227 
    228         // Decrement setcount
    229         SUBS    setCount,setCount,#2
    230 
    231 
    232         // finish first stage of 8 point FFT
    233 
    234         // fadd    qU0,qX0,qX4
    235         // fadd    qU2,qX1,qX5
    236         // fadd    qU4,qX2,qX6
    237         // fadd    qU6,qX3,qX7
    238         fadd    dUr0,dXr0,dXr4
    239         fadd    dUr2,dXr1,dXr5
    240         fadd    dUr4,dXr2,dXr6
    241         fadd    dUr6,dXr3,dXr7
    242         fadd    dUi0,dXi0,dXi4
    243         fadd    dUi2,dXi1,dXi5
    244         fadd    dUi4,dXi2,dXi6
    245         fadd    dUi6,dXi3,dXi7
    246 
    247         // finish second stage of 8 point FFT
    248 
    249         // fadd    qV0,qU0,qU4
    250         // fsub    qV2,qU0,qU4
    251         // fadd    qV4,qU2,qU6
    252         // fsub    qV6,qU2,qU6
    253         fadd    dVr0,dUr0,dUr4
    254         fsub    dVr2,dUr0,dUr4
    255         fadd    dVr4,dUr2,dUr6
    256         fsub    dVr6,dUr2,dUr6
    257         fadd    dVi0,dUi0,dUi4
    258         fsub    dVi2,dUi0,dUi4
    259         fadd    dVi4,dUi2,dUi6
    260         fsub    dVi6,dUi2,dUi6
    261 
    262         // finish third stage of 8 point FFT
    263 
    264         // fadd    qY0,qV0,qV4
    265         // fsub    qY4,qV0,qV4
    266         fadd    dYr0,dVr0,dVr4
    267         fsub    dYr4,dVr0,dVr4
    268         fadd    dYi0,dVi0,dVi4
    269         fsub    dYi4,dVi0,dVi4
    270 
    271         st2     {dYr0,dYi0},[pDst],step1         // store y0
    272 
    273         .ifeqs  "\inverse", "TRUE"
    274 
    275             fsub    dYr2,dVr2,dVi6
    276             fadd    dYi2,dVi2,dVr6
    277 
    278             fadd    dYr6,dVr2,dVi6
    279             st2     {dYr2,dYi2},[pDst],step1     // store y2
    280             fsub    dYi6,dVi2,dVr6
    281 
    282             // fsub    qU1,qX0,qX4
    283             fsub    dUr1,dXr0,dXr4
    284             fsub    dUi1,dXi0,dXi4
    285 
    286             st2     {dYr4,dYi4},[pDst],step1     // store y4
    287 
    288             // fsub    qU3,qX1,qX5
    289             // fsub    qU5,qX2,qX6
    290             fsub    dUr3,dXr1,dXr5
    291             fsub    dUr5,dXr2,dXr6
    292             fsub    dUi3,dXi1,dXi5
    293             fsub    dUi5,dXi2,dXi6
    294 
    295             st2     {dYr6,dYi6},[pDst],step1     // store y6
    296 
    297         .else
    298 
    299             fadd    dYr6,dVr2,dVi6
    300             fsub    dYi6,dVi2,dVr6
    301 
    302             fsub    dYr2,dVr2,dVi6
    303             st2     {dYr6,dYi6},[pDst],step1     // store y2
    304             fadd    dYi2,dVi2,dVr6
    305 
    306 
    307             // fsub    qU1,qX0,qX4
    308             fsub    dUr1,dXr0,dXr4
    309             fsub    dUi1,dXi0,dXi4
    310 
    311             st2     {dYr4,dYi4},[pDst],step1     // store y4
    312 
    313             // fsub    qU3,qX1,qX5
    314             // fsub    qU5,qX2,qX6
    315             fsub    dUr3,dXr1,dXr5
    316             fsub    dUr5,dXr2,dXr6
    317             fsub    dUi3,dXi1,dXi5
    318             fsub    dUi5,dXi2,dXi6
    319 
    320             st2     {dYr2,dYi2},[pDst],step1     // store y6
    321 
    322 
    323         .endif
    324 
    325         // finish first stage of 8 point FFT
    326 
    327         // fsub    qU7,qX3,qX7
    328         fsub    dUr7,dXr3,dXr7
    329         fsub    dUi7,dXi3,dXi7
    330 
    331         mov     dT0s[0], t0
    332 
    333         // finish second stage of 8 point FFT
    334 
    335         fsub    dVr1,dUr1,dUi5
    336         //  data[0] for next iteration
    337         ld2     {dXr0,dXi0},[pSrc],pointStep
    338         fadd    dVi1,dUi1,dUr5
    339         fadd    dVr3,dUr1,dUi5
    340         ld2     {dXr1,dXi1},[pSrc],pointStep     //  data[1]
    341         fsub    dVi3,dUi1,dUr5
    342 
    343         fsub    dVr5,dUr3,dUi7
    344         ld2     {dXr2,dXi2},[pSrc],pointStep     //  data[2]
    345         fadd    dVi5,dUi3,dUr7
    346         fadd    dVr7,dUr3,dUi7
    347         ld2     {dXr3,dXi3},[pSrc],pointStep     //  data[3]
    348         fsub    dVi7,dUi3,dUr7
    349 
    350         // finish third stage of 8 point FFT
    351 
    352         .ifeqs  "\inverse", "TRUE"
    353 
    354             // calculate a*v5
    355             fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
    356 
    357             ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
    358             fmul    dVi5,dVi5,dT0[0]
    359 
    360             ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
    361             fsub    dVr5,dT1,dVi5                // a * V5
    362             fadd    dVi5,dT1,dVi5
    363 
    364             ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
    365 
    366             // calculate  b*v7
    367             fmul    dT1,dVr7,dT0[0]
    368             fmul    dVi7,dVi7,dT0[0]
    369 
    370             // fadd    qY1,qV1,qV5
    371             // fsub    qY5,qV1,qV5
    372             fadd    dYr1,dVr1,dVr5
    373             fsub    dYr5,dVr1,dVr5
    374             fadd    dYi1,dVi1,dVi5
    375             fsub    dYi5,dVi1,dVi5
    376 
    377             fadd    dVr7,dT1,dVi7                // b * V7
    378             fsub    dVi7,dVi7,dT1
    379             SUB     pDst, pDst, step2            // set pDst to y1
    380 
    381             // On the last iteration,  this will read past the end of pSrc,
    382             // so skip this read.
    383             BEQ     radix8SkipLastUpdateInv\name
    384             ld2     {dXr7,dXi7},[pSrc],setStep   //  data[7]
    385 radix8SkipLastUpdateInv\name:
    386 
    387             fsub    dYr3,dVr3,dVr7
    388             fsub    dYi3,dVi3,dVi7
    389             st2     {dYr1,dYi1},[pDst],step1     // store y1
    390             fadd    dYr7,dVr3,dVr7
    391             fadd    dYi7,dVi3,dVi7
    392 
    393 
    394             st2     {dYr3,dYi3},[pDst],step1     // store y3
    395             st2     {dYr5,dYi5},[pDst],step1     // store y5
    396             st2     {dYr7,dYi7},[pDst]           // store y7
    397             ADD pDst, pDst, #16
    398 
    399         .else
    400 
    401             // calculate  b*v7
    402             fmul    dT1,dVr7,dT0[0]
    403             ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
    404             fmul    dVi7,dVi7,dT0[0]
    405 
    406             ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
    407             fadd    dVr7,dT1,dVi7                     // b * V7
    408             fsub    dVi7,dVi7,dT1
    409 
    410             ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
    411 
    412             // calculate a*v5
    413             fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
    414             fmul    dVi5,dVi5,dT0[0]
    415 
    416             fadd    dYr7,dVr3,dVr7
    417             fadd    dYi7,dVi3,dVi7
    418             SUB     pDst, pDst, step2            // set pDst to y1
    419 
    420             fsub    dVr5,dT1,dVi5                // a * V5
    421             fadd    dVi5,dT1,dVi5
    422 
    423             // On the last iteration,  this will read past the end of pSrc,
    424             // so skip this read.
    425             BEQ     radix8SkipLastUpdateFwd\name
    426             ld2     {dXr7,dXi7},[pSrc],setStep   //  data[7]
    427 radix8SkipLastUpdateFwd\name:
    428 
    429             // fsub    qY5,qV1,qV5
    430             fsub    dYr5,dVr1,dVr5
    431             fsub    dYi5,dVi1,dVi5
    432 
    433             fsub    dYr3,dVr3,dVr7
    434             st2     {dYr7,dYi7},[pDst],step1     // store y1
    435             fsub    dYi3,dVi3,dVi7
    436 
    437             // fadd    qY1,qV1,qV5
    438             fadd    dYr1,dVr1,dVr5
    439             fadd    dYi1,dVi1,dVi5
    440 
    441             st2     {dYr5,dYi5},[pDst],step1     // store y3
    442             st2     {dYr3,dYi3},[pDst],step1     // store y5
    443             st2     {dYr1,dYi1},[pDst],#16       // store y7
    444 
    445         .endif
    446 
    447 
    448         // update pDst for the next set
    449         SUB     pDst, pDst, step2
    450         BGT     radix8fsGrpZeroSetLoop\name
    451 
    452         // Save subFFTNum and subFFTSize for next stage
    453         str     subFFTNum, [pSubFFTNum]
    454         str     subFFTSize, [pSubFFTSize]
    455 
    456         .endm
    457 
    458 
    459         // Allocate stack memory required by the function
    460 
    461 
    462         M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15
    463             FFTSTAGE "FALSE","FALSE",FWD
    464         M_END
    465 
    466 
    467         M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15
    468             FFTSTAGE "FALSE","TRUE",INV
    469         M_END
    470 
    471 
    472 
    473         .end
    474