1 // 2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 // 4 // Use of this source code is governed by a BSD-style license 5 // that can be found in the LICENSE file in the root of the source 6 // tree. An additional intellectual property rights grant can be found 7 // in the file PATENTS. All contributing project authors may 8 // be found in the AUTHORS file in the root of the source tree. 9 // 10 // This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s 11 // to support float instead of SC32. 12 // 13 14 // Description: 15 // Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point 16 // complex signal. This handles the general stage, not the first or last 17 // stage. 18 // 19 // 20 21 22 // Include standard headers 23 24 #include "dl/api/arm/arm64COMM_s.h" 25 #include "dl/api/arm/omxtypes_s.h" 26 27 28 // Import symbols required from other files 29 // (For example tables) 30 31 32 33 // Set debugging level 34 //DEBUG_ON SETL {TRUE} 35 36 37 38 // Guarding implementation by the processor name 39 40 41 42 43 // Guarding implementation by the processor name 44 45 //Input Registers 46 47 #define pSrc x0 48 #define pDst x1 49 #define pTwiddle x2 50 #define pSubFFTNum x3 51 #define pSubFFTSize x4 52 53 54 //Output Registers 55 56 57 //Local Scratch Registers 58 59 #define subFFTNum x5 60 #define subFFTSize x6 61 #define outPointStep x8 62 #define pointStep x9 63 #define pointStep32 w9 64 #define grpCount x10 65 #define grpCount32 w10 66 #define setCount x13 67 #define step x15 68 #define dstStep x11 69 70 // Neon Registers 71 72 #define dW v0.2s 73 #define dX0 v2.2s 74 #define dX1 v3.2s 75 #define dX2 v4.2s 76 #define dX3 v5.2s 77 #define dY0 v6.2s 78 #define dY1 v7.2s 79 #define dY2 v8.2s 80 #define dY3 v9.2s 81 #define qT0 v10.2s 82 #define qT1 v11.2s 83 84 .macro FFTSTAGE scaled, inverse, name 85 86 // Define stack arguments 87 88 // Move args values into our work registers 89 ldr subFFTNum, [pSubFFTNum] 90 ldr subFFTSize, [pSubFFTSize] 91 92 // Update grpCount and grpSize rightaway inorder to reuse pGrpCount 93 // and pGrpSize regs 94 95 LSR subFFTNum,subFFTNum,#1 //grpSize 96 LSL grpCount,subFFTSize,#1 97 98 99 // pT0+1 increments pT0 by 8 bytes 100 // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes 101 lsl pointStep, subFFTNum, #2 102 103 // update subFFTSize for the next stage 104 MOV subFFTSize,grpCount 105 106 // pOut0+1 increments pOut0 by 8 bytes 107 // pOut0+outPointStep == increment of 8*outPointStep bytes = 108 // 4*size bytes 109 smull outPointStep, grpCount32, pointStep32 110 111 LSL pointStep,pointStep,#1 112 113 114 rsb step,pointStep,#16 115 rsb dstStep,outPointStep,#16 116 117 // Loop on the groups 118 119 radix2GrpLoop\name : 120 lsr setCount, pointStep, #3 121 LD1 {dW},[pTwiddle],pointStep //[wi | wr] 122 123 124 // Loop on the sets 125 126 127 radix2SetLoop\name : 128 129 130 // point0: dX0-real part dX1-img part 131 LD2 {dX0,dX1},[pSrc],pointStep 132 // point1: dX2-real part dX3-img part 133 LD2 {dX2,dX3},[pSrc],step 134 135 SUBS setCount,setCount,#2 136 137 .ifeqs "\inverse", "TRUE" 138 fmul qT0,dX2,dW[0] 139 fmla qT0,dX3,dW[1] // real part 140 fmul qT1,dX3,dW[0] 141 fmls qT1,dX2,dW[1] // imag part 142 143 .else 144 145 fmul qT0,dX2,dW[0] 146 fmls qT0,dX3,dW[1] // real part 147 fmul qT1,dX3,dW[0] 148 fmla qT1,dX2,dW[1] // imag part 149 150 .endif 151 152 fsub dY0,dX0,qT0 153 fsub dY1,dX1,qT1 154 fadd dY2,dX0,qT0 155 fadd dY3,dX1,qT1 156 157 st2 {dY0,dY1},[pDst],outPointStep 158 // dstStep = -outPointStep + 16 159 st2 {dY2,dY3},[pDst],dstStep 160 161 BGT radix2SetLoop\name 162 163 SUBS grpCount,grpCount,#2 164 ADD pSrc,pSrc,pointStep 165 BGT radix2GrpLoop\name 166 167 168 str subFFTNum, [pSubFFTNum] 169 str subFFTSize, [pSubFFTSize] 170 .endm 171 172 173 174 M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11 175 FFTSTAGE "FALSE","FALSE",FWD 176 M_END 177 178 179 180 M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11 181 FFTSTAGE "FALSE","TRUE",INV 182 M_END 183 184 185 .end 186