1 // 2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 // 4 // Use of this source code is governed by a BSD-style license 5 // that can be found in the LICENSE file in the root of the source 6 // tree. An additional intellectual property rights grant can be found 7 // in the file PATENTS. All contributing project authors may 8 // be found in the AUTHORS file in the root of the source tree. 9 // 10 // This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s 11 // to support float instead of SC32. 12 // 13 14 // 15 // Description: 16 // Compute FFT for a real signal 17 // 18 // 19 20 21 // Include standard headers 22 23 #include "dl/api/arm/arm64COMM_s.h" 24 #include "dl/api/arm/omxtypes_s.h" 25 26 27 // Import symbols required from other files 28 // (For example tables) 29 30 // Set debugging level 31 //DEBUG_ON SETL {TRUE} 32 33 34 35 // Guarding implementation by the processor name 36 37 38 39 // Guarding implementation by the processor name 40 41 // Import symbols required from other files 42 43 44 //Input Registers 45 46 #define pSrc x0 47 #define pDst x1 48 #define pTwiddle x2 49 #define pOut x3 50 #define subFFTNum x4 51 52 // Output registers 53 54 //Local Scratch Registers 55 56 #define argTwiddle x5 57 #define argDst x6 58 #define subFFTSize x7 59 #define N subFFTNum 60 #define order x14 61 #define step x8 62 #define step1 pTwiddle 63 #define twStep x9 64 #define zero w10 65 #define pTwiddleTmp pOut 66 67 // Neon registers 68 69 #define dX0 v0.2s 70 #define dX0s v0.s 71 #define dX0r v2.2s 72 #define dX0rs v2.s 73 #define dX0i v3.2s 74 #define dX0is v3.s 75 #define dX1r v4.2s 76 #define dX1i v5.2s 77 #define dT0 v6.2s 78 #define dT1 v7.2s 79 #define dT2 v8.2s 80 #define dT3 v9.2s 81 #define qT0 v10.2s 82 #define qT1 v12.2s 83 #define dW0r v14.2s 84 #define dW0r8b v14.8b 85 #define dW0i v15.2s 86 #define dW1r v16.2s 87 #define dW1r8b v16.8b 88 #define dW1i v17.2s 89 #define dY0r v14.2s 90 #define dY0i v15.2s 91 #define dY1r v16.2s 92 #define dY1i v17.2s 93 #define qT2 v18.2s 94 #define qT3 v20.2s 95 96 #define half v0.2s 97 #define dZip v21.2s 98 #define dZip8b v21.8b 99 100 // Allocate stack memory required by the function 101 102 // Write function header 103 M_START ComplexToRealFixup,,d15 104 105 asr N, N, #1 106 107 clz order, subFFTNum // N = 2^order 108 109 RSB order,order,#63 110 MOV subFFTSize,subFFTNum // subFFTSize = N/2 111 //MOV subFFTNum,N 112 mov argDst, pDst 113 mov argTwiddle, pTwiddle 114 115 // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] 116 // 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] 117 // 1/2[2a+j0] - j [0+j2b] 118 // (a+b, 0) 119 120 // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] 121 // 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] 122 // 1/2[2a+j0] + j [0+j2b] 123 // (a-b, 0) 124 125 // F(0) and F(N/2) 126 ld2 {dX0rs,dX0is}[0],[pSrc], #8 127 MOV zero,#0 128 mov dX0rs[1],zero 129 lsl step,subFFTSize, #3 // step = N/2 * 8 bytes 130 mov dX0i[1],zero 131 // twStep = 3N/8 * 8 bytes pointing to W^1 132 SUB twStep,step,subFFTSize,LSL #1 133 134 fadd dY0r,dX0r,dX0i // F(0) = ((Z0.r+Z0.i) , 0) 135 lsl step1,subFFTSize, #2 // step1 = N/2 * 4 bytes 136 fsub dY0i,dX0r,dX0i // F(N/2) = ((Z0.r-Z0.i) , 0) 137 SUBS subFFTSize,subFFTSize,#2 138 139 st1 {dY0r},[argDst],step 140 ADD pTwiddleTmp,argTwiddle,#8 // W^2 141 st1 {dY0i},[argDst], #8 142 ADD argTwiddle,argTwiddle,twStep // W^1 143 144 // dup dzero,zero 145 SUB argDst,argDst,step 146 147 BLT End 148 BEQ lastElement 149 SUB step,step,#24 150 SUB step1,step1,#8 // (N/4-1)*8 bytes 151 152 // F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] 153 // Note: W^k is stored as negative values in the table 154 // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) 155 // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1) 156 157 fmov half, #0.5 158 159 evenOddButterflyLoop: 160 161 162 ld1 {dW0r},[argTwiddle],step1 163 ld1 {dW1r},[argTwiddle], #8 164 165 ld2 {dX0r,dX0i},[pSrc],step 166 SUB argTwiddle,argTwiddle,step1 167 ld2 {dX1r,dX1i},[pSrc], #16 168 169 170 171 SUB step1,step1,#8 // (N/4-2)*8 bytes 172 ld1 {dW0i},[pTwiddleTmp],step1 173 ld1 {dW1i},[pTwiddleTmp], #8 174 SUB pSrc,pSrc,step 175 176 SUB pTwiddleTmp,pTwiddleTmp,step1 177 rev64 dX1r,dX1r 178 rev64 dX1i,dX1i 179 SUBS subFFTSize,subFFTSize,#4 180 181 182 183 fsub dT2,dX0r,dX1r // a-c 184 SUB step1,step1,#8 185 fadd dT0,dX0r,dX1r // a+c 186 fsub dT1,dX0i,dX1i // b-d 187 fadd dT3,dX0i,dX1i // b+d 188 fmul dT0,dT0,half[0] 189 fmul dT1,dT1,half[0] 190 // VZIP dW1r,dW1i 191 // VZIP dW0r,dW0i 192 zip1 dZip, dW1r, dW1i 193 zip2 dW1i, dW1r, dW1i 194 mov dW1r8b, dZip8b 195 zip1 dZip, dW0r, dW0i 196 zip2 dW0i, dW0r, dW0i 197 mov dW0r8b, dZip8b 198 199 fmul qT0,dW1r,dT2 200 fmul qT1,dW1r,dT3 201 fmul qT2,dW0r,dT2 202 fmul qT3,dW0r,dT3 203 204 fmla qT0,dW1i,dT3 205 fmls qT1,dW1i,dT2 206 207 fmls qT2,dW0i,dT3 208 fmla qT3,dW0i,dT2 209 210 211 fmul dX1r,qT0,half[0] 212 fmul dX1i,qT1,half[0] 213 214 fsub dY1r,dT0,dX1i // F(N/2 -1) 215 fadd dY1i,dT1,dX1r 216 fneg dY1i,dY1i 217 218 rev64 dY1r,dY1r 219 rev64 dY1i,dY1i 220 221 222 fmul dX0r,qT2,half[0] 223 fmul dX0i,qT3,half[0] 224 225 fsub dY0r,dT0,dX0i // F(1) 226 fadd dY0i,dT1,dX0r 227 228 229 st2 {dY0r,dY0i},[argDst],step 230 st2 {dY1r,dY1i},[argDst], #16 231 SUB argDst,argDst,step 232 SUB step,step,#32 // (N/2-4)*8 bytes 233 234 235 BGT evenOddButterflyLoop 236 237 // set both the ptrs to the last element 238 SUB pSrc,pSrc,#8 239 SUB argDst,argDst,#8 240 241 242 243 // Last element can be expanded as follows 244 // 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] 245 // 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] 246 // 1/2[2a+j0] + j (c+jd) [0+j2b] 247 // (a-bc, -bd) 248 // Since (c,d) = (0,1) for the last element, result is just (a,-b) 249 250 lastElement: 251 ld1 {dX0r},[pSrc] 252 253 st1 {dX0rs}[0],[argDst], #4 254 fneg dX0r,dX0r 255 st1 {dX0rs}[1],[argDst], #4 256 End: 257 258 // Write function tail 259 M_END 260 261 .end 262