1 // 2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 // 4 // Use of this source code is governed by a BSD-style license 5 // that can be found in the LICENSE file in the root of the source 6 // tree. An additional intellectual property rights grant can be found 7 // in the file PATENTS. All contributing project authors may 8 // be found in the AUTHORS file in the root of the source tree. 9 // 10 // This is a modification of 11 // armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float 12 // instead of SC32. 13 // 14 15 // 16 // Description: 17 // Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT 18 // It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation 19 // 20 // 21 22 23 // Include standard headers 24 25 #include "dl/api/arm/arm64COMM_s.h" 26 #include "dl/api/arm/omxtypes_s.h" 27 28 29 // Import symbols required from other files 30 // (For example tables) 31 32 33 // Set debugging level 34 //DEBUG_ON SETL {TRUE} 35 36 37 38 // Guarding implementation by the processor name 39 40 41 42 // Guarding implementation by the processor name 43 44 45 46 //Input Registers 47 48 #define pSrc x0 49 #define pTwiddle x1 50 #define pOut x2 51 #define subFFTNum x3 52 53 // Output registers 54 55 //Local Scratch Registers 56 57 #define argTwiddle x5 58 #define argDst x6 59 #define subFFTSize x7 60 #define N subFFTNum 61 62 #define pOut1 x13 63 64 #define size x7 65 #define step x8 66 #define step1 x9 67 #define twStep x10 68 #define pTwiddleTmp x11 69 #define argTwiddle1 x12 70 71 // Neon registers 72 73 #define dX0 v0.2s 74 #define dX0s v0.s 75 #define dShift v1.2s 76 #define dX1 v1.2s 77 #define dX1s v1.s 78 #define dY0 v2.2s 79 #define dY08b v2.8b 80 #define dY1 v3.2s 81 #define dX0r v0.2s 82 #define dX0rs v0.s 83 #define dX0i v1.2s 84 #define dX1r v2.2s 85 #define dX1i v3.2s 86 #define dW0r v4.2s 87 #define dW0r8b v4.8b 88 #define dW0i v5.2s 89 #define dW1r v6.2s 90 #define dW1r8b v6.8b 91 #define dW1i v7.2s 92 #define dT0 v8.2s 93 #define dT1 v9.2s 94 #define dT2 v10.2s 95 #define dT3 v11.2s 96 #define qT0 v12.2s 97 #define qT1 v14.2s 98 #define qT2 v16.2s 99 #define qT3 v18.2s 100 #define dY0r v4.2s 101 #define dY0i v5.2s 102 #define dY1r v6.2s 103 #define dY1i v7.2s 104 105 #define dY2 v4.2s 106 #define dY3 v5.2s 107 #define dW0 v6.2s 108 #define dW1 v7.2s 109 #define dW0Tmp v10.2s 110 #define dW1Neg v11.2s 111 112 #define dZip v19.2s 113 #define dZip8b v19.8b 114 #define half v13.2s 115 116 .macro FFTSTAGE scaled, inverse, name 117 118 fmov half, 0.5 119 120 asr size, subFFTNum, #1 // preserve the contents of N = subFFTNum 121 lsl step, subFFTNum, #2 // step = N/2 * 8 bytes 122 123 124 // Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} 125 // Note: W^(k) is stored as negated value and also need to 126 // conjugate the values from the table 127 128 // Z(0) : no need of twiddle multiply 129 // Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } 130 131 ld1 {dX0},[pSrc],step 132 ADD pOut1,pOut,step // pOut1 = pOut+ N/2*8 bytes 133 134 ld1 {dX1},[pSrc], #8 135 // twStep = 3N/8 * 8 bytes pointing to W^1 136 SUB twStep,step,size,LSL #1 137 138 lsl step1,size, #2 // step1 = N/4 * 8 = N/2*4 bytes 139 SUB step1,step1,#8 // (N/4-1)*8 bytes 140 141 fadd dY0,dX0,dX1 // [b+d | a+c] 142 fsub dY1,dX0,dX1 // [b-d | a-c] 143 fmul dY0, dY0, half[0] 144 fmul dY1, dY1, half[0] 145 146 // dY0= [a-c | a+c] ;dY1= [b-d | b+d] 147 // VZIP dY0,dY1 148 zip1 dZip,dY0,dY1 149 zip2 dY1,dY0,dY1 150 mov dY08b, dZip8b 151 152 fsub dX0,dY0,dY1 153 SUBS size,size,#2 154 fadd dX1,dY0,dY1 155 156 SUB pSrc,pSrc,step 157 158 st1 {dX0s}[0],[pOut1], #4 159 ADD pTwiddleTmp,pTwiddle,#8 // W^2 160 st1 {dX1s}[1],[pOut1], #4 161 ADD argTwiddle1,pTwiddle,twStep // W^1 162 163 164 BLT decrementScale\name 165 BEQ lastElement\name 166 167 168 // Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] 169 // Note: W^k is stored as negative values in the table and also 170 // need to conjugate the values from the table. 171 // 172 // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) 173 // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1) 174 175 176 SUB step,step,#24 177 evenOddButterflyLoop\name : 178 179 180 ld1 {dW0r},[argTwiddle1],step1 181 ld1 {dW1r},[argTwiddle1], #8 182 183 ld2 {dX0r,dX0i},[pSrc],step 184 SUB argTwiddle1,argTwiddle1,step1 185 ld2 {dX1r,dX1i},[pSrc], #16 186 187 SUB step1,step1,#8 // (N/4-2)*8 bytes 188 ld1 {dW0i},[pTwiddleTmp],step1 189 ld1 {dW1i},[pTwiddleTmp], #8 190 SUB pSrc,pSrc,step 191 192 SUB pTwiddleTmp,pTwiddleTmp,step1 193 rev64 dX1r,dX1r 194 rev64 dX1i,dX1i 195 SUBS size,size,#4 196 197 198 fsub dT2,dX0r,dX1r // a-c 199 fadd dT3,dX0i,dX1i // b+d 200 fadd dT0,dX0r,dX1r // a+c 201 fsub dT1,dX0i,dX1i // b-d 202 SUB step1,step1,#8 203 204 fmul dT2, dT2, half[0] 205 fmul dT3, dT3, half[0] 206 207 fmul dT0, dT0, half[0] 208 fmul dT1, dT1, half[0] 209 210 // VZIP dW1r,dW1i 211 // VZIP dW0r,dW0i 212 zip1 dZip, dW1r,dW1i 213 zip2 dW1i,dW1r,dW1i 214 mov dW1r8b, dZip8b 215 zip1 dZip,dW0r,dW0i 216 zip2 dW0i,dW0r,dW0i 217 mov dW0r8b, dZip8b 218 219 fmul dX1r,dW1r,dT2 220 fmul dX1i,dW1r,dT3 221 fmul dX0r,dW0r,dT2 222 fmul dX0i,dW0r,dT3 223 224 fmls dX1r,dW1i,dT3 225 fmla dX1i,dW1i,dT2 226 227 fmla dX0r,dW0i,dT3 228 fmls dX0i,dW0i,dT2 229 230 231 fadd dY1r,dT0,dX1i // F(N/2 -1) 232 fsub dY1i,dX1r,dT1 233 234 rev64 dY1r,dY1r 235 rev64 dY1i,dY1i 236 237 238 fadd dY0r,dT0,dX0i // F(1) 239 fsub dY0i,dT1,dX0r 240 241 242 st2 {dY0r,dY0i},[pOut1],step 243 st2 {dY1r,dY1i},[pOut1], #16 244 SUB pOut1,pOut1,step 245 SUB step,step,#32 // (N/2-4)*8 bytes 246 247 248 BGT evenOddButterflyLoop\name 249 250 251 // set both the ptrs to the last element 252 SUB pSrc,pSrc,#8 253 SUB pOut1,pOut1,#8 254 255 // Last element can be expanded as follows 256 // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as 257 // -ve) 258 // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] 259 // 1/2[2a+j0] - j (c-jd) [0+j2b] 260 // (a+bc, -bd) 261 // Since (c,d) = (0,1) for the last element, result is just (a,-b) 262 263 lastElement\name : 264 ld1 {dX0r},[pSrc] 265 266 st1 {dX0rs}[0],[pOut1], #4 267 fneg dX0r,dX0r 268 st1 {dX0rs}[1],[pOut1] 269 270 271 272 decrementScale\name : 273 274 .endm 275 276 M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15 277 FFTSTAGE "FALSE","TRUE",Inv 278 M_END 279 280 .end 281