1 // 2 // Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 // 4 // Use of this source code is governed by a BSD-style license 5 // that can be found in the LICENSE file in the root of the source 6 // tree. An additional intellectual property rights grant can be found 7 // in the file PATENTS. All contributing project authors may 8 // be found in the AUTHORS file in the root of the source tree. 9 // 10 // This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s 11 // to support float instead of SC32. 12 // 13 14 // 15 // Description: 16 // Compute a Radix 4 FFT stage for a N point complex signal 17 // 18 // 19 20 21 // Include standard headers 22 23 #include "dl/api/arm/arm64COMM_s.h" 24 #include "dl/api/arm/omxtypes_s.h" 25 26 // Import symbols required from other files 27 // (For example tables) 28 29 30 31 32 // Set debugging level 33 //DEBUG_ON SETL {TRUE} 34 35 36 // Guarding implementation by the processor name 37 38 39 // Import symbols required from other files 40 // (For example tables) 41 //IMPORT armAAC_constTable 42 43 //Input Registers 44 45 #define pSrc x0 46 #define pDst x1 47 #define pTwiddle x2 48 #define pSubFFTNum x3 49 #define pSubFFTSize x4 50 51 52 53 //Output Registers 54 55 56 //Local Scratch Registers 57 58 #define subFFTNum x5 59 #define subFFTSize x6 60 #define outPointStep x8 61 #define grpCount x9 62 #define dstStep x10 63 #define grpTwStep x13 64 #define stepTwiddle x14 65 #define twStep x15 66 #define step16 x11 67 #define step24 x12 68 69 70 // Neon Registers 71 72 #define dButterfly1Real02 v0.2s 73 #define dButterfly1Real028b v0.8b 74 #define dButterfly1Imag02 v1.2s 75 #define dButterfly1Imag028b v1.8b 76 #define dButterfly1Real13 v2.2s 77 #define dButterfly1Real138b v2.8b 78 #define dButterfly1Imag13 v3.2s 79 #define dButterfly1Imag138b v3.8b 80 #define dButterfly2Real02 v4.2s 81 #define dButterfly2Imag02 v5.2s 82 #define dButterfly2Real13 v6.2s 83 #define dButterfly2Imag13 v7.2s 84 #define dXr0 v0.2s 85 #define dXi0 v1.2s 86 #define dXr08b v0.8b 87 #define dXi08b v1.8b 88 #define dXr1 v2.2s 89 #define dXi1 v3.2s 90 #define dXr2 v4.2s 91 #define dXi2 v5.2s 92 #define dXr3 v6.2s 93 #define dXi3 v7.2s 94 95 #define dYr0 v16.2s 96 #define dYi0 v17.2s 97 #define dYr1 v18.2s 98 #define dYi1 v19.2s 99 #define dYr2 v20.2s 100 #define dYi2 v21.2s 101 #define dYr3 v22.2s 102 #define dYi3 v23.2s 103 104 #define dW1r v8.2s 105 #define dW1i v9.2s 106 #define dW2r v10.2s 107 #define dW2r8b v10.8b 108 #define dW2i v11.2s 109 #define dW3r v12.2s 110 #define dW3r8b v12.8b 111 #define dW3i v13.2s 112 113 #define dZr0 v14.2s 114 #define dZi0 v15.2s 115 #define dZr08b v14.8b 116 #define dZi08b v15.8b 117 #define dZr1 v26.2s 118 #define dZi1 v27.2s 119 #define dZr2 v28.2s 120 #define dZi2 v29.2s 121 #define dZr3 v30.2s 122 #define dZi3 v31.2s 123 124 #define dZip v24.2s 125 #define dZip8b v24.8b 126 127 .macro FFTSTAGE scaled, inverse , name 128 129 // Define stack arguments 130 131 // Move args values into our work registers 132 ldr subFFTNum, [pSubFFTNum] 133 ldr subFFTSize, [pSubFFTSize] 134 135 // pOut0+1 increments pOut0 by 8 bytes 136 // pOut0+outPointStep == increment of 8*outPointStep bytes 137 lsl outPointStep,subFFTSize, #3 138 139 // Update grpCount and grpSize rightaway 140 141 ld2 {dW1r,dW1i},[pTwiddle] // [wi|wr] 142 MOV step16,#16 143 LSL grpCount,subFFTSize,#2 144 145 ld1 {dW2r},[pTwiddle] // [wi|wr] 146 MOV subFFTNum,#1 //after the last stage 147 148 ld1 {dW3r},[pTwiddle],step16 // [wi|wr] 149 MOV stepTwiddle,#0 150 151 ld1 {dW2i},[pTwiddle],#8 // [wi|wr] 152 SUB grpTwStep,stepTwiddle,#8 // grpTwStep = -8 to start with 153 154 // update subFFTSize for the next stage 155 MOV subFFTSize,grpCount 156 ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr] 157 lsl dstStep,outPointStep, #1 158 159 // AC.r AC.i BD.r BD.i 160 ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32 161 ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep 162 163 rsb dstStep,dstStep,#16 // dstStep = - 3*outPointStep+16 164 MOV step24,#24 165 166 // AC.r AC.i BD.r BD.i 167 ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32 168 169 170 // Process two groups at a time 171 172 radix4lsGrpLoop\name : 173 174 // VZIP dW2r,dW2i 175 zip1 dZip, dW2r, dW2i 176 zip2 dW2i, dW2r, dW2i 177 mov dW2r8b, dZip8b 178 179 ADD stepTwiddle,stepTwiddle,#16 180 181 // VZIP dW3r,dW3i 182 zip1 dZip, dW3r,dW3i 183 zip2 dW3i, dW3r, dW3i 184 mov dW3r8b, dZip8b 185 ADD grpTwStep,stepTwiddle,#4 186 187 // VUZP dButterfly1Real13, dButterfly2Real13 // B.r D.r 188 uzp1 dZip, dButterfly1Real13, dButterfly2Real13 // B.r D.r 189 uzp2 dButterfly2Real13, dButterfly1Real13, dButterfly2Real13 // B.r D.r 190 mov dButterfly1Real138b, dZip8b 191 192 SUB twStep,stepTwiddle,#16 // -16+stepTwiddle 193 194 // VUZP dButterfly1Imag13, dButterfly2Imag13 // B.i D.i 195 uzp1 dZip, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i 196 uzp2 dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i 197 mov dButterfly1Imag138b, dZip8b 198 lsl grpTwStep,grpTwStep,#1 199 200 // VUZP dButterfly1Real02, dButterfly2Real02 // A.r C.r 201 uzp1 dZip, dButterfly1Real02, dButterfly2Real02 // A.r C.r 202 uzp2 dButterfly2Real02, dButterfly1Real02, dButterfly2Real02 // A.r C.r 203 mov dButterfly1Real028b, dZip8b 204 rsb grpTwStep,grpTwStep,#0 // -8-2*stepTwiddle 205 206 // VUZP dButterfly1Imag02, dButterfly2Imag02 // A.i C.i 207 uzp1 dZip, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i 208 uzp2 dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i 209 mov dButterfly1Imag028b, dZip8b 210 211 212 // grpCount is multiplied by 4 213 SUBS grpCount,grpCount,#8 214 215 .ifeqs "\inverse", "TRUE" 216 fmul dZr1,dW1r,dXr1 217 fmla dZr1,dW1i,dXi1 // real part 218 fmul dZi1,dW1r,dXi1 219 fmls dZi1,dW1i,dXr1 // imag part 220 221 .else 222 223 fmul dZr1,dW1r,dXr1 224 fmls dZr1,dW1i,dXi1 // real part 225 fmul dZi1,dW1r,dXi1 226 fmla dZi1,dW1i,dXr1 // imag part 227 228 .endif 229 230 ld2 {dW1r,dW1i},[pTwiddle],stepTwiddle // [wi|wr] 231 232 .ifeqs "\inverse", "TRUE" 233 fmul dZr2,dW2r,dXr2 234 fmla dZr2,dW2i,dXi2 // real part 235 fmul dZi2,dW2r,dXi2 236 ld1 {dW2r},[pTwiddle],step16 // [wi|wr] 237 fmls dZi2,dW2i,dXr2 // imag part 238 239 .else 240 241 fmul dZr2,dW2r,dXr2 242 fmls dZr2,dW2i,dXi2 // real part 243 fmul dZi2,dW2r,dXi2 244 ld1 {dW2r},[pTwiddle],step16 // [wi|wr] 245 fmla dZi2,dW2i,dXr2 // imag part 246 247 .endif 248 249 250 ld1 {dW2i},[pTwiddle],twStep // [wi|wr] 251 252 // move qX0 so as to load for the next iteration 253 // MOV qZ0,qX0 254 mov dZr08b, dXr08b 255 mov dZi08b, dXi08b 256 257 .ifeqs "\inverse", "TRUE" 258 fmul dZr3,dW3r,dXr3 259 fmla dZr3,dW3i,dXi3 // real part 260 fmul dZi3,dW3r,dXi3 261 ld1 {dW3r},[pTwiddle],step24 262 fmls dZi3,dW3i,dXr3 // imag part 263 264 .else 265 266 fmul dZr3,dW3r,dXr3 267 fmls dZr3,dW3i,dXi3 // real part 268 fmul dZi3,dW3r,dXi3 269 ld1 {dW3r},[pTwiddle],step24 270 fmla dZi3,dW3i,dXr3 // imag part 271 272 .endif 273 274 ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr] 275 276 // Don't do the load on the last iteration so we don't read past the end 277 // of pSrc. 278 bne skipIncrement\name 279 add pSrc, pSrc, #64 280 skipIncrement\name: 281 beq radix4lsSkipRead\name 282 // AC.r AC.i BD.r BD.i 283 ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32 284 285 // AC.r AC.i BD.r BD.i 286 ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32 287 radix4lsSkipRead\name: 288 289 // finish first stage of 4 point FFT 290 291 // fadd qY0,qZ0,qZ2 292 fadd dYr0,dZr0,dZr2 293 fadd dYi0,dZi0,dZi2 294 // fsub qY2,qZ0,qZ2 295 fsub dYr2,dZr0,dZr2 296 fsub dYi2,dZi0,dZi2 297 // fadd qY1,qZ1,qZ3 298 fadd dYr1,dZr1,dZr3 299 fadd dYi1,dZi1,dZi3 300 // fsub qY3,qZ1,qZ3 301 fsub dYr3,dZr1,dZr3 302 fsub dYi3,dZi1,dZi3 303 304 305 // finish second stage of 4 point FFT 306 307 .ifeqs "\inverse", "TRUE" 308 309 // fsub qZ0,qY2,qY1 310 fsub dZr0,dYr2,dYr1 311 fsub dZi0,dYi2,dYi1 312 fadd dZr3,dYr0,dYi3 313 st2 {dZr0,dZi0},[pDst],outPointStep 314 fsub dZi3,dYi0,dYr3 315 316 // fadd qZ2,qY2,qY1 317 fadd dZr2,dYr2,dYr1 318 fadd dZi2,dYi2,dYi1 319 320 st2 {dZr3,dZi3},[pDst],outPointStep 321 322 fsub dZr1,dYr0,dYi3 323 st2 {dZr2,dZi2},[pDst],outPointStep 324 fadd dZi1,dYi0,dYr3 325 326 // dstStep = -outPointStep + 16 327 st2 {dZr1,dZi1},[pDst],dstStep 328 329 330 .else 331 332 // fsub qZ0,qY2,qY1 333 fsub dZr0,dYr2,dYr1 334 fsub dZi0,dYi2,dYi1 335 336 fsub dZr1,dYr0,dYi3 337 st2 {dZr0,dZi0},[pDst],outPointStep 338 fadd dZi1,dYi0,dYr3 339 340 // fadd qZ2,qY2,qY1 341 fadd dZr2,dYr2,dYr1 342 fadd dZi2,dYi2,dYi1 343 344 st2 {dZr1,dZi1},[pDst],outPointStep 345 346 fadd dZr3,dYr0,dYi3 347 st2 {dZr2,dZi2},[pDst],outPointStep 348 fsub dZi3,dYi0,dYr3 349 350 // dstStep = -outPointStep + 16 351 st2 {dZr3,dZi3},[pDst],dstStep 352 353 354 .endif 355 356 BGT radix4lsGrpLoop\name 357 358 .endm 359 360 361 M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15 362 FFTSTAGE "FALSE","FALSE",fwd 363 M_END 364 365 366 M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15 367 FFTSTAGE "FALSE","TRUE",inv 368 M_END 369 370 371 .end 372