1 @// 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 @// 4 @// Use of this source code is governed by a BSD-style license 5 @// that can be found in the LICENSE file in the root of the source 6 @// tree. An additional intellectual property rights grant can be found 7 @// in the file PATENTS. All contributing project authors may 8 @// be found in the AUTHORS file in the root of the source tree. 9 @// 10 @// 11 @// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s 12 @// to support float instead of SC32. 13 @// 14 15 @// 16 @// Description: 17 @// Compute a Radix 4 FFT stage for a N point complex signal 18 @// 19 @// 20 21 22 @// Include standard headers 23 24 #include "dl/api/arm/armCOMM_s.h" 25 #include "dl/api/arm/omxtypes_s.h" 26 27 28 @// Import symbols required from other files 29 @// (For example tables) 30 31 32 33 34 @// Set debugging level 35 @//DEBUG_ON SETL {TRUE} 36 37 38 39 @// Guarding implementation by the processor name 40 41 42 43 44 @// Guarding implementation by the processor name 45 46 47 @// Import symbols required from other files 48 @// (For example tables) 49 50 51 @//Input Registers 52 53 #define pSrc r0 54 #define pDst r2 55 #define pTwiddle r1 56 #define subFFTNum r6 57 #define subFFTSize r7 58 59 60 61 @//Output Registers 62 63 64 @//Local Scratch Registers 65 66 #define grpCount r3 67 #define pointStep r4 68 #define outPointStep r5 69 #define stepTwiddle r12 70 #define setCount r14 71 #define srcStep r8 72 #define setStep r9 73 #define dstStep r10 74 #define twStep r11 75 #define t1 r3 76 77 @// Neon Registers 78 79 #define dW1 D0.F32 80 #define dW2 D1.F32 81 #define dW3 D2.F32 82 83 #define dXr0 D4.F32 84 #define dXi0 D5.F32 85 #define dXr1 D6.F32 86 #define dXi1 D7.F32 87 #define dXr2 D8.F32 88 #define dXi2 D9.F32 89 #define dXr3 D10.F32 90 #define dXi3 D11.F32 91 #define dYr0 D12.F32 92 #define dYi0 D13.F32 93 #define dYr1 D14.F32 94 #define dYi1 D15.F32 95 #define dYr2 D16.F32 96 #define dYi2 D17.F32 97 #define dYr3 D18.F32 98 #define dYi3 D19.F32 99 #define qT0 d16.f32 100 #define qT1 d18.f32 101 #define qT2 d12.f32 102 #define qT3 d14.f32 103 #define dZr0 D20.F32 104 #define dZi0 D21.F32 105 #define dZr1 D22.F32 106 #define dZi1 D23.F32 107 #define dZr2 D24.F32 108 #define dZi2 D25.F32 109 #define dZr3 D26.F32 110 #define dZi3 D27.F32 111 112 #define qY0 Q6.F32 113 #define qY1 Q7.F32 114 #define qY2 Q8.F32 115 #define qY3 Q9.F32 116 #define qX0 Q2.F32 117 #define qZ0 Q10.F32 118 #define qZ1 Q11.F32 119 #define qZ2 Q12.F32 120 #define qZ3 Q13.F32 121 122 .macro FFTSTAGE scaled, inverse , name 123 124 @// Define stack arguments 125 126 127 @// Update grpCount and grpSize rightaway inorder to reuse 128 @// pGrpCount and pGrpSize regs 129 130 LSL grpCount,subFFTSize,#2 131 LSR subFFTNum,subFFTNum,#2 132 MOV subFFTSize,grpCount 133 134 VLD1 dW1,[pTwiddle] @//[wi | wr] 135 @// pT0+1 increments pT0 by 8 bytes 136 @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes 137 MOV pointStep,subFFTNum,LSL #1 138 139 140 @// pOut0+1 increments pOut0 by 8 bytes 141 @// pOut0+outPointStep == increment of 8*outPointStep bytes 142 @// = 2*size bytes 143 144 MOV stepTwiddle,#0 145 VLD1 dW2,[pTwiddle] @//[wi | wr] 146 SMULBB outPointStep,grpCount,pointStep 147 LSL pointStep,pointStep,#2 @// 2*grpSize 148 149 VLD1 dW3,[pTwiddle] @//[wi | wr] 150 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep 151 ADD setStep,srcStep,pointStep @// setStep = 3*pointStep 152 153 RSB setStep,setStep,#0 @// setStep = - 3*pointStep 154 SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16 155 156 MOV dstStep,outPointStep,LSL #1 157 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep 158 @// dstStep = - 3*outPointStep+16 159 RSB dstStep,dstStep,#16 160 161 162 163 radix4GrpLoop\name : 164 165 VLD2 {dXr0,dXi0},[pSrc],pointStep @// data[0] 166 ADD stepTwiddle,stepTwiddle,pointStep 167 VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1] 168 @// set pTwiddle to the first point 169 ADD pTwiddle,pTwiddle,stepTwiddle 170 VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2] 171 MOV twStep,stepTwiddle,LSL #2 172 173 @// data[3] & update pSrc for the next set 174 VLD2 {dXr3,dXi3},[pSrc],setStep 175 SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle 176 177 MOV setCount,pointStep,LSR #3 178 @// set pSrc to data[0] of the next set 179 ADD pSrc,pSrc,#16 180 @// increment to data[1] of the next set 181 ADD pSrc,pSrc,pointStep 182 183 184 @// Loop on the sets 185 186 radix4SetLoop\name : 187 188 189 190 .ifeqs "\inverse", "TRUE" 191 VMUL dZr1,dXr1,dW1[0] 192 VMUL dZi1,dXi1,dW1[0] 193 VMUL dZr2,dXr2,dW2[0] 194 VMUL dZi2,dXi2,dW2[0] 195 VMUL dZr3,dXr3,dW3[0] 196 VMUL dZi3,dXi3,dW3[0] 197 198 VMLA dZr1,dXi1,dW1[1] @// real part 199 VMLS dZi1,dXr1,dW1[1] @// imag part 200 201 @// data[1] for next iteration 202 VLD2 {dXr1,dXi1},[pSrc],pointStep 203 204 VMLA dZr2,dXi2,dW2[1] @// real part 205 VMLS dZi2,dXr2,dW2[1] @// imag part 206 207 @// data[2] for next iteration 208 VLD2 {dXr2,dXi2},[pSrc],pointStep 209 210 VMLA dZr3,dXi3,dW3[1] @// real part 211 VMLS dZi3,dXr3,dW3[1] @// imag part 212 .else 213 VMUL dZr1,dXr1,dW1[0] 214 VMUL dZi1,dXi1,dW1[0] 215 VMUL dZr2,dXr2,dW2[0] 216 VMUL dZi2,dXi2,dW2[0] 217 VMUL dZr3,dXr3,dW3[0] 218 VMUL dZi3,dXi3,dW3[0] 219 220 VMLS dZr1,dXi1,dW1[1] @// real part 221 VMLA dZi1,dXr1,dW1[1] @// imag part 222 223 @// data[1] for next iteration 224 VLD2 {dXr1,dXi1},[pSrc],pointStep 225 226 VMLS dZr2,dXi2,dW2[1] @// real part 227 VMLA dZi2,dXr2,dW2[1] @// imag part 228 229 @// data[2] for next iteration 230 VLD2 {dXr2,dXi2},[pSrc],pointStep 231 232 VMLS dZr3,dXi3,dW3[1] @// real part 233 VMLA dZi3,dXr3,dW3[1] @// imag part 234 .endif 235 236 @// data[3] & update pSrc to data[0] 237 @// But don't read on the very last iteration because that reads past 238 @// the end of pSrc. The last iteration is grpCount = 4, setCount = 2. 239 cmp grpCount, #4 240 cmpeq setCount, #2 @// Test setCount if grpCount = 4 241 @// These are executed only if both grpCount = 4 and setCount = 2 242 addeq pSrc, pSrc, setStep 243 beq radix4SkipRead\name 244 VLD2 {dXr3,dXi3},[pSrc],setStep 245 radix4SkipRead\name: 246 SUBS setCount,setCount,#2 247 248 @// finish first stage of 4 point FFT 249 VADD qY0,qX0,qZ2 250 VSUB qY2,qX0,qZ2 251 252 @// data[0] for next iteration 253 VLD2 {dXr0,dXi0},[pSrc :128]! 254 VADD qY1,qZ1,qZ3 255 VSUB qY3,qZ1,qZ3 256 257 @// finish second stage of 4 point FFT 258 259 VSUB qZ0,qY2,qY1 260 261 262 .ifeqs "\inverse", "TRUE" 263 264 VADD dZr3,dYr0,dYi3 265 VST2 {dZr0,dZi0},[pDst :128],outPointStep 266 VSUB dZi3,dYi0,dYr3 267 268 VADD qZ2,qY2,qY1 269 VST2 {dZr3,dZi3},[pDst :128],outPointStep 270 271 VSUB dZr1,dYr0,dYi3 272 VST2 {dZr2,dZi2},[pDst :128],outPointStep 273 VADD dZi1,dYi0,dYr3 274 275 VST2 {dZr1,dZi1},[pDst :128],dstStep 276 277 278 .else 279 280 VSUB dZr1,dYr0,dYi3 281 VST2 {dZr0,dZi0},[pDst :128],outPointStep 282 VADD dZi1,dYi0,dYr3 283 284 VADD qZ2,qY2,qY1 285 VST2 {dZr1,dZi1},[pDst :128],outPointStep 286 287 VADD dZr3,dYr0,dYi3 288 VST2 {dZr2,dZi2},[pDst :128],outPointStep 289 VSUB dZi3,dYi0,dYr3 290 291 VST2 {dZr3,dZi3},[pDst :128],dstStep 292 293 294 .endif 295 296 @// increment to data[1] of the next set 297 ADD pSrc,pSrc,pointStep 298 BGT radix4SetLoop\name 299 300 301 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr] 302 @// subtract 4 since grpCount multiplied by 4 303 SUBS grpCount,grpCount,#4 304 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr] 305 @// increment pSrc for the next grp 306 ADD pSrc,pSrc,srcStep 307 VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr] 308 BGT radix4GrpLoop\name 309 310 311 @// Reset and Swap pSrc and pDst for the next stage 312 MOV t1,pDst 313 @// pDst -= 2*size; pSrc -= 8*size bytes 314 SUB pDst,pSrc,outPointStep,LSL #2 315 SUB pSrc,t1,outPointStep 316 317 318 .endm 319 320 321 M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4 322 FFTSTAGE "FALSE","FALSE",FWD 323 M_END 324 325 326 M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4 327 FFTSTAGE "FALSE","TRUE",INV 328 M_END 329 330 331 .end 332