1 @// 2 @// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 @// 4 @// Use of this source code is governed by a BSD-style license 5 @// that can be found in the LICENSE file in the root of the source 6 @// tree. An additional intellectual property rights grant can be found 7 @// in the file PATENTS. All contributing project authors may 8 @// be found in the AUTHORS file in the root of the source tree. 9 @// 10 @// This file was originally licensed as follows. It has been 11 @// relicensed with permission from the copyright holders. 12 @// 13 14 @// 15 @// File Name: armSP_FFT_CToC_SC32_Radix4_unsafe_s.s 16 @// OpenMAX DL: v1.0.2 17 @// Last Modified Revision: 7767 18 @// Last Modified Date: Thu, 27 Sep 2007 19 @// 20 @// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 21 @// 22 @// 23 @// 24 @// Description: 25 @// Compute a Radix 4 FFT stage for a N point complex signal 26 @// 27 28 29 30 31 @// Include standard headers 32 33 #include "dl/api/arm/armCOMM_s.h" 34 #include "dl/api/arm/omxtypes_s.h" 35 36 37 @// Import symbols required from other files 38 @// (For example tables) 39 40 41 42 43 @// Set debugging level 44 @//DEBUG_ON SETL {TRUE} 45 46 47 48 @// Guarding implementation by the processor name 49 50 51 52 53 @// Guarding implementation by the processor name 54 55 56 @// Import symbols required from other files 57 @// (For example tables) 58 59 60 @//Input Registers 61 62 #define pSrc r0 63 #define pDst r2 64 #define pTwiddle r1 65 #define subFFTNum r6 66 #define subFFTSize r7 67 68 69 70 @//Output Registers 71 72 73 @//Local Scratch Registers 74 75 #define grpCount r3 76 #define pointStep r4 77 #define outPointStep r5 78 #define stepTwiddle r12 79 #define setCount r14 80 #define srcStep r8 81 #define setStep r9 82 #define dstStep r10 83 #define twStep r11 84 #define t1 r3 85 86 @// Neon Registers 87 88 #define dW1 D0.S32 89 #define dW2 D1.S32 90 #define dW3 D2.S32 91 92 #define dXr0 D4.S32 93 #define dXi0 D5.S32 94 #define dXr1 D6.S32 95 #define dXi1 D7.S32 96 #define dXr2 D8.S32 97 #define dXi2 D9.S32 98 #define dXr3 D10.S32 99 #define dXi3 D11.S32 100 #define dYr0 D12.S32 101 #define dYi0 D13.S32 102 #define dYr1 D14.S32 103 #define dYi1 D15.S32 104 #define dYr2 D16.S32 105 #define dYi2 D17.S32 106 #define dYr3 D18.S32 107 #define dYi3 D19.S32 108 #define qT0 Q8.S64 109 #define qT1 Q9.S64 110 #define qT2 Q6.S64 111 #define qT3 Q7.S64 112 113 #define dZr0 D20.S32 114 #define dZi0 D21.S32 115 #define dZr1 D22.S32 116 #define dZi1 D23.S32 117 #define dZr2 D24.S32 118 #define dZi2 D25.S32 119 #define dZr3 D26.S32 120 #define dZi3 D27.S32 121 122 #define qY0 Q6.S32 123 #define qY1 Q7.S32 124 #define qY2 Q8.S32 125 #define qY3 Q9.S32 126 #define qX0 Q2.S32 127 #define qZ0 Q10.S32 128 #define qZ1 Q11.S32 129 #define qZ2 Q12.S32 130 #define qZ3 Q13.S32 131 132 133 .MACRO FFTSTAGE scaled, inverse , name 134 135 @// Define stack arguments 136 137 138 @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs 139 140 LSL grpCount,subFFTSize,#2 141 LSR subFFTNum,subFFTNum,#2 142 MOV subFFTSize,grpCount 143 144 VLD1 dW1,[pTwiddle] @//[wi | wr] 145 @// pT0+1 increments pT0 by 8 bytes 146 @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes 147 MOV pointStep,subFFTNum,LSL #1 148 149 150 @// pOut0+1 increments pOut0 by 8 bytes 151 @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size bytes 152 153 MOV stepTwiddle,#0 154 VLD1 dW2,[pTwiddle] @//[wi | wr] 155 SMULBB outPointStep,grpCount,pointStep 156 LSL pointStep,pointStep,#2 @// 2*grpSize 157 158 VLD1 dW3,[pTwiddle] @//[wi | wr] 159 MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep 160 ADD setStep,srcStep,pointStep @// setStep = 3*pointStep 161 @//RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16 162 RSB setStep,setStep,#0 @// setStep = - 3*pointStep 163 SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16 164 165 MOV dstStep,outPointStep,LSL #1 166 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep 167 RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16 168 169 170 171 grpLoop\name : 172 173 VLD2 {dXr0,dXi0},[pSrc],pointStep @// data[0] 174 ADD stepTwiddle,stepTwiddle,pointStep 175 VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1] 176 ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point 177 VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2] 178 MOV twStep,stepTwiddle,LSL #2 179 180 VLD2 {dXr3,dXi3},[pSrc],setStep @// data[3] & update pSrc for the next set 181 SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle 182 183 MOV setCount,pointStep,LSR #3 184 ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set 185 ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set 186 187 188 @// Loop on the sets 189 190 setLoop\name : 191 192 193 194 SUBS setCount,setCount,#2 @// decrement the loop counter 195 196 .ifeqs "\inverse", "TRUE" 197 VMULL qT0,dXr1,dW1[0] 198 VMLAL qT0,dXi1,dW1[1] @// real part 199 VMULL qT1,dXi1,dW1[0] 200 VMLSL qT1,dXr1,dW1[1] @// imag part 201 202 .else 203 VMULL qT0,dXr1,dW1[0] 204 VMLSL qT0,dXi1,dW1[1] @// real part 205 VMULL qT1,dXi1,dW1[0] 206 VMLAL qT1,dXr1,dW1[1] @// imag part 207 208 .endif 209 210 VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1] for next iteration 211 212 .ifeqs "\inverse", "TRUE" 213 VMULL qT2,dXr2,dW2[0] 214 VMLAL qT2,dXi2,dW2[1] @// real part 215 VMULL qT3,dXi2,dW2[0] 216 VMLSL qT3,dXr2,dW2[1] @// imag part 217 218 .else 219 VMULL qT2,dXr2,dW2[0] 220 VMLSL qT2,dXi2,dW2[1] @// real part 221 VMULL qT3,dXi2,dW2[0] 222 VMLAL qT3,dXr2,dW2[1] @// imag part 223 224 .endif 225 226 VRSHRN dZr1,qT0,#31 227 VRSHRN dZi1,qT1,#31 228 VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2] for next iteration 229 230 231 .ifeqs "\inverse", "TRUE" 232 VMULL qT0,dXr3,dW3[0] 233 VMLAL qT0,dXi3,dW3[1] @// real part 234 VMULL qT1,dXi3,dW3[0] 235 VMLSL qT1,dXr3,dW3[1] @// imag part 236 237 .else 238 VMULL qT0,dXr3,dW3[0] 239 VMLSL qT0,dXi3,dW3[1] @// real part 240 VMULL qT1,dXi3,dW3[0] 241 VMLAL qT1,dXr3,dW3[1] @// imag part 242 243 .endif 244 245 VRSHRN dZr2,qT2,#31 246 VRSHRN dZi2,qT3,#31 247 248 249 VRSHRN dZr3,qT0,#31 250 VRSHRN dZi3,qT1,#31 251 VLD2 {dXr3,dXi3},[pSrc],setStep @// data[3] & update pSrc to data[0] 252 253 .ifeqs "\scaled", "TRUE" 254 255 @// finish first stage of 4 point FFT 256 VHADD qY0,qX0,qZ2 257 VHSUB qY2,qX0,qZ2 258 259 VLD2 {dXr0,dXi0},[pSrc]! @// data[0] for next iteration 260 VHADD qY1,qZ1,qZ3 261 VHSUB qY3,qZ1,qZ3 262 263 @// finish second stage of 4 point FFT 264 265 VHSUB qZ0,qY2,qY1 266 267 268 .ifeqs "\inverse", "TRUE" 269 270 VHADD dZr3,dYr0,dYi3 271 VST2 {dZr0,dZi0},[pDst :128],outPointStep 272 VHSUB dZi3,dYi0,dYr3 273 274 VHADD qZ2,qY2,qY1 275 VST2 {dZr3,dZi3},[pDst :128],outPointStep 276 277 VHSUB dZr1,dYr0,dYi3 278 VST2 {dZr2,dZi2},[pDst :128],outPointStep 279 VHADD dZi1,dYi0,dYr3 280 281 VST2 {dZr1,dZi1},[pDst :128],dstStep 282 283 284 .else 285 286 VHSUB dZr1,dYr0,dYi3 287 VST2 {dZr0,dZi0},[pDst :128],outPointStep 288 VHADD dZi1,dYi0,dYr3 289 290 VHADD qZ2,qY2,qY1 291 VST2 {dZr1,dZi1},[pDst :128],outPointStep 292 293 VHADD dZr3,dYr0,dYi3 294 VST2 {dZr2,dZi2},[pDst :128],outPointStep 295 VHSUB dZi3,dYi0,dYr3 296 297 VST2 {dZr3,dZi3},[pDst :128],dstStep 298 299 300 .endif 301 302 303 .else 304 305 @// finish first stage of 4 point FFT 306 VADD qY0,qX0,qZ2 307 VSUB qY2,qX0,qZ2 308 309 VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0] for next iteration 310 VADD qY1,qZ1,qZ3 311 VSUB qY3,qZ1,qZ3 312 313 @// finish second stage of 4 point FFT 314 315 VSUB qZ0,qY2,qY1 316 317 318 .ifeqs "\inverse", "TRUE" 319 320 VADD dZr3,dYr0,dYi3 321 VST2 {dZr0,dZi0},[pDst :128],outPointStep 322 VSUB dZi3,dYi0,dYr3 323 324 VADD qZ2,qY2,qY1 325 VST2 {dZr3,dZi3},[pDst :128],outPointStep 326 327 VSUB dZr1,dYr0,dYi3 328 VST2 {dZr2,dZi2},[pDst :128],outPointStep 329 VADD dZi1,dYi0,dYr3 330 331 VST2 {dZr1,dZi1},[pDst :128],dstStep 332 333 334 .else 335 336 VSUB dZr1,dYr0,dYi3 337 VST2 {dZr0,dZi0},[pDst :128],outPointStep 338 VADD dZi1,dYi0,dYr3 339 340 VADD qZ2,qY2,qY1 341 VST2 {dZr1,dZi1},[pDst :128],outPointStep 342 343 VADD dZr3,dYr0,dYi3 344 VST2 {dZr2,dZi2},[pDst :128],outPointStep 345 VSUB dZi3,dYi0,dYr3 346 347 VST2 {dZr3,dZi3},[pDst :128],dstStep 348 349 350 .endif 351 352 .endif 353 354 ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set 355 BGT setLoop\name 356 357 358 VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr] 359 SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4 360 VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr] 361 ADD pSrc,pSrc,srcStep @// increment pSrc for the next grp 362 VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr] 363 BGT grpLoop\name 364 365 366 @// Reset and Swap pSrc and pDst for the next stage 367 MOV t1,pDst 368 SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= 2*size; pSrc -= 8*size bytes 369 SUB pSrc,t1,outPointStep 370 371 372 .endm 373 374 375 M_START armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe,r4 376 FFTSTAGE "FALSE","FALSE",FWD 377 M_END 378 379 380 M_START armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe,r4 381 FFTSTAGE "FALSE","TRUE",INV 382 M_END 383 384 385 M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4 386 FFTSTAGE "TRUE","FALSE",FWDSFS 387 M_END 388 389 390 M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4 391 FFTSTAGE "TRUE","TRUE",INVSFS 392 M_END 393 394 395 .end 396