1 /* 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 * 10 */ 11 12 #include <stdbool.h> 13 #include <stdint.h> 14 15 #include "dl/api/omxtypes.h" 16 #include "dl/sp/api/omxSP.h" 17 #include "dl/sp/api/x86SP.h" 18 #include "dl/sp/src/x86/x86SP_SSE_Math.h" 19 20 extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace( 21 const OMX_F32 *src, 22 OMX_F32 *buf1, 23 OMX_F32 *buf2, 24 const OMX_F32 *twiddle, 25 OMX_INT n, 26 bool forward_fft); 27 28 extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse( 29 const OMX_F32 *src, 30 OMX_F32 *buf1, 31 OMX_F32 *buf2, 32 const OMX_F32 *twiddle, 33 OMX_INT n, 34 bool forward_fft); 35 36 /** 37 * A two-for-one algorithm is used here to do the real fft: 38 * 39 * Input x[n], (n = 0, ..., N - 1) 40 * Output X[k] = DFT(N, k){x} 41 * a[n] = x[2n], (n = 0, ..., N/2 - 1) 42 * b[n] = x[2n + 1], (n = 0, ..., N/2 - 1) 43 * z[n] = a[n] + j * b[n] 44 * Z[k] = DFT(N/2, k){z} 45 * Z' is the complex conjugate of Z 46 * A[k] = (Z[k] + Z'[N/2 - k]) / 2 47 * B[k] = -j * (Z[k] - Z'[N/2 - k]) / 2 48 * X[k] = A[k] + B[k] * W[k], (W = exp(-j*2*PI*k/N); k = 0, ..., N/2 - 1) 49 * X[k] = A[k] - B[k], (k = N/2) 50 * X' is complex conjugate of X 51 * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1) 52 */ 53 54 /** 55 * This function is the last permutation of two-for-one FFT algorithm. 56 * We move the division by 2 to the last step in the implementation, so: 57 * A[k] = (Z[k] + Z'[N/2 - k]) 58 * B[k] = -j * (Z[k] - Z'[N/2 - k]) 59 * X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1) 60 * X[k] = (A[k] - B[k]), (k = N/2) 61 * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1) 62 */ 63 static void RevbinPermuteFwd( 64 const OMX_F32 *in, 65 OMX_F32 *out, 66 const OMX_F32 *twiddle, 67 OMX_INT n) { 68 OMX_INT i; 69 OMX_INT j; 70 OMX_INT n_by_2 = n >> 1; 71 OMX_INT n_by_4 = n >> 2; 72 73 OMX_FC32 big_a; 74 OMX_FC32 big_b; 75 OMX_FC32 temp; 76 const OMX_F32 *tw; 77 78 for (i = 1, j = n_by_2 - 1; i < n_by_4; i++, j--) { 79 // A[k] = (Z[k] + Z'[N/2 - k]) 80 big_a.Re = in[i] + in[j]; 81 big_a.Im = in[j + n_by_2] - in[i + n_by_2]; 82 83 // B[k] = -j * (Z[k] - Z'[N/2 - k]) 84 big_b.Re = in[j] - in[i]; 85 big_b.Im = in[j + n_by_2] + in[i + n_by_2]; 86 87 // W[k] 88 tw = twiddle + i; 89 90 // temp = B[k] * W[k] 91 temp.Re = big_b.Re * tw[0] + big_b.Im * tw[n]; 92 temp.Im = big_b.Re * tw[n] - big_b.Im * tw[0]; 93 94 // Convert split format to interleaved format. 95 // X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1) 96 out[i << 1] = 0.5f * (big_a.Re - temp.Im); 97 out[(i << 1) + 1] = 0.5f * (temp.Re - big_a.Im); 98 // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1) 99 out[j << 1] = 0.5f * (big_a.Re + temp.Im); 100 out[(j << 1) + 1] = 0.5f * (temp.Re + big_a.Im); 101 } 102 103 // X[k] = A[k] - B[k] (k = N/2) 104 out[n_by_2] = in[n_by_4]; 105 out[n_by_2 + 1] = -in[n_by_4 + n_by_2]; 106 107 out[0] = in[0] + in[n_by_2]; 108 out[1] = 0; 109 out[n] = in[0] - in[n_by_2]; 110 out[n + 1] = 0; 111 } 112 113 // Sse version of RevbinPermuteFwd function. 114 static void RevbinPermuteFwdSse( 115 const OMX_F32 *in, 116 OMX_F32 *out, 117 const OMX_F32 *twiddle, 118 OMX_INT n) { 119 OMX_INT i; 120 OMX_INT j; 121 OMX_INT n_by_2 = n >> 1; 122 OMX_INT n_by_4 = n >> 2; 123 124 VC v_i; 125 VC v_j; 126 VC v_big_a; 127 VC v_big_b; 128 VC v_temp; 129 VC v_x0; 130 VC v_x1; 131 VC v_tw; 132 133 __m128 factor = _mm_set1_ps(0.5f); 134 135 for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { 136 VC_LOAD_SPLIT(&v_i, (in + i), n_by_2); 137 138 VC_LOADU_SPLIT(&v_j, (in + j), n_by_2); 139 VC_REVERSE(&v_j); 140 141 // A[k] = (Z[k] + Z'[N/2 - k]) 142 VC_ADD_SUB(&v_big_a, &v_j, &v_i); 143 144 // B[k] = -j * (Z[k] - Z'[N/2 - k]) 145 VC_SUB_ADD(&v_big_b, &v_j, &v_i); 146 147 // W[k] 148 VC_LOAD_SPLIT(&v_tw, (twiddle + i), n); 149 150 // temp = B[k] * W[k] 151 VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw); 152 153 VC_SUB_X(&v_x0, &v_big_a, &v_temp); 154 VC_ADD_X(&v_x1, &v_big_a, &v_temp); 155 156 VC_MUL_F(&v_x0, &v_x0, factor); 157 VC_MUL_F(&v_x1, &v_x1, factor); 158 159 // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1) 160 VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0); 161 162 // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1) 163 VC_REVERSE(&v_x1); 164 VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1); 165 } 166 167 out[n_by_2] = in[n_by_4]; 168 out[n_by_2 + 1] = -in[n_by_4 + n_by_2]; 169 170 out[0] = in[0] + in[n_by_2]; 171 out[1] = 0; 172 out[n] = in[0] - in[n_by_2]; 173 out[n + 1] = 0; 174 } 175 176 OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst, 177 const OMXFFTSpec_R_F32 *pFFTSpec) { 178 OMX_INT n; 179 OMX_INT n_by_2; 180 OMX_INT n_by_4; 181 const OMX_F32 *twiddle; 182 OMX_F32 *buf; 183 184 const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec; 185 186 // Input must be 32 byte aligned 187 if (!pSrc || !pDst || (const uintptr_t)pSrc & 31 || (uintptr_t)pDst & 31) 188 return OMX_Sts_BadArgErr; 189 190 n = pFFTStruct->N; 191 192 // This is to handle the case of order == 1. 193 if (n == 2) { 194 pDst[0] = (pSrc[0] + pSrc[1]); 195 pDst[1] = 0.0f; 196 pDst[2] = (pSrc[0] - pSrc[1]); 197 pDst[3] = 0.0f; 198 return OMX_Sts_NoErr; 199 } 200 201 n_by_2 = n >> 1; 202 n_by_4 = n >> 2; 203 buf = pFFTStruct->pBuf1; 204 twiddle = pFFTStruct->pTwiddle; 205 206 if(n_by_2 >= 16) { 207 buf = x86SP_F32_radix4_kernel_OutOfPlace_sse( 208 pSrc, 209 pFFTStruct->pBuf2, 210 buf, 211 twiddle, 212 n_by_2, 213 1); 214 } else { 215 buf = x86SP_F32_radix2_kernel_OutOfPlace( 216 pSrc, 217 pFFTStruct->pBuf2, 218 buf, 219 twiddle, 220 n_by_2, 221 1); 222 } 223 224 if(n >= 8) 225 RevbinPermuteFwdSse(buf, pDst, twiddle, n); 226 else 227 RevbinPermuteFwd(buf, pDst, twiddle, n); 228 229 return OMX_Sts_NoErr; 230 } 231