Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  *
     10  */
     11 
     12 #include "dl/api/omxtypes.h"
     13 #include "dl/sp/src/x86/x86SP_SSE_Math.h"
     14 
     15 void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(
     16     const OMX_F32 *in,
     17     OMX_F32 *out,
     18     const OMX_F32 *twiddle,
     19     OMX_INT n) {
     20   OMX_INT n_by_2 = n >> 1;
     21   OMX_INT n_by_4 = n >> 2;
     22   OMX_INT n_mul_2 = n << 1;
     23   OMX_INT i;
     24 
     25   OMX_F32 *out0 = out;
     26 
     27   for (i = 0; i < n_by_2; i += 8) {
     28     const OMX_F32 *tw1 = twiddle + i;
     29     const OMX_F32 *tw2 = tw1 + i;
     30     const OMX_F32 *tw3 = tw2 + i;
     31     const OMX_F32 *in0 = in + (i << 1);
     32     const OMX_F32 *in1 = in0 + 4;
     33     const OMX_F32 *in2 = in1 + 4;
     34     const OMX_F32 *in3 = in2 + 4;
     35     OMX_F32 *out1 = out0 + n_by_4;
     36     OMX_F32 *out2 = out1 + n_by_4;
     37     OMX_F32 *out3 = out2 + n_by_4;
     38 
     39     VC v_tw1;
     40     VC v_tw2;
     41     VC v_tw3;
     42     VC v_t0;
     43     VC v_t1;
     44     VC v_t2;
     45     VC v_t3;
     46     VC v_t4;
     47     VC v_t5;
     48     VC v_t6;
     49     VC v_t7;
     50 
     51     v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]);
     52     v_tw1.imag = _mm_set_ps(
     53         tw1[6 + n_mul_2],
     54         tw1[4 + n_mul_2],
     55         tw1[2 + n_mul_2],
     56         tw1[n_mul_2]);
     57     v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]);
     58     v_tw2.imag = _mm_set_ps(
     59         tw2[12 + n_mul_2],
     60         tw2[8 + n_mul_2],
     61         tw2[4 + n_mul_2],
     62         tw2[n_mul_2]);
     63     v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]);
     64     v_tw3.imag = _mm_set_ps(
     65         tw3[18 + n_mul_2],
     66         tw3[12 + n_mul_2],
     67         tw3[6 + n_mul_2],
     68         tw3[n_mul_2]);
     69 
     70     VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n);
     71 
     72     RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
     73                          &v_tw1, &v_tw2, &v_tw3,
     74                          &v_t0, &v_t1, &v_t2, &v_t3);
     75 
     76     RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
     77                                &v_t4, &v_t5, &v_t6, &v_t7, n);
     78 
     79     out0 += 4;
     80   }
     81 }
     82