Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  *
     10  */
     11 
     12 #include "dl/api/omxtypes.h"
     13 #include "dl/sp/src/x86/x86SP_SSE_Math.h"
     14 
     15 // This function handles the case when set_count = 2, in which we cannot
     16 // unroll the set loop by 4 to meet the SSE requirement (4 elements).
     17 static void InternalUnroll2Inv(
     18     const OMX_F32 *in,
     19     OMX_F32 *out,
     20     const OMX_F32 *twiddle,
     21     OMX_INT n) {
     22   OMX_INT i;
     23   OMX_INT n_by_2 = n >> 1;
     24   OMX_INT n_by_4 = n >> 2;
     25   OMX_INT n_mul_2 = n << 1;
     26   OMX_F32 *out0 = out;
     27 
     28   for (i = 0; i < n_by_2; i += 8) {
     29     const OMX_F32 *tw1  = twiddle + i;
     30     const OMX_F32 *tw2  = tw1 + i;
     31     const OMX_F32 *tw3  = tw2 + i;
     32     const OMX_F32 *tw1e = tw1 + 4;
     33     const OMX_F32 *tw2e = tw2 + 8;
     34     const OMX_F32 *tw3e = tw3 + 12;
     35 
     36     VC v_tw1;
     37     VC v_tw2;
     38     VC v_tw3;
     39     VC v_t0;
     40     VC v_t1;
     41     VC v_t2;
     42     VC v_t3;
     43     VC v_t4;
     44     VC v_t5;
     45     VC v_t6;
     46     VC v_t7;
     47 
     48     v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
     49                                 _mm_load_ss(tw1e),
     50                                 _MM_SHUFFLE(0, 0, 0, 0));
     51     v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
     52                                 _mm_load_ss(tw1e + n_mul_2),
     53                                 _MM_SHUFFLE(0, 0, 0, 0));
     54     v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
     55                                 _mm_load_ss(tw2e),
     56                                 _MM_SHUFFLE(0, 0, 0, 0));
     57     v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
     58                                 _mm_load_ss(tw2e + n_mul_2),
     59                                 _MM_SHUFFLE(0, 0, 0, 0));
     60     v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
     61                                 _mm_load_ss(tw3e),
     62                                 _MM_SHUFFLE(0, 0, 0, 0));
     63     v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
     64                                 _mm_load_ss(tw3e + n_mul_2),
     65                                 _MM_SHUFFLE(0, 0, 0, 0));
     66 
     67     __m128 xmm0;
     68     __m128 xmm1;
     69     __m128 xmm2;
     70     __m128 xmm3;
     71     __m128 xmm4;
     72     __m128 xmm5;
     73     __m128 xmm6;
     74     __m128 xmm7;
     75 
     76     const OMX_F32 *in0 = in + (i << 1);
     77     xmm0 = _mm_load_ps(in0);
     78     xmm1 = _mm_load_ps(in0 + 4);
     79     xmm2 = _mm_load_ps(in0 + 8);
     80     xmm3 = _mm_load_ps(in0 + 12);
     81     v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
     82     v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
     83     v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
     84     v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));
     85 
     86     xmm4 = _mm_load_ps(in0 + n);
     87     xmm5 = _mm_load_ps(in0 + n + 4);
     88     xmm6 = _mm_load_ps(in0 + n + 8);
     89     xmm7 = _mm_load_ps(in0 + n + 12);
     90     v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
     91     v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
     92     v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
     93     v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
     94 
     95     OMX_F32 *out1 = out0 + n_by_4;
     96     OMX_F32 *out2 = out1 + n_by_4;
     97     OMX_F32 *out3 = out2 + n_by_4;
     98 
     99     RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
    100                          &v_tw1, &v_tw2, &v_tw3,
    101                          &v_t0, &v_t1, &v_t2, &v_t3);
    102 
    103     RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
    104                                &v_t4, &v_t5, &v_t6, &v_t7, n);
    105 
    106     out0 += 4;
    107   }
    108 }
    109 
    110 void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
    111     const OMX_F32 *in,
    112     OMX_F32 *out,
    113     const OMX_F32 *twiddle,
    114     OMX_INT n,
    115     OMX_INT sub_size,
    116     OMX_INT sub_num) {
    117   OMX_INT set;
    118   OMX_INT grp;
    119   OMX_INT step = sub_num >> 1;
    120   OMX_INT set_count = sub_num >> 2;
    121   OMX_INT n_by_4 = n >> 2;
    122   OMX_INT n_mul_2 = n << 1;
    123 
    124   OMX_F32 *out0 = out;
    125 
    126   if (set_count == 2) {
    127     InternalUnroll2Inv(in, out, twiddle, n);
    128     return;
    129   }
    130 
    131   // grp == 0
    132   for (set = 0; set < set_count; set += 4) {
    133     const OMX_F32 * in0 = in + set;
    134     const OMX_F32 *in1 = in0 + set_count;
    135     const OMX_F32 *in2 = in1 + set_count;
    136     const OMX_F32 *in3 = in2 + set_count;
    137 
    138     VC v_t0;
    139     VC v_t1;
    140     VC v_t2;
    141     VC v_t3;
    142     VC v_t4;
    143     VC v_t5;
    144     VC v_t6;
    145     VC v_t7;
    146 
    147     VC_LOAD_SPLIT(&v_t0, in0, n);
    148     VC_LOAD_SPLIT(&v_t1, in1, n);
    149     VC_LOAD_SPLIT(&v_t2, in2, n);
    150     VC_LOAD_SPLIT(&v_t3, in3, n);
    151 
    152     OMX_F32 *out1 = out0 + n_by_4;
    153     OMX_F32 *out2 = out1 + n_by_4;
    154     OMX_F32 *out3 = out2 + n_by_4;
    155 
    156     RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
    157                         &v_t0, &v_t1, &v_t2, &v_t3);
    158 
    159     RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
    160                                &v_t4, &v_t5, &v_t6, &v_t7, n);
    161 
    162     out0 += 4;
    163   }
    164 
    165   for (grp = 1; grp < sub_size; ++grp) {
    166     const OMX_F32 *tw1 = twiddle + grp * step;
    167     const OMX_F32 *tw2 = tw1 + grp * step;
    168     const OMX_F32 *tw3 = tw2 + grp * step;
    169 
    170     VC v_tw1;
    171     VC v_tw2;
    172     VC v_tw3;
    173 
    174     v_tw1.real = _mm_load1_ps(tw1);
    175     v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
    176     v_tw2.real = _mm_load1_ps(tw2);
    177     v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
    178     v_tw3.real = _mm_load1_ps(tw3);
    179     v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);
    180 
    181     for (set = 0; set < set_count; set += 4) {
    182       const OMX_F32 *in0 = in + set + grp * sub_num;
    183       const OMX_F32 *in1 = in0 + set_count;
    184       const OMX_F32 *in2 = in1 + set_count;
    185       const OMX_F32 *in3 = in2 + set_count;
    186 
    187       VC v_t0;
    188       VC v_t1;
    189       VC v_t2;
    190       VC v_t3;
    191       VC v_t4;
    192       VC v_t5;
    193       VC v_t6;
    194       VC v_t7;
    195 
    196       VC_LOAD_SPLIT(&v_t0, in0, n);
    197       VC_LOAD_SPLIT(&v_t1, in1, n);
    198       VC_LOAD_SPLIT(&v_t2, in2, n);
    199       VC_LOAD_SPLIT(&v_t3, in3, n);
    200 
    201       OMX_F32 *out1 = out0 + n_by_4;
    202       OMX_F32 *out2 = out1 + n_by_4;
    203       OMX_F32 *out3 = out2 + n_by_4;
    204 
    205       RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
    206                            &v_tw1, &v_tw2, &v_tw3,
    207                            &v_t0, &v_t1, &v_t2, &v_t3);
    208 
    209       RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
    210                                  &v_t4, &v_t5, &v_t6, &v_t7, n);
    211 
    212       out0 += 4;
    213     }
    214   }
    215 }
    216