Home | History | Annotate | Download | only in libspeex
      1 /* Copyright (C) 2002 Jean-Marc Valin */
      2 /**
      3    @file filters_sse.h
      4    @brief Various analysis/synthesis filters (SSE version)
      5 */
      6 /*
      7    Redistribution and use in source and binary forms, with or without
      8    modification, are permitted provided that the following conditions
      9    are met:
     10 
     11    - Redistributions of source code must retain the above copyright
     12    notice, this list of conditions and the following disclaimer.
     13 
     14    - Redistributions in binary form must reproduce the above copyright
     15    notice, this list of conditions and the following disclaimer in the
     16    documentation and/or other materials provided with the distribution.
     17 
     18    - Neither the name of the Xiph.org Foundation nor the names of its
     19    contributors may be used to endorse or promote products derived from
     20    this software without specific prior written permission.
     21 
     22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     23    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     25    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
     26    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     27    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     28    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     29    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     30    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     31    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     32    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     33 */
     34 
     35 #include <xmmintrin.h>
     36 
     37 void filter_mem16_10(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
     38 {
     39    __m128 num[3], den[3], mem[3];
     40 
     41    int i;
     42 
     43    /* Copy numerator, denominator and memory to aligned xmm */
     44    for (i=0;i<2;i++)
     45    {
     46       mem[i] = _mm_loadu_ps(_mem+4*i);
     47       num[i] = _mm_loadu_ps(_num+4*i);
     48       den[i] = _mm_loadu_ps(_den+4*i);
     49    }
     50    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
     51    num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
     52    den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
     53 
     54    for (i=0;i<N;i++)
     55    {
     56       __m128 xx;
     57       __m128 yy;
     58       /* Compute next filter result */
     59       xx = _mm_load_ps1(x+i);
     60       yy = _mm_add_ss(xx, mem[0]);
     61       _mm_store_ss(y+i, yy);
     62       yy = _mm_shuffle_ps(yy, yy, 0);
     63 
     64       /* Update memory */
     65       mem[0] = _mm_move_ss(mem[0], mem[1]);
     66       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
     67 
     68       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
     69       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
     70 
     71       mem[1] = _mm_move_ss(mem[1], mem[2]);
     72       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
     73 
     74       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
     75       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
     76 
     77       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
     78 
     79       mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
     80       mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
     81    }
     82    /* Put memory back in its place */
     83    _mm_storeu_ps(_mem, mem[0]);
     84    _mm_storeu_ps(_mem+4, mem[1]);
     85    _mm_store_ss(_mem+8, mem[2]);
     86    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
     87    _mm_store_ss(_mem+9, mem[2]);
     88 }
     89 
     90 void filter_mem16_8(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
     91 {
     92    __m128 num[2], den[2], mem[2];
     93 
     94    int i;
     95 
     96    /* Copy numerator, denominator and memory to aligned xmm */
     97    for (i=0;i<2;i++)
     98    {
     99       mem[i] = _mm_loadu_ps(_mem+4*i);
    100       num[i] = _mm_loadu_ps(_num+4*i);
    101       den[i] = _mm_loadu_ps(_den+4*i);
    102    }
    103 
    104    for (i=0;i<N;i++)
    105    {
    106       __m128 xx;
    107       __m128 yy;
    108       /* Compute next filter result */
    109       xx = _mm_load_ps1(x+i);
    110       yy = _mm_add_ss(xx, mem[0]);
    111       _mm_store_ss(y+i, yy);
    112       yy = _mm_shuffle_ps(yy, yy, 0);
    113 
    114       /* Update memory */
    115       mem[0] = _mm_move_ss(mem[0], mem[1]);
    116       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
    117 
    118       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
    119       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
    120 
    121       mem[1] = _mm_sub_ss(mem[1], mem[1]);
    122       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
    123 
    124       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
    125       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
    126    }
    127    /* Put memory back in its place */
    128    _mm_storeu_ps(_mem, mem[0]);
    129    _mm_storeu_ps(_mem+4, mem[1]);
    130 }
    131 
    132 
    133 #define OVERRIDE_FILTER_MEM16
    134 void filter_mem16(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
    135 {
    136    if(ord==10)
    137       filter_mem16_10(x, _num, _den, y, N, ord, _mem);
    138    else if (ord==8)
    139       filter_mem16_8(x, _num, _den, y, N, ord, _mem);
    140 }
    141 
    142 
    143 
    144 void iir_mem16_10(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
    145 {
    146    __m128 den[3], mem[3];
    147 
    148    int i;
    149 
    150    /* Copy numerator, denominator and memory to aligned xmm */
    151    for (i=0;i<2;i++)
    152    {
    153       mem[i] = _mm_loadu_ps(_mem+4*i);
    154       den[i] = _mm_loadu_ps(_den+4*i);
    155    }
    156    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
    157    den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
    158 
    159    for (i=0;i<N;i++)
    160    {
    161       __m128 xx;
    162       __m128 yy;
    163       /* Compute next filter result */
    164       xx = _mm_load_ps1(x+i);
    165       yy = _mm_add_ss(xx, mem[0]);
    166       _mm_store_ss(y+i, yy);
    167       yy = _mm_shuffle_ps(yy, yy, 0);
    168 
    169       /* Update memory */
    170       mem[0] = _mm_move_ss(mem[0], mem[1]);
    171       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
    172 
    173       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
    174 
    175       mem[1] = _mm_move_ss(mem[1], mem[2]);
    176       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
    177 
    178       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
    179 
    180       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
    181 
    182       mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
    183    }
    184    /* Put memory back in its place */
    185    _mm_storeu_ps(_mem, mem[0]);
    186    _mm_storeu_ps(_mem+4, mem[1]);
    187    _mm_store_ss(_mem+8, mem[2]);
    188    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
    189    _mm_store_ss(_mem+9, mem[2]);
    190 }
    191 
    192 
    193 void iir_mem16_8(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
    194 {
    195    __m128 den[2], mem[2];
    196 
    197    int i;
    198 
    199    /* Copy numerator, denominator and memory to aligned xmm */
    200    for (i=0;i<2;i++)
    201    {
    202       mem[i] = _mm_loadu_ps(_mem+4*i);
    203       den[i] = _mm_loadu_ps(_den+4*i);
    204    }
    205 
    206    for (i=0;i<N;i++)
    207    {
    208       __m128 xx;
    209       __m128 yy;
    210       /* Compute next filter result */
    211       xx = _mm_load_ps1(x+i);
    212       yy = _mm_add_ss(xx, mem[0]);
    213       _mm_store_ss(y+i, yy);
    214       yy = _mm_shuffle_ps(yy, yy, 0);
    215 
    216       /* Update memory */
    217       mem[0] = _mm_move_ss(mem[0], mem[1]);
    218       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
    219 
    220       mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
    221 
    222       mem[1] = _mm_sub_ss(mem[1], mem[1]);
    223       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
    224 
    225       mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
    226    }
    227    /* Put memory back in its place */
    228    _mm_storeu_ps(_mem, mem[0]);
    229    _mm_storeu_ps(_mem+4, mem[1]);
    230 }
    231 
    232 #define OVERRIDE_IIR_MEM16
    233 void iir_mem16(const float *x, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
    234 {
    235    if(ord==10)
    236       iir_mem16_10(x, _den, y, N, ord, _mem);
    237    else if (ord==8)
    238       iir_mem16_8(x, _den, y, N, ord, _mem);
    239 }
    240 
    241 
    242 void fir_mem16_10(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
    243 {
    244    __m128 num[3], mem[3];
    245 
    246    int i;
    247 
    248    /* Copy numerator, denominator and memory to aligned xmm */
    249    for (i=0;i<2;i++)
    250    {
    251       mem[i] = _mm_loadu_ps(_mem+4*i);
    252       num[i] = _mm_loadu_ps(_num+4*i);
    253    }
    254    mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
    255    num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
    256 
    257    for (i=0;i<N;i++)
    258    {
    259       __m128 xx;
    260       __m128 yy;
    261       /* Compute next filter result */
    262       xx = _mm_load_ps1(x+i);
    263       yy = _mm_add_ss(xx, mem[0]);
    264       _mm_store_ss(y+i, yy);
    265       yy = _mm_shuffle_ps(yy, yy, 0);
    266 
    267       /* Update memory */
    268       mem[0] = _mm_move_ss(mem[0], mem[1]);
    269       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
    270 
    271       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
    272 
    273       mem[1] = _mm_move_ss(mem[1], mem[2]);
    274       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
    275 
    276       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
    277 
    278       mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
    279 
    280       mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
    281    }
    282    /* Put memory back in its place */
    283    _mm_storeu_ps(_mem, mem[0]);
    284    _mm_storeu_ps(_mem+4, mem[1]);
    285    _mm_store_ss(_mem+8, mem[2]);
    286    mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
    287    _mm_store_ss(_mem+9, mem[2]);
    288 }
    289 
    290 void fir_mem16_8(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
    291 {
    292    __m128 num[2], mem[2];
    293 
    294    int i;
    295 
    296    /* Copy numerator, denominator and memory to aligned xmm */
    297    for (i=0;i<2;i++)
    298    {
    299       mem[i] = _mm_loadu_ps(_mem+4*i);
    300       num[i] = _mm_loadu_ps(_num+4*i);
    301    }
    302 
    303    for (i=0;i<N;i++)
    304    {
    305       __m128 xx;
    306       __m128 yy;
    307       /* Compute next filter result */
    308       xx = _mm_load_ps1(x+i);
    309       yy = _mm_add_ss(xx, mem[0]);
    310       _mm_store_ss(y+i, yy);
    311       yy = _mm_shuffle_ps(yy, yy, 0);
    312 
    313       /* Update memory */
    314       mem[0] = _mm_move_ss(mem[0], mem[1]);
    315       mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
    316 
    317       mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
    318 
    319       mem[1] = _mm_sub_ss(mem[1], mem[1]);
    320       mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
    321 
    322       mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
    323    }
    324    /* Put memory back in its place */
    325    _mm_storeu_ps(_mem, mem[0]);
    326    _mm_storeu_ps(_mem+4, mem[1]);
    327 }
    328 
    329 #define OVERRIDE_FIR_MEM16
    330 void fir_mem16(const float *x, const float *_num, float *y, int N, int ord, float *_mem, char *stack)
    331 {
    332    if(ord==10)
    333       fir_mem16_10(x, _num, y, N, ord, _mem);
    334    else if (ord==8)
    335       fir_mem16_8(x, _num, y, N, ord, _mem);
    336 }
    337