Home | History | Annotate | Download | only in x86
      1 /* Copyright (c) 2014, Cisco Systems, INC
      2    Written by XiangMingZhu WeiZhou MinPeng YanWang
      3 
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions
      6    are met:
      7 
      8    - Redistributions of source code must retain the above copyright
      9    notice, this list of conditions and the following disclaimer.
     10 
     11    - Redistributions in binary form must reproduce the above copyright
     12    notice, this list of conditions and the following disclaimer in the
     13    documentation and/or other materials provided with the distribution.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     19    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <xmmintrin.h>
     33 #include <emmintrin.h>
     34 
     35 #include "macros.h"
     36 #include "celt_lpc.h"
     37 #include "stack_alloc.h"
     38 #include "mathops.h"
     39 #include "pitch.h"
     40 
     41 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
     42 #include <smmintrin.h>
     43 #include "x86cpu.h"
     44 
     45 opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
     46       int N)
     47 {
     48     opus_int  i, dataSize16;
     49     opus_int32 sum;
     50     __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
     51     __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
     52     __m128i inVec1_3210, inVec2_3210;
     53 
     54     sum = 0;
     55     dataSize16 = N & ~15;
     56 
     57     acc1 = _mm_setzero_si128();
     58     acc2 = _mm_setzero_si128();
     59 
     60     for (i=0;i<dataSize16;i+=16) {
     61         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
     62         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
     63 
     64         inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
     65         inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
     66 
     67         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
     68         inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
     69 
     70         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
     71         acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
     72     }
     73 
     74     acc1 = _mm_add_epi32(acc1, acc2);
     75 
     76     if (N - i >= 8)
     77     {
     78         inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
     79         inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
     80 
     81         inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
     82 
     83         acc1 = _mm_add_epi32(acc1, inVec1_76543210);
     84         i += 8;
     85     }
     86 
     87     if (N - i >= 4)
     88     {
     89         inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
     90         inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
     91 
     92         inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
     93 
     94         acc1 = _mm_add_epi32(acc1, inVec1_3210);
     95         i += 4;
     96     }
     97 
     98     acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
     99     acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
    100 
    101     sum += _mm_cvtsi128_si32(acc1);
    102 
    103     for (;i<N;i++)
    104     {
    105         sum = silk_SMLABB(sum, x[i], y[i]);
    106     }
    107 
    108     return sum;
    109 }
    110 
    111 void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
    112 {
    113     int j;
    114 
    115     __m128i vecX, vecX0, vecX1, vecX2, vecX3;
    116     __m128i vecY0, vecY1, vecY2, vecY3;
    117     __m128i sum0, sum1, sum2, sum3, vecSum;
    118     __m128i initSum;
    119 
    120     celt_assert(len >= 3);
    121 
    122     sum0 = _mm_setzero_si128();
    123     sum1 = _mm_setzero_si128();
    124     sum2 = _mm_setzero_si128();
    125     sum3 = _mm_setzero_si128();
    126 
    127     for (j=0;j<(len-7);j+=8)
    128     {
    129         vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
    130         vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
    131         vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
    132         vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
    133         vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
    134 
    135         sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
    136         sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
    137         sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
    138         sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
    139     }
    140 
    141     sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
    142     sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
    143 
    144     sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
    145     sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
    146 
    147     sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
    148     sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
    149 
    150     sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
    151     sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
    152 
    153     vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
    154           _mm_unpacklo_epi32(sum2, sum3));
    155 
    156     for (;j<(len-3);j+=4)
    157     {
    158         vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
    159         vecX0 = _mm_shuffle_epi32(vecX, 0x00);
    160         vecX1 = _mm_shuffle_epi32(vecX, 0x55);
    161         vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
    162         vecX3 = _mm_shuffle_epi32(vecX, 0xff);
    163 
    164         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
    165         vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
    166         vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
    167         vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
    168 
    169         sum0 = _mm_mullo_epi32(vecX0, vecY0);
    170         sum1 = _mm_mullo_epi32(vecX1, vecY1);
    171         sum2 = _mm_mullo_epi32(vecX2, vecY2);
    172         sum3 = _mm_mullo_epi32(vecX3, vecY3);
    173 
    174         sum0 = _mm_add_epi32(sum0, sum1);
    175         sum2 = _mm_add_epi32(sum2, sum3);
    176         vecSum = _mm_add_epi32(vecSum, sum0);
    177         vecSum = _mm_add_epi32(vecSum, sum2);
    178     }
    179 
    180     for (;j<len;j++)
    181     {
    182         vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
    183         vecX0 = _mm_shuffle_epi32(vecX, 0x00);
    184 
    185         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
    186 
    187         sum0 = _mm_mullo_epi32(vecX0, vecY0);
    188         vecSum = _mm_add_epi32(vecSum, sum0);
    189     }
    190 
    191     initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
    192     initSum = _mm_add_epi32(initSum, vecSum);
    193     _mm_storeu_si128((__m128i *)sum, initSum);
    194 }
    195 #endif
    196