Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 
     13 #include "./vpx_config.h"
     14 #include "./vpx_dsp_rtcd.h"
     15 #include "vpx_dsp/vpx_dsp_common.h"
     16 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
     17 
     18 void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
     19   __m128i in0, in1;
     20   __m128i tmp;
     21   const __m128i zero = _mm_setzero_si128();
     22   in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
     23   in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
     24   in1 = _mm_unpacklo_epi64(
     25       in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
     26   in0 = _mm_unpacklo_epi64(
     27       in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
     28 
     29   tmp = _mm_add_epi16(in0, in1);
     30   in0 = _mm_unpacklo_epi16(zero, tmp);
     31   in1 = _mm_unpackhi_epi16(zero, tmp);
     32   in0 = _mm_srai_epi32(in0, 16);
     33   in1 = _mm_srai_epi32(in1, 16);
     34 
     35   tmp = _mm_add_epi32(in0, in1);
     36   in0 = _mm_unpacklo_epi32(tmp, zero);
     37   in1 = _mm_unpackhi_epi32(tmp, zero);
     38 
     39   tmp = _mm_add_epi32(in0, in1);
     40   in0 = _mm_srli_si128(tmp, 8);
     41 
     42   in1 = _mm_add_epi32(tmp, in0);
     43   in0 = _mm_slli_epi32(in1, 1);
     44   output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
     45 }
     46 
     47 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
     48   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
     49   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
     50   __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
     51   __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
     52   __m128i u0, u1, sum;
     53 
     54   u0 = _mm_add_epi16(in0, in1);
     55   u1 = _mm_add_epi16(in2, in3);
     56 
     57   in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
     58   in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
     59   in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
     60   in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
     61 
     62   sum = _mm_add_epi16(u0, u1);
     63 
     64   in0 = _mm_add_epi16(in0, in1);
     65   in2 = _mm_add_epi16(in2, in3);
     66   sum = _mm_add_epi16(sum, in0);
     67 
     68   u0 = _mm_setzero_si128();
     69   sum = _mm_add_epi16(sum, in2);
     70 
     71   in0 = _mm_unpacklo_epi16(u0, sum);
     72   in1 = _mm_unpackhi_epi16(u0, sum);
     73   in0 = _mm_srai_epi32(in0, 16);
     74   in1 = _mm_srai_epi32(in1, 16);
     75 
     76   sum = _mm_add_epi32(in0, in1);
     77   in0 = _mm_unpacklo_epi32(sum, u0);
     78   in1 = _mm_unpackhi_epi32(sum, u0);
     79 
     80   sum = _mm_add_epi32(in0, in1);
     81   in0 = _mm_srli_si128(sum, 8);
     82 
     83   in1 = _mm_add_epi32(sum, in0);
     84   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
     85 }
     86 
     87 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
     88                           int stride) {
     89   __m128i in0, in1, in2, in3;
     90   __m128i u0, u1;
     91   __m128i sum = _mm_setzero_si128();
     92   int i;
     93 
     94   for (i = 0; i < 2; ++i) {
     95     in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
     96     in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
     97     in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
     98     in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
     99 
    100     u0 = _mm_add_epi16(in0, in1);
    101     u1 = _mm_add_epi16(in2, in3);
    102     sum = _mm_add_epi16(sum, u0);
    103 
    104     in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
    105     in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
    106     in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
    107     in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
    108 
    109     sum = _mm_add_epi16(sum, u1);
    110     u0 = _mm_add_epi16(in0, in1);
    111     u1 = _mm_add_epi16(in2, in3);
    112     sum = _mm_add_epi16(sum, u0);
    113 
    114     in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
    115     in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
    116     in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
    117     in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
    118 
    119     sum = _mm_add_epi16(sum, u1);
    120     u0 = _mm_add_epi16(in0, in1);
    121     u1 = _mm_add_epi16(in2, in3);
    122     sum = _mm_add_epi16(sum, u0);
    123 
    124     in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
    125     in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
    126     in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
    127     in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
    128 
    129     sum = _mm_add_epi16(sum, u1);
    130     u0 = _mm_add_epi16(in0, in1);
    131     u1 = _mm_add_epi16(in2, in3);
    132     sum = _mm_add_epi16(sum, u0);
    133 
    134     sum = _mm_add_epi16(sum, u1);
    135     input += 8 * stride;
    136   }
    137 
    138   u0 = _mm_setzero_si128();
    139   in0 = _mm_unpacklo_epi16(u0, sum);
    140   in1 = _mm_unpackhi_epi16(u0, sum);
    141   in0 = _mm_srai_epi32(in0, 16);
    142   in1 = _mm_srai_epi32(in1, 16);
    143 
    144   sum = _mm_add_epi32(in0, in1);
    145   in0 = _mm_unpacklo_epi32(sum, u0);
    146   in1 = _mm_unpackhi_epi32(sum, u0);
    147 
    148   sum = _mm_add_epi32(in0, in1);
    149   in0 = _mm_srli_si128(sum, 8);
    150 
    151   in1 = _mm_add_epi32(sum, in0);
    152   in1 = _mm_srai_epi32(in1, 1);
    153   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
    154 }
    155 
    156 void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
    157                           int stride) {
    158   __m128i in0, in1, in2, in3;
    159   __m128i u0, u1;
    160   __m128i sum = _mm_setzero_si128();
    161   int i;
    162 
    163   for (i = 0; i < 8; ++i) {
    164     in0 = _mm_load_si128((const __m128i *)(input + 0));
    165     in1 = _mm_load_si128((const __m128i *)(input + 8));
    166     in2 = _mm_load_si128((const __m128i *)(input + 16));
    167     in3 = _mm_load_si128((const __m128i *)(input + 24));
    168 
    169     input += stride;
    170     u0 = _mm_add_epi16(in0, in1);
    171     u1 = _mm_add_epi16(in2, in3);
    172     sum = _mm_add_epi16(sum, u0);
    173 
    174     in0 = _mm_load_si128((const __m128i *)(input + 0));
    175     in1 = _mm_load_si128((const __m128i *)(input + 8));
    176     in2 = _mm_load_si128((const __m128i *)(input + 16));
    177     in3 = _mm_load_si128((const __m128i *)(input + 24));
    178 
    179     input += stride;
    180     sum = _mm_add_epi16(sum, u1);
    181     u0 = _mm_add_epi16(in0, in1);
    182     u1 = _mm_add_epi16(in2, in3);
    183     sum = _mm_add_epi16(sum, u0);
    184 
    185     in0 = _mm_load_si128((const __m128i *)(input + 0));
    186     in1 = _mm_load_si128((const __m128i *)(input + 8));
    187     in2 = _mm_load_si128((const __m128i *)(input + 16));
    188     in3 = _mm_load_si128((const __m128i *)(input + 24));
    189 
    190     input += stride;
    191     sum = _mm_add_epi16(sum, u1);
    192     u0 = _mm_add_epi16(in0, in1);
    193     u1 = _mm_add_epi16(in2, in3);
    194     sum = _mm_add_epi16(sum, u0);
    195 
    196     in0 = _mm_load_si128((const __m128i *)(input + 0));
    197     in1 = _mm_load_si128((const __m128i *)(input + 8));
    198     in2 = _mm_load_si128((const __m128i *)(input + 16));
    199     in3 = _mm_load_si128((const __m128i *)(input + 24));
    200 
    201     input += stride;
    202     sum = _mm_add_epi16(sum, u1);
    203     u0 = _mm_add_epi16(in0, in1);
    204     u1 = _mm_add_epi16(in2, in3);
    205     sum = _mm_add_epi16(sum, u0);
    206 
    207     sum = _mm_add_epi16(sum, u1);
    208   }
    209 
    210   u0 = _mm_setzero_si128();
    211   in0 = _mm_unpacklo_epi16(u0, sum);
    212   in1 = _mm_unpackhi_epi16(u0, sum);
    213   in0 = _mm_srai_epi32(in0, 16);
    214   in1 = _mm_srai_epi32(in1, 16);
    215 
    216   sum = _mm_add_epi32(in0, in1);
    217   in0 = _mm_unpacklo_epi32(sum, u0);
    218   in1 = _mm_unpackhi_epi32(sum, u0);
    219 
    220   sum = _mm_add_epi32(in0, in1);
    221   in0 = _mm_srli_si128(sum, 8);
    222 
    223   in1 = _mm_add_epi32(sum, in0);
    224   in1 = _mm_srai_epi32(in1, 3);
    225   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
    226 }
    227 
    228 #define DCT_HIGH_BIT_DEPTH 0
    229 #define FDCT4x4_2D vpx_fdct4x4_sse2
    230 #define FDCT8x8_2D vpx_fdct8x8_sse2
    231 #define FDCT16x16_2D vpx_fdct16x16_sse2
    232 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
    233 #undef FDCT4x4_2D
    234 #undef FDCT8x8_2D
    235 #undef FDCT16x16_2D
    236 
    237 #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
    238 #define FDCT32x32_HIGH_PRECISION 0
    239 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
    240 #undef FDCT32x32_2D
    241 #undef FDCT32x32_HIGH_PRECISION
    242 
    243 #define FDCT32x32_2D vpx_fdct32x32_sse2
    244 #define FDCT32x32_HIGH_PRECISION 1
    245 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
    246 #undef FDCT32x32_2D
    247 #undef FDCT32x32_HIGH_PRECISION
    248 #undef DCT_HIGH_BIT_DEPTH
    249 
    250 #if CONFIG_VP9_HIGHBITDEPTH
    251 #define DCT_HIGH_BIT_DEPTH 1
    252 #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
    253 #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
    254 #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
    255 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
    256 #undef FDCT4x4_2D
    257 #undef FDCT8x8_2D
    258 #undef FDCT16x16_2D
    259 
    260 #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
    261 #define FDCT32x32_HIGH_PRECISION 0
    262 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
    263 #undef FDCT32x32_2D
    264 #undef FDCT32x32_HIGH_PRECISION
    265 
    266 #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
    267 #define FDCT32x32_HIGH_PRECISION 1
    268 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
    269 #undef FDCT32x32_2D
    270 #undef FDCT32x32_HIGH_PRECISION
    271 #undef DCT_HIGH_BIT_DEPTH
    272 #endif  // CONFIG_VP9_HIGHBITDEPTH
    273