Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vp9_rtcd.h"
     12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
     13 #include "vpx_dsp/x86/txfm_common_sse2.h"
     14 #include "vpx_ports/mem.h"
     15 
     16 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
     17                             int tx_type) {
     18   __m128i in[2];
     19   const __m128i eight = _mm_set1_epi16(8);
     20 
     21   in[0] = load_input_data8(input);
     22   in[1] = load_input_data8(input + 8);
     23 
     24   switch (tx_type) {
     25     case 0:  // DCT_DCT
     26       idct4_sse2(in);
     27       idct4_sse2(in);
     28       break;
     29     case 1:  // ADST_DCT
     30       idct4_sse2(in);
     31       iadst4_sse2(in);
     32       break;
     33     case 2:  // DCT_ADST
     34       iadst4_sse2(in);
     35       idct4_sse2(in);
     36       break;
     37     case 3:  // ADST_ADST
     38       iadst4_sse2(in);
     39       iadst4_sse2(in);
     40       break;
     41     default: assert(0); break;
     42   }
     43 
     44   // Final round and shift
     45   in[0] = _mm_add_epi16(in[0], eight);
     46   in[1] = _mm_add_epi16(in[1], eight);
     47 
     48   in[0] = _mm_srai_epi16(in[0], 4);
     49   in[1] = _mm_srai_epi16(in[1], 4);
     50 
     51   recon_and_store4x4_sse2(in, dest, stride);
     52 }
     53 
     54 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
     55                             int tx_type) {
     56   __m128i in[8];
     57   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
     58 
     59   // load input data
     60   in[0] = load_input_data8(input);
     61   in[1] = load_input_data8(input + 8 * 1);
     62   in[2] = load_input_data8(input + 8 * 2);
     63   in[3] = load_input_data8(input + 8 * 3);
     64   in[4] = load_input_data8(input + 8 * 4);
     65   in[5] = load_input_data8(input + 8 * 5);
     66   in[6] = load_input_data8(input + 8 * 6);
     67   in[7] = load_input_data8(input + 8 * 7);
     68 
     69   switch (tx_type) {
     70     case 0:  // DCT_DCT
     71       idct8_sse2(in);
     72       idct8_sse2(in);
     73       break;
     74     case 1:  // ADST_DCT
     75       idct8_sse2(in);
     76       iadst8_sse2(in);
     77       break;
     78     case 2:  // DCT_ADST
     79       iadst8_sse2(in);
     80       idct8_sse2(in);
     81       break;
     82     case 3:  // ADST_ADST
     83       iadst8_sse2(in);
     84       iadst8_sse2(in);
     85       break;
     86     default: assert(0); break;
     87   }
     88 
     89   // Final rounding and shift
     90   in[0] = _mm_adds_epi16(in[0], final_rounding);
     91   in[1] = _mm_adds_epi16(in[1], final_rounding);
     92   in[2] = _mm_adds_epi16(in[2], final_rounding);
     93   in[3] = _mm_adds_epi16(in[3], final_rounding);
     94   in[4] = _mm_adds_epi16(in[4], final_rounding);
     95   in[5] = _mm_adds_epi16(in[5], final_rounding);
     96   in[6] = _mm_adds_epi16(in[6], final_rounding);
     97   in[7] = _mm_adds_epi16(in[7], final_rounding);
     98 
     99   in[0] = _mm_srai_epi16(in[0], 5);
    100   in[1] = _mm_srai_epi16(in[1], 5);
    101   in[2] = _mm_srai_epi16(in[2], 5);
    102   in[3] = _mm_srai_epi16(in[3], 5);
    103   in[4] = _mm_srai_epi16(in[4], 5);
    104   in[5] = _mm_srai_epi16(in[5], 5);
    105   in[6] = _mm_srai_epi16(in[6], 5);
    106   in[7] = _mm_srai_epi16(in[7], 5);
    107 
    108   recon_and_store(dest + 0 * stride, in[0]);
    109   recon_and_store(dest + 1 * stride, in[1]);
    110   recon_and_store(dest + 2 * stride, in[2]);
    111   recon_and_store(dest + 3 * stride, in[3]);
    112   recon_and_store(dest + 4 * stride, in[4]);
    113   recon_and_store(dest + 5 * stride, in[5]);
    114   recon_and_store(dest + 6 * stride, in[6]);
    115   recon_and_store(dest + 7 * stride, in[7]);
    116 }
    117 
    118 static INLINE void load_buffer_8x16(const tran_low_t *const input,
    119                                     __m128i *const in) {
    120   in[0] = load_input_data8(input + 0 * 16);
    121   in[1] = load_input_data8(input + 1 * 16);
    122   in[2] = load_input_data8(input + 2 * 16);
    123   in[3] = load_input_data8(input + 3 * 16);
    124   in[4] = load_input_data8(input + 4 * 16);
    125   in[5] = load_input_data8(input + 5 * 16);
    126   in[6] = load_input_data8(input + 6 * 16);
    127   in[7] = load_input_data8(input + 7 * 16);
    128 
    129   in[8] = load_input_data8(input + 8 * 16);
    130   in[9] = load_input_data8(input + 9 * 16);
    131   in[10] = load_input_data8(input + 10 * 16);
    132   in[11] = load_input_data8(input + 11 * 16);
    133   in[12] = load_input_data8(input + 12 * 16);
    134   in[13] = load_input_data8(input + 13 * 16);
    135   in[14] = load_input_data8(input + 14 * 16);
    136   in[15] = load_input_data8(input + 15 * 16);
    137 }
    138 
    139 static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
    140                                      const int stride) {
    141   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
    142   // Final rounding and shift
    143   in[0] = _mm_adds_epi16(in[0], final_rounding);
    144   in[1] = _mm_adds_epi16(in[1], final_rounding);
    145   in[2] = _mm_adds_epi16(in[2], final_rounding);
    146   in[3] = _mm_adds_epi16(in[3], final_rounding);
    147   in[4] = _mm_adds_epi16(in[4], final_rounding);
    148   in[5] = _mm_adds_epi16(in[5], final_rounding);
    149   in[6] = _mm_adds_epi16(in[6], final_rounding);
    150   in[7] = _mm_adds_epi16(in[7], final_rounding);
    151   in[8] = _mm_adds_epi16(in[8], final_rounding);
    152   in[9] = _mm_adds_epi16(in[9], final_rounding);
    153   in[10] = _mm_adds_epi16(in[10], final_rounding);
    154   in[11] = _mm_adds_epi16(in[11], final_rounding);
    155   in[12] = _mm_adds_epi16(in[12], final_rounding);
    156   in[13] = _mm_adds_epi16(in[13], final_rounding);
    157   in[14] = _mm_adds_epi16(in[14], final_rounding);
    158   in[15] = _mm_adds_epi16(in[15], final_rounding);
    159 
    160   in[0] = _mm_srai_epi16(in[0], 6);
    161   in[1] = _mm_srai_epi16(in[1], 6);
    162   in[2] = _mm_srai_epi16(in[2], 6);
    163   in[3] = _mm_srai_epi16(in[3], 6);
    164   in[4] = _mm_srai_epi16(in[4], 6);
    165   in[5] = _mm_srai_epi16(in[5], 6);
    166   in[6] = _mm_srai_epi16(in[6], 6);
    167   in[7] = _mm_srai_epi16(in[7], 6);
    168   in[8] = _mm_srai_epi16(in[8], 6);
    169   in[9] = _mm_srai_epi16(in[9], 6);
    170   in[10] = _mm_srai_epi16(in[10], 6);
    171   in[11] = _mm_srai_epi16(in[11], 6);
    172   in[12] = _mm_srai_epi16(in[12], 6);
    173   in[13] = _mm_srai_epi16(in[13], 6);
    174   in[14] = _mm_srai_epi16(in[14], 6);
    175   in[15] = _mm_srai_epi16(in[15], 6);
    176 
    177   recon_and_store(dest + 0 * stride, in[0]);
    178   recon_and_store(dest + 1 * stride, in[1]);
    179   recon_and_store(dest + 2 * stride, in[2]);
    180   recon_and_store(dest + 3 * stride, in[3]);
    181   recon_and_store(dest + 4 * stride, in[4]);
    182   recon_and_store(dest + 5 * stride, in[5]);
    183   recon_and_store(dest + 6 * stride, in[6]);
    184   recon_and_store(dest + 7 * stride, in[7]);
    185   recon_and_store(dest + 8 * stride, in[8]);
    186   recon_and_store(dest + 9 * stride, in[9]);
    187   recon_and_store(dest + 10 * stride, in[10]);
    188   recon_and_store(dest + 11 * stride, in[11]);
    189   recon_and_store(dest + 12 * stride, in[12]);
    190   recon_and_store(dest + 13 * stride, in[13]);
    191   recon_and_store(dest + 14 * stride, in[14]);
    192   recon_and_store(dest + 15 * stride, in[15]);
    193 }
    194 
    195 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
    196                                int stride, int tx_type) {
    197   __m128i in0[16], in1[16];
    198 
    199   load_buffer_8x16(input, in0);
    200   input += 8;
    201   load_buffer_8x16(input, in1);
    202 
    203   switch (tx_type) {
    204     case 0:  // DCT_DCT
    205       idct16_sse2(in0, in1);
    206       idct16_sse2(in0, in1);
    207       break;
    208     case 1:  // ADST_DCT
    209       idct16_sse2(in0, in1);
    210       iadst16_sse2(in0, in1);
    211       break;
    212     case 2:  // DCT_ADST
    213       iadst16_sse2(in0, in1);
    214       idct16_sse2(in0, in1);
    215       break;
    216     case 3:  // ADST_ADST
    217       iadst16_sse2(in0, in1);
    218       iadst16_sse2(in0, in1);
    219       break;
    220     default: assert(0); break;
    221   }
    222 
    223   write_buffer_8x16(dest, in0, stride);
    224   dest += 8;
    225   write_buffer_8x16(dest, in1, stride);
    226 }
    227