Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <xmmintrin.h>
     13 
     14 #include "./vp8_rtcd.h"
     15 #include "./vpx_config.h"
     16 #include "vp8/common/filter.h"
     17 #include "vpx_dsp/x86/mem_sse2.h"
     18 #include "vpx_ports/mem.h"
     19 
     20 static INLINE void horizontal_16x16(uint8_t *src, const int stride,
     21                                     uint16_t *dst, const int xoffset) {
     22   int h;
     23   const __m128i zero = _mm_setzero_si128();
     24 
     25   if (xoffset == 0) {
     26     for (h = 0; h < 17; ++h) {
     27       const __m128i a = _mm_loadu_si128((__m128i *)src);
     28       const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
     29       const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
     30       _mm_store_si128((__m128i *)dst, a_lo);
     31       _mm_store_si128((__m128i *)(dst + 8), a_hi);
     32       src += stride;
     33       dst += 16;
     34     }
     35     return;
     36   }
     37 
     38   {
     39     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
     40     const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
     41     const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
     42 
     43     for (h = 0; h < 17; ++h) {
     44       const __m128i a = _mm_loadu_si128((__m128i *)src);
     45       const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
     46       const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
     47       const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0);
     48       const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0);
     49 
     50       const __m128i b = _mm_loadu_si128((__m128i *)(src + 1));
     51       const __m128i b_lo = _mm_unpacklo_epi8(b, zero);
     52       const __m128i b_hi = _mm_unpackhi_epi8(b, zero);
     53       const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1);
     54       const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1);
     55 
     56       const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered);
     57       const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered);
     58 
     59       const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
     60       const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
     61 
     62       const __m128i shifted_lo =
     63           _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
     64       const __m128i shifted_hi =
     65           _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
     66 
     67       _mm_store_si128((__m128i *)dst, shifted_lo);
     68       _mm_store_si128((__m128i *)(dst + 8), shifted_hi);
     69       src += stride;
     70       dst += 16;
     71     }
     72   }
     73 }
     74 
     75 static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride,
     76                                   const int yoffset) {
     77   int h;
     78 
     79   if (yoffset == 0) {
     80     for (h = 0; h < 16; ++h) {
     81       const __m128i row_lo = _mm_load_si128((__m128i *)src);
     82       const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8));
     83       const __m128i packed = _mm_packus_epi16(row_lo, row_hi);
     84       _mm_store_si128((__m128i *)dst, packed);
     85       src += 16;
     86       dst += stride;
     87     }
     88     return;
     89   }
     90 
     91   {
     92     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
     93     const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
     94     const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
     95 
     96     __m128i row_0_lo = _mm_load_si128((__m128i *)src);
     97     __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8));
     98     src += 16;
     99     for (h = 0; h < 16; ++h) {
    100       const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0);
    101       const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0);
    102 
    103       const __m128i row_1_lo = _mm_load_si128((__m128i *)src);
    104       const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8));
    105       const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1);
    106       const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1);
    107 
    108       const __m128i sum_lo =
    109           _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered);
    110       const __m128i sum_hi =
    111           _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered);
    112 
    113       const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
    114       const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
    115 
    116       const __m128i shifted_lo =
    117           _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
    118       const __m128i shifted_hi =
    119           _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
    120 
    121       const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi);
    122       _mm_store_si128((__m128i *)dst, packed);
    123       row_0_lo = row_1_lo;
    124       row_0_hi = row_1_hi;
    125       src += 16;
    126       dst += stride;
    127     }
    128   }
    129 }
    130 
    131 void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line,
    132                                     int xoffset, int yoffset, uint8_t *dst_ptr,
    133                                     int dst_pitch) {
    134   DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]);
    135 
    136   assert((xoffset | yoffset) != 0);
    137 
    138   horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset);
    139 
    140   vertical_16x16(FData, dst_ptr, dst_pitch, yoffset);
    141 }
    142 
    143 static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
    144                                   const int xoffset, const int height) {
    145   int h;
    146   const __m128i zero = _mm_setzero_si128();
    147 
    148   if (xoffset == 0) {
    149     for (h = 0; h < height; ++h) {
    150       const __m128i a = _mm_loadl_epi64((__m128i *)src);
    151       const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
    152       _mm_store_si128((__m128i *)dst, a_u16);
    153       src += stride;
    154       dst += 8;
    155     }
    156     return;
    157   }
    158 
    159   {
    160     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
    161     const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
    162     const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
    163 
    164     // Filter horizontally. Rather than load the whole array and transpose, load
    165     // 16 values (overreading) and shift to set up the second value. Do an
    166     // "extra" 9th line so the vertical pass has the necessary context.
    167     for (h = 0; h < height; ++h) {
    168       const __m128i a = _mm_loadu_si128((__m128i *)src);
    169       const __m128i b = _mm_srli_si128(a, 1);
    170       const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
    171       const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
    172       const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
    173       const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
    174       const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
    175       const __m128i compensated = _mm_add_epi16(sum, round_factor);
    176       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
    177       _mm_store_si128((__m128i *)dst, shifted);
    178       src += stride;
    179       dst += 8;
    180     }
    181   }
    182 }
    183 
    184 static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride,
    185                                 const int yoffset, const int height) {
    186   int h;
    187 
    188   if (yoffset == 0) {
    189     for (h = 0; h < height; ++h) {
    190       const __m128i row = _mm_load_si128((__m128i *)src);
    191       const __m128i packed = _mm_packus_epi16(row, row);
    192       _mm_storel_epi64((__m128i *)dst, packed);
    193       src += 8;
    194       dst += stride;
    195     }
    196     return;
    197   }
    198 
    199   {
    200     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
    201     const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
    202     const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
    203 
    204     __m128i row_0 = _mm_load_si128((__m128i *)src);
    205     src += 8;
    206     for (h = 0; h < height; ++h) {
    207       const __m128i row_1 = _mm_load_si128((__m128i *)src);
    208       const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
    209       const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
    210       const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
    211       const __m128i compensated = _mm_add_epi16(sum, round_factor);
    212       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
    213       const __m128i packed = _mm_packus_epi16(shifted, shifted);
    214       _mm_storel_epi64((__m128i *)dst, packed);
    215       row_0 = row_1;
    216       src += 8;
    217       dst += stride;
    218     }
    219   }
    220 }
    221 
    222 void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line,
    223                                   int xoffset, int yoffset, uint8_t *dst_ptr,
    224                                   int dst_pitch) {
    225   DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]);
    226 
    227   assert((xoffset | yoffset) != 0);
    228 
    229   horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9);
    230 
    231   vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8);
    232 }
    233 
    234 void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
    235                                   int xoffset, int yoffset, uint8_t *dst_ptr,
    236                                   int dst_pitch) {
    237   DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]);
    238 
    239   assert((xoffset | yoffset) != 0);
    240 
    241   horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);
    242 
    243   vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
    244 }
    245 
    246 static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,
    247                                   const int xoffset) {
    248   int h;
    249   const __m128i zero = _mm_setzero_si128();
    250 
    251   if (xoffset == 0) {
    252     for (h = 0; h < 5; ++h) {
    253       const __m128i a = load_unaligned_u32(src);
    254       const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
    255       _mm_storel_epi64((__m128i *)dst, a_u16);
    256       src += stride;
    257       dst += 4;
    258     }
    259     return;
    260   }
    261 
    262   {
    263     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
    264     const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
    265     const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
    266 
    267     for (h = 0; h < 5; ++h) {
    268       const __m128i a = load_unaligned_u32(src);
    269       const __m128i b = load_unaligned_u32(src + 1);
    270       const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
    271       const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
    272       const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
    273       const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
    274       const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
    275       const __m128i compensated = _mm_add_epi16(sum, round_factor);
    276       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
    277       _mm_storel_epi64((__m128i *)dst, shifted);
    278       src += stride;
    279       dst += 4;
    280     }
    281   }
    282 }
    283 
    284 static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
    285                                 const int yoffset) {
    286   int h;
    287 
    288   if (yoffset == 0) {
    289     for (h = 0; h < 4; h += 2) {
    290       const __m128i row = _mm_load_si128((__m128i *)src);
    291       __m128i packed = _mm_packus_epi16(row, row);
    292       store_unaligned_u32(dst, packed);
    293       dst += stride;
    294       packed = _mm_srli_si128(packed, 4);
    295       store_unaligned_u32(dst, packed);
    296       dst += stride;
    297       src += 8;
    298     }
    299     return;
    300   }
    301 
    302   {
    303     const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
    304     const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
    305     const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
    306 
    307     for (h = 0; h < 4; h += 2) {
    308       const __m128i row_0 = _mm_load_si128((__m128i *)src);
    309       const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));
    310       const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
    311       const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
    312       const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
    313       const __m128i compensated = _mm_add_epi16(sum, round_factor);
    314       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
    315       __m128i packed = _mm_packus_epi16(shifted, shifted);
    316       storeu_uint32(dst, _mm_cvtsi128_si32(packed));
    317       packed = _mm_srli_si128(packed, 4);
    318       dst += stride;
    319       storeu_uint32(dst, _mm_cvtsi128_si32(packed));
    320       dst += stride;
    321       src += 8;
    322     }
    323   }
    324 }
    325 
    326 void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
    327                                   int xoffset, int yoffset, uint8_t *dst_ptr,
    328                                   int dst_pitch) {
    329   DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]);
    330 
    331   assert((xoffset | yoffset) != 0);
    332 
    333   horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);
    334 
    335   vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);
    336 }
    337