Home | History | Annotate | Download | only in x86
      1 /*
      2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <assert.h>
     13 #include <smmintrin.h>
     14 
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "av1/common/convolve.h"
     18 #include "av1/common/resize.h"
     19 #include "aom_dsp/x86/synonyms.h"
     20 
     21 // Note: If the crop width is not a multiple of 4, then, unlike the C version,
     22 // this function will overwrite some of the padding on the right hand side of
     23 // the frame. This padding appears to be trashed anyway, so this should not
     24 // affect the running of the decoder.
     25 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
     26                                   uint8_t *dst, int dst_stride, int w, int h,
     27                                   const int16_t *x_filters, int x0_qn,
     28                                   int x_step_qn) {
     29   assert(UPSCALE_NORMATIVE_TAPS == 8);
     30 
     31   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
     32 
     33   const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
     34   const __m128i zero = _mm_setzero_si128();
     35 
     36   const uint8_t *src_y;
     37   uint8_t *dst_y;
     38   int x_qn = x0_qn;
     39   for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
     40     const int x_filter_idx0 =
     41         ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
     42     const int x_filter_idx1 =
     43         ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
     44     const int x_filter_idx2 =
     45         ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
     46     const int x_filter_idx3 =
     47         ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
     48 
     49     assert(x_filter_idx0 <= RS_SUBPEL_MASK);
     50     assert(x_filter_idx1 <= RS_SUBPEL_MASK);
     51     assert(x_filter_idx2 <= RS_SUBPEL_MASK);
     52     assert(x_filter_idx3 <= RS_SUBPEL_MASK);
     53 
     54     const int16_t *const x_filter0 =
     55         &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
     56     const int16_t *const x_filter1 =
     57         &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
     58     const int16_t *const x_filter2 =
     59         &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
     60     const int16_t *const x_filter3 =
     61         &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
     62 
     63     const __m128i fil0_16 = xx_loadu_128(x_filter0);
     64     const __m128i fil1_16 = xx_loadu_128(x_filter1);
     65     const __m128i fil2_16 = xx_loadu_128(x_filter2);
     66     const __m128i fil3_16 = xx_loadu_128(x_filter3);
     67 
     68     src_y = src;
     69     dst_y = dst;
     70     for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
     71       const uint8_t *const src_x0 =
     72           &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
     73       const uint8_t *const src_x1 =
     74           &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
     75       const uint8_t *const src_x2 =
     76           &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
     77       const uint8_t *const src_x3 =
     78           &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
     79 
     80       // Load up the source data. This is 8-bit input data, so each load
     81       // gets 8 pixels.
     82       const __m128i src0_8 = xx_loadl_64(src_x0);
     83       const __m128i src1_8 = xx_loadl_64(src_x1);
     84       const __m128i src2_8 = xx_loadl_64(src_x2);
     85       const __m128i src3_8 = xx_loadl_64(src_x3);
     86 
     87       // Now zero-extend up to 16-bit precision, i.e.
     88       // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
     89       const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
     90       const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
     91       const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
     92       const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
     93 
     94       // Multiply by filter coefficients (results in a 32-bit value),
     95       // and add adjacent pairs, i.e.
     96       // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
     97       // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
     98       const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
     99       const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
    100       const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
    101       const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
    102 
    103       // Reduce horizontally and add, i.e.
    104       // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
    105       const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
    106       const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
    107 
    108       const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
    109 
    110       // Divide down by (1 << FILTER_BITS), rounding to nearest.
    111       const __m128i shifted_32 =
    112           _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
    113 
    114       // Pack 32-bit values into 16-bit values, i.e.
    115       // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
    116       const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
    117 
    118       // Pack 16-bit values into 8-bit values, i.e.
    119       // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
    120       // -> [ 0 0 0 0 0 0 DC BA ]
    121       const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
    122 
    123       // Write to the output
    124       xx_storel_32(&dst_y[x], shifted_8);
    125     }
    126   }
    127 }
    128 
    129 // Note: If the crop width is not a multiple of 4, then, unlike the C version,
    130 // this function will overwrite some of the padding on the right hand side of
    131 // the frame. This padding appears to be trashed anyway, so this should not
    132 // affect the running of the decoder.
    133 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
    134                                          uint16_t *dst, int dst_stride, int w,
    135                                          int h, const int16_t *x_filters,
    136                                          int x0_qn, int x_step_qn, int bd) {
    137   assert(UPSCALE_NORMATIVE_TAPS == 8);
    138   assert(bd == 8 || bd == 10 || bd == 12);
    139 
    140   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
    141 
    142   const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
    143   const __m128i zero = _mm_setzero_si128();
    144   const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
    145 
    146   const uint16_t *src_y;
    147   uint16_t *dst_y;
    148   int x_qn = x0_qn;
    149   for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
    150     const int x_filter_idx0 =
    151         ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
    152     const int x_filter_idx1 =
    153         ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
    154     const int x_filter_idx2 =
    155         ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
    156     const int x_filter_idx3 =
    157         ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
    158 
    159     assert(x_filter_idx0 <= RS_SUBPEL_MASK);
    160     assert(x_filter_idx1 <= RS_SUBPEL_MASK);
    161     assert(x_filter_idx2 <= RS_SUBPEL_MASK);
    162     assert(x_filter_idx3 <= RS_SUBPEL_MASK);
    163 
    164     const int16_t *const x_filter0 =
    165         &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
    166     const int16_t *const x_filter1 =
    167         &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
    168     const int16_t *const x_filter2 =
    169         &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
    170     const int16_t *const x_filter3 =
    171         &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
    172 
    173     const __m128i fil0_16 = xx_loadu_128(x_filter0);
    174     const __m128i fil1_16 = xx_loadu_128(x_filter1);
    175     const __m128i fil2_16 = xx_loadu_128(x_filter2);
    176     const __m128i fil3_16 = xx_loadu_128(x_filter3);
    177 
    178     src_y = src;
    179     dst_y = dst;
    180     for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
    181       const uint16_t *const src_x0 =
    182           &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
    183       const uint16_t *const src_x1 =
    184           &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
    185       const uint16_t *const src_x2 =
    186           &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
    187       const uint16_t *const src_x3 =
    188           &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
    189 
    190       // Load up the source data. This is 16-bit input data, so each load
    191       // gets 8 pixels.
    192       const __m128i src0_16 = xx_loadu_128(src_x0);
    193       const __m128i src1_16 = xx_loadu_128(src_x1);
    194       const __m128i src2_16 = xx_loadu_128(src_x2);
    195       const __m128i src3_16 = xx_loadu_128(src_x3);
    196 
    197       // Multiply by filter coefficients (results in a 32-bit value),
    198       // and add adjacent pairs, i.e.
    199       // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
    200       // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
    201       const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
    202       const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
    203       const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
    204       const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
    205 
    206       // Reduce horizontally and add, i.e.
    207       // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
    208       const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
    209       const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
    210 
    211       const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
    212 
    213       // Divide down by (1 << FILTER_BITS), rounding to nearest.
    214       const __m128i shifted_32 =
    215           _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
    216 
    217       // Pack 32-bit values into 16-bit values, i.e.
    218       // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
    219       const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
    220 
    221       // Clip the values at (1 << bd) - 1
    222       const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
    223 
    224       // Write to the output
    225       xx_storel_64(&dst_y[x], clipped_16);
    226     }
    227   }
    228 }
    229