1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <smmintrin.h> 14 15 #include "config/av1_rtcd.h" 16 17 #include "av1/common/convolve.h" 18 #include "av1/common/resize.h" 19 #include "aom_dsp/x86/synonyms.h" 20 21 // Note: If the crop width is not a multiple of 4, then, unlike the C version, 22 // this function will overwrite some of the padding on the right hand side of 23 // the frame. This padding appears to be trashed anyway, so this should not 24 // affect the running of the decoder. 25 void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, 26 uint8_t *dst, int dst_stride, int w, int h, 27 const int16_t *x_filters, int x0_qn, 28 int x_step_qn) { 29 assert(UPSCALE_NORMATIVE_TAPS == 8); 30 31 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; 32 33 const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); 34 const __m128i zero = _mm_setzero_si128(); 35 36 const uint8_t *src_y; 37 uint8_t *dst_y; 38 int x_qn = x0_qn; 39 for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { 40 const int x_filter_idx0 = 41 ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 42 const int x_filter_idx1 = 43 ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 44 const int x_filter_idx2 = 45 ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 46 const int x_filter_idx3 = 47 ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 48 49 assert(x_filter_idx0 <= RS_SUBPEL_MASK); 50 assert(x_filter_idx1 <= RS_SUBPEL_MASK); 51 assert(x_filter_idx2 <= RS_SUBPEL_MASK); 52 assert(x_filter_idx3 <= RS_SUBPEL_MASK); 53 54 const int16_t *const x_filter0 = 55 &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; 56 const int16_t *const x_filter1 = 57 &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; 58 const int16_t *const x_filter2 = 59 &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; 60 const int16_t *const x_filter3 = 61 &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; 62 63 const __m128i fil0_16 = xx_loadu_128(x_filter0); 64 const __m128i fil1_16 = xx_loadu_128(x_filter1); 65 const __m128i fil2_16 = xx_loadu_128(x_filter2); 66 const __m128i fil3_16 = xx_loadu_128(x_filter3); 67 68 src_y = src; 69 dst_y = dst; 70 for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { 71 const uint8_t *const src_x0 = 72 &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; 73 const uint8_t *const src_x1 = 74 &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; 75 const uint8_t *const src_x2 = 76 &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; 77 const uint8_t *const src_x3 = 78 &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; 79 80 // Load up the source data. This is 8-bit input data, so each load 81 // gets 8 pixels. 82 const __m128i src0_8 = xx_loadl_64(src_x0); 83 const __m128i src1_8 = xx_loadl_64(src_x1); 84 const __m128i src2_8 = xx_loadl_64(src_x2); 85 const __m128i src3_8 = xx_loadl_64(src_x3); 86 87 // Now zero-extend up to 16-bit precision, i.e. 88 // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ] 89 const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8); 90 const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8); 91 const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8); 92 const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8); 93 94 // Multiply by filter coefficients (results in a 32-bit value), 95 // and add adjacent pairs, i.e. 96 // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) 97 // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] 98 const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); 99 const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); 100 const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); 101 const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); 102 103 // Reduce horizontally and add, i.e. 104 // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] 105 const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); 106 const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); 107 108 const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); 109 110 // Divide down by (1 << FILTER_BITS), rounding to nearest. 111 const __m128i shifted_32 = 112 _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); 113 114 // Pack 32-bit values into 16-bit values, i.e. 115 // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] 116 const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); 117 118 // Pack 16-bit values into 8-bit values, i.e. 119 // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ]) 120 // -> [ 0 0 0 0 0 0 DC BA ] 121 const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero); 122 123 // Write to the output 124 xx_storel_32(&dst_y[x], shifted_8); 125 } 126 } 127 } 128 129 // Note: If the crop width is not a multiple of 4, then, unlike the C version, 130 // this function will overwrite some of the padding on the right hand side of 131 // the frame. This padding appears to be trashed anyway, so this should not 132 // affect the running of the decoder. 133 void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, 134 uint16_t *dst, int dst_stride, int w, 135 int h, const int16_t *x_filters, 136 int x0_qn, int x_step_qn, int bd) { 137 assert(UPSCALE_NORMATIVE_TAPS == 8); 138 assert(bd == 8 || bd == 10 || bd == 12); 139 140 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; 141 142 const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); 143 const __m128i zero = _mm_setzero_si128(); 144 const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1); 145 146 const uint16_t *src_y; 147 uint16_t *dst_y; 148 int x_qn = x0_qn; 149 for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { 150 const int x_filter_idx0 = 151 ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 152 const int x_filter_idx1 = 153 ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 154 const int x_filter_idx2 = 155 ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 156 const int x_filter_idx3 = 157 ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 158 159 assert(x_filter_idx0 <= RS_SUBPEL_MASK); 160 assert(x_filter_idx1 <= RS_SUBPEL_MASK); 161 assert(x_filter_idx2 <= RS_SUBPEL_MASK); 162 assert(x_filter_idx3 <= RS_SUBPEL_MASK); 163 164 const int16_t *const x_filter0 = 165 &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; 166 const int16_t *const x_filter1 = 167 &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; 168 const int16_t *const x_filter2 = 169 &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; 170 const int16_t *const x_filter3 = 171 &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; 172 173 const __m128i fil0_16 = xx_loadu_128(x_filter0); 174 const __m128i fil1_16 = xx_loadu_128(x_filter1); 175 const __m128i fil2_16 = xx_loadu_128(x_filter2); 176 const __m128i fil3_16 = xx_loadu_128(x_filter3); 177 178 src_y = src; 179 dst_y = dst; 180 for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { 181 const uint16_t *const src_x0 = 182 &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; 183 const uint16_t *const src_x1 = 184 &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; 185 const uint16_t *const src_x2 = 186 &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; 187 const uint16_t *const src_x3 = 188 &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; 189 190 // Load up the source data. This is 16-bit input data, so each load 191 // gets 8 pixels. 192 const __m128i src0_16 = xx_loadu_128(src_x0); 193 const __m128i src1_16 = xx_loadu_128(src_x1); 194 const __m128i src2_16 = xx_loadu_128(src_x2); 195 const __m128i src3_16 = xx_loadu_128(src_x3); 196 197 // Multiply by filter coefficients (results in a 32-bit value), 198 // and add adjacent pairs, i.e. 199 // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) 200 // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] 201 const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); 202 const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); 203 const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); 204 const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); 205 206 // Reduce horizontally and add, i.e. 207 // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] 208 const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); 209 const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); 210 211 const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); 212 213 // Divide down by (1 << FILTER_BITS), rounding to nearest. 214 const __m128i shifted_32 = 215 _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); 216 217 // Pack 32-bit values into 16-bit values, i.e. 218 // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] 219 const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); 220 221 // Clip the values at (1 << bd) - 1 222 const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum); 223 224 // Write to the output 225 xx_storel_64(&dst_y[x], clipped_16); 226 } 227 } 228 } 229