1 /* 2 * Copyright (c) 2018 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <xmmintrin.h> 13 14 #include "./vp8_rtcd.h" 15 #include "./vpx_config.h" 16 #include "vp8/common/filter.h" 17 #include "vpx_dsp/x86/mem_sse2.h" 18 #include "vpx_ports/mem.h" 19 20 static INLINE void horizontal_16x16(uint8_t *src, const int stride, 21 uint16_t *dst, const int xoffset) { 22 int h; 23 const __m128i zero = _mm_setzero_si128(); 24 25 if (xoffset == 0) { 26 for (h = 0; h < 17; ++h) { 27 const __m128i a = _mm_loadu_si128((__m128i *)src); 28 const __m128i a_lo = _mm_unpacklo_epi8(a, zero); 29 const __m128i a_hi = _mm_unpackhi_epi8(a, zero); 30 _mm_store_si128((__m128i *)dst, a_lo); 31 _mm_store_si128((__m128i *)(dst + 8), a_hi); 32 src += stride; 33 dst += 16; 34 } 35 return; 36 } 37 38 { 39 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); 40 const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); 41 const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); 42 43 for (h = 0; h < 17; ++h) { 44 const __m128i a = _mm_loadu_si128((__m128i *)src); 45 const __m128i a_lo = _mm_unpacklo_epi8(a, zero); 46 const __m128i a_hi = _mm_unpackhi_epi8(a, zero); 47 const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0); 48 const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0); 49 50 const __m128i b = _mm_loadu_si128((__m128i *)(src + 1)); 51 const __m128i b_lo = _mm_unpacklo_epi8(b, zero); 52 const __m128i b_hi = _mm_unpackhi_epi8(b, zero); 53 const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1); 54 const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1); 55 56 const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered); 57 const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered); 58 59 const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); 60 const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); 61 62 const __m128i shifted_lo = 63 _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); 64 const __m128i shifted_hi = 65 _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); 66 67 _mm_store_si128((__m128i *)dst, shifted_lo); 68 _mm_store_si128((__m128i *)(dst + 8), shifted_hi); 69 src += stride; 70 dst += 16; 71 } 72 } 73 } 74 75 static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride, 76 const int yoffset) { 77 int h; 78 79 if (yoffset == 0) { 80 for (h = 0; h < 16; ++h) { 81 const __m128i row_lo = _mm_load_si128((__m128i *)src); 82 const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8)); 83 const __m128i packed = _mm_packus_epi16(row_lo, row_hi); 84 _mm_store_si128((__m128i *)dst, packed); 85 src += 16; 86 dst += stride; 87 } 88 return; 89 } 90 91 { 92 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); 93 const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); 94 const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); 95 96 __m128i row_0_lo = _mm_load_si128((__m128i *)src); 97 __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8)); 98 src += 16; 99 for (h = 0; h < 16; ++h) { 100 const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0); 101 const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0); 102 103 const __m128i row_1_lo = _mm_load_si128((__m128i *)src); 104 const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8)); 105 const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1); 106 const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1); 107 108 const __m128i sum_lo = 109 _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered); 110 const __m128i sum_hi = 111 _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered); 112 113 const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); 114 const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); 115 116 const __m128i shifted_lo = 117 _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); 118 const __m128i shifted_hi = 119 _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); 120 121 const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi); 122 _mm_store_si128((__m128i *)dst, packed); 123 row_0_lo = row_1_lo; 124 row_0_hi = row_1_hi; 125 src += 16; 126 dst += stride; 127 } 128 } 129 } 130 131 void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line, 132 int xoffset, int yoffset, uint8_t *dst_ptr, 133 int dst_pitch) { 134 DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]); 135 136 assert((xoffset | yoffset) != 0); 137 138 horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset); 139 140 vertical_16x16(FData, dst_ptr, dst_pitch, yoffset); 141 } 142 143 static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst, 144 const int xoffset, const int height) { 145 int h; 146 const __m128i zero = _mm_setzero_si128(); 147 148 if (xoffset == 0) { 149 for (h = 0; h < height; ++h) { 150 const __m128i a = _mm_loadl_epi64((__m128i *)src); 151 const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); 152 _mm_store_si128((__m128i *)dst, a_u16); 153 src += stride; 154 dst += 8; 155 } 156 return; 157 } 158 159 { 160 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); 161 const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); 162 const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); 163 164 // Filter horizontally. Rather than load the whole array and transpose, load 165 // 16 values (overreading) and shift to set up the second value. Do an 166 // "extra" 9th line so the vertical pass has the necessary context. 167 for (h = 0; h < height; ++h) { 168 const __m128i a = _mm_loadu_si128((__m128i *)src); 169 const __m128i b = _mm_srli_si128(a, 1); 170 const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); 171 const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); 172 const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); 173 const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); 174 const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); 175 const __m128i compensated = _mm_add_epi16(sum, round_factor); 176 const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); 177 _mm_store_si128((__m128i *)dst, shifted); 178 src += stride; 179 dst += 8; 180 } 181 } 182 } 183 184 static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride, 185 const int yoffset, const int height) { 186 int h; 187 188 if (yoffset == 0) { 189 for (h = 0; h < height; ++h) { 190 const __m128i row = _mm_load_si128((__m128i *)src); 191 const __m128i packed = _mm_packus_epi16(row, row); 192 _mm_storel_epi64((__m128i *)dst, packed); 193 src += 8; 194 dst += stride; 195 } 196 return; 197 } 198 199 { 200 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); 201 const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); 202 const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); 203 204 __m128i row_0 = _mm_load_si128((__m128i *)src); 205 src += 8; 206 for (h = 0; h < height; ++h) { 207 const __m128i row_1 = _mm_load_si128((__m128i *)src); 208 const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); 209 const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); 210 const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); 211 const __m128i compensated = _mm_add_epi16(sum, round_factor); 212 const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); 213 const __m128i packed = _mm_packus_epi16(shifted, shifted); 214 _mm_storel_epi64((__m128i *)dst, packed); 215 row_0 = row_1; 216 src += 8; 217 dst += stride; 218 } 219 } 220 } 221 222 void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line, 223 int xoffset, int yoffset, uint8_t *dst_ptr, 224 int dst_pitch) { 225 DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]); 226 227 assert((xoffset | yoffset) != 0); 228 229 horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9); 230 231 vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8); 232 } 233 234 void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, 235 int xoffset, int yoffset, uint8_t *dst_ptr, 236 int dst_pitch) { 237 DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]); 238 239 assert((xoffset | yoffset) != 0); 240 241 horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5); 242 243 vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4); 244 } 245 246 static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst, 247 const int xoffset) { 248 int h; 249 const __m128i zero = _mm_setzero_si128(); 250 251 if (xoffset == 0) { 252 for (h = 0; h < 5; ++h) { 253 const __m128i a = load_unaligned_u32(src); 254 const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); 255 _mm_storel_epi64((__m128i *)dst, a_u16); 256 src += stride; 257 dst += 4; 258 } 259 return; 260 } 261 262 { 263 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); 264 const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); 265 const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); 266 267 for (h = 0; h < 5; ++h) { 268 const __m128i a = load_unaligned_u32(src); 269 const __m128i b = load_unaligned_u32(src + 1); 270 const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); 271 const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); 272 const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); 273 const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); 274 const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); 275 const __m128i compensated = _mm_add_epi16(sum, round_factor); 276 const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); 277 _mm_storel_epi64((__m128i *)dst, shifted); 278 src += stride; 279 dst += 4; 280 } 281 } 282 } 283 284 static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride, 285 const int yoffset) { 286 int h; 287 288 if (yoffset == 0) { 289 for (h = 0; h < 4; h += 2) { 290 const __m128i row = _mm_load_si128((__m128i *)src); 291 __m128i packed = _mm_packus_epi16(row, row); 292 store_unaligned_u32(dst, packed); 293 dst += stride; 294 packed = _mm_srli_si128(packed, 4); 295 store_unaligned_u32(dst, packed); 296 dst += stride; 297 src += 8; 298 } 299 return; 300 } 301 302 { 303 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); 304 const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); 305 const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); 306 307 for (h = 0; h < 4; h += 2) { 308 const __m128i row_0 = _mm_load_si128((__m128i *)src); 309 const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4)); 310 const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); 311 const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); 312 const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); 313 const __m128i compensated = _mm_add_epi16(sum, round_factor); 314 const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); 315 __m128i packed = _mm_packus_epi16(shifted, shifted); 316 storeu_uint32(dst, _mm_cvtsi128_si32(packed)); 317 packed = _mm_srli_si128(packed, 4); 318 dst += stride; 319 storeu_uint32(dst, _mm_cvtsi128_si32(packed)); 320 dst += stride; 321 src += 8; 322 } 323 } 324 } 325 326 void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, 327 int xoffset, int yoffset, uint8_t *dst_ptr, 328 int dst_pitch) { 329 DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]); 330 331 assert((xoffset | yoffset) != 0); 332 333 horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset); 334 335 vertical_4x4(FData, dst_ptr, dst_pitch, yoffset); 336 } 337