1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <immintrin.h> // AVX2 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx_ports/mem.h" 15 16 /* clang-format off */ 17 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 18 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 19 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 20 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 21 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 22 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 23 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 24 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 25 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 26 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 27 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 28 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 29 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 30 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 31 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 32 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 33 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 34 }; 35 /* clang-format on */ 36 37 void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, 38 const unsigned char *ref_ptr, int recon_stride, 39 unsigned int *SSE, int *Sum) { 40 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; 41 __m256i ref_expand_high, madd_low, madd_high; 42 unsigned int i, src_2strides, ref_2strides; 43 __m256i zero_reg = _mm256_set1_epi16(0); 44 __m256i sum_ref_src = _mm256_set1_epi16(0); 45 __m256i madd_ref_src = _mm256_set1_epi16(0); 46 47 // processing two strides in a 256 bit register reducing the number 48 // of loop stride by half (comparing to the sse2 code) 49 src_2strides = source_stride << 1; 50 ref_2strides = recon_stride << 1; 51 for (i = 0; i < 8; i++) { 52 src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); 53 src = _mm256_inserti128_si256( 54 src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); 55 56 ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); 57 ref = _mm256_inserti128_si256( 58 ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); 59 60 // expanding to 16 bit each lane 61 src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); 62 src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); 63 64 ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); 65 ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); 66 67 // src-ref 68 src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); 69 src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); 70 71 // madd low (src - ref) 72 madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); 73 74 // add high to low 75 src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); 76 77 // madd high (src - ref) 78 madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); 79 80 sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); 81 82 // add high to low 83 madd_ref_src = 84 _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); 85 86 src_ptr += src_2strides; 87 ref_ptr += ref_2strides; 88 } 89 90 { 91 __m128i sum_res, madd_res; 92 __m128i expand_sum_low, expand_sum_high, expand_sum; 93 __m128i expand_madd_low, expand_madd_high, expand_madd; 94 __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; 95 96 // extract the low lane and add it to the high lane 97 sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), 98 _mm256_extractf128_si256(sum_ref_src, 1)); 99 100 madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), 101 _mm256_extractf128_si256(madd_ref_src, 1)); 102 103 // padding each 2 bytes with another 2 zeroed bytes 104 expand_sum_low = 105 _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); 106 expand_sum_high = 107 _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); 108 109 // shifting the sign 16 bits right 110 expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); 111 expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); 112 113 expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); 114 115 // expand each 32 bits of the madd result to 64 bits 116 expand_madd_low = 117 _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); 118 expand_madd_high = 119 _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); 120 121 expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); 122 123 ex_expand_sum_low = 124 _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); 125 ex_expand_sum_high = 126 _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); 127 128 ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); 129 130 // shift 8 bytes eight 131 madd_res = _mm_srli_si128(expand_madd, 8); 132 sum_res = _mm_srli_si128(ex_expand_sum, 8); 133 134 madd_res = _mm_add_epi32(madd_res, expand_madd); 135 sum_res = _mm_add_epi32(sum_res, ex_expand_sum); 136 137 *((int *)SSE) = _mm_cvtsi128_si32(madd_res); 138 139 *((int *)Sum) = _mm_cvtsi128_si32(sum_res); 140 } 141 } 142 143 void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, 144 const unsigned char *ref_ptr, int recon_stride, 145 unsigned int *SSE, int *Sum) { 146 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; 147 __m256i ref_expand_high, madd_low, madd_high; 148 unsigned int i; 149 __m256i zero_reg = _mm256_set1_epi16(0); 150 __m256i sum_ref_src = _mm256_set1_epi16(0); 151 __m256i madd_ref_src = _mm256_set1_epi16(0); 152 153 // processing 32 elements in parallel 154 for (i = 0; i < 16; i++) { 155 src = _mm256_loadu_si256((__m256i const *)(src_ptr)); 156 157 ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); 158 159 // expanding to 16 bit each lane 160 src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); 161 src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); 162 163 ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); 164 ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); 165 166 // src-ref 167 src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); 168 src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); 169 170 // madd low (src - ref) 171 madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); 172 173 // add high to low 174 src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); 175 176 // madd high (src - ref) 177 madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); 178 179 sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); 180 181 // add high to low 182 madd_ref_src = 183 _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); 184 185 src_ptr += source_stride; 186 ref_ptr += recon_stride; 187 } 188 189 { 190 __m256i expand_sum_low, expand_sum_high, expand_sum; 191 __m256i expand_madd_low, expand_madd_high, expand_madd; 192 __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; 193 194 // padding each 2 bytes with another 2 zeroed bytes 195 expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); 196 expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); 197 198 // shifting the sign 16 bits right 199 expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); 200 expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); 201 202 expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); 203 204 // expand each 32 bits of the madd result to 64 bits 205 expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); 206 expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); 207 208 expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); 209 210 ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); 211 ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); 212 213 ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); 214 215 // shift 8 bytes eight 216 madd_ref_src = _mm256_srli_si256(expand_madd, 8); 217 sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); 218 219 madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); 220 sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); 221 222 // extract the low lane and the high lane and add the results 223 *((int *)SSE) = 224 _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + 225 _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); 226 227 *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + 228 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); 229 } 230 } 231 232 #define FILTER_SRC(filter) \ 233 /* filter the source */ \ 234 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ 235 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ 236 \ 237 /* add 8 to source */ \ 238 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ 239 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ 240 \ 241 /* divide source by 16 */ \ 242 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ 243 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); 244 245 #define MERGE_WITH_SRC(src_reg, reg) \ 246 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ 247 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); 248 249 #define LOAD_SRC_DST \ 250 /* load source and destination */ \ 251 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ 252 dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); 253 254 #define AVG_NEXT_SRC(src_reg, size_stride) \ 255 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ 256 /* average between current and next stride source */ \ 257 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); 258 259 #define MERGE_NEXT_SRC(src_reg, size_stride) \ 260 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ 261 MERGE_WITH_SRC(src_reg, src_next_reg) 262 263 #define CALC_SUM_SSE_INSIDE_LOOP \ 264 /* expand each byte to 2 bytes */ \ 265 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ 266 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ 267 /* source - dest */ \ 268 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ 269 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ 270 /* caculate sum */ \ 271 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ 272 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ 273 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ 274 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ 275 /* calculate sse */ \ 276 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ 277 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); 278 279 // final calculation to sum and sse 280 #define CALC_SUM_AND_SSE \ 281 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ 282 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ 283 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ 284 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ 285 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ 286 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ 287 \ 288 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ 289 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ 290 \ 291 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ 292 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ 293 *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ 294 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ 295 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ 296 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ 297 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ 298 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); 299 300 unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, 301 int x_offset, int y_offset, 302 const uint8_t *dst, int dst_stride, 303 int height, unsigned int *sse) { 304 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; 305 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; 306 __m256i zero_reg; 307 int i, sum; 308 sum_reg = _mm256_set1_epi16(0); 309 sse_reg = _mm256_set1_epi16(0); 310 zero_reg = _mm256_set1_epi16(0); 311 312 // x_offset = 0 and y_offset = 0 313 if (x_offset == 0) { 314 if (y_offset == 0) { 315 for (i = 0; i < height; i++) { 316 LOAD_SRC_DST 317 // expend each byte to 2 bytes 318 MERGE_WITH_SRC(src_reg, zero_reg) 319 CALC_SUM_SSE_INSIDE_LOOP 320 src += src_stride; 321 dst += dst_stride; 322 } 323 // x_offset = 0 and y_offset = 8 324 } else if (y_offset == 8) { 325 __m256i src_next_reg; 326 for (i = 0; i < height; i++) { 327 LOAD_SRC_DST 328 AVG_NEXT_SRC(src_reg, src_stride) 329 // expend each byte to 2 bytes 330 MERGE_WITH_SRC(src_reg, zero_reg) 331 CALC_SUM_SSE_INSIDE_LOOP 332 src += src_stride; 333 dst += dst_stride; 334 } 335 // x_offset = 0 and y_offset = bilin interpolation 336 } else { 337 __m256i filter, pw8, src_next_reg; 338 339 y_offset <<= 5; 340 filter = _mm256_load_si256( 341 (__m256i const *)(bilinear_filters_avx2 + y_offset)); 342 pw8 = _mm256_set1_epi16(8); 343 for (i = 0; i < height; i++) { 344 LOAD_SRC_DST 345 MERGE_NEXT_SRC(src_reg, src_stride) 346 FILTER_SRC(filter) 347 CALC_SUM_SSE_INSIDE_LOOP 348 src += src_stride; 349 dst += dst_stride; 350 } 351 } 352 // x_offset = 8 and y_offset = 0 353 } else if (x_offset == 8) { 354 if (y_offset == 0) { 355 __m256i src_next_reg; 356 for (i = 0; i < height; i++) { 357 LOAD_SRC_DST 358 AVG_NEXT_SRC(src_reg, 1) 359 // expand each byte to 2 bytes 360 MERGE_WITH_SRC(src_reg, zero_reg) 361 CALC_SUM_SSE_INSIDE_LOOP 362 src += src_stride; 363 dst += dst_stride; 364 } 365 // x_offset = 8 and y_offset = 8 366 } else if (y_offset == 8) { 367 __m256i src_next_reg, src_avg; 368 // load source and another source starting from the next 369 // following byte 370 src_reg = _mm256_loadu_si256((__m256i const *)(src)); 371 AVG_NEXT_SRC(src_reg, 1) 372 for (i = 0; i < height; i++) { 373 src_avg = src_reg; 374 src += src_stride; 375 LOAD_SRC_DST 376 AVG_NEXT_SRC(src_reg, 1) 377 // average between previous average to current average 378 src_avg = _mm256_avg_epu8(src_avg, src_reg); 379 // expand each byte to 2 bytes 380 MERGE_WITH_SRC(src_avg, zero_reg) 381 // save current source average 382 CALC_SUM_SSE_INSIDE_LOOP 383 dst += dst_stride; 384 } 385 // x_offset = 8 and y_offset = bilin interpolation 386 } else { 387 __m256i filter, pw8, src_next_reg, src_avg; 388 y_offset <<= 5; 389 filter = _mm256_load_si256( 390 (__m256i const *)(bilinear_filters_avx2 + y_offset)); 391 pw8 = _mm256_set1_epi16(8); 392 // load source and another source starting from the next 393 // following byte 394 src_reg = _mm256_loadu_si256((__m256i const *)(src)); 395 AVG_NEXT_SRC(src_reg, 1) 396 for (i = 0; i < height; i++) { 397 // save current source average 398 src_avg = src_reg; 399 src += src_stride; 400 LOAD_SRC_DST 401 AVG_NEXT_SRC(src_reg, 1) 402 MERGE_WITH_SRC(src_avg, src_reg) 403 FILTER_SRC(filter) 404 CALC_SUM_SSE_INSIDE_LOOP 405 dst += dst_stride; 406 } 407 } 408 // x_offset = bilin interpolation and y_offset = 0 409 } else { 410 if (y_offset == 0) { 411 __m256i filter, pw8, src_next_reg; 412 x_offset <<= 5; 413 filter = _mm256_load_si256( 414 (__m256i const *)(bilinear_filters_avx2 + x_offset)); 415 pw8 = _mm256_set1_epi16(8); 416 for (i = 0; i < height; i++) { 417 LOAD_SRC_DST 418 MERGE_NEXT_SRC(src_reg, 1) 419 FILTER_SRC(filter) 420 CALC_SUM_SSE_INSIDE_LOOP 421 src += src_stride; 422 dst += dst_stride; 423 } 424 // x_offset = bilin interpolation and y_offset = 8 425 } else if (y_offset == 8) { 426 __m256i filter, pw8, src_next_reg, src_pack; 427 x_offset <<= 5; 428 filter = _mm256_load_si256( 429 (__m256i const *)(bilinear_filters_avx2 + x_offset)); 430 pw8 = _mm256_set1_epi16(8); 431 src_reg = _mm256_loadu_si256((__m256i const *)(src)); 432 MERGE_NEXT_SRC(src_reg, 1) 433 FILTER_SRC(filter) 434 // convert each 16 bit to 8 bit to each low and high lane source 435 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 436 for (i = 0; i < height; i++) { 437 src += src_stride; 438 LOAD_SRC_DST 439 MERGE_NEXT_SRC(src_reg, 1) 440 FILTER_SRC(filter) 441 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 442 // average between previous pack to the current 443 src_pack = _mm256_avg_epu8(src_pack, src_reg); 444 MERGE_WITH_SRC(src_pack, zero_reg) 445 CALC_SUM_SSE_INSIDE_LOOP 446 src_pack = src_reg; 447 dst += dst_stride; 448 } 449 // x_offset = bilin interpolation and y_offset = bilin interpolation 450 } else { 451 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; 452 x_offset <<= 5; 453 xfilter = _mm256_load_si256( 454 (__m256i const *)(bilinear_filters_avx2 + x_offset)); 455 y_offset <<= 5; 456 yfilter = _mm256_load_si256( 457 (__m256i const *)(bilinear_filters_avx2 + y_offset)); 458 pw8 = _mm256_set1_epi16(8); 459 // load source and another source starting from the next 460 // following byte 461 src_reg = _mm256_loadu_si256((__m256i const *)(src)); 462 MERGE_NEXT_SRC(src_reg, 1) 463 464 FILTER_SRC(xfilter) 465 // convert each 16 bit to 8 bit to each low and high lane source 466 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 467 for (i = 0; i < height; i++) { 468 src += src_stride; 469 LOAD_SRC_DST 470 MERGE_NEXT_SRC(src_reg, 1) 471 FILTER_SRC(xfilter) 472 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 473 // merge previous pack to current pack source 474 MERGE_WITH_SRC(src_pack, src_reg) 475 // filter the source 476 FILTER_SRC(yfilter) 477 src_pack = src_reg; 478 CALC_SUM_SSE_INSIDE_LOOP 479 dst += dst_stride; 480 } 481 } 482 } 483 CALC_SUM_AND_SSE 484 return sum; 485 } 486 487 unsigned int vpx_sub_pixel_avg_variance32xh_avx2( 488 const uint8_t *src, int src_stride, int x_offset, int y_offset, 489 const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, 490 int height, unsigned int *sse) { 491 __m256i sec_reg; 492 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; 493 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; 494 __m256i zero_reg; 495 int i, sum; 496 sum_reg = _mm256_set1_epi16(0); 497 sse_reg = _mm256_set1_epi16(0); 498 zero_reg = _mm256_set1_epi16(0); 499 500 // x_offset = 0 and y_offset = 0 501 if (x_offset == 0) { 502 if (y_offset == 0) { 503 for (i = 0; i < height; i++) { 504 LOAD_SRC_DST 505 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 506 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 507 sec += sec_stride; 508 // expend each byte to 2 bytes 509 MERGE_WITH_SRC(src_reg, zero_reg) 510 CALC_SUM_SSE_INSIDE_LOOP 511 src += src_stride; 512 dst += dst_stride; 513 } 514 } else if (y_offset == 8) { 515 __m256i src_next_reg; 516 for (i = 0; i < height; i++) { 517 LOAD_SRC_DST 518 AVG_NEXT_SRC(src_reg, src_stride) 519 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 520 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 521 sec += sec_stride; 522 // expend each byte to 2 bytes 523 MERGE_WITH_SRC(src_reg, zero_reg) 524 CALC_SUM_SSE_INSIDE_LOOP 525 src += src_stride; 526 dst += dst_stride; 527 } 528 // x_offset = 0 and y_offset = bilin interpolation 529 } else { 530 __m256i filter, pw8, src_next_reg; 531 532 y_offset <<= 5; 533 filter = _mm256_load_si256( 534 (__m256i const *)(bilinear_filters_avx2 + y_offset)); 535 pw8 = _mm256_set1_epi16(8); 536 for (i = 0; i < height; i++) { 537 LOAD_SRC_DST 538 MERGE_NEXT_SRC(src_reg, src_stride) 539 FILTER_SRC(filter) 540 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 541 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 542 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 543 sec += sec_stride; 544 MERGE_WITH_SRC(src_reg, zero_reg) 545 CALC_SUM_SSE_INSIDE_LOOP 546 src += src_stride; 547 dst += dst_stride; 548 } 549 } 550 // x_offset = 8 and y_offset = 0 551 } else if (x_offset == 8) { 552 if (y_offset == 0) { 553 __m256i src_next_reg; 554 for (i = 0; i < height; i++) { 555 LOAD_SRC_DST 556 AVG_NEXT_SRC(src_reg, 1) 557 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 558 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 559 sec += sec_stride; 560 // expand each byte to 2 bytes 561 MERGE_WITH_SRC(src_reg, zero_reg) 562 CALC_SUM_SSE_INSIDE_LOOP 563 src += src_stride; 564 dst += dst_stride; 565 } 566 // x_offset = 8 and y_offset = 8 567 } else if (y_offset == 8) { 568 __m256i src_next_reg, src_avg; 569 // load source and another source starting from the next 570 // following byte 571 src_reg = _mm256_loadu_si256((__m256i const *)(src)); 572 AVG_NEXT_SRC(src_reg, 1) 573 for (i = 0; i < height; i++) { 574 // save current source average 575 src_avg = src_reg; 576 src += src_stride; 577 LOAD_SRC_DST 578 AVG_NEXT_SRC(src_reg, 1) 579 // average between previous average to current average 580 src_avg = _mm256_avg_epu8(src_avg, src_reg); 581 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 582 src_avg = _mm256_avg_epu8(src_avg, sec_reg); 583 sec += sec_stride; 584 // expand each byte to 2 bytes 585 MERGE_WITH_SRC(src_avg, zero_reg) 586 CALC_SUM_SSE_INSIDE_LOOP 587 dst += dst_stride; 588 } 589 // x_offset = 8 and y_offset = bilin interpolation 590 } else { 591 __m256i filter, pw8, src_next_reg, src_avg; 592 y_offset <<= 5; 593 filter = _mm256_load_si256( 594 (__m256i const *)(bilinear_filters_avx2 + y_offset)); 595 pw8 = _mm256_set1_epi16(8); 596 // load source and another source starting from the next 597 // following byte 598 src_reg = _mm256_loadu_si256((__m256i const *)(src)); 599 AVG_NEXT_SRC(src_reg, 1) 600 for (i = 0; i < height; i++) { 601 // save current source average 602 src_avg = src_reg; 603 src += src_stride; 604 LOAD_SRC_DST 605 AVG_NEXT_SRC(src_reg, 1) 606 MERGE_WITH_SRC(src_avg, src_reg) 607 FILTER_SRC(filter) 608 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 609 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 610 src_avg = _mm256_avg_epu8(src_avg, sec_reg); 611 // expand each byte to 2 bytes 612 MERGE_WITH_SRC(src_avg, zero_reg) 613 sec += sec_stride; 614 CALC_SUM_SSE_INSIDE_LOOP 615 dst += dst_stride; 616 } 617 } 618 // x_offset = bilin interpolation and y_offset = 0 619 } else { 620 if (y_offset == 0) { 621 __m256i filter, pw8, src_next_reg; 622 x_offset <<= 5; 623 filter = _mm256_load_si256( 624 (__m256i const *)(bilinear_filters_avx2 + x_offset)); 625 pw8 = _mm256_set1_epi16(8); 626 for (i = 0; i < height; i++) { 627 LOAD_SRC_DST 628 MERGE_NEXT_SRC(src_reg, 1) 629 FILTER_SRC(filter) 630 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 631 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 632 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 633 MERGE_WITH_SRC(src_reg, zero_reg) 634 sec += sec_stride; 635 CALC_SUM_SSE_INSIDE_LOOP 636 src += src_stride; 637 dst += dst_stride; 638 } 639 // x_offset = bilin interpolation and y_offset = 8 640 } else if (y_offset == 8) { 641 __m256i filter, pw8, src_next_reg, src_pack; 642 x_offset <<= 5; 643 filter = _mm256_load_si256( 644 (__m256i const *)(bilinear_filters_avx2 + x_offset)); 645 pw8 = _mm256_set1_epi16(8); 646 src_reg = _mm256_loadu_si256((__m256i const *)(src)); 647 MERGE_NEXT_SRC(src_reg, 1) 648 FILTER_SRC(filter) 649 // convert each 16 bit to 8 bit to each low and high lane source 650 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 651 for (i = 0; i < height; i++) { 652 src += src_stride; 653 LOAD_SRC_DST 654 MERGE_NEXT_SRC(src_reg, 1) 655 FILTER_SRC(filter) 656 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 657 // average between previous pack to the current 658 src_pack = _mm256_avg_epu8(src_pack, src_reg); 659 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 660 src_pack = _mm256_avg_epu8(src_pack, sec_reg); 661 sec += sec_stride; 662 MERGE_WITH_SRC(src_pack, zero_reg) 663 src_pack = src_reg; 664 CALC_SUM_SSE_INSIDE_LOOP 665 dst += dst_stride; 666 } 667 // x_offset = bilin interpolation and y_offset = bilin interpolation 668 } else { 669 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; 670 x_offset <<= 5; 671 xfilter = _mm256_load_si256( 672 (__m256i const *)(bilinear_filters_avx2 + x_offset)); 673 y_offset <<= 5; 674 yfilter = _mm256_load_si256( 675 (__m256i const *)(bilinear_filters_avx2 + y_offset)); 676 pw8 = _mm256_set1_epi16(8); 677 // load source and another source starting from the next 678 // following byte 679 src_reg = _mm256_loadu_si256((__m256i const *)(src)); 680 MERGE_NEXT_SRC(src_reg, 1) 681 682 FILTER_SRC(xfilter) 683 // convert each 16 bit to 8 bit to each low and high lane source 684 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 685 for (i = 0; i < height; i++) { 686 src += src_stride; 687 LOAD_SRC_DST 688 MERGE_NEXT_SRC(src_reg, 1) 689 FILTER_SRC(xfilter) 690 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 691 // merge previous pack to current pack source 692 MERGE_WITH_SRC(src_pack, src_reg) 693 // filter the source 694 FILTER_SRC(yfilter) 695 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 696 sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); 697 src_pack = _mm256_avg_epu8(src_pack, sec_reg); 698 MERGE_WITH_SRC(src_pack, zero_reg) 699 src_pack = src_reg; 700 sec += sec_stride; 701 CALC_SUM_SSE_INSIDE_LOOP 702 dst += dst_stride; 703 } 704 } 705 } 706 CALC_SUM_AND_SSE 707 return sum; 708 } 709