1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 #include "aom_dsp/x86/lpf_common_sse2.h" 16 17 static INLINE __m256i dc_sum_64(const uint8_t *ref) { 18 const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); 19 const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); 20 const __m256i zero = _mm256_setzero_si256(); 21 __m256i y0 = _mm256_sad_epu8(x0, zero); 22 __m256i y1 = _mm256_sad_epu8(x1, zero); 23 y0 = _mm256_add_epi64(y0, y1); 24 __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); 25 y0 = _mm256_add_epi64(u0, y0); 26 u0 = _mm256_unpackhi_epi64(y0, y0); 27 return _mm256_add_epi16(y0, u0); 28 } 29 30 static INLINE __m256i dc_sum_32(const uint8_t *ref) { 31 const __m256i x = _mm256_loadu_si256((const __m256i *)ref); 32 const __m256i zero = _mm256_setzero_si256(); 33 __m256i y = _mm256_sad_epu8(x, zero); 34 __m256i u = _mm256_permute2x128_si256(y, y, 1); 35 y = _mm256_add_epi64(u, y); 36 u = _mm256_unpackhi_epi64(y, y); 37 return _mm256_add_epi16(y, u); 38 } 39 40 static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, 41 ptrdiff_t stride) { 42 for (int i = 0; i < height; ++i) { 43 _mm256_storeu_si256((__m256i *)dst, *r); 44 dst += stride; 45 } 46 } 47 48 static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1, 49 int height, uint8_t *dst, 50 ptrdiff_t stride) { 51 for (int i = 0; i < height; ++i) { 52 _mm256_storeu_si256((__m256i *)dst, *r0); 53 _mm256_storeu_si256((__m256i *)(dst + 32), *r1); 54 dst += stride; 55 } 56 } 57 58 static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst, 59 ptrdiff_t stride) { 60 for (int i = 0; i < height; ++i) { 61 _mm256_storeu_si256((__m256i *)dst, *r); 62 _mm256_storeu_si256((__m256i *)(dst + 32), *r); 63 dst += stride; 64 } 65 } 66 67 static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) { 68 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 69 70 r0 = _mm_unpacklo_epi16(x[0], x[1]); 71 r1 = _mm_unpacklo_epi16(x[2], x[3]); 72 r2 = _mm_unpacklo_epi16(x[4], x[5]); 73 r3 = _mm_unpacklo_epi16(x[6], x[7]); 74 75 r4 = _mm_unpacklo_epi16(x[8], x[9]); 76 r5 = _mm_unpacklo_epi16(x[10], x[11]); 77 r6 = _mm_unpacklo_epi16(x[12], x[13]); 78 r7 = _mm_unpacklo_epi16(x[14], x[15]); 79 80 r8 = _mm_unpacklo_epi32(r0, r1); 81 r9 = _mm_unpackhi_epi32(r0, r1); 82 r10 = _mm_unpacklo_epi32(r2, r3); 83 r11 = _mm_unpackhi_epi32(r2, r3); 84 85 r12 = _mm_unpacklo_epi32(r4, r5); 86 r13 = _mm_unpackhi_epi32(r4, r5); 87 r14 = _mm_unpacklo_epi32(r6, r7); 88 r15 = _mm_unpackhi_epi32(r6, r7); 89 90 r0 = _mm_unpacklo_epi64(r8, r9); 91 r1 = _mm_unpackhi_epi64(r8, r9); 92 r2 = _mm_unpacklo_epi64(r10, r11); 93 r3 = _mm_unpackhi_epi64(r10, r11); 94 95 r4 = _mm_unpacklo_epi64(r12, r13); 96 r5 = _mm_unpackhi_epi64(r12, r13); 97 r6 = _mm_unpacklo_epi64(r14, r15); 98 r7 = _mm_unpackhi_epi64(r14, r15); 99 100 d[0] = _mm_unpacklo_epi64(r0, r2); 101 d[1] = _mm_unpacklo_epi64(r4, r6); 102 d[2] = _mm_unpacklo_epi64(r1, r3); 103 d[3] = _mm_unpacklo_epi64(r5, r7); 104 105 d[4] = _mm_unpackhi_epi64(r0, r2); 106 d[5] = _mm_unpackhi_epi64(r4, r6); 107 d[6] = _mm_unpackhi_epi64(r1, r3); 108 d[7] = _mm_unpackhi_epi64(r5, r7); 109 } 110 111 static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) { 112 __m256i w0, w1, w2, w3, ww0, ww1; 113 114 w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 115 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 116 w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53 117 w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73 118 119 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 120 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 121 122 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 123 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 124 125 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 126 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 127 128 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 129 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 130 } 131 132 static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) { 133 __m256i w0, w1, w2, w3, ww0, ww1; 134 135 w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 136 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 137 w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53 138 w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73 139 140 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 141 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 142 143 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 144 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 145 146 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 147 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 148 149 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 150 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 151 152 w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17 153 w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37 154 w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57 155 w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77 156 157 ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 158 ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 159 160 d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 161 d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 162 163 ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 164 ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 165 166 d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 167 d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 168 } 169 170 static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) { 171 __m256i w0, w1, w2, w3, ww0, ww1; 172 __m256i dd[16]; 173 w0 = _mm256_unpacklo_epi16(x[0], x[1]); 174 w1 = _mm256_unpacklo_epi16(x[2], x[3]); 175 w2 = _mm256_unpacklo_epi16(x[4], x[5]); 176 w3 = _mm256_unpacklo_epi16(x[6], x[7]); 177 178 ww0 = _mm256_unpacklo_epi32(w0, w1); // 179 ww1 = _mm256_unpacklo_epi32(w2, w3); // 180 181 dd[0] = _mm256_unpacklo_epi64(ww0, ww1); 182 dd[1] = _mm256_unpackhi_epi64(ww0, ww1); 183 184 ww0 = _mm256_unpackhi_epi32(w0, w1); // 185 ww1 = _mm256_unpackhi_epi32(w2, w3); // 186 187 dd[2] = _mm256_unpacklo_epi64(ww0, ww1); 188 dd[3] = _mm256_unpackhi_epi64(ww0, ww1); 189 190 w0 = _mm256_unpackhi_epi16(x[0], x[1]); 191 w1 = _mm256_unpackhi_epi16(x[2], x[3]); 192 w2 = _mm256_unpackhi_epi16(x[4], x[5]); 193 w3 = _mm256_unpackhi_epi16(x[6], x[7]); 194 195 ww0 = _mm256_unpacklo_epi32(w0, w1); // 196 ww1 = _mm256_unpacklo_epi32(w2, w3); // 197 198 dd[4] = _mm256_unpacklo_epi64(ww0, ww1); 199 dd[5] = _mm256_unpackhi_epi64(ww0, ww1); 200 201 ww0 = _mm256_unpackhi_epi32(w0, w1); // 202 ww1 = _mm256_unpackhi_epi32(w2, w3); // 203 204 dd[6] = _mm256_unpacklo_epi64(ww0, ww1); 205 dd[7] = _mm256_unpackhi_epi64(ww0, ww1); 206 207 w0 = _mm256_unpacklo_epi16(x[8], x[9]); 208 w1 = _mm256_unpacklo_epi16(x[10], x[11]); 209 w2 = _mm256_unpacklo_epi16(x[12], x[13]); 210 w3 = _mm256_unpacklo_epi16(x[14], x[15]); 211 212 ww0 = _mm256_unpacklo_epi32(w0, w1); 213 ww1 = _mm256_unpacklo_epi32(w2, w3); 214 215 dd[8] = _mm256_unpacklo_epi64(ww0, ww1); 216 dd[9] = _mm256_unpackhi_epi64(ww0, ww1); 217 218 ww0 = _mm256_unpackhi_epi32(w0, w1); 219 ww1 = _mm256_unpackhi_epi32(w2, w3); 220 221 dd[10] = _mm256_unpacklo_epi64(ww0, ww1); 222 dd[11] = _mm256_unpackhi_epi64(ww0, ww1); 223 224 w0 = _mm256_unpackhi_epi16(x[8], x[9]); 225 w1 = _mm256_unpackhi_epi16(x[10], x[11]); 226 w2 = _mm256_unpackhi_epi16(x[12], x[13]); 227 w3 = _mm256_unpackhi_epi16(x[14], x[15]); 228 229 ww0 = _mm256_unpacklo_epi32(w0, w1); 230 ww1 = _mm256_unpacklo_epi32(w2, w3); 231 232 dd[12] = _mm256_unpacklo_epi64(ww0, ww1); 233 dd[13] = _mm256_unpackhi_epi64(ww0, ww1); 234 235 ww0 = _mm256_unpackhi_epi32(w0, w1); 236 ww1 = _mm256_unpackhi_epi32(w2, w3); 237 238 dd[14] = _mm256_unpacklo_epi64(ww0, ww1); 239 dd[15] = _mm256_unpackhi_epi64(ww0, ww1); 240 241 for (int i = 0; i < 8; i++) { 242 d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1); 243 d[i + 8] = _mm256_insertf128_si256(dd[i + 8], 244 _mm256_extracti128_si256(dd[i], 1), 0); 245 } 246 } 247 248 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 249 const uint8_t *above, const uint8_t *left) { 250 const __m256i sum_above = dc_sum_32(above); 251 __m256i sum_left = dc_sum_32(left); 252 sum_left = _mm256_add_epi16(sum_left, sum_above); 253 const __m256i thirtytwo = _mm256_set1_epi16(32); 254 sum_left = _mm256_add_epi16(sum_left, thirtytwo); 255 sum_left = _mm256_srai_epi16(sum_left, 6); 256 const __m256i zero = _mm256_setzero_si256(); 257 __m256i row = _mm256_shuffle_epi8(sum_left, zero); 258 row_store_32xh(&row, 32, dst, stride); 259 } 260 261 void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 262 const uint8_t *above, 263 const uint8_t *left) { 264 __m256i sum = dc_sum_32(above); 265 (void)left; 266 267 const __m256i sixteen = _mm256_set1_epi16(16); 268 sum = _mm256_add_epi16(sum, sixteen); 269 sum = _mm256_srai_epi16(sum, 5); 270 const __m256i zero = _mm256_setzero_si256(); 271 __m256i row = _mm256_shuffle_epi8(sum, zero); 272 row_store_32xh(&row, 32, dst, stride); 273 } 274 275 void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 276 const uint8_t *above, 277 const uint8_t *left) { 278 __m256i sum = dc_sum_32(left); 279 (void)above; 280 281 const __m256i sixteen = _mm256_set1_epi16(16); 282 sum = _mm256_add_epi16(sum, sixteen); 283 sum = _mm256_srai_epi16(sum, 5); 284 const __m256i zero = _mm256_setzero_si256(); 285 __m256i row = _mm256_shuffle_epi8(sum, zero); 286 row_store_32xh(&row, 32, dst, stride); 287 } 288 289 void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 290 const uint8_t *above, 291 const uint8_t *left) { 292 (void)above; 293 (void)left; 294 const __m256i row = _mm256_set1_epi8((uint8_t)0x80); 295 row_store_32xh(&row, 32, dst, stride); 296 } 297 298 void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 299 const uint8_t *above, const uint8_t *left) { 300 const __m256i row = _mm256_loadu_si256((const __m256i *)above); 301 (void)left; 302 row_store_32xh(&row, 32, dst, stride); 303 } 304 305 // There are 32 rows togeter. This function does line: 306 // 0,1,2,3, and 16,17,18,19. The next call would do 307 // 4,5,6,7, and 20,21,22,23. So 4 times of calling 308 // would finish 32 rows. 309 static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst, 310 ptrdiff_t stride) { 311 __m256i t[4]; 312 __m256i m = _mm256_setzero_si256(); 313 const __m256i inc = _mm256_set1_epi8(4); 314 int i; 315 316 for (i = 0; i < 4; i++) { 317 t[i] = _mm256_shuffle_epi8(*row, m); 318 __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); 319 __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); 320 _mm256_storeu_si256((__m256i *)dst, r0); 321 _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); 322 dst += stride; 323 m = _mm256_add_epi8(m, inc); 324 } 325 } 326 327 void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 328 const uint8_t *above, const uint8_t *left) { 329 (void)above; 330 const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); 331 332 __m256i u = _mm256_unpacklo_epi8(left_col, left_col); 333 334 __m256i v = _mm256_unpacklo_epi8(u, u); 335 h_predictor_32x8line(&v, dst, stride); 336 dst += stride << 2; 337 338 v = _mm256_unpackhi_epi8(u, u); 339 h_predictor_32x8line(&v, dst, stride); 340 dst += stride << 2; 341 342 u = _mm256_unpackhi_epi8(left_col, left_col); 343 344 v = _mm256_unpacklo_epi8(u, u); 345 h_predictor_32x8line(&v, dst, stride); 346 dst += stride << 2; 347 348 v = _mm256_unpackhi_epi8(u, u); 349 h_predictor_32x8line(&v, dst, stride); 350 } 351 352 // ----------------------------------------------------------------------------- 353 // Rectangle 354 355 // TODO(luoyi) The following two functions are shared with intrapred_sse2.c. 356 // Use a header file, intrapred_common_x86.h 357 static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { 358 __m128i x = _mm_load_si128((__m128i const *)ref); 359 const __m128i zero = _mm_setzero_si128(); 360 x = _mm_sad_epu8(x, zero); 361 const __m128i high = _mm_unpackhi_epi64(x, x); 362 return _mm_add_epi16(x, high); 363 } 364 365 static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { 366 __m128i x0 = _mm_load_si128((__m128i const *)ref); 367 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); 368 const __m128i zero = _mm_setzero_si128(); 369 x0 = _mm_sad_epu8(x0, zero); 370 x1 = _mm_sad_epu8(x1, zero); 371 x0 = _mm_add_epi16(x0, x1); 372 const __m128i high = _mm_unpackhi_epi64(x0, x0); 373 return _mm_add_epi16(x0, high); 374 } 375 376 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 377 const uint8_t *above, const uint8_t *left) { 378 const __m128i top_sum = dc_sum_32_sse2(above); 379 __m128i left_sum = dc_sum_16_sse2(left); 380 left_sum = _mm_add_epi16(top_sum, left_sum); 381 uint16_t sum = _mm_cvtsi128_si32(left_sum); 382 sum += 24; 383 sum /= 48; 384 const __m256i row = _mm256_set1_epi8((uint8_t)sum); 385 row_store_32xh(&row, 16, dst, stride); 386 } 387 388 void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 389 const uint8_t *above, const uint8_t *left) { 390 const __m256i sum_above = dc_sum_32(above); 391 __m256i sum_left = dc_sum_64(left); 392 sum_left = _mm256_add_epi16(sum_left, sum_above); 393 uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); 394 sum += 48; 395 sum /= 96; 396 const __m256i row = _mm256_set1_epi8((uint8_t)sum); 397 row_store_32xh(&row, 64, dst, stride); 398 } 399 400 void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 401 const uint8_t *above, const uint8_t *left) { 402 const __m256i sum_above = dc_sum_64(above); 403 __m256i sum_left = dc_sum_64(left); 404 sum_left = _mm256_add_epi16(sum_left, sum_above); 405 uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); 406 sum += 64; 407 sum /= 128; 408 const __m256i row = _mm256_set1_epi8((uint8_t)sum); 409 row_store_64xh(&row, 64, dst, stride); 410 } 411 412 void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 413 const uint8_t *above, const uint8_t *left) { 414 const __m256i sum_above = dc_sum_64(above); 415 __m256i sum_left = dc_sum_32(left); 416 sum_left = _mm256_add_epi16(sum_left, sum_above); 417 uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); 418 sum += 48; 419 sum /= 96; 420 const __m256i row = _mm256_set1_epi8((uint8_t)sum); 421 row_store_64xh(&row, 32, dst, stride); 422 } 423 424 void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 425 const uint8_t *above, const uint8_t *left) { 426 const __m256i sum_above = dc_sum_64(above); 427 __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); 428 sum_left = _mm256_add_epi16(sum_left, sum_above); 429 uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); 430 sum += 40; 431 sum /= 80; 432 const __m256i row = _mm256_set1_epi8((uint8_t)sum); 433 row_store_64xh(&row, 16, dst, stride); 434 } 435 436 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 437 const uint8_t *above, 438 const uint8_t *left) { 439 __m256i sum = dc_sum_32(above); 440 (void)left; 441 442 const __m256i sixteen = _mm256_set1_epi16(16); 443 sum = _mm256_add_epi16(sum, sixteen); 444 sum = _mm256_srai_epi16(sum, 5); 445 const __m256i zero = _mm256_setzero_si256(); 446 __m256i row = _mm256_shuffle_epi8(sum, zero); 447 row_store_32xh(&row, 16, dst, stride); 448 } 449 450 void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 451 const uint8_t *above, 452 const uint8_t *left) { 453 __m256i sum = dc_sum_32(above); 454 (void)left; 455 456 const __m256i sixteen = _mm256_set1_epi16(16); 457 sum = _mm256_add_epi16(sum, sixteen); 458 sum = _mm256_srai_epi16(sum, 5); 459 const __m256i zero = _mm256_setzero_si256(); 460 __m256i row = _mm256_shuffle_epi8(sum, zero); 461 row_store_32xh(&row, 64, dst, stride); 462 } 463 464 void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 465 const uint8_t *above, 466 const uint8_t *left) { 467 __m256i sum = dc_sum_64(above); 468 (void)left; 469 470 const __m256i thirtytwo = _mm256_set1_epi16(32); 471 sum = _mm256_add_epi16(sum, thirtytwo); 472 sum = _mm256_srai_epi16(sum, 6); 473 const __m256i zero = _mm256_setzero_si256(); 474 __m256i row = _mm256_shuffle_epi8(sum, zero); 475 row_store_64xh(&row, 64, dst, stride); 476 } 477 478 void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 479 const uint8_t *above, 480 const uint8_t *left) { 481 __m256i sum = dc_sum_64(above); 482 (void)left; 483 484 const __m256i thirtytwo = _mm256_set1_epi16(32); 485 sum = _mm256_add_epi16(sum, thirtytwo); 486 sum = _mm256_srai_epi16(sum, 6); 487 const __m256i zero = _mm256_setzero_si256(); 488 __m256i row = _mm256_shuffle_epi8(sum, zero); 489 row_store_64xh(&row, 32, dst, stride); 490 } 491 492 void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 493 const uint8_t *above, 494 const uint8_t *left) { 495 __m256i sum = dc_sum_64(above); 496 (void)left; 497 498 const __m256i thirtytwo = _mm256_set1_epi16(32); 499 sum = _mm256_add_epi16(sum, thirtytwo); 500 sum = _mm256_srai_epi16(sum, 6); 501 const __m256i zero = _mm256_setzero_si256(); 502 __m256i row = _mm256_shuffle_epi8(sum, zero); 503 row_store_64xh(&row, 16, dst, stride); 504 } 505 506 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 507 const uint8_t *above, 508 const uint8_t *left) { 509 __m128i sum = dc_sum_16_sse2(left); 510 (void)above; 511 512 const __m128i eight = _mm_set1_epi16(8); 513 sum = _mm_add_epi16(sum, eight); 514 sum = _mm_srai_epi16(sum, 4); 515 const __m128i zero = _mm_setzero_si128(); 516 const __m128i r = _mm_shuffle_epi8(sum, zero); 517 const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); 518 row_store_32xh(&row, 16, dst, stride); 519 } 520 521 void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 522 const uint8_t *above, 523 const uint8_t *left) { 524 __m256i sum = dc_sum_64(left); 525 (void)above; 526 527 const __m256i thirtytwo = _mm256_set1_epi16(32); 528 sum = _mm256_add_epi16(sum, thirtytwo); 529 sum = _mm256_srai_epi16(sum, 6); 530 const __m256i zero = _mm256_setzero_si256(); 531 __m256i row = _mm256_shuffle_epi8(sum, zero); 532 row_store_32xh(&row, 64, dst, stride); 533 } 534 535 void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 536 const uint8_t *above, 537 const uint8_t *left) { 538 __m256i sum = dc_sum_64(left); 539 (void)above; 540 541 const __m256i thirtytwo = _mm256_set1_epi16(32); 542 sum = _mm256_add_epi16(sum, thirtytwo); 543 sum = _mm256_srai_epi16(sum, 6); 544 const __m256i zero = _mm256_setzero_si256(); 545 __m256i row = _mm256_shuffle_epi8(sum, zero); 546 row_store_64xh(&row, 64, dst, stride); 547 } 548 549 void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 550 const uint8_t *above, 551 const uint8_t *left) { 552 __m256i sum = dc_sum_32(left); 553 (void)above; 554 555 const __m256i sixteen = _mm256_set1_epi16(16); 556 sum = _mm256_add_epi16(sum, sixteen); 557 sum = _mm256_srai_epi16(sum, 5); 558 const __m256i zero = _mm256_setzero_si256(); 559 __m256i row = _mm256_shuffle_epi8(sum, zero); 560 row_store_64xh(&row, 32, dst, stride); 561 } 562 563 void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 564 const uint8_t *above, 565 const uint8_t *left) { 566 __m128i sum = dc_sum_16_sse2(left); 567 (void)above; 568 569 const __m128i eight = _mm_set1_epi16(8); 570 sum = _mm_add_epi16(sum, eight); 571 sum = _mm_srai_epi16(sum, 4); 572 const __m128i zero = _mm_setzero_si128(); 573 const __m128i r = _mm_shuffle_epi8(sum, zero); 574 const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); 575 row_store_64xh(&row, 16, dst, stride); 576 } 577 578 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 579 const uint8_t *above, 580 const uint8_t *left) { 581 (void)above; 582 (void)left; 583 const __m256i row = _mm256_set1_epi8((uint8_t)0x80); 584 row_store_32xh(&row, 16, dst, stride); 585 } 586 587 void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 588 const uint8_t *above, 589 const uint8_t *left) { 590 (void)above; 591 (void)left; 592 const __m256i row = _mm256_set1_epi8((uint8_t)0x80); 593 row_store_32xh(&row, 64, dst, stride); 594 } 595 596 void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 597 const uint8_t *above, 598 const uint8_t *left) { 599 (void)above; 600 (void)left; 601 const __m256i row = _mm256_set1_epi8((uint8_t)0x80); 602 row_store_64xh(&row, 64, dst, stride); 603 } 604 605 void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 606 const uint8_t *above, 607 const uint8_t *left) { 608 (void)above; 609 (void)left; 610 const __m256i row = _mm256_set1_epi8((uint8_t)0x80); 611 row_store_64xh(&row, 32, dst, stride); 612 } 613 614 void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 615 const uint8_t *above, 616 const uint8_t *left) { 617 (void)above; 618 (void)left; 619 const __m256i row = _mm256_set1_epi8((uint8_t)0x80); 620 row_store_64xh(&row, 16, dst, stride); 621 } 622 623 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 624 const uint8_t *above, const uint8_t *left) { 625 const __m256i row = _mm256_loadu_si256((const __m256i *)above); 626 (void)left; 627 row_store_32xh(&row, 16, dst, stride); 628 } 629 630 void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 631 const uint8_t *above, const uint8_t *left) { 632 const __m256i row = _mm256_loadu_si256((const __m256i *)above); 633 (void)left; 634 row_store_32xh(&row, 64, dst, stride); 635 } 636 637 void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 638 const uint8_t *above, const uint8_t *left) { 639 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); 640 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); 641 (void)left; 642 row_store_32x2xh(&row0, &row1, 64, dst, stride); 643 } 644 645 void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 646 const uint8_t *above, const uint8_t *left) { 647 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); 648 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); 649 (void)left; 650 row_store_32x2xh(&row0, &row1, 32, dst, stride); 651 } 652 653 void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 654 const uint8_t *above, const uint8_t *left) { 655 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); 656 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); 657 (void)left; 658 row_store_32x2xh(&row0, &row1, 16, dst, stride); 659 } 660 661 // ----------------------------------------------------------------------------- 662 // PAETH_PRED 663 664 // Return 16 16-bit pixels in one row (__m256i) 665 static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, 666 const __m256i *topleft) { 667 const __m256i base = 668 _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); 669 670 __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); 671 __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); 672 __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); 673 674 __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); 675 mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); 676 __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); 677 678 pl = _mm256_andnot_si256(mask1, *left); 679 680 ptl = _mm256_and_si256(mask2, *topleft); 681 pt = _mm256_andnot_si256(mask2, *top); 682 pt = _mm256_or_si256(pt, ptl); 683 pt = _mm256_and_si256(mask1, pt); 684 685 return _mm256_or_si256(pt, pl); 686 } 687 688 // Return 16 8-bit pixels in one row (__m128i) 689 static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, 690 const __m256i *topleft) { 691 const __m256i p0 = paeth_pred(left, top, topleft); 692 const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); 693 const __m256i p = _mm256_packus_epi16(p0, p1); 694 return _mm256_castsi256_si128(p); 695 } 696 697 static INLINE __m256i get_top_vector(const uint8_t *above) { 698 const __m128i x = _mm_load_si128((const __m128i *)above); 699 const __m128i zero = _mm_setzero_si128(); 700 const __m128i t0 = _mm_unpacklo_epi8(x, zero); 701 const __m128i t1 = _mm_unpackhi_epi8(x, zero); 702 return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); 703 } 704 705 void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, 706 const uint8_t *above, const uint8_t *left) { 707 __m128i x = _mm_loadl_epi64((const __m128i *)left); 708 const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); 709 const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); 710 __m256i rep = _mm256_set1_epi16(0x8000); 711 const __m256i one = _mm256_set1_epi16(1); 712 const __m256i top = get_top_vector(above); 713 714 int i; 715 for (i = 0; i < 8; ++i) { 716 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 717 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 718 719 _mm_store_si128((__m128i *)dst, row); 720 dst += stride; 721 rep = _mm256_add_epi16(rep, one); 722 } 723 } 724 725 static INLINE __m256i get_left_vector(const uint8_t *left) { 726 const __m128i x = _mm_load_si128((const __m128i *)left); 727 return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); 728 } 729 730 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, 731 const uint8_t *above, const uint8_t *left) { 732 const __m256i l = get_left_vector(left); 733 const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); 734 __m256i rep = _mm256_set1_epi16(0x8000); 735 const __m256i one = _mm256_set1_epi16(1); 736 const __m256i top = get_top_vector(above); 737 738 int i; 739 for (i = 0; i < 16; ++i) { 740 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 741 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 742 743 _mm_store_si128((__m128i *)dst, row); 744 dst += stride; 745 rep = _mm256_add_epi16(rep, one); 746 } 747 } 748 749 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, 750 const uint8_t *above, const uint8_t *left) { 751 __m256i l = get_left_vector(left); 752 const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); 753 __m256i rep = _mm256_set1_epi16(0x8000); 754 const __m256i one = _mm256_set1_epi16(1); 755 const __m256i top = get_top_vector(above); 756 757 int i; 758 for (i = 0; i < 16; ++i) { 759 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 760 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 761 762 _mm_store_si128((__m128i *)dst, row); 763 dst += stride; 764 rep = _mm256_add_epi16(rep, one); 765 } 766 767 l = get_left_vector(left + 16); 768 rep = _mm256_set1_epi16(0x8000); 769 for (i = 0; i < 16; ++i) { 770 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 771 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 772 773 _mm_store_si128((__m128i *)dst, row); 774 dst += stride; 775 rep = _mm256_add_epi16(rep, one); 776 } 777 } 778 779 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, 780 const uint8_t *above, const uint8_t *left) { 781 const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); 782 const __m256i one = _mm256_set1_epi16(1); 783 const __m256i top = get_top_vector(above); 784 785 for (int j = 0; j < 4; ++j) { 786 const __m256i l = get_left_vector(left + j * 16); 787 __m256i rep = _mm256_set1_epi16(0x8000); 788 for (int i = 0; i < 16; ++i) { 789 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 790 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 791 792 _mm_store_si128((__m128i *)dst, row); 793 dst += stride; 794 rep = _mm256_add_epi16(rep, one); 795 } 796 } 797 } 798 799 // Return 32 8-bit pixels in one row (__m256i) 800 static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, 801 const __m256i *top1, 802 const __m256i *topleft) { 803 __m256i p0 = paeth_pred(left, top0, topleft); 804 __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); 805 const __m256i x0 = _mm256_packus_epi16(p0, p1); 806 807 p0 = paeth_pred(left, top1, topleft); 808 p1 = _mm256_permute4x64_epi64(p0, 0xe); 809 const __m256i x1 = _mm256_packus_epi16(p0, p1); 810 811 return _mm256_permute2x128_si256(x0, x1, 0x20); 812 } 813 814 void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 815 const uint8_t *above, const uint8_t *left) { 816 const __m256i l = get_left_vector(left); 817 const __m256i t0 = get_top_vector(above); 818 const __m256i t1 = get_top_vector(above + 16); 819 const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); 820 __m256i rep = _mm256_set1_epi16(0x8000); 821 const __m256i one = _mm256_set1_epi16(1); 822 823 int i; 824 for (i = 0; i < 16; ++i) { 825 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 826 827 const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); 828 829 _mm256_storeu_si256((__m256i *)dst, r); 830 831 dst += stride; 832 rep = _mm256_add_epi16(rep, one); 833 } 834 } 835 836 void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 837 const uint8_t *above, const uint8_t *left) { 838 __m256i l = get_left_vector(left); 839 const __m256i t0 = get_top_vector(above); 840 const __m256i t1 = get_top_vector(above + 16); 841 const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); 842 __m256i rep = _mm256_set1_epi16(0x8000); 843 const __m256i one = _mm256_set1_epi16(1); 844 845 int i; 846 for (i = 0; i < 16; ++i) { 847 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 848 849 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 850 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 851 852 _mm_store_si128((__m128i *)dst, r0); 853 _mm_store_si128((__m128i *)(dst + 16), r1); 854 855 dst += stride; 856 rep = _mm256_add_epi16(rep, one); 857 } 858 859 l = get_left_vector(left + 16); 860 rep = _mm256_set1_epi16(0x8000); 861 for (i = 0; i < 16; ++i) { 862 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 863 864 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 865 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 866 867 _mm_store_si128((__m128i *)dst, r0); 868 _mm_store_si128((__m128i *)(dst + 16), r1); 869 870 dst += stride; 871 rep = _mm256_add_epi16(rep, one); 872 } 873 } 874 875 void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 876 const uint8_t *above, const uint8_t *left) { 877 const __m256i t0 = get_top_vector(above); 878 const __m256i t1 = get_top_vector(above + 16); 879 const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); 880 const __m256i one = _mm256_set1_epi16(1); 881 882 int i, j; 883 for (j = 0; j < 4; ++j) { 884 const __m256i l = get_left_vector(left + j * 16); 885 __m256i rep = _mm256_set1_epi16(0x8000); 886 for (i = 0; i < 16; ++i) { 887 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 888 889 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 890 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 891 892 _mm_store_si128((__m128i *)dst, r0); 893 _mm_store_si128((__m128i *)(dst + 16), r1); 894 895 dst += stride; 896 rep = _mm256_add_epi16(rep, one); 897 } 898 } 899 } 900 901 void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 902 const uint8_t *above, const uint8_t *left) { 903 const __m256i t0 = get_top_vector(above); 904 const __m256i t1 = get_top_vector(above + 16); 905 const __m256i t2 = get_top_vector(above + 32); 906 const __m256i t3 = get_top_vector(above + 48); 907 const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); 908 const __m256i one = _mm256_set1_epi16(1); 909 910 int i, j; 911 for (j = 0; j < 2; ++j) { 912 const __m256i l = get_left_vector(left + j * 16); 913 __m256i rep = _mm256_set1_epi16(0x8000); 914 for (i = 0; i < 16; ++i) { 915 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 916 917 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 918 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 919 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); 920 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); 921 922 _mm_store_si128((__m128i *)dst, r0); 923 _mm_store_si128((__m128i *)(dst + 16), r1); 924 _mm_store_si128((__m128i *)(dst + 32), r2); 925 _mm_store_si128((__m128i *)(dst + 48), r3); 926 927 dst += stride; 928 rep = _mm256_add_epi16(rep, one); 929 } 930 } 931 } 932 933 void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 934 const uint8_t *above, const uint8_t *left) { 935 const __m256i t0 = get_top_vector(above); 936 const __m256i t1 = get_top_vector(above + 16); 937 const __m256i t2 = get_top_vector(above + 32); 938 const __m256i t3 = get_top_vector(above + 48); 939 const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); 940 const __m256i one = _mm256_set1_epi16(1); 941 942 int i, j; 943 for (j = 0; j < 4; ++j) { 944 const __m256i l = get_left_vector(left + j * 16); 945 __m256i rep = _mm256_set1_epi16(0x8000); 946 for (i = 0; i < 16; ++i) { 947 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 948 949 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 950 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 951 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); 952 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); 953 954 _mm_store_si128((__m128i *)dst, r0); 955 _mm_store_si128((__m128i *)(dst + 16), r1); 956 _mm_store_si128((__m128i *)(dst + 32), r2); 957 _mm_store_si128((__m128i *)(dst + 48), r3); 958 959 dst += stride; 960 rep = _mm256_add_epi16(rep, one); 961 } 962 } 963 } 964 965 void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 966 const uint8_t *above, const uint8_t *left) { 967 const __m256i t0 = get_top_vector(above); 968 const __m256i t1 = get_top_vector(above + 16); 969 const __m256i t2 = get_top_vector(above + 32); 970 const __m256i t3 = get_top_vector(above + 48); 971 const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); 972 const __m256i one = _mm256_set1_epi16(1); 973 974 int i; 975 const __m256i l = get_left_vector(left); 976 __m256i rep = _mm256_set1_epi16(0x8000); 977 for (i = 0; i < 16; ++i) { 978 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 979 980 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 981 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 982 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); 983 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); 984 985 _mm_store_si128((__m128i *)dst, r0); 986 _mm_store_si128((__m128i *)(dst + 16), r1); 987 _mm_store_si128((__m128i *)(dst + 32), r2); 988 _mm_store_si128((__m128i *)(dst + 48), r3); 989 990 dst += stride; 991 rep = _mm256_add_epi16(rep, one); 992 } 993 } 994 995 #define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6) 996 #define PERM2x128(c0, c1) c0 + (c1 << 4) 997 998 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2( 999 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { 1000 const int frac_bits = 6 - upsample_above; 1001 const int max_base_x = ((N + 4) - 1) << upsample_above; 1002 int x; 1003 // a assert(dx > 0); 1004 // pre-filter above pixels 1005 // store in temp buffers: 1006 // above[x] * 32 + 16 1007 // above[x+1] - above[x] 1008 // final pixels will be caluculated as: 1009 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1010 __m256i a0, a1, a32, a16; 1011 __m256i diff; 1012 __m128i a_mbase_x, max_base_x128, base_inc128, mask128; 1013 1014 a16 = _mm256_set1_epi32(16); 1015 a_mbase_x = _mm_set1_epi16(above[max_base_x]); 1016 max_base_x128 = _mm_set1_epi32(max_base_x); 1017 1018 x = dx; 1019 for (int r = 0; r < N; r++) { 1020 __m256i b, res, shift; 1021 __m128i res1; 1022 1023 int base = x >> frac_bits; 1024 if (base >= max_base_x) { 1025 for (int i = r; i < N; ++i) { 1026 dst[i] = a_mbase_x; // save 4 values 1027 } 1028 return; 1029 } 1030 1031 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); 1032 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); 1033 1034 if (upsample_above) { 1035 a0 = _mm256_permutevar8x32_epi32( 1036 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); 1037 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); 1038 base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6); 1039 shift = _mm256_srli_epi32( 1040 _mm256_and_si256( 1041 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), 1042 _mm256_set1_epi32(0x3f)), 1043 1); 1044 } else { 1045 base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3); 1046 shift = _mm256_srli_epi32( 1047 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1048 } 1049 1050 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1051 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1052 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1053 1054 b = _mm256_mullo_epi32(diff, shift); 1055 res = _mm256_add_epi32(a32, b); 1056 res = _mm256_srli_epi32(res, 5); 1057 1058 res1 = _mm256_castsi256_si128(res); 1059 res1 = _mm_packus_epi32(res1, res1); 1060 1061 mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128); 1062 mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit 1063 dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); 1064 x += dx; 1065 } 1066 } 1067 1068 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst, 1069 ptrdiff_t stride, 1070 const uint16_t *above, 1071 int upsample_above, int dx) { 1072 __m128i dstvec[16]; 1073 1074 highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, 1075 dx); 1076 for (int i = 0; i < N; i++) { 1077 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); 1078 } 1079 } 1080 1081 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2( 1082 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { 1083 const int frac_bits = 6 - upsample_above; 1084 const int max_base_x = ((8 + N) - 1) << upsample_above; 1085 1086 int x; 1087 // a assert(dx > 0); 1088 // pre-filter above pixels 1089 // store in temp buffers: 1090 // above[x] * 32 + 16 1091 // above[x+1] - above[x] 1092 // final pixels will be caluculated as: 1093 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1094 __m256i a0, a1, a0_1, a1_1, a32, a16; 1095 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1096 1097 a16 = _mm256_set1_epi32(16); 1098 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1099 max_base_x256 = _mm256_set1_epi32(max_base_x); 1100 1101 x = dx; 1102 for (int r = 0; r < N; r++) { 1103 __m256i b, res, res1, shift; 1104 1105 int base = x >> frac_bits; 1106 if (base >= max_base_x) { 1107 for (int i = r; i < N; ++i) { 1108 dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values 1109 } 1110 return; 1111 } 1112 1113 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); 1114 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); 1115 1116 if (upsample_above) { 1117 a0 = _mm256_permutevar8x32_epi32( 1118 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); 1119 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); 1120 1121 a0_1 = 1122 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); 1123 a0_1 = _mm256_permutevar8x32_epi32( 1124 a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); 1125 a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1)); 1126 1127 a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1); 1128 a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1); 1129 base_inc256 = 1130 _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8, 1131 base + 10, base + 12, base + 14); 1132 shift = _mm256_srli_epi32( 1133 _mm256_and_si256( 1134 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), 1135 _mm256_set1_epi32(0x3f)), 1136 1); 1137 } else { 1138 base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3, 1139 base + 4, base + 5, base + 6, base + 7); 1140 shift = _mm256_srli_epi32( 1141 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1142 } 1143 1144 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1145 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1146 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1147 1148 b = _mm256_mullo_epi32(diff, shift); 1149 res = _mm256_add_epi32(a32, b); 1150 res = _mm256_srli_epi32(res, 5); 1151 1152 res1 = _mm256_packus_epi32( 1153 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); 1154 1155 mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256); 1156 mask256 = _mm256_packs_epi32( 1157 mask256, _mm256_castsi128_si256( 1158 _mm256_extracti128_si256(mask256, 1))); // goto 16 bit 1159 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); 1160 dst[r] = _mm256_castsi256_si128(res1); 1161 x += dx; 1162 } 1163 } 1164 1165 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst, 1166 ptrdiff_t stride, 1167 const uint16_t *above, 1168 int upsample_above, int dx) { 1169 __m128i dstvec[32]; 1170 1171 highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, 1172 dx); 1173 for (int i = 0; i < N; i++) { 1174 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); 1175 } 1176 } 1177 1178 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2( 1179 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { 1180 int x; 1181 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1182 (void)upsample_above; 1183 const int frac_bits = 6; 1184 const int max_base_x = ((16 + N) - 1); 1185 1186 // pre-filter above pixels 1187 // store in temp buffers: 1188 // above[x] * 32 + 16 1189 // above[x+1] - above[x] 1190 // final pixels will be caluculated as: 1191 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1192 __m256i a0, a0_1, a1, a1_1, a32, a16; 1193 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1194 1195 a16 = _mm256_set1_epi32(16); 1196 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1197 max_base_x256 = _mm256_set1_epi16(max_base_x); 1198 1199 x = dx; 1200 for (int r = 0; r < N; r++) { 1201 __m256i b, res[2], res1; 1202 1203 int base = x >> frac_bits; 1204 if (base >= max_base_x) { 1205 for (int i = r; i < N; ++i) { 1206 dstvec[i] = a_mbase_x; // save 16 values 1207 } 1208 return; 1209 } 1210 __m256i shift = _mm256_srli_epi32( 1211 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1212 1213 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); 1214 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); 1215 1216 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1217 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1218 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1219 b = _mm256_mullo_epi32(diff, shift); 1220 1221 res[0] = _mm256_add_epi32(a32, b); 1222 res[0] = _mm256_srli_epi32(res[0], 5); 1223 res[0] = _mm256_packus_epi32( 1224 res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 1225 1226 int mdif = max_base_x - base; 1227 if (mdif > 8) { 1228 a0_1 = 1229 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); 1230 a1_1 = 1231 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9))); 1232 1233 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 1234 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 1235 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1236 b = _mm256_mullo_epi32(diff, shift); 1237 1238 res[1] = _mm256_add_epi32(a32, b); 1239 res[1] = _mm256_srli_epi32(res[1], 5); 1240 res[1] = _mm256_packus_epi32( 1241 res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 1242 } else { 1243 res[1] = a_mbase_x; 1244 } 1245 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1246 1); // 16 16bit values 1247 1248 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, 1249 base + 4, base + 5, base + 6, base + 7, 1250 base + 8, base + 9, base + 10, base + 11, 1251 base + 12, base + 13, base + 14, base + 15); 1252 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1253 dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256); 1254 x += dx; 1255 } 1256 } 1257 1258 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst, 1259 ptrdiff_t stride, 1260 const uint16_t *above, 1261 int upsample_above, int dx) { 1262 __m256i dstvec[64]; 1263 highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, 1264 dx); 1265 for (int i = 0; i < N; i++) { 1266 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); 1267 } 1268 } 1269 1270 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2( 1271 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { 1272 int x; 1273 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1274 (void)upsample_above; 1275 const int frac_bits = 6; 1276 const int max_base_x = ((32 + N) - 1); 1277 1278 // pre-filter above pixels 1279 // store in temp buffers: 1280 // above[x] * 32 + 16 1281 // above[x+1] - above[x] 1282 // final pixels will be caluculated as: 1283 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1284 __m256i a0, a0_1, a1, a1_1, a32, a16; 1285 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1286 1287 a16 = _mm256_set1_epi32(16); 1288 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1289 max_base_x256 = _mm256_set1_epi16(max_base_x); 1290 1291 x = dx; 1292 for (int r = 0; r < N; r++) { 1293 __m256i b, res[2], res1; 1294 1295 int base = x >> frac_bits; 1296 if (base >= max_base_x) { 1297 for (int i = r; i < N; ++i) { 1298 dstvec[i] = a_mbase_x; // save 32 values 1299 dstvec[i + N] = a_mbase_x; 1300 } 1301 return; 1302 } 1303 1304 __m256i shift = _mm256_srli_epi32( 1305 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1306 1307 for (int j = 0; j < 32; j += 16) { 1308 int mdif = max_base_x - (base + j); 1309 if (mdif <= 0) { 1310 res1 = a_mbase_x; 1311 } else { 1312 a0 = _mm256_cvtepu16_epi32( 1313 _mm_loadu_si128((__m128i *)(above + base + j))); 1314 a1 = _mm256_cvtepu16_epi32( 1315 _mm_loadu_si128((__m128i *)(above + base + 1 + j))); 1316 1317 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1318 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1319 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1320 b = _mm256_mullo_epi32(diff, shift); 1321 1322 res[0] = _mm256_add_epi32(a32, b); 1323 res[0] = _mm256_srli_epi32(res[0], 5); 1324 res[0] = _mm256_packus_epi32( 1325 res[0], 1326 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 1327 if (mdif > 8) { 1328 a0_1 = _mm256_cvtepu16_epi32( 1329 _mm_loadu_si128((__m128i *)(above + base + 8 + j))); 1330 a1_1 = _mm256_cvtepu16_epi32( 1331 _mm_loadu_si128((__m128i *)(above + base + 9 + j))); 1332 1333 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 1334 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 1335 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1336 b = _mm256_mullo_epi32(diff, shift); 1337 1338 res[1] = _mm256_add_epi32(a32, b); 1339 res[1] = _mm256_srli_epi32(res[1], 5); 1340 res[1] = _mm256_packus_epi32( 1341 res[1], 1342 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 1343 } else { 1344 res[1] = a_mbase_x; 1345 } 1346 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1347 1); // 16 16bit values 1348 base_inc256 = _mm256_setr_epi16( 1349 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, 1350 base + j + 5, base + j + 6, base + j + 7, base + j + 8, 1351 base + j + 9, base + j + 10, base + j + 11, base + j + 12, 1352 base + j + 13, base + j + 14, base + j + 15); 1353 1354 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1355 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); 1356 } 1357 if (!j) 1358 dstvec[r] = res1; 1359 else 1360 dstvec[r + N] = res1; 1361 } 1362 x += dx; 1363 } 1364 } 1365 1366 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst, 1367 ptrdiff_t stride, 1368 const uint16_t *above, 1369 int upsample_above, int dx) { 1370 __m256i dstvec[128]; 1371 1372 highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, 1373 dx); 1374 for (int i = 0; i < N; i++) { 1375 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); 1376 _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]); 1377 } 1378 } 1379 1380 static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst, 1381 ptrdiff_t stride, 1382 const uint16_t *above, 1383 int upsample_above, int dx) { 1384 int x; 1385 1386 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1387 (void)upsample_above; 1388 const int frac_bits = 6; 1389 const int max_base_x = ((64 + N) - 1); 1390 1391 // pre-filter above pixels 1392 // store in temp buffers: 1393 // above[x] * 32 + 16 1394 // above[x+1] - above[x] 1395 // final pixels will be caluculated as: 1396 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1397 __m256i a0, a0_1, a1, a1_1, a32, a16; 1398 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1399 1400 a16 = _mm256_set1_epi32(16); 1401 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1402 max_base_x256 = _mm256_set1_epi16(max_base_x); 1403 1404 x = dx; 1405 for (int r = 0; r < N; r++, dst += stride) { 1406 __m256i b, res[2], res1; 1407 1408 int base = x >> frac_bits; 1409 if (base >= max_base_x) { 1410 for (int i = r; i < N; ++i) { 1411 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values 1412 _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); 1413 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); 1414 _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); 1415 dst += stride; 1416 } 1417 return; 1418 } 1419 1420 __m256i shift = _mm256_srli_epi32( 1421 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1422 1423 __m128i a0_128, a0_1_128, a1_128, a1_1_128; 1424 for (int j = 0; j < 64; j += 16) { 1425 int mdif = max_base_x - (base + j); 1426 if (mdif <= 0) { 1427 _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); 1428 } else { 1429 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); 1430 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); 1431 a0 = _mm256_cvtepu16_epi32(a0_128); 1432 a1 = _mm256_cvtepu16_epi32(a1_128); 1433 1434 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1435 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1436 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1437 b = _mm256_mullo_epi32(diff, shift); 1438 1439 res[0] = _mm256_add_epi32(a32, b); 1440 res[0] = _mm256_srli_epi32(res[0], 5); 1441 res[0] = _mm256_packus_epi32( 1442 res[0], 1443 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 1444 if (mdif > 8) { 1445 a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j)); 1446 a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j)); 1447 a0_1 = _mm256_cvtepu16_epi32(a0_1_128); 1448 a1_1 = _mm256_cvtepu16_epi32(a1_1_128); 1449 1450 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 1451 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 1452 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1453 b = _mm256_mullo_epi32(diff, shift); 1454 1455 res[1] = _mm256_add_epi32(a32, b); 1456 res[1] = _mm256_srli_epi32(res[1], 5); 1457 res[1] = _mm256_packus_epi32( 1458 res[1], 1459 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 1460 } else { 1461 res[1] = a_mbase_x; 1462 } 1463 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1464 1); // 16 16bit values 1465 base_inc256 = _mm256_setr_epi16( 1466 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, 1467 base + j + 5, base + j + 6, base + j + 7, base + j + 8, 1468 base + j + 9, base + j + 10, base + j + 11, base + j + 12, 1469 base + j + 13, base + j + 14, base + j + 15); 1470 1471 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1472 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); 1473 _mm256_storeu_si256((__m256i *)(dst + j), res1); 1474 } 1475 } 1476 x += dx; 1477 } 1478 } 1479 1480 // Directional prediction, zone 1: 0 < angle < 90 1481 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw, 1482 int bh, const uint16_t *above, 1483 const uint16_t *left, int upsample_above, 1484 int dx, int dy, int bd) { 1485 (void)left; 1486 (void)dy; 1487 (void)bd; 1488 1489 switch (bw) { 1490 case 4: 1491 highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, 1492 dx); 1493 break; 1494 case 8: 1495 highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, 1496 dx); 1497 break; 1498 case 16: 1499 highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, 1500 dx); 1501 break; 1502 case 32: 1503 highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, 1504 dx); 1505 break; 1506 case 64: 1507 highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, 1508 dx); 1509 break; 1510 default: break; 1511 } 1512 return; 1513 } 1514 1515 static void highbd_transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc, 1516 uint16_t *dst, ptrdiff_t pitchDst) { 1517 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_Lo, r1_Lo, r2_Lo, r3_Lo, r4_Lo, 1518 r5_Lo, r6_Lo; 1519 r0 = _mm_load_si128( 1520 (__m128i *)(src + 0 * pitchSrc)); // 07,06,05,04,03,02,01,00 1521 r1 = _mm_load_si128( 1522 (__m128i *)(src + 1 * pitchSrc)); // 17,16,15,14,13,12,11,10 1523 r2 = _mm_load_si128( 1524 (__m128i *)(src + 2 * pitchSrc)); // 27,26,25,24,23,22,21,20 1525 r3 = _mm_load_si128( 1526 (__m128i *)(src + 3 * pitchSrc)); // 37,36,35,34,33,32,31,30 1527 r4 = _mm_load_si128( 1528 (__m128i *)(src + 4 * pitchSrc)); // 47,46,45,44,43,42,41,40 1529 r5 = _mm_load_si128( 1530 (__m128i *)(src + 5 * pitchSrc)); // 57,56,55,54,53,52,51,50 1531 r6 = _mm_load_si128( 1532 (__m128i *)(src + 6 * pitchSrc)); // 67,66,65,64,63,62,61,60 1533 r7 = _mm_load_si128( 1534 (__m128i *)(src + 7 * pitchSrc)); // 77,76,75,74,73,72,71,70 1535 1536 r0_Lo = _mm_unpacklo_epi16(r0, r1); 1537 r2_Lo = _mm_unpacklo_epi16(r2, r3); 1538 r4_Lo = _mm_unpacklo_epi16(r4, r5); 1539 r6_Lo = _mm_unpacklo_epi16(r6, r7); 1540 1541 r1_Lo = r0_Lo; 1542 r0_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo); 1543 r1_Lo = _mm_unpackhi_epi32(r1_Lo, r2_Lo); 1544 r5_Lo = r4_Lo; 1545 r4_Lo = _mm_unpacklo_epi32(r4_Lo, r6_Lo); 1546 r5_Lo = _mm_unpackhi_epi32(r5_Lo, r6_Lo); 1547 r2_Lo = r0_Lo; 1548 r0_Lo = _mm_unpacklo_epi64(r0_Lo, r4_Lo); // 64 1549 r2_Lo = _mm_unpackhi_epi64(r2_Lo, r4_Lo); 1550 r3_Lo = r1_Lo; 1551 r1_Lo = _mm_unpacklo_epi64(r1_Lo, r5_Lo); 1552 r3_Lo = _mm_unpackhi_epi64(r3_Lo, r5_Lo); 1553 1554 _mm_storeu_si128((__m128i *)(dst + 0 * pitchDst), r0_Lo); 1555 _mm_storeu_si128((__m128i *)(dst + 1 * pitchDst), r2_Lo); 1556 _mm_storeu_si128((__m128i *)(dst + 2 * pitchDst), r1_Lo); 1557 _mm_storeu_si128((__m128i *)(dst + 3 * pitchDst), r3_Lo); 1558 1559 r0 = _mm_unpackhi_epi16(r0, r1); 1560 r2 = _mm_unpackhi_epi16(r2, r3); 1561 r4 = _mm_unpackhi_epi16(r4, r5); 1562 r6 = _mm_unpackhi_epi16(r6, r7); 1563 1564 r1 = r0; 1565 r0 = _mm_unpacklo_epi32(r0, r2); 1566 r1 = _mm_unpackhi_epi32(r1, r2); 1567 r5 = r4; 1568 r4 = _mm_unpacklo_epi32(r4, r6); 1569 r5 = _mm_unpackhi_epi32(r5, r6); 1570 r2 = r0; 1571 r0 = _mm_unpacklo_epi64(r0, r4); 1572 r2 = _mm_unpackhi_epi64(r2, r4); 1573 r3 = r1; 1574 r1 = _mm_unpacklo_epi64(r1, r5); 1575 r3 = _mm_unpackhi_epi64(r3, r5); 1576 1577 _mm_storeu_si128((__m128i *)(dst + 4 * pitchDst), r0); 1578 _mm_storeu_si128((__m128i *)(dst + 5 * pitchDst), r2); 1579 _mm_storeu_si128((__m128i *)(dst + 6 * pitchDst), r1); 1580 _mm_storeu_si128((__m128i *)(dst + 7 * pitchDst), r3); 1581 } 1582 1583 static uint8_t HighbdLoadMaskx[8][16] = { 1584 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 1585 { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, 1586 { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, 1587 { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, 1588 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 }, 1589 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 }, 1590 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 }, 1591 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, 1592 }; 1593 1594 static uint8_t HighbdEvenOddMaskx4[8][16] = { 1595 { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 1596 15 }, // 0=0,1, 1=2,3, 2=4,5, 3=6,7, 4=8,9, 5=10,11, 6=12,13, 7=14,15, 1597 // >7=0,1 1598 { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 }, 1599 { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 }, 1600 { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }, 1601 { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 0, 1, 0, 1 }, 1602 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 0, 1 }, 1603 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 0, 1 }, 1604 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15 } 1605 }; 1606 1607 static uint16_t HighbdEvenOddMaskx8_2[8][16] = { 1608 { 0, 2, 4, 6, 8, 10, 12, 14 }, { 2, 2, 4, 6, 8, 10, 12, 14 }, 1609 { 4, 4, 4, 6, 8, 10, 12, 14 }, { 6, 6, 6, 6, 8, 10, 12, 14 }, 1610 { 8, 8, 8, 8, 8, 10, 12, 14 }, { 10, 10, 10, 10, 10, 10, 12, 14 }, 1611 { 12, 12, 12, 12, 12, 12, 12, 14 }, { 14, 14, 14, 14, 14, 14, 14, 14 }, 1612 }; 1613 1614 static uint16_t HighbdBaseMask[17][16] = { 1615 { 1616 0, 1617 0, 1618 0, 1619 0, 1620 0, 1621 0, 1622 0, 1623 0, 1624 0, 1625 0, 1626 0, 1627 0, 1628 0, 1629 0, 1630 0, 1631 0, 1632 }, 1633 { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1634 { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1635 { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1636 { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1637 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1638 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1639 0 }, 1640 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 1641 0, 0 }, 1642 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 1643 0, 0, 0, 0 }, 1644 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 1645 0, 0, 0, 0, 0, 0 }, 1646 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 1647 0xffff, 0, 0, 0, 0, 0, 0 }, 1648 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 1649 0xffff, 0xffff, 0, 0, 0, 0, 0 }, 1650 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 1651 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 }, 1652 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 1653 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 }, 1654 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 1655 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 }, 1656 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 1657 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 }, 1658 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 1659 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff } 1660 }; 1661 1662 static void highbd_dr_prediction_z2_Nx4_avx2( 1663 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 1664 const uint16_t *left, int upsample_above, int upsample_left, int dx, 1665 int dy) { 1666 const int min_base_x = -(1 << upsample_above); 1667 const int min_base_y = -(1 << upsample_left); 1668 const int frac_bits_x = 6 - upsample_above; 1669 const int frac_bits_y = 6 - upsample_left; 1670 1671 // a assert(dx > 0); 1672 // pre-filter above pixels 1673 // store in temp buffers: 1674 // above[x] * 32 + 16 1675 // above[x+1] - above[x] 1676 // final pixels will be caluculated as: 1677 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1678 __m256i a0_x, a1_x, a32, a16; 1679 __m256i diff; 1680 __m128i c3f, min_base_y128; 1681 1682 a16 = _mm256_set1_epi32(16); 1683 c3f = _mm_set1_epi32(0x3f); 1684 min_base_y128 = _mm_set1_epi32(min_base_y); 1685 1686 for (int r = 0; r < N; r++) { 1687 __m256i b, res, shift; 1688 __m128i resx, resy, resxy; 1689 __m128i a0_x128, a1_x128; 1690 int y = r + 1; 1691 int base_x = (-y * dx) >> frac_bits_x; 1692 int base_shift = 0; 1693 if (base_x < (min_base_x - 1)) { 1694 base_shift = (min_base_x - base_x - 1) >> upsample_above; 1695 } 1696 int base_min_diff = 1697 (min_base_x - base_x + upsample_above) >> upsample_above; 1698 if (base_min_diff > 4) { 1699 base_min_diff = 4; 1700 } else { 1701 if (base_min_diff < 0) base_min_diff = 0; 1702 } 1703 1704 if (base_shift > 3) { 1705 a0_x = _mm256_setzero_si256(); 1706 a1_x = _mm256_setzero_si256(); 1707 shift = _mm256_setzero_si256(); 1708 } else { 1709 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 1710 if (upsample_above) { 1711 a0_x128 = _mm_shuffle_epi8(a0_x128, 1712 *(__m128i *)HighbdEvenOddMaskx4[base_shift]); 1713 a1_x128 = _mm_srli_si128(a0_x128, 8); 1714 1715 shift = _mm256_castsi128_si256(_mm_srli_epi32( 1716 _mm_and_si128( 1717 _mm_slli_epi32( 1718 _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, 1719 (2 << 6) - y * dx, (3 << 6) - y * dx), 1720 upsample_above), 1721 c3f), 1722 1)); 1723 } else { 1724 a0_x128 = 1725 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 1726 a1_x128 = _mm_srli_si128(a0_x128, 2); 1727 1728 shift = _mm256_castsi128_si256(_mm_srli_epi32( 1729 _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, 1730 (2 << 6) - y * dx, (3 << 6) - y * dx), 1731 c3f), 1732 1)); 1733 } 1734 a0_x = _mm256_cvtepu16_epi32(a0_x128); 1735 a1_x = _mm256_cvtepu16_epi32(a1_x128); 1736 } 1737 // y calc 1738 __m128i a0_y, a1_y, shifty; 1739 if (base_x < min_base_x) { 1740 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; 1741 DECLARE_ALIGNED(32, int, base_y_c[4]); 1742 r6 = _mm_set1_epi32(r << 6); 1743 dy128 = _mm_set1_epi32(dy); 1744 c1234 = _mm_setr_epi32(1, 2, 3, 4); 1745 y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); 1746 base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); 1747 mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); 1748 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 1749 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 1750 1751 a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], 1752 left[base_y_c[2]], left[base_y_c[3]]); 1753 a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], 1754 left[base_y_c[2] + 1], left[base_y_c[3] + 1]); 1755 1756 if (upsample_left) { 1757 shifty = _mm_srli_epi32( 1758 _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); 1759 } else { 1760 shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); 1761 } 1762 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); 1763 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); 1764 shift = _mm256_inserti128_si256(shift, shifty, 1); 1765 } 1766 1767 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] 1768 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 1769 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1770 1771 b = _mm256_mullo_epi32(diff, shift); 1772 res = _mm256_add_epi32(a32, b); 1773 res = _mm256_srli_epi32(res, 5); 1774 1775 resx = _mm256_castsi256_si128(res); 1776 resx = _mm_packus_epi32(resx, resx); 1777 1778 resy = _mm256_extracti128_si256(res, 1); 1779 resy = _mm_packus_epi32(resy, resy); 1780 1781 resxy = 1782 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); 1783 _mm_storel_epi64((__m128i *)(dst), resxy); 1784 dst += stride; 1785 } 1786 } 1787 1788 static void highbd_dr_prediction_32bit_z2_Nx8_avx2( 1789 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 1790 const uint16_t *left, int upsample_above, int upsample_left, int dx, 1791 int dy) { 1792 const int min_base_x = -(1 << upsample_above); 1793 const int min_base_y = -(1 << upsample_left); 1794 const int frac_bits_x = 6 - upsample_above; 1795 const int frac_bits_y = 6 - upsample_left; 1796 1797 // pre-filter above pixels 1798 // store in temp buffers: 1799 // above[x] * 32 + 16 1800 // above[x+1] - above[x] 1801 // final pixels will be caluculated as: 1802 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1803 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256; 1804 __m256i diff; 1805 __m128i a0_x128, a1_x128; 1806 1807 a16 = _mm256_set1_epi32(16); 1808 c3f = _mm256_set1_epi32(0x3f); 1809 min_base_y256 = _mm256_set1_epi32(min_base_y); 1810 1811 for (int r = 0; r < N; r++) { 1812 __m256i b, res, shift; 1813 __m128i resx, resy, resxy; 1814 int y = r + 1; 1815 int base_x = (-y * dx) >> frac_bits_x; 1816 int base_shift = 0; 1817 if (base_x < (min_base_x - 1)) { 1818 base_shift = (min_base_x - base_x - 1) >> upsample_above; 1819 } 1820 int base_min_diff = 1821 (min_base_x - base_x + upsample_above) >> upsample_above; 1822 if (base_min_diff > 8) { 1823 base_min_diff = 8; 1824 } else { 1825 if (base_min_diff < 0) base_min_diff = 0; 1826 } 1827 1828 if (base_shift > 7) { 1829 resx = _mm_setzero_si128(); 1830 } else { 1831 if (upsample_above) { 1832 a0_x128 = _mm_setr_epi16( 1833 above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]], 1834 above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]], 1835 above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]], 1836 above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]], 1837 above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]], 1838 above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]], 1839 above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]], 1840 above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]); 1841 a1_x128 = _mm_setr_epi16( 1842 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]], 1843 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]], 1844 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]], 1845 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]], 1846 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]], 1847 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]], 1848 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]], 1849 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]); 1850 shift = _mm256_srli_epi32( 1851 _mm256_and_si256( 1852 _mm256_slli_epi32( 1853 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, 1854 (2 << 6) - y * dx, (3 << 6) - y * dx, 1855 (4 << 6) - y * dx, (5 << 6) - y * dx, 1856 (6 << 6) - y * dx, (7 << 6) - y * dx), 1857 upsample_above), 1858 c3f), 1859 1); 1860 } else { 1861 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 1862 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); 1863 a0_x128 = 1864 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 1865 a1_x128 = 1866 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 1867 1868 shift = _mm256_srli_epi32( 1869 _mm256_and_si256( 1870 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, 1871 (3 << 6) - y * dx, (4 << 6) - y * dx, 1872 (5 << 6) - y * dx, (6 << 6) - y * dx, 1873 (7 << 6) - y * dx), 1874 c3f), 1875 1); 1876 } 1877 1878 a0_x = _mm256_cvtepu16_epi32(a0_x128); 1879 a1_x = _mm256_cvtepu16_epi32(a1_x128); 1880 1881 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] 1882 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 1883 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1884 1885 b = _mm256_mullo_epi32(diff, shift); 1886 res = _mm256_add_epi32(a32, b); 1887 res = _mm256_srli_epi32(res, 5); 1888 1889 resx = _mm256_castsi256_si128(_mm256_packus_epi32( 1890 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); 1891 } 1892 // y calc 1893 if (base_x < min_base_x) { 1894 DECLARE_ALIGNED(32, int, base_y_c[8]); 1895 __m256i r6, c256, dy256, y_c256, base_y_c256, mask256; 1896 r6 = _mm256_set1_epi32(r << 6); 1897 dy256 = _mm256_set1_epi32(dy); 1898 c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); 1899 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); 1900 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); 1901 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); 1902 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 1903 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 1904 1905 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 1906 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 1907 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 1908 left[base_y_c[6]], left[base_y_c[7]])); 1909 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 1910 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], 1911 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], 1912 left[base_y_c[6] + 1], left[base_y_c[7] + 1])); 1913 1914 if (upsample_left) { 1915 shift = _mm256_srli_epi32( 1916 _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f), 1917 1); 1918 } else { 1919 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); 1920 } 1921 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] 1922 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 1923 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1924 1925 b = _mm256_mullo_epi32(diff, shift); 1926 res = _mm256_add_epi32(a32, b); 1927 res = _mm256_srli_epi32(res, 5); 1928 1929 resy = _mm256_castsi256_si128(_mm256_packus_epi32( 1930 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); 1931 } else { 1932 resy = resx; 1933 } 1934 resxy = 1935 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); 1936 _mm_storeu_si128((__m128i *)(dst), resxy); 1937 dst += stride; 1938 } 1939 } 1940 1941 static void highbd_dr_prediction_z2_Nx8_avx2( 1942 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 1943 const uint16_t *left, int upsample_above, int upsample_left, int dx, 1944 int dy) { 1945 const int min_base_x = -(1 << upsample_above); 1946 const int min_base_y = -(1 << upsample_left); 1947 const int frac_bits_x = 6 - upsample_above; 1948 const int frac_bits_y = 6 - upsample_left; 1949 1950 // pre-filter above pixels 1951 // store in temp buffers: 1952 // above[x] * 32 + 16 1953 // above[x+1] - above[x] 1954 // final pixels will be caluculated as: 1955 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1956 __m128i c3f, min_base_y128; 1957 __m256i a0_x, a1_x, diff, a32, a16; 1958 __m128i a0_x128, a1_x128; 1959 1960 a16 = _mm256_set1_epi16(16); 1961 c3f = _mm_set1_epi16(0x3f); 1962 min_base_y128 = _mm_set1_epi16(min_base_y); 1963 1964 for (int r = 0; r < N; r++) { 1965 __m256i b, res, shift; 1966 __m128i resx, resy, resxy; 1967 int y = r + 1; 1968 int base_x = (-y * dx) >> frac_bits_x; 1969 int base_shift = 0; 1970 if (base_x < (min_base_x - 1)) { 1971 base_shift = (min_base_x - base_x - 1) >> upsample_above; 1972 } 1973 int base_min_diff = 1974 (min_base_x - base_x + upsample_above) >> upsample_above; 1975 if (base_min_diff > 8) { 1976 base_min_diff = 8; 1977 } else { 1978 if (base_min_diff < 0) base_min_diff = 0; 1979 } 1980 1981 if (base_shift > 7) { 1982 a0_x = _mm256_setzero_si256(); 1983 a1_x = _mm256_setzero_si256(); 1984 shift = _mm256_setzero_si256(); 1985 } else { 1986 if (upsample_above) { 1987 a0_x128 = _mm_setr_epi16( 1988 above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]], 1989 above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]], 1990 above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]], 1991 above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]], 1992 above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]], 1993 above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]], 1994 above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]], 1995 above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]); 1996 a1_x128 = _mm_setr_epi16( 1997 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]], 1998 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]], 1999 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]], 2000 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]], 2001 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]], 2002 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]], 2003 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]], 2004 above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]); 2005 shift = _mm256_castsi128_si256(_mm_srli_epi16( 2006 _mm_and_si128( 2007 _mm_slli_epi16( 2008 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, 2009 (2 << 6) - y * dx, (3 << 6) - y * dx, 2010 (4 << 6) - y * dx, (5 << 6) - y * dx, 2011 (6 << 6) - y * dx, (7 << 6) - y * dx), 2012 upsample_above), 2013 c3f), 2014 1)); 2015 } else { 2016 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 2017 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); 2018 a0_x128 = 2019 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2020 a1_x128 = 2021 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2022 2023 shift = _mm256_castsi128_si256(_mm_srli_epi16( 2024 _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, 2025 (2 << 6) - y * dx, (3 << 6) - y * dx, 2026 (4 << 6) - y * dx, (5 << 6) - y * dx, 2027 (6 << 6) - y * dx, (7 << 6) - y * dx), 2028 c3f), 2029 1)); 2030 } 2031 a0_x = _mm256_castsi128_si256(a0_x128); 2032 a1_x = _mm256_castsi128_si256(a1_x128); 2033 } 2034 2035 // y calc 2036 __m128i a0_y, a1_y, shifty; 2037 if (base_x < min_base_x) { 2038 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 2039 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; 2040 r6 = _mm_set1_epi16(r << 6); 2041 dy128 = _mm_set1_epi16(dy); 2042 c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); 2043 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); 2044 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); 2045 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); 2046 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 2047 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 2048 2049 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 2050 left[base_y_c[2]], left[base_y_c[3]], 2051 left[base_y_c[4]], left[base_y_c[5]], 2052 left[base_y_c[6]], left[base_y_c[7]]); 2053 a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], 2054 left[base_y_c[2] + 1], left[base_y_c[3] + 1], 2055 left[base_y_c[4] + 1], left[base_y_c[5] + 1], 2056 left[base_y_c[6] + 1], left[base_y_c[7] + 1]); 2057 2058 if (upsample_left) { 2059 shifty = _mm_srli_epi16( 2060 _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1); 2061 } else { 2062 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); 2063 } 2064 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); 2065 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); 2066 shift = _mm256_inserti128_si256(shift, shifty, 1); 2067 } 2068 2069 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 2070 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 2071 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 2072 2073 b = _mm256_mullo_epi16(diff, shift); 2074 res = _mm256_add_epi16(a32, b); 2075 res = _mm256_srli_epi16(res, 5); 2076 2077 resx = _mm256_castsi256_si128(res); 2078 resy = _mm256_extracti128_si256(res, 1); 2079 2080 resxy = 2081 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); 2082 _mm_storeu_si128((__m128i *)(dst), resxy); 2083 dst += stride; 2084 } 2085 } 2086 2087 static void highbd_dr_prediction_32bit_z2_HxW_avx2( 2088 int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 2089 const uint16_t *left, int upsample_above, int upsample_left, int dx, 2090 int dy) { 2091 // here upsample_above and upsample_left are 0 by design of 2092 // av1_use_intra_edge_upsample 2093 const int min_base_x = -1; 2094 const int min_base_y = -1; 2095 (void)upsample_above; 2096 (void)upsample_left; 2097 const int frac_bits_x = 6; 2098 const int frac_bits_y = 6; 2099 2100 // pre-filter above pixels 2101 // store in temp buffers: 2102 // above[x] * 32 + 16 2103 // above[x+1] - above[x] 2104 // final pixels will be caluculated as: 2105 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 2106 __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16; 2107 __m256i diff, min_base_y256, c3f; 2108 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; 2109 2110 a16 = _mm256_set1_epi32(16); 2111 min_base_y256 = _mm256_set1_epi16(min_base_y); 2112 c3f = _mm256_set1_epi32(0x3f); 2113 2114 for (int r = 0; r < H; r++) { 2115 __m256i b, res, shift; 2116 __m256i resx[2], resy[2]; 2117 __m256i resxy; 2118 for (int j = 0; j < W; j += 16) { 2119 int y = r + 1; 2120 int base_x = (-y * dx) >> frac_bits_x; 2121 int base_shift = 0; 2122 if ((base_x + j) < (min_base_x - 1)) { 2123 base_shift = (min_base_x - (base_x + j) - 1); 2124 } 2125 int base_min_diff = (min_base_x - base_x - j); 2126 if (base_min_diff > 16) { 2127 base_min_diff = 16; 2128 } else { 2129 if (base_min_diff < 0) base_min_diff = 0; 2130 } 2131 2132 if (base_shift > 7) { 2133 resx[0] = _mm256_setzero_si256(); 2134 } else { 2135 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); 2136 a1_x128 = 2137 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); 2138 a0_x128 = 2139 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2140 a1_x128 = 2141 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2142 2143 a0_x = _mm256_cvtepu16_epi32(a0_x128); 2144 a1_x = _mm256_cvtepu16_epi32(a1_x128); 2145 2146 shift = _mm256_srli_epi32( 2147 _mm256_and_si256( 2148 _mm256_setr_epi32( 2149 ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx, 2150 ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx, 2151 ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx, 2152 ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx), 2153 c3f), 2154 1); 2155 2156 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] 2157 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 2158 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2159 2160 b = _mm256_mullo_epi32(diff, shift); 2161 res = _mm256_add_epi32(a32, b); 2162 res = _mm256_srli_epi32(res, 5); 2163 2164 resx[0] = _mm256_packus_epi32( 2165 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); 2166 } 2167 int base_shift8 = 0; 2168 if ((base_x + j + 8) < (min_base_x - 1)) { 2169 base_shift8 = (min_base_x - (base_x + j + 8) - 1); 2170 } 2171 if (base_shift8 > 7) { 2172 resx[1] = _mm256_setzero_si256(); 2173 } else { 2174 a0_1_x128 = 2175 _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8 + j)); 2176 a1_1_x128 = 2177 _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9 + j)); 2178 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, 2179 *(__m128i *)HighbdLoadMaskx[base_shift8]); 2180 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, 2181 *(__m128i *)HighbdLoadMaskx[base_shift8]); 2182 2183 a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128); 2184 a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128); 2185 2186 shift = _mm256_srli_epi32( 2187 _mm256_and_si256( 2188 _mm256_setr_epi32( 2189 ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx, 2190 ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx, 2191 ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx, 2192 ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx), 2193 c3f), 2194 1); 2195 2196 diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x] 2197 a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32 2198 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2199 b = _mm256_mullo_epi32(diff, shift); 2200 2201 resx[1] = _mm256_add_epi32(a32, b); 2202 resx[1] = _mm256_srli_epi32(resx[1], 5); 2203 resx[1] = _mm256_packus_epi32( 2204 resx[1], 2205 _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1))); 2206 } 2207 resx[0] = 2208 _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]), 2209 1); // 16 16bit values 2210 2211 // y calc 2212 if ((base_x < min_base_x)) { 2213 DECLARE_ALIGNED(32, int, base_y_c[16]); 2214 __m256i r6, c256, dy256, y_c256, y_c_1_256, base_y_c256, mask256; 2215 r6 = _mm256_set1_epi32(r << 6); 2216 dy256 = _mm256_set1_epi32(dy); 2217 c256 = _mm256_setr_epi32(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j, 2218 7 + j, 8 + j); 2219 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); 2220 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); 2221 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); 2222 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 2223 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 2224 c256 = _mm256_setr_epi32(9 + j, 10 + j, 11 + j, 12 + j, 13 + j, 14 + j, 2225 15 + j, 16 + j); 2226 y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); 2227 base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y); 2228 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); 2229 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 2230 _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256); 2231 2232 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 2233 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 2234 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 2235 left[base_y_c[6]], left[base_y_c[7]])); 2236 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 2237 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], 2238 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], 2239 left[base_y_c[6] + 1], left[base_y_c[7] + 1])); 2240 2241 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); 2242 2243 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] 2244 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 2245 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2246 2247 b = _mm256_mullo_epi32(diff, shift); 2248 res = _mm256_add_epi32(a32, b); 2249 res = _mm256_srli_epi32(res, 5); 2250 2251 resy[0] = _mm256_packus_epi32( 2252 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); 2253 2254 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 2255 left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], 2256 left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], 2257 left[base_y_c[14]], left[base_y_c[15]])); 2258 a1_y = _mm256_cvtepu16_epi32( 2259 _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1], 2260 left[base_y_c[10] + 1], left[base_y_c[11] + 1], 2261 left[base_y_c[12] + 1], left[base_y_c[13] + 1], 2262 left[base_y_c[14] + 1], left[base_y_c[15] + 1])); 2263 shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1); 2264 2265 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] 2266 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 2267 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2268 2269 b = _mm256_mullo_epi32(diff, shift); 2270 res = _mm256_add_epi32(a32, b); 2271 res = _mm256_srli_epi32(res, 5); 2272 2273 resy[1] = _mm256_packus_epi32( 2274 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); 2275 2276 resy[0] = 2277 _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]), 2278 1); // 16 16bit values 2279 } else { 2280 resy[0] = resx[0]; 2281 } 2282 resxy = _mm256_blendv_epi8(resx[0], resy[0], 2283 *(__m256i *)HighbdBaseMask[base_min_diff]); 2284 _mm256_storeu_si256((__m256i *)(dst + j), resxy); 2285 } // for j 2286 dst += stride; 2287 } 2288 } 2289 2290 static void highbd_dr_prediction_z2_HxW_avx2( 2291 int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 2292 const uint16_t *left, int upsample_above, int upsample_left, int dx, 2293 int dy) { 2294 // here upsample_above and upsample_left are 0 by design of 2295 // av1_use_intra_edge_upsample 2296 const int min_base_x = -1; 2297 const int min_base_y = -1; 2298 (void)upsample_above; 2299 (void)upsample_left; 2300 const int frac_bits_x = 6; 2301 const int frac_bits_y = 6; 2302 2303 // pre-filter above pixels 2304 // store in temp buffers: 2305 // above[x] * 32 + 16 2306 // above[x+1] - above[x] 2307 // final pixels will be caluculated as: 2308 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 2309 __m256i a0_x, a1_x, a32, a16, c3f; 2310 __m256i diff, min_base_y256; 2311 2312 a16 = _mm256_set1_epi16(16); 2313 min_base_y256 = _mm256_set1_epi16(min_base_y); 2314 c3f = _mm256_set1_epi16(0x3f); 2315 2316 for (int r = 0; r < H; r++) { 2317 __m256i b, res, shift; 2318 __m256i resx, resy; 2319 __m256i resxy; 2320 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, shiftx; 2321 2322 for (int j = 0; j < W; j += 16) { 2323 int y = r + 1; 2324 int base_x = (-y * dx) >> frac_bits_x; 2325 int base_shift = 0; 2326 if ((base_x + j) < (min_base_x - 1)) { 2327 base_shift = (min_base_x - (base_x + j) - 1); 2328 } 2329 int base_min_diff = (min_base_x - base_x - j); 2330 if (base_min_diff > 16) { 2331 base_min_diff = 16; 2332 } else { 2333 if (base_min_diff < 0) base_min_diff = 0; 2334 } 2335 2336 if (base_shift > 7) { 2337 a0_x = _mm256_setzero_si256(); 2338 a1_x = _mm256_setzero_si256(); 2339 shift = _mm256_setzero_si256(); 2340 } else { 2341 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); 2342 a1_x128 = 2343 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); 2344 a0_x128 = 2345 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2346 a1_x128 = 2347 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2348 2349 a0_x = _mm256_castsi128_si256(a0_x128); 2350 a1_x = _mm256_castsi128_si256(a1_x128); 2351 2352 shift = _mm256_castsi128_si256(_mm_srli_epi16( 2353 _mm_and_si128(_mm_setr_epi16( 2354 ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx, 2355 ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx, 2356 ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx, 2357 ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx), 2358 _mm256_castsi256_si128(c3f)), 2359 1)); 2360 } 2361 2362 base_shift = 0; 2363 if ((base_x + j + 8) < (min_base_x - 1)) { 2364 base_shift = (min_base_x - (base_x + j + 8) - 1); 2365 } 2366 if (base_shift <= 7) { 2367 a0_1_x128 = 2368 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j)); 2369 a1_1_x128 = 2370 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j)); 2371 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, 2372 *(__m128i *)HighbdLoadMaskx[base_shift]); 2373 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, 2374 *(__m128i *)HighbdLoadMaskx[base_shift]); 2375 2376 shiftx = _mm_srli_epi16( 2377 _mm_and_si128( 2378 _mm_setr_epi16( 2379 ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx, 2380 ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx, 2381 ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx, 2382 ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx), 2383 _mm256_castsi256_si128(c3f)), 2384 1); 2385 2386 a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1); 2387 a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1); 2388 shift = _mm256_inserti128_si256(shift, shiftx, 1); 2389 } 2390 2391 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 2392 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 2393 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 2394 2395 b = _mm256_mullo_epi16(diff, shift); 2396 res = _mm256_add_epi16(a32, b); 2397 resx = _mm256_srli_epi16(res, 5); // 16 16-bit values 2398 2399 // y calc 2400 __m256i a0_y, a1_y, shifty; 2401 if ((base_x < min_base_x)) { 2402 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 2403 __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16; 2404 r6 = _mm256_set1_epi16(r << 6); 2405 dy256 = _mm256_set1_epi16(dy); 2406 c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j, 2407 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j, 2408 13 + j, 14 + j, 15 + j, 16 + j); 2409 mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), 2410 _mm256_srli_epi16(min_base_y256, 1)); 2411 y_c256 = _mm256_sub_epi16(r6, mul16); 2412 base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); 2413 mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); 2414 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 2415 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 2416 2417 a0_y = _mm256_setr_epi16( 2418 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 2419 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 2420 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], 2421 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], 2422 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], 2423 left[base_y_c[15]]); 2424 a1_y = _mm256_setr_epi16( 2425 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], 2426 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], 2427 left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1], 2428 left[base_y_c[9] + 1], left[base_y_c[10] + 1], 2429 left[base_y_c[11] + 1], left[base_y_c[12] + 1], 2430 left[base_y_c[13] + 1], left[base_y_c[14] + 1], 2431 left[base_y_c[15] + 1]); 2432 2433 shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); 2434 2435 diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] 2436 a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 2437 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 2438 2439 b = _mm256_mullo_epi16(diff, shifty); 2440 res = _mm256_add_epi16(a32, b); 2441 resy = _mm256_srli_epi16(res, 5); 2442 } else { 2443 resy = _mm256_setzero_si256(); 2444 } 2445 2446 resxy = _mm256_blendv_epi8(resx, resy, 2447 *(__m256i *)HighbdBaseMask[base_min_diff]); 2448 _mm256_storeu_si256((__m256i *)(dst + j), resxy); 2449 } // for j 2450 dst += stride; 2451 } 2452 } 2453 2454 // Directional prediction, zone 2: 90 < angle < 180 2455 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw, 2456 int bh, const uint16_t *above, 2457 const uint16_t *left, int upsample_above, 2458 int upsample_left, int dx, int dy, 2459 int bd) { 2460 (void)bd; 2461 assert(dx > 0); 2462 assert(dy > 0); 2463 switch (bw) { 2464 case 4: 2465 highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, 2466 upsample_above, upsample_left, dx, dy); 2467 break; 2468 case 8: 2469 if (bd < 12) { 2470 highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, 2471 upsample_above, upsample_left, dx, dy); 2472 } else { 2473 highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left, 2474 upsample_above, upsample_left, 2475 dx, dy); 2476 } 2477 break; 2478 default: 2479 if (bd < 12) { 2480 highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, 2481 upsample_above, upsample_left, dx, dy); 2482 } else { 2483 highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left, 2484 upsample_above, upsample_left, 2485 dx, dy); 2486 } 2487 break; 2488 } 2489 } 2490 2491 static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc, 2492 uint16_t *dst, ptrdiff_t pitchDst, int width, 2493 int height) { 2494 for (int j = 0; j < height; j += 8) 2495 for (int i = 0; i < width; i += 8) 2496 highbd_transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, 2497 dst + j * pitchDst + i, pitchDst); 2498 } 2499 2500 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride, 2501 const uint16_t *left, 2502 int upsample_left, int dy) { 2503 __m128i dstvec[4], d[4]; 2504 2505 highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy); 2506 highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], 2507 &dstvec[3], &d[0], &d[1], &d[2], &d[3]); 2508 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); 2509 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); 2510 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); 2511 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); 2512 return; 2513 } 2514 2515 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride, 2516 const uint16_t *left, 2517 int upsample_left, int dy) { 2518 __m128i dstvec[8], d[8]; 2519 2520 highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy); 2521 highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 2522 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], 2523 &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], 2524 &d[7]); 2525 for (int i = 0; i < 8; i++) { 2526 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 2527 } 2528 } 2529 2530 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride, 2531 const uint16_t *left, 2532 int upsample_left, int dy) { 2533 __m128i dstvec[4], d[8]; 2534 2535 highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy); 2536 highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 2537 &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], 2538 &d[7]); 2539 for (int i = 0; i < 8; i++) { 2540 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); 2541 } 2542 } 2543 2544 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride, 2545 const uint16_t *left, 2546 int upsample_left, int dy) { 2547 __m128i dstvec[8], d[4]; 2548 2549 highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy); 2550 highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 2551 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], 2552 &d[0], &d[1], &d[2], &d[3]); 2553 _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); 2554 _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]); 2555 _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]); 2556 _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]); 2557 } 2558 2559 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride, 2560 const uint16_t *left, 2561 int upsample_left, int dy) { 2562 __m256i dstvec[8], d[8]; 2563 2564 highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, 2565 dy); 2566 highbd_transpose8x16_16x8_avx2(dstvec, d); 2567 for (int i = 0; i < 8; i++) { 2568 _mm_storeu_si128((__m128i *)(dst + i * stride), 2569 _mm256_castsi256_si128(d[i])); 2570 } 2571 for (int i = 8; i < 16; i++) { 2572 _mm_storeu_si128((__m128i *)(dst + i * stride), 2573 _mm256_extracti128_si256(d[i - 8], 1)); 2574 } 2575 } 2576 2577 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride, 2578 const uint16_t *left, 2579 int upsample_left, int dy) { 2580 __m128i dstvec[16], d[16]; 2581 2582 highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, 2583 dy); 2584 for (int i = 0; i < 16; i += 8) { 2585 highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], 2586 &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], 2587 &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], 2588 &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], 2589 &d[5 + i], &d[6 + i], &d[7 + i]); 2590 } 2591 for (int i = 0; i < 8; i++) { 2592 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 2593 _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); 2594 } 2595 } 2596 2597 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride, 2598 const uint16_t *left, 2599 int upsample_left, int dy) { 2600 __m256i dstvec[4], d[4], d1; 2601 2602 highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, 2603 dy); 2604 highbd_transpose4x16_avx2(dstvec, d); 2605 for (int i = 0; i < 4; i++) { 2606 _mm_storel_epi64((__m128i *)(dst + i * stride), 2607 _mm256_castsi256_si128(d[i])); 2608 d1 = _mm256_bsrli_epi128(d[i], 8); 2609 _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride), 2610 _mm256_castsi256_si128(d1)); 2611 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), 2612 _mm256_extracti128_si256(d[i], 1)); 2613 _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride), 2614 _mm256_extracti128_si256(d1, 1)); 2615 } 2616 } 2617 2618 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride, 2619 const uint16_t *left, 2620 int upsample_left, int dy) { 2621 __m128i dstvec[16], d[8]; 2622 2623 highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, 2624 dy); 2625 highbd_transpose16x4_8x8_sse2(dstvec, d); 2626 2627 _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); 2628 _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]); 2629 _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]); 2630 _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]); 2631 _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]); 2632 _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]); 2633 _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]); 2634 _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]); 2635 } 2636 2637 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride, 2638 const uint16_t *left, 2639 int upsample_left, int dy) { 2640 __m256i dstvec[16], d[16]; 2641 2642 highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, 2643 dy); 2644 for (int i = 0; i < 16; i += 8) { 2645 highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); 2646 } 2647 2648 for (int i = 0; i < 8; i++) { 2649 _mm_storeu_si128((__m128i *)(dst + i * stride), 2650 _mm256_castsi256_si128(d[i])); 2651 } 2652 for (int i = 0; i < 8; i++) { 2653 _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), 2654 _mm256_extracti128_si256(d[i], 1)); 2655 } 2656 for (int i = 8; i < 16; i++) { 2657 _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), 2658 _mm256_castsi256_si128(d[i])); 2659 } 2660 for (int i = 8; i < 16; i++) { 2661 _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride), 2662 _mm256_extracti128_si256(d[i], 1)); 2663 } 2664 } 2665 2666 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride, 2667 const uint16_t *left, 2668 int upsample_left, int dy) { 2669 __m128i dstvec[32], d[32]; 2670 2671 highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, 2672 dy); 2673 for (int i = 0; i < 32; i += 8) { 2674 highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], 2675 &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], 2676 &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], 2677 &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], 2678 &d[5 + i], &d[6 + i], &d[7 + i]); 2679 } 2680 for (int i = 0; i < 8; i++) { 2681 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 2682 _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); 2683 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]); 2684 _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]); 2685 } 2686 } 2687 2688 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride, 2689 const uint16_t *left, 2690 int upsample_left, int dy) { 2691 __m256i dstvec[16], d[16]; 2692 2693 highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, 2694 dy); 2695 highbd_transpose16x16_avx2(dstvec, d); 2696 2697 for (int i = 0; i < 16; i++) { 2698 _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]); 2699 } 2700 } 2701 2702 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride, 2703 const uint16_t *left, 2704 int upsample_left, int dy) { 2705 __m256i dstvec[64], d[16]; 2706 2707 highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, 2708 dy); 2709 2710 highbd_transpose16x16_avx2(dstvec, d); 2711 for (int j = 0; j < 16; j++) { 2712 _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]); 2713 } 2714 highbd_transpose16x16_avx2(dstvec + 16, d); 2715 for (int j = 0; j < 16; j++) { 2716 _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]); 2717 } 2718 highbd_transpose16x16_avx2(dstvec + 32, d); 2719 for (int j = 0; j < 16; j++) { 2720 _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]); 2721 } 2722 highbd_transpose16x16_avx2(dstvec + 48, d); 2723 for (int j = 0; j < 16; j++) { 2724 _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]); 2725 } 2726 } 2727 2728 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride, 2729 const uint16_t *left, 2730 int upsample_left, int dy) { 2731 DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]); 2732 highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); 2733 highbd_transpose(dstT, 64, dst, stride, 64, 64); 2734 } 2735 2736 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride, 2737 const uint16_t *left, 2738 int upsample_left, int dy) { 2739 __m256i dstvec[32], d[32]; 2740 2741 highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, 2742 dy); 2743 for (int i = 0; i < 32; i += 8) { 2744 highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); 2745 } 2746 // store 2747 for (int j = 0; j < 32; j += 16) { 2748 for (int i = 0; i < 8; i++) { 2749 _mm_storeu_si128((__m128i *)(dst + (i + j) * stride), 2750 _mm256_castsi256_si128(d[(i + j)])); 2751 } 2752 for (int i = 0; i < 8; i++) { 2753 _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8), 2754 _mm256_castsi256_si128(d[(i + j) + 8])); 2755 } 2756 for (int i = 8; i < 16; i++) { 2757 _mm256_storeu_si256( 2758 (__m256i *)(dst + (i + j) * stride), 2759 _mm256_inserti128_si256( 2760 d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0)); 2761 } 2762 } 2763 } 2764 2765 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride, 2766 const uint16_t *left, 2767 int upsample_left, int dy) { 2768 __m256i dstvec[32], d[16]; 2769 2770 highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, 2771 dy); 2772 for (int i = 0; i < 32; i += 16) { 2773 highbd_transpose16x16_avx2((dstvec + i), d); 2774 for (int j = 0; j < 16; j++) { 2775 _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); 2776 } 2777 } 2778 } 2779 2780 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride, 2781 const uint16_t *left, 2782 int upsample_left, int dy) { 2783 uint16_t dstT[64 * 32]; 2784 highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); 2785 highbd_transpose(dstT, 64, dst, stride, 32, 64); 2786 } 2787 2788 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride, 2789 const uint16_t *left, 2790 int upsample_left, int dy) { 2791 DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]); 2792 highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); 2793 highbd_transpose(dstT, 32, dst, stride, 64, 32); 2794 return; 2795 } 2796 2797 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride, 2798 const uint16_t *left, 2799 int upsample_left, int dy) { 2800 DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]); 2801 highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); 2802 highbd_transpose(dstT, 64, dst, stride, 16, 64); 2803 } 2804 2805 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride, 2806 const uint16_t *left, 2807 int upsample_left, int dy) { 2808 __m256i dstvec[64], d[16]; 2809 2810 highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, 2811 dy); 2812 for (int i = 0; i < 64; i += 16) { 2813 highbd_transpose16x16_avx2((dstvec + i), d); 2814 for (int j = 0; j < 16; j++) { 2815 _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); 2816 } 2817 } 2818 } 2819 2820 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw, 2821 int bh, const uint16_t *above, 2822 const uint16_t *left, int upsample_left, 2823 int dx, int dy, int bd) { 2824 (void)above; 2825 (void)dx; 2826 (void)bd; 2827 assert(dx == 1); 2828 assert(dy > 0); 2829 if (bw == bh) { 2830 switch (bw) { 2831 case 4: 2832 highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy); 2833 break; 2834 case 8: 2835 highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy); 2836 break; 2837 case 16: 2838 highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, 2839 dy); 2840 break; 2841 case 32: 2842 highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, 2843 dy); 2844 break; 2845 case 64: 2846 highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, 2847 dy); 2848 break; 2849 } 2850 } else { 2851 if (bw < bh) { 2852 if (bw + bw == bh) { 2853 switch (bw) { 2854 case 4: 2855 highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, 2856 dy); 2857 break; 2858 case 8: 2859 highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, 2860 dy); 2861 break; 2862 case 16: 2863 highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, 2864 dy); 2865 break; 2866 case 32: 2867 highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, 2868 dy); 2869 break; 2870 } 2871 } else { 2872 switch (bw) { 2873 case 4: 2874 highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, 2875 dy); 2876 break; 2877 case 8: 2878 highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, 2879 dy); 2880 break; 2881 case 16: 2882 highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, 2883 dy); 2884 break; 2885 } 2886 } 2887 } else { 2888 if (bh + bh == bw) { 2889 switch (bh) { 2890 case 4: 2891 highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, 2892 dy); 2893 break; 2894 case 8: 2895 highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, 2896 dy); 2897 break; 2898 case 16: 2899 highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, 2900 dy); 2901 break; 2902 case 32: 2903 highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, 2904 dy); 2905 break; 2906 } 2907 } else { 2908 switch (bh) { 2909 case 4: 2910 highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, 2911 dy); 2912 break; 2913 case 8: 2914 highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, 2915 dy); 2916 break; 2917 case 16: 2918 highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, 2919 dy); 2920 break; 2921 } 2922 } 2923 } 2924 } 2925 return; 2926 } 2927 2928 // Low bit depth functions 2929 static uint8_t BaseMask[33][32] = { 2930 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2931 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2932 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2934 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2936 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2937 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2938 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2939 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2940 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2941 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2942 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2943 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2944 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2945 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2946 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 2947 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2948 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 2949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2950 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 2951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2953 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2955 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2956 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2957 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2959 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2960 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2962 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2963 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 2964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2965 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2966 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 2967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2968 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2969 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 2970 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2971 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2972 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 2973 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2974 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2975 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 2976 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2977 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2978 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 2979 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2980 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2981 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 2982 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2983 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2984 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 2985 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2986 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2987 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2989 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2990 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2991 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 2992 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2993 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2994 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, 2995 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2996 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2997 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, 2998 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2999 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3000 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, 3001 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3002 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3003 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, 3004 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3005 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3006 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, 3007 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3008 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3009 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, 3010 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3011 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3012 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, 3013 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3014 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3015 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, 3016 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3017 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3018 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, 3019 }; 3020 3021 static AOM_FORCE_INLINE void dr_prediction_z1_4xN_internal_avx2( 3022 int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) { 3023 const int frac_bits = 6 - upsample_above; 3024 const int max_base_x = ((N + 4) - 1) << upsample_above; 3025 int x; 3026 // a assert(dx > 0); 3027 // pre-filter above pixels 3028 // store in temp buffers: 3029 // above[x] * 32 + 16 3030 // above[x+1] - above[x] 3031 // final pixels will be caluculated as: 3032 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3033 __m256i a0, a1, a32, a16; 3034 __m256i diff, c3f; 3035 __m128i a_mbase_x; 3036 3037 a16 = _mm256_set1_epi16(16); 3038 a_mbase_x = _mm_set1_epi8(above[max_base_x]); 3039 c3f = _mm256_set1_epi16(0x3f); 3040 3041 x = dx; 3042 for (int r = 0; r < N; r++) { 3043 __m256i b, res, shift; 3044 __m128i res1, a0_128, a1_128; 3045 3046 int base = x >> frac_bits; 3047 int base_max_diff = (max_base_x - base) >> upsample_above; 3048 if (base_max_diff <= 0) { 3049 for (int i = r; i < N; ++i) { 3050 dst[i] = a_mbase_x; // save 4 values 3051 } 3052 return; 3053 } 3054 if (base_max_diff > 4) base_max_diff = 4; 3055 a0_128 = _mm_loadu_si128((__m128i *)(above + base)); 3056 a1_128 = _mm_srli_si128(a0_128, 1); 3057 3058 if (upsample_above) { 3059 a0_128 = _mm_shuffle_epi8( 3060 a0_128, 3061 _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15)); 3062 a1_128 = _mm_srli_si128(a0_128, 4); 3063 3064 shift = _mm256_srli_epi16( 3065 _mm256_and_si256( 3066 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), 3067 1); 3068 } else { 3069 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 3070 } 3071 a0 = _mm256_cvtepu8_epi16(a0_128); 3072 a1 = _mm256_cvtepu8_epi16(a1_128); 3073 3074 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 3075 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 3076 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 3077 3078 b = _mm256_mullo_epi16(diff, shift); 3079 res = _mm256_add_epi16(a32, b); 3080 res = _mm256_srli_epi16(res, 5); 3081 3082 res1 = _mm256_castsi256_si128(res); 3083 res1 = _mm_packus_epi16(res1, res1); 3084 3085 dst[r] = 3086 _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]); 3087 x += dx; 3088 } 3089 } 3090 3091 static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3092 const uint8_t *above, int upsample_above, 3093 int dx) { 3094 __m128i dstvec[16]; 3095 3096 dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, dx); 3097 for (int i = 0; i < N; i++) { 3098 *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); 3099 } 3100 } 3101 3102 static AOM_FORCE_INLINE void dr_prediction_z1_8xN_internal_avx2( 3103 int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) { 3104 const int frac_bits = 6 - upsample_above; 3105 const int max_base_x = ((8 + N) - 1) << upsample_above; 3106 3107 int x; 3108 // pre-filter above pixels 3109 // store in temp buffers: 3110 // above[x] * 32 + 16 3111 // above[x+1] - above[x] 3112 // final pixels will be caluculated as: 3113 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3114 __m256i a0, a1, a0_1, a1_1, a32, a16, diff, c3f; 3115 __m128i a_mbase_x; 3116 3117 a16 = _mm256_set1_epi32(16); 3118 a_mbase_x = _mm_set1_epi8(above[max_base_x]); 3119 c3f = _mm256_set1_epi32(0x3f); 3120 3121 x = dx; 3122 for (int r = 0; r < N; r++) { 3123 __m256i b, res, res1, shift; 3124 __m128i res128; 3125 3126 int base = x >> frac_bits; 3127 int base_max_diff = (max_base_x - base) >> upsample_above; 3128 if (base_max_diff <= 0) { 3129 for (int i = r; i < N; ++i) { 3130 dst[i] = a_mbase_x; // save 16 values, 8 to be used furter 3131 } 3132 return; 3133 } 3134 if (base_max_diff > 8) base_max_diff = 8; 3135 3136 a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base))); 3137 a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); 3138 3139 if (upsample_above) { 3140 a0 = _mm256_permutevar8x32_epi32( 3141 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); 3142 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); 3143 3144 a0_1 = 3145 _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); 3146 a0_1 = _mm256_permutevar8x32_epi32( 3147 a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); 3148 a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1)); 3149 3150 a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1); 3151 a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1); 3152 3153 shift = _mm256_srli_epi32( 3154 _mm256_and_si256( 3155 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), c3f), 3156 1); 3157 } else { 3158 shift = _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); 3159 } 3160 3161 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 3162 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 3163 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 3164 3165 b = _mm256_mullo_epi32(diff, shift); 3166 res = _mm256_add_epi32(a32, b); 3167 res = _mm256_srli_epi32(res, 5); 3168 3169 res1 = _mm256_packus_epi32( 3170 res, _mm256_castsi128_si256( 3171 _mm256_extracti128_si256(res, 1))); // goto 16 bit 3172 3173 res128 = _mm_packus_epi16(_mm256_castsi256_si128(res1), 3174 _mm256_castsi256_si128(res1)); // goto 8 bit 3175 3176 res128 = 3177 _mm_blendv_epi8(a_mbase_x, res128, *(__m128i *)BaseMask[base_max_diff]); 3178 dst[r] = res128; 3179 x += dx; 3180 } 3181 } 3182 3183 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3184 const uint8_t *above, int upsample_above, 3185 int dx) { 3186 __m128i dstvec[32]; 3187 3188 dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, dx); 3189 for (int i = 0; i < N; i++) { 3190 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); 3191 } 3192 } 3193 3194 static AOM_FORCE_INLINE void dr_prediction_z1_16xN_internal_avx2( 3195 int N, __m128i *dstvec, const uint8_t *above, int upsample_above, int dx) { 3196 int x; 3197 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 3198 (void)upsample_above; 3199 const int frac_bits = 6; 3200 const int max_base_x = ((16 + N) - 1); 3201 3202 // pre-filter above pixels 3203 // store in temp buffers: 3204 // above[x] * 32 + 16 3205 // above[x+1] - above[x] 3206 // final pixels will be caluculated as: 3207 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3208 __m256i a0, a0_1, a1, a1_1, diff, a32, a16, c3f; 3209 __m128i a_mbase_x; 3210 3211 a16 = _mm256_set1_epi32(16); 3212 a_mbase_x = _mm_set1_epi8((uint8_t)above[max_base_x]); 3213 c3f = _mm256_set1_epi32(0x3f); 3214 3215 x = dx; 3216 for (int r = 0; r < N; r++) { 3217 __m256i b, res[2]; 3218 __m128i res128[2]; 3219 int base = x >> frac_bits; 3220 int base_max_diff = (max_base_x - base); 3221 if (base_max_diff <= 0) { 3222 for (int i = r; i < N; ++i) { 3223 dstvec[i] = a_mbase_x; // save 16 values 3224 } 3225 return; 3226 } 3227 __m256i shift = 3228 _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); 3229 3230 a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base))); 3231 a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); 3232 3233 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 3234 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 3235 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 3236 b = _mm256_mullo_epi32(diff, shift); 3237 3238 res[0] = _mm256_add_epi32(a32, b); 3239 res[0] = _mm256_srli_epi32(res[0], 5); 3240 res[0] = _mm256_packus_epi32( 3241 res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 3242 res128[0] = _mm_packus_epi16(_mm256_castsi256_si128(res[0]), 3243 _mm256_castsi256_si128(res[0])); // goto 8 bit 3244 3245 if (base_max_diff > 8) { 3246 if (base_max_diff > 16) base_max_diff = 16; 3247 a0_1 = 3248 _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); 3249 a1_1 = 3250 _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 9))); 3251 3252 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 3253 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 3254 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 3255 b = _mm256_mullo_epi32(diff, shift); 3256 3257 res[1] = _mm256_add_epi32(a32, b); 3258 res[1] = _mm256_srli_epi32(res[1], 5); 3259 res[1] = _mm256_packus_epi32( 3260 res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 3261 res128[1] = 3262 _mm_packus_epi16(_mm256_castsi256_si128(res[1]), 3263 _mm256_castsi256_si128(res[1])); // goto 8 bit 3264 3265 } else { 3266 res128[1] = a_mbase_x; 3267 } 3268 res128[0] = _mm_unpacklo_epi64(res128[0], res128[1]); // 16 8bit values 3269 3270 dstvec[r] = _mm_blendv_epi8(a_mbase_x, res128[0], 3271 *(__m128i *)BaseMask[base_max_diff]); 3272 x += dx; 3273 } 3274 } 3275 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3276 const uint8_t *above, int upsample_above, 3277 int dx) { 3278 __m128i dstvec[64]; 3279 3280 dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, dx); 3281 for (int i = 0; i < N; i++) { 3282 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); 3283 } 3284 } 3285 3286 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2( 3287 int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) { 3288 int x; 3289 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 3290 (void)upsample_above; 3291 const int frac_bits = 6; 3292 const int max_base_x = ((32 + N) - 1); 3293 3294 // pre-filter above pixels 3295 // store in temp buffers: 3296 // above[x] * 32 + 16 3297 // above[x+1] - above[x] 3298 // final pixels will be caluculated as: 3299 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3300 __m256i a0, a0_1, a1, a1_1, a32, a16; 3301 __m256i a_mbase_x, diff, c3f; 3302 3303 a16 = _mm256_set1_epi32(16); 3304 a_mbase_x = _mm256_set1_epi8(above[max_base_x]); 3305 c3f = _mm256_set1_epi32(0x3f); 3306 3307 x = dx; 3308 for (int r = 0; r < N; r++) { 3309 __m256i b, res[2], res16[2]; 3310 3311 int base = x >> frac_bits; 3312 int base_max_diff = (max_base_x - base); 3313 if (base_max_diff <= 0) { 3314 for (int i = r; i < N; ++i) { 3315 dstvec[i] = a_mbase_x; // save 32 values 3316 } 3317 return; 3318 } 3319 if (base_max_diff > 32) base_max_diff = 32; 3320 __m256i shift = 3321 _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); 3322 3323 for (int j = 0, jj = 0; j < 32; j += 16, jj++) { 3324 int mdiff = base_max_diff - j; 3325 if (mdiff <= 0) { 3326 res16[jj] = a_mbase_x; 3327 } else { 3328 a0 = _mm256_cvtepu8_epi32( 3329 _mm_loadu_si128((__m128i *)(above + base + j))); 3330 a1 = _mm256_cvtepu8_epi32( 3331 _mm_loadu_si128((__m128i *)(above + base + 1 + j))); 3332 3333 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 3334 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 3335 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 3336 b = _mm256_mullo_epi32(diff, shift); 3337 3338 res[0] = _mm256_add_epi32(a32, b); 3339 res[0] = _mm256_srli_epi32(res[0], 5); 3340 res[0] = _mm256_packus_epi32( 3341 res[0], 3342 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 3343 3344 // goto 8 bit 3345 res[0] = _mm256_packus_epi16(res[0], res[0]); 3346 3347 if (mdiff > 8) { 3348 a0_1 = _mm256_cvtepu8_epi32( 3349 _mm_loadu_si128((__m128i *)(above + base + 8 + j))); 3350 a1_1 = _mm256_cvtepu8_epi32( 3351 _mm_loadu_si128((__m128i *)(above + base + 9 + j))); 3352 3353 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 3354 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 3355 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 3356 b = _mm256_mullo_epi32(diff, shift); 3357 3358 res[1] = _mm256_add_epi32(a32, b); 3359 res[1] = _mm256_srli_epi32(res[1], 5); 3360 res[1] = _mm256_packus_epi32( 3361 res[1], 3362 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 3363 res[1] = _mm256_packus_epi16(res[1], res[1]); 3364 // goto 8 bit 3365 } else { 3366 res[1] = a_mbase_x; 3367 } 3368 res16[jj] = _mm256_unpacklo_epi64(res[0], res[1]); // 16 8bit values 3369 } 3370 } 3371 res16[1] = 3372 _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]), 3373 1); // 32 8bit values 3374 3375 dstvec[r] = _mm256_blendv_epi8( 3376 a_mbase_x, res16[1], 3377 *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values 3378 x += dx; 3379 } 3380 } 3381 3382 static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3383 const uint8_t *above, int upsample_above, 3384 int dx) { 3385 __m256i dstvec[64]; 3386 dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); 3387 for (int i = 0; i < N; i++) { 3388 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); 3389 } 3390 } 3391 3392 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3393 const uint8_t *above, int upsample_above, 3394 int dx) { 3395 int x; 3396 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 3397 (void)upsample_above; 3398 const int frac_bits = 6; 3399 const int max_base_x = ((64 + N) - 1); 3400 3401 // pre-filter above pixels 3402 // store in temp buffers: 3403 // above[x] * 32 + 16 3404 // above[x+1] - above[x] 3405 // final pixels will be caluculated as: 3406 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3407 __m256i a0, a0_1, a1, a1_1, a32, a16; 3408 __m256i a_mbase_x, diff, c3f; 3409 __m128i max_base_x128, base_inc128, mask128; 3410 3411 a16 = _mm256_set1_epi32(16); 3412 a_mbase_x = _mm256_set1_epi8(above[max_base_x]); 3413 max_base_x128 = _mm_set1_epi8(max_base_x); 3414 c3f = _mm256_set1_epi32(0x3f); 3415 3416 x = dx; 3417 for (int r = 0; r < N; r++, dst += stride) { 3418 __m256i b, res[2]; 3419 __m128i res1; 3420 3421 int base = x >> frac_bits; 3422 if (base >= max_base_x) { 3423 for (int i = r; i < N; ++i) { 3424 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values 3425 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); 3426 dst += stride; 3427 } 3428 return; 3429 } 3430 3431 __m256i shift = 3432 _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); 3433 3434 __m128i a0_128, a0_1_128, a1_128, a1_1_128; 3435 for (int j = 0; j < 64; j += 16) { 3436 int mdif = max_base_x - (base + j); 3437 if (mdif <= 0) { 3438 _mm_storeu_si128((__m128i *)(dst + j), 3439 _mm256_castsi256_si128(a_mbase_x)); 3440 } else { 3441 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); 3442 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); 3443 a0 = _mm256_cvtepu8_epi32(a0_128); 3444 a1 = _mm256_cvtepu8_epi32(a1_128); 3445 3446 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 3447 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 3448 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 3449 b = _mm256_mullo_epi32(diff, shift); 3450 3451 res[0] = _mm256_add_epi32(a32, b); 3452 res[0] = _mm256_srli_epi32(res[0], 5); 3453 res[0] = _mm256_packus_epi32( 3454 res[0], 3455 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 3456 // goto 8 bit 3457 res[0] = _mm256_packus_epi16(res[0], res[0]); 3458 3459 if (mdif > 8) { 3460 a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j)); 3461 a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j)); 3462 a0_1 = _mm256_cvtepu8_epi32(a0_1_128); 3463 a1_1 = _mm256_cvtepu8_epi32(a1_1_128); 3464 3465 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 3466 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 3467 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 3468 b = _mm256_mullo_epi32(diff, shift); 3469 3470 res[1] = _mm256_add_epi32(a32, b); 3471 res[1] = _mm256_srli_epi32(res[1], 5); 3472 res[1] = _mm256_packus_epi32( 3473 res[1], 3474 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 3475 res[1] = _mm256_packus_epi16(res[1], res[1]); 3476 3477 } else { 3478 res[1] = a_mbase_x; 3479 } 3480 res1 = _mm_unpacklo_epi64( 3481 _mm256_castsi256_si128(res[0]), 3482 _mm256_castsi256_si128(res[1])); // 16 8bit values 3483 3484 base_inc128 = _mm_setr_epi8( 3485 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, 3486 base + j + 5, base + j + 6, base + j + 7, base + j + 8, 3487 base + j + 9, base + j + 10, base + j + 11, base + j + 12, 3488 base + j + 13, base + j + 14, base + j + 15); 3489 3490 mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128), 3491 _mm_setzero_si128()); 3492 res1 = 3493 _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), res1, mask128); 3494 _mm_storeu_si128((__m128i *)(dst + j), res1); 3495 } 3496 } 3497 x += dx; 3498 } 3499 } 3500 3501 // Directional prediction, zone 1: 0 < angle < 90 3502 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 3503 const uint8_t *above, const uint8_t *left, 3504 int upsample_above, int dx, int dy) { 3505 (void)left; 3506 (void)dy; 3507 switch (bw) { 3508 case 4: 3509 dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx); 3510 break; 3511 case 8: 3512 dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx); 3513 break; 3514 case 16: 3515 dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx); 3516 break; 3517 case 32: 3518 dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx); 3519 break; 3520 case 64: 3521 dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); 3522 break; 3523 default: break; 3524 } 3525 return; 3526 } 3527 3528 static uint8_t LoadMaskx[8][16] = { 3529 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 3530 { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, 3531 { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, 3532 { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, 3533 { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, 3534 { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }, 3535 { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, 3536 { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 }, 3537 }; 3538 3539 static uint8_t EvenOddMaskx4[8][16] = { 3540 { 0, 2, 4, 6, 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0 }, 3541 { 0, 1, 3, 5, 7, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0 }, 3542 { 0, 0, 2, 4, 6, 8, 3, 5, 7, 9, 0, 0, 0, 0, 0, 0 }, 3543 { 0, 0, 0, 3, 5, 7, 9, 4, 6, 8, 10, 0, 0, 0, 0, 0 }, 3544 { 0, 0, 0, 0, 4, 6, 8, 10, 5, 7, 9, 11, 0, 0, 0, 0 }, 3545 { 0, 0, 0, 0, 0, 5, 7, 9, 11, 6, 8, 10, 12, 0, 0, 0 }, 3546 { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 7, 9, 11, 13, 0, 0 }, 3547 { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 8, 10, 12, 14, 0 } 3548 }; 3549 3550 static uint8_t EvenOddMaskx[8][16] = { 3551 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 0, 0, 0, 0 }, 3552 { 0, 1, 3, 5, 7, 9, 11, 13, 15, 2, 4, 6, 8, 0, 0, 0 }, 3553 { 0, 0, 2, 4, 6, 8, 10, 12, 14, 3, 5, 7, 9, 0, 0, 0 }, 3554 { 0, 0, 0, 3, 5, 7, 9, 11, 13, 15, 4, 6, 8, 10, 0 }, 3555 { 0, 0, 0, 0, 4, 6, 8, 10, 12, 14, 5, 7, 9, 11, 0, 0 }, 3556 { 0, 0, 0, 0, 0, 5, 7, 9, 11, 13, 15, 6, 8, 10, 12, 0 }, 3557 { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 14, 7, 9, 11, 13, 0 }, 3558 { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 15, 8, 10, 12, 14 } 3559 }; 3560 3561 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3562 const uint8_t *above, const uint8_t *left, 3563 int upsample_above, int upsample_left, 3564 int dx, int dy) { 3565 const int min_base_x = -(1 << upsample_above); 3566 const int min_base_y = -(1 << upsample_left); 3567 const int frac_bits_x = 6 - upsample_above; 3568 const int frac_bits_y = 6 - upsample_left; 3569 3570 // a assert(dx > 0); 3571 // pre-filter above pixels 3572 // store in temp buffers: 3573 // above[x] * 32 + 16 3574 // above[x+1] - above[x] 3575 // final pixels will be caluculated as: 3576 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3577 __m256i a0_x, a1_x, a32, a16, diff; 3578 __m128i c3f, min_base_y128; 3579 3580 a16 = _mm256_set1_epi32(16); 3581 c3f = _mm_set1_epi32(0x3f); 3582 min_base_y128 = _mm_set1_epi32(min_base_y); 3583 3584 for (int r = 0; r < N; r++) { 3585 __m256i b, res, shift; 3586 __m128i resx, resy, resxy; 3587 __m128i a0_x128, a1_x128; 3588 int y = r + 1; 3589 int base_x = (-y * dx) >> frac_bits_x; 3590 int base_shift = 0; 3591 if (base_x < (min_base_x - 1)) { 3592 base_shift = (min_base_x - base_x - 1) >> upsample_above; 3593 } 3594 int base_min_diff = 3595 (min_base_x - base_x + upsample_above) >> upsample_above; 3596 if (base_min_diff > 4) { 3597 base_min_diff = 4; 3598 } else { 3599 if (base_min_diff < 0) base_min_diff = 0; 3600 } 3601 3602 if (base_shift > 3) { 3603 a0_x = _mm256_setzero_si256(); 3604 a1_x = _mm256_setzero_si256(); 3605 shift = _mm256_setzero_si256(); 3606 } else { 3607 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 3608 if (upsample_above) { 3609 a0_x128 = 3610 _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx4[base_shift]); 3611 a1_x128 = _mm_srli_si128(a0_x128, 4); 3612 3613 shift = _mm256_castsi128_si256(_mm_srli_epi32( 3614 _mm_and_si128( 3615 _mm_slli_epi32( 3616 _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, 3617 (2 << 6) - y * dx, (3 << 6) - y * dx), 3618 upsample_above), 3619 c3f), 3620 1)); 3621 } else { 3622 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); 3623 a1_x128 = _mm_srli_si128(a0_x128, 1); 3624 3625 shift = _mm256_castsi128_si256(_mm_srli_epi32( 3626 _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, 3627 (2 << 6) - y * dx, (3 << 6) - y * dx), 3628 c3f), 3629 1)); 3630 } 3631 a0_x = _mm256_cvtepu8_epi32(a0_x128); 3632 a1_x = _mm256_cvtepu8_epi32(a1_x128); 3633 } 3634 // y calc 3635 __m128i a0_y, a1_y, shifty; 3636 if (base_x < min_base_x) { 3637 DECLARE_ALIGNED(32, int, base_y_c[4]); 3638 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; 3639 r6 = _mm_set1_epi32(r << 6); 3640 dy128 = _mm_set1_epi32(dy); 3641 c1234 = _mm_setr_epi32(1, 2, 3, 4); 3642 y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); 3643 base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); 3644 mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); 3645 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 3646 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 3647 3648 a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], 3649 left[base_y_c[2]], left[base_y_c[3]]); 3650 a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], 3651 left[base_y_c[2] + 1], left[base_y_c[3] + 1]); 3652 3653 if (upsample_left) { 3654 shifty = _mm_srli_epi32( 3655 _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); 3656 } else { 3657 shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); 3658 } 3659 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); 3660 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); 3661 shift = _mm256_inserti128_si256(shift, shifty, 1); 3662 } 3663 3664 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] 3665 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 3666 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 3667 3668 b = _mm256_mullo_epi32(diff, shift); 3669 res = _mm256_add_epi32(a32, b); 3670 res = _mm256_srli_epi32(res, 5); 3671 3672 resx = _mm256_castsi256_si128(res); 3673 resx = _mm_packus_epi32(resx, resx); 3674 resx = _mm_packus_epi16(resx, resx); 3675 3676 resy = _mm256_extracti128_si256(res, 1); 3677 resy = _mm_packus_epi32(resy, resy); 3678 resy = _mm_packus_epi16(resy, resy); 3679 3680 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); 3681 *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy); 3682 dst += stride; 3683 } 3684 } 3685 3686 static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3687 const uint8_t *above, const uint8_t *left, 3688 int upsample_above, int upsample_left, 3689 int dx, int dy) { 3690 const int min_base_x = -(1 << upsample_above); 3691 const int min_base_y = -(1 << upsample_left); 3692 const int frac_bits_x = 6 - upsample_above; 3693 const int frac_bits_y = 6 - upsample_left; 3694 3695 // pre-filter above pixels 3696 // store in temp buffers: 3697 // above[x] * 32 + 16 3698 // above[x+1] - above[x] 3699 // final pixels will be caluculated as: 3700 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3701 __m256i diff, a32, a16; 3702 __m256i a0_x, a1_x; 3703 __m128i a0_x128, a1_x128, min_base_y128, c3f; 3704 3705 a16 = _mm256_set1_epi16(16); 3706 c3f = _mm_set1_epi16(0x3f); 3707 min_base_y128 = _mm_set1_epi16(min_base_y); 3708 3709 for (int r = 0; r < N; r++) { 3710 __m256i b, res, shift; 3711 __m128i resx, resy, resxy; 3712 3713 int y = r + 1; 3714 int base_x = (-y * dx) >> frac_bits_x; 3715 int base_shift = 0; 3716 if (base_x < (min_base_x - 1)) { 3717 base_shift = (min_base_x - base_x - 1) >> upsample_above; 3718 } 3719 int base_min_diff = 3720 (min_base_x - base_x + upsample_above) >> upsample_above; 3721 if (base_min_diff > 8) { 3722 base_min_diff = 8; 3723 } else { 3724 if (base_min_diff < 0) base_min_diff = 0; 3725 } 3726 3727 if (base_shift > 7) { 3728 a0_x = _mm256_setzero_si256(); 3729 a1_x = _mm256_setzero_si256(); 3730 shift = _mm256_setzero_si256(); 3731 } else { 3732 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 3733 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); 3734 if (upsample_above) { 3735 a0_x128 = 3736 _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); 3737 a1_x128 = 3738 _mm_shuffle_epi8(a1_x128, *(__m128i *)EvenOddMaskx[base_shift]); 3739 3740 shift = _mm256_castsi128_si256(_mm_srli_epi16( 3741 _mm_and_si128( 3742 _mm_slli_epi16( 3743 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, 3744 (2 << 6) - y * dx, (3 << 6) - y * dx, 3745 (4 << 6) - y * dx, (5 << 6) - y * dx, 3746 (6 << 6) - y * dx, (7 << 6) - y * dx), 3747 upsample_above), 3748 c3f), 3749 1)); 3750 } else { 3751 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); 3752 a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); 3753 3754 shift = _mm256_castsi128_si256(_mm_srli_epi16( 3755 _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, 3756 (2 << 6) - y * dx, (3 << 6) - y * dx, 3757 (4 << 6) - y * dx, (5 << 6) - y * dx, 3758 (6 << 6) - y * dx, (7 << 6) - y * dx), 3759 c3f), 3760 1)); 3761 } 3762 a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128)); 3763 a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128)); 3764 } 3765 3766 // y calc 3767 __m128i a0_y, a1_y, shifty; 3768 if (base_x < min_base_x) { 3769 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 3770 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; 3771 r6 = _mm_set1_epi16(r << 6); 3772 dy128 = _mm_set1_epi16(dy); 3773 c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); 3774 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); 3775 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); 3776 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); 3777 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 3778 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 3779 3780 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 3781 left[base_y_c[2]], left[base_y_c[3]], 3782 left[base_y_c[4]], left[base_y_c[5]], 3783 left[base_y_c[6]], left[base_y_c[7]]); 3784 a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], 3785 left[base_y_c[2] + 1], left[base_y_c[3] + 1], 3786 left[base_y_c[4] + 1], left[base_y_c[5] + 1], 3787 left[base_y_c[6] + 1], left[base_y_c[7] + 1]); 3788 3789 if (upsample_left) { 3790 shifty = _mm_srli_epi16( 3791 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); 3792 } else { 3793 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); 3794 } 3795 3796 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); 3797 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); 3798 shift = _mm256_inserti128_si256(shift, shifty, 1); 3799 } 3800 3801 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 3802 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 3803 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 3804 3805 b = _mm256_mullo_epi16(diff, shift); 3806 res = _mm256_add_epi16(a32, b); 3807 res = _mm256_srli_epi16(res, 5); 3808 3809 resx = _mm_packus_epi16(_mm256_castsi256_si128(res), 3810 _mm256_castsi256_si128(res)); 3811 resy = _mm256_extracti128_si256(res, 1); 3812 resy = _mm_packus_epi16(resy, resy); 3813 3814 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); 3815 _mm_storel_epi64((__m128i *)(dst), resxy); 3816 dst += stride; 3817 } 3818 } 3819 3820 static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst, 3821 ptrdiff_t stride, const uint8_t *above, 3822 const uint8_t *left, int upsample_above, 3823 int upsample_left, int dx, int dy) { 3824 // here upsample_above and upsample_left are 0 by design of 3825 // av1_use_intra_edge_upsample 3826 const int min_base_x = -1; 3827 const int min_base_y = -1; 3828 (void)upsample_above; 3829 (void)upsample_left; 3830 const int frac_bits_x = 6; 3831 const int frac_bits_y = 6; 3832 3833 // pre-filter above pixels 3834 // store in temp buffers: 3835 // above[x] * 32 + 16 3836 // above[x+1] - above[x] 3837 // final pixels will be caluculated as: 3838 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3839 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16; 3840 __m256i diff, min_base_y256, c3f, shifty; 3841 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, a0_1_x, a1_1_x, shiftx; 3842 3843 a16 = _mm256_set1_epi16(16); 3844 min_base_y256 = _mm256_set1_epi16(min_base_y); 3845 c3f = _mm256_set1_epi16(0x3f); 3846 3847 for (int r = 0; r < H; r++) { 3848 __m256i b, res, shift; 3849 __m128i resx, resy; 3850 __m128i resxy; 3851 for (int j = 0; j < W; j += 16) { 3852 int y = r + 1; 3853 int base_x = (-y * dx) >> frac_bits_x; 3854 3855 int base_shift = 0; 3856 if ((base_x + j) < (min_base_x - 1)) { 3857 base_shift = (min_base_x - (base_x + j) - 1); 3858 } 3859 int base_min_diff = (min_base_x - base_x - j); 3860 if (base_min_diff > 16) { 3861 base_min_diff = 16; 3862 } else { 3863 if (base_min_diff < 0) base_min_diff = 0; 3864 } 3865 if (base_shift > 7) { 3866 a0_x = _mm256_setzero_si256(); 3867 a1_x = _mm256_setzero_si256(); 3868 shift = _mm256_setzero_si256(); 3869 } else { 3870 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); 3871 a1_x128 = 3872 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); 3873 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); 3874 a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); 3875 3876 a0_x = _mm256_cvtepu8_epi16(a0_x128); 3877 a1_x = _mm256_cvtepu8_epi16(a1_x128); 3878 3879 shift = _mm256_castsi128_si256(_mm_srli_epi16( 3880 _mm_and_si128(_mm_setr_epi16( 3881 ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx, 3882 ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx, 3883 ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx, 3884 ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx), 3885 _mm256_castsi256_si128(c3f)), 3886 1)); 3887 } 3888 3889 base_shift = 0; 3890 if ((base_x + j + 8) < (min_base_x - 1)) { 3891 base_shift = (min_base_x - (base_x + j + 8) - 1); 3892 } 3893 if (base_shift <= 7) { 3894 a0_1_x128 = 3895 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j)); 3896 a1_1_x128 = 3897 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j)); 3898 a0_1_x128 = 3899 _mm_shuffle_epi8(a0_1_x128, *(__m128i *)LoadMaskx[base_shift]); 3900 a1_1_x128 = 3901 _mm_shuffle_epi8(a1_1_x128, *(__m128i *)LoadMaskx[base_shift]); 3902 3903 a0_1_x = _mm_cvtepu8_epi16(a0_1_x128); 3904 a1_1_x = _mm_cvtepu8_epi16(a1_1_x128); 3905 3906 shiftx = _mm_srli_epi16( 3907 _mm_and_si128( 3908 _mm_setr_epi16( 3909 ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx, 3910 ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx, 3911 ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx, 3912 ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx), 3913 _mm256_castsi256_si128(c3f)), 3914 1); 3915 3916 a0_x = _mm256_inserti128_si256(a0_x, a0_1_x, 1); 3917 a1_x = _mm256_inserti128_si256(a1_x, a1_1_x, 1); 3918 shift = _mm256_inserti128_si256(shift, shiftx, 1); 3919 } 3920 3921 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 3922 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 3923 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 3924 3925 b = _mm256_mullo_epi16(diff, shift); 3926 res = _mm256_add_epi16(a32, b); 3927 res = _mm256_srli_epi16(res, 5); // 16 16-bit values 3928 resx = _mm256_castsi256_si128(_mm256_packus_epi16( 3929 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); 3930 3931 // y calc 3932 if ((base_x < min_base_x)) { 3933 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 3934 __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16; 3935 r6 = _mm256_set1_epi16(r << 6); 3936 dy256 = _mm256_set1_epi16(dy); 3937 c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j, 3938 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j, 3939 13 + j, 14 + j, 15 + j, 16 + j); 3940 mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), 3941 _mm256_srli_epi16(min_base_y256, 1)); 3942 y_c256 = _mm256_sub_epi16(r6, mul16); 3943 3944 base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); 3945 mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); 3946 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 3947 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/ 3948 3949 a0_y = _mm256_setr_epi16( 3950 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 3951 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 3952 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], 3953 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], 3954 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], 3955 left[base_y_c[15]]); 3956 a1_y = _mm256_setr_epi16( 3957 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], 3958 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], 3959 left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1], 3960 left[base_y_c[9] + 1], left[base_y_c[10] + 1], 3961 left[base_y_c[11] + 1], left[base_y_c[12] + 1], 3962 left[base_y_c[13] + 1], left[base_y_c[14] + 1], 3963 left[base_y_c[15] + 1]); 3964 3965 shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); 3966 3967 diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] 3968 a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 3969 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 3970 3971 b = _mm256_mullo_epi16(diff, shifty); 3972 res = _mm256_add_epi16(a32, b); 3973 res = _mm256_srli_epi16(res, 5); // 16 16-bit values 3974 resy = _mm256_castsi256_si128(_mm256_packus_epi16( 3975 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); 3976 3977 } else { 3978 resy = _mm_setzero_si128(); 3979 } 3980 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); 3981 _mm_storeu_si128((__m128i *)(dst + j), resxy); 3982 } // for j 3983 dst += stride; 3984 } 3985 } 3986 3987 // Directional prediction, zone 2: 90 < angle < 180 3988 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 3989 const uint8_t *above, const uint8_t *left, 3990 int upsample_above, int upsample_left, int dx, 3991 int dy) { 3992 assert(dx > 0); 3993 assert(dy > 0); 3994 switch (bw) { 3995 case 4: 3996 dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, 3997 upsample_left, dx, dy); 3998 break; 3999 case 8: 4000 dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, 4001 upsample_left, dx, dy); 4002 break; 4003 default: 4004 dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, 4005 upsample_above, upsample_left, dx, dy); 4006 break; 4007 } 4008 return; 4009 } 4010 4011 // z3 functions 4012 static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) { 4013 __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3; 4014 w0 = _mm_unpacklo_epi8(x[0], x[1]); 4015 w1 = _mm_unpacklo_epi8(x[2], x[3]); 4016 w2 = _mm_unpackhi_epi8(x[0], x[1]); 4017 w3 = _mm_unpackhi_epi8(x[2], x[3]); 4018 4019 ww0 = _mm_unpacklo_epi16(w0, w1); 4020 ww1 = _mm_unpacklo_epi16(w2, w3); 4021 ww2 = _mm_unpackhi_epi16(w0, w1); 4022 ww3 = _mm_unpackhi_epi16(w2, w3); 4023 4024 w0 = _mm_unpacklo_epi32(ww0, ww1); 4025 w2 = _mm_unpacklo_epi32(ww2, ww3); 4026 w1 = _mm_unpackhi_epi32(ww0, ww1); 4027 w3 = _mm_unpackhi_epi32(ww2, ww3); 4028 4029 d[0] = _mm_unpacklo_epi64(w0, w2); 4030 d[1] = _mm_unpackhi_epi64(w0, w2); 4031 d[2] = _mm_unpacklo_epi64(w1, w3); 4032 d[3] = _mm_unpackhi_epi64(w1, w3); 4033 4034 d[4] = _mm_srli_si128(d[0], 8); 4035 d[5] = _mm_srli_si128(d[1], 8); 4036 d[6] = _mm_srli_si128(d[2], 8); 4037 d[7] = _mm_srli_si128(d[3], 8); 4038 4039 d[8] = _mm_srli_si128(d[0], 4); 4040 d[9] = _mm_srli_si128(d[1], 4); 4041 d[10] = _mm_srli_si128(d[2], 4); 4042 d[11] = _mm_srli_si128(d[3], 4); 4043 4044 d[12] = _mm_srli_si128(d[0], 12); 4045 d[13] = _mm_srli_si128(d[1], 12); 4046 d[14] = _mm_srli_si128(d[2], 12); 4047 d[15] = _mm_srli_si128(d[3], 12); 4048 } 4049 4050 static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) { 4051 __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; 4052 __m256i w10, w11, w12, w13, w14, w15; 4053 4054 w0 = _mm256_unpacklo_epi8(x[0], x[1]); 4055 w1 = _mm256_unpacklo_epi8(x[2], x[3]); 4056 w2 = _mm256_unpacklo_epi8(x[4], x[5]); 4057 w3 = _mm256_unpacklo_epi8(x[6], x[7]); 4058 4059 w8 = _mm256_unpacklo_epi8(x[8], x[9]); 4060 w9 = _mm256_unpacklo_epi8(x[10], x[11]); 4061 w10 = _mm256_unpacklo_epi8(x[12], x[13]); 4062 w11 = _mm256_unpacklo_epi8(x[14], x[15]); 4063 4064 w4 = _mm256_unpacklo_epi16(w0, w1); 4065 w5 = _mm256_unpacklo_epi16(w2, w3); 4066 w12 = _mm256_unpacklo_epi16(w8, w9); 4067 w13 = _mm256_unpacklo_epi16(w10, w11); 4068 4069 w6 = _mm256_unpacklo_epi32(w4, w5); 4070 w7 = _mm256_unpackhi_epi32(w4, w5); 4071 w14 = _mm256_unpacklo_epi32(w12, w13); 4072 w15 = _mm256_unpackhi_epi32(w12, w13); 4073 4074 // Store first 4-line result 4075 d[0] = _mm256_unpacklo_epi64(w6, w14); 4076 d[1] = _mm256_unpackhi_epi64(w6, w14); 4077 d[2] = _mm256_unpacklo_epi64(w7, w15); 4078 d[3] = _mm256_unpackhi_epi64(w7, w15); 4079 4080 w4 = _mm256_unpackhi_epi16(w0, w1); 4081 w5 = _mm256_unpackhi_epi16(w2, w3); 4082 w12 = _mm256_unpackhi_epi16(w8, w9); 4083 w13 = _mm256_unpackhi_epi16(w10, w11); 4084 4085 w6 = _mm256_unpacklo_epi32(w4, w5); 4086 w7 = _mm256_unpackhi_epi32(w4, w5); 4087 w14 = _mm256_unpacklo_epi32(w12, w13); 4088 w15 = _mm256_unpackhi_epi32(w12, w13); 4089 4090 // Store second 4-line result 4091 d[4] = _mm256_unpacklo_epi64(w6, w14); 4092 d[5] = _mm256_unpackhi_epi64(w6, w14); 4093 d[6] = _mm256_unpacklo_epi64(w7, w15); 4094 d[7] = _mm256_unpackhi_epi64(w7, w15); 4095 4096 // upper half 4097 w0 = _mm256_unpackhi_epi8(x[0], x[1]); 4098 w1 = _mm256_unpackhi_epi8(x[2], x[3]); 4099 w2 = _mm256_unpackhi_epi8(x[4], x[5]); 4100 w3 = _mm256_unpackhi_epi8(x[6], x[7]); 4101 4102 w8 = _mm256_unpackhi_epi8(x[8], x[9]); 4103 w9 = _mm256_unpackhi_epi8(x[10], x[11]); 4104 w10 = _mm256_unpackhi_epi8(x[12], x[13]); 4105 w11 = _mm256_unpackhi_epi8(x[14], x[15]); 4106 4107 w4 = _mm256_unpacklo_epi16(w0, w1); 4108 w5 = _mm256_unpacklo_epi16(w2, w3); 4109 w12 = _mm256_unpacklo_epi16(w8, w9); 4110 w13 = _mm256_unpacklo_epi16(w10, w11); 4111 4112 w6 = _mm256_unpacklo_epi32(w4, w5); 4113 w7 = _mm256_unpackhi_epi32(w4, w5); 4114 w14 = _mm256_unpacklo_epi32(w12, w13); 4115 w15 = _mm256_unpackhi_epi32(w12, w13); 4116 4117 // Store first 4-line result 4118 d[8] = _mm256_unpacklo_epi64(w6, w14); 4119 d[9] = _mm256_unpackhi_epi64(w6, w14); 4120 d[10] = _mm256_unpacklo_epi64(w7, w15); 4121 d[11] = _mm256_unpackhi_epi64(w7, w15); 4122 4123 w4 = _mm256_unpackhi_epi16(w0, w1); 4124 w5 = _mm256_unpackhi_epi16(w2, w3); 4125 w12 = _mm256_unpackhi_epi16(w8, w9); 4126 w13 = _mm256_unpackhi_epi16(w10, w11); 4127 4128 w6 = _mm256_unpacklo_epi32(w4, w5); 4129 w7 = _mm256_unpackhi_epi32(w4, w5); 4130 w14 = _mm256_unpacklo_epi32(w12, w13); 4131 w15 = _mm256_unpackhi_epi32(w12, w13); 4132 4133 // Store second 4-line result 4134 d[12] = _mm256_unpacklo_epi64(w6, w14); 4135 d[13] = _mm256_unpackhi_epi64(w6, w14); 4136 d[14] = _mm256_unpacklo_epi64(w7, w15); 4137 d[15] = _mm256_unpackhi_epi64(w7, w15); 4138 } 4139 4140 static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) { 4141 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; 4142 __m128i w10, w11, w12, w13, w14, w15; 4143 4144 w0 = _mm_unpacklo_epi8(x[0], x[1]); 4145 w1 = _mm_unpacklo_epi8(x[2], x[3]); 4146 w2 = _mm_unpacklo_epi8(x[4], x[5]); 4147 w3 = _mm_unpacklo_epi8(x[6], x[7]); 4148 4149 w8 = _mm_unpacklo_epi8(x[8], x[9]); 4150 w9 = _mm_unpacklo_epi8(x[10], x[11]); 4151 w10 = _mm_unpacklo_epi8(x[12], x[13]); 4152 w11 = _mm_unpacklo_epi8(x[14], x[15]); 4153 4154 w4 = _mm_unpacklo_epi16(w0, w1); 4155 w5 = _mm_unpacklo_epi16(w2, w3); 4156 w12 = _mm_unpacklo_epi16(w8, w9); 4157 w13 = _mm_unpacklo_epi16(w10, w11); 4158 4159 w6 = _mm_unpacklo_epi32(w4, w5); 4160 w7 = _mm_unpackhi_epi32(w4, w5); 4161 w14 = _mm_unpacklo_epi32(w12, w13); 4162 w15 = _mm_unpackhi_epi32(w12, w13); 4163 4164 // Store first 4-line result 4165 d[0] = _mm_unpacklo_epi64(w6, w14); 4166 d[1] = _mm_unpackhi_epi64(w6, w14); 4167 d[2] = _mm_unpacklo_epi64(w7, w15); 4168 d[3] = _mm_unpackhi_epi64(w7, w15); 4169 4170 w4 = _mm_unpackhi_epi16(w0, w1); 4171 w5 = _mm_unpackhi_epi16(w2, w3); 4172 w12 = _mm_unpackhi_epi16(w8, w9); 4173 w13 = _mm_unpackhi_epi16(w10, w11); 4174 4175 w6 = _mm_unpacklo_epi32(w4, w5); 4176 w7 = _mm_unpackhi_epi32(w4, w5); 4177 w14 = _mm_unpacklo_epi32(w12, w13); 4178 w15 = _mm_unpackhi_epi32(w12, w13); 4179 4180 // Store second 4-line result 4181 d[4] = _mm_unpacklo_epi64(w6, w14); 4182 d[5] = _mm_unpackhi_epi64(w6, w14); 4183 d[6] = _mm_unpacklo_epi64(w7, w15); 4184 d[7] = _mm_unpackhi_epi64(w7, w15); 4185 4186 // upper half 4187 w0 = _mm_unpackhi_epi8(x[0], x[1]); 4188 w1 = _mm_unpackhi_epi8(x[2], x[3]); 4189 w2 = _mm_unpackhi_epi8(x[4], x[5]); 4190 w3 = _mm_unpackhi_epi8(x[6], x[7]); 4191 4192 w8 = _mm_unpackhi_epi8(x[8], x[9]); 4193 w9 = _mm_unpackhi_epi8(x[10], x[11]); 4194 w10 = _mm_unpackhi_epi8(x[12], x[13]); 4195 w11 = _mm_unpackhi_epi8(x[14], x[15]); 4196 4197 w4 = _mm_unpacklo_epi16(w0, w1); 4198 w5 = _mm_unpacklo_epi16(w2, w3); 4199 w12 = _mm_unpacklo_epi16(w8, w9); 4200 w13 = _mm_unpacklo_epi16(w10, w11); 4201 4202 w6 = _mm_unpacklo_epi32(w4, w5); 4203 w7 = _mm_unpackhi_epi32(w4, w5); 4204 w14 = _mm_unpacklo_epi32(w12, w13); 4205 w15 = _mm_unpackhi_epi32(w12, w13); 4206 4207 // Store first 4-line result 4208 d[8] = _mm_unpacklo_epi64(w6, w14); 4209 d[9] = _mm_unpackhi_epi64(w6, w14); 4210 d[10] = _mm_unpacklo_epi64(w7, w15); 4211 d[11] = _mm_unpackhi_epi64(w7, w15); 4212 4213 w4 = _mm_unpackhi_epi16(w0, w1); 4214 w5 = _mm_unpackhi_epi16(w2, w3); 4215 w12 = _mm_unpackhi_epi16(w8, w9); 4216 w13 = _mm_unpackhi_epi16(w10, w11); 4217 4218 w6 = _mm_unpacklo_epi32(w4, w5); 4219 w7 = _mm_unpackhi_epi32(w4, w5); 4220 w14 = _mm_unpacklo_epi32(w12, w13); 4221 w15 = _mm_unpackhi_epi32(w12, w13); 4222 4223 // Store second 4-line result 4224 d[12] = _mm_unpacklo_epi64(w6, w14); 4225 d[13] = _mm_unpackhi_epi64(w6, w14); 4226 d[14] = _mm_unpacklo_epi64(w7, w15); 4227 d[15] = _mm_unpackhi_epi64(w7, w15); 4228 } 4229 4230 static void transpose_TX_8X8(const uint8_t *src, ptrdiff_t pitchSrc, 4231 uint8_t *dst, ptrdiff_t pitchDst) { 4232 __m128i r0, r1, r2, r3, r4, r5, r6, r7; 4233 __m128i d0d1, d2d3, d4d5, d6d7; 4234 r0 = _mm_loadl_epi64((__m128i *)(src + 0 * pitchSrc)); 4235 r1 = _mm_loadl_epi64((__m128i *)(src + 1 * pitchSrc)); 4236 r2 = _mm_loadl_epi64((__m128i *)(src + 2 * pitchSrc)); 4237 r3 = _mm_loadl_epi64((__m128i *)(src + 3 * pitchSrc)); 4238 r4 = _mm_loadl_epi64((__m128i *)(src + 4 * pitchSrc)); 4239 r5 = _mm_loadl_epi64((__m128i *)(src + 5 * pitchSrc)); 4240 r6 = _mm_loadl_epi64((__m128i *)(src + 6 * pitchSrc)); 4241 r7 = _mm_loadl_epi64((__m128i *)(src + 7 * pitchSrc)); 4242 4243 transpose8x8_sse2(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7, &d0d1, &d2d3, &d4d5, 4244 &d6d7); 4245 4246 _mm_storel_epi64((__m128i *)(dst + 0 * pitchDst), d0d1); 4247 _mm_storel_epi64((__m128i *)(dst + 1 * pitchDst), _mm_srli_si128(d0d1, 8)); 4248 _mm_storel_epi64((__m128i *)(dst + 2 * pitchDst), d2d3); 4249 _mm_storel_epi64((__m128i *)(dst + 3 * pitchDst), _mm_srli_si128(d2d3, 8)); 4250 _mm_storel_epi64((__m128i *)(dst + 4 * pitchDst), d4d5); 4251 _mm_storel_epi64((__m128i *)(dst + 5 * pitchDst), _mm_srli_si128(d4d5, 8)); 4252 _mm_storel_epi64((__m128i *)(dst + 6 * pitchDst), d6d7); 4253 _mm_storel_epi64((__m128i *)(dst + 7 * pitchDst), _mm_srli_si128(d6d7, 8)); 4254 } 4255 4256 static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, 4257 ptrdiff_t pitchDst, int width, int height) { 4258 for (int j = 0; j < height; j += 8) 4259 for (int i = 0; i < width; i += 8) 4260 transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i, 4261 pitchDst); 4262 } 4263 4264 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride, 4265 const uint8_t *left, int upsample_left, 4266 int dy) { 4267 __m128i dstvec[4], d[4]; 4268 4269 dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy); 4270 transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 4271 &d[0], &d[1], &d[2], &d[3]); 4272 4273 *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); 4274 *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); 4275 *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); 4276 *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); 4277 return; 4278 } 4279 4280 static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride, 4281 const uint8_t *left, int upsample_left, 4282 int dy) { 4283 __m128i dstvec[8], d[8]; 4284 4285 dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy); 4286 transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], 4287 &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], 4288 &d[3]); 4289 4290 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); 4291 _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); 4292 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); 4293 _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); 4294 _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); 4295 _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); 4296 _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); 4297 _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); 4298 } 4299 4300 static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride, 4301 const uint8_t *left, int upsample_left, 4302 int dy) { 4303 __m128i dstvec[4], d[8]; 4304 4305 dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy); 4306 transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], 4307 &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); 4308 for (int i = 0; i < 8; i++) { 4309 *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); 4310 } 4311 } 4312 4313 static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride, 4314 const uint8_t *left, int upsample_left, 4315 int dy) { 4316 __m128i dstvec[8], d[4]; 4317 4318 dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy); 4319 transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 4320 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], 4321 &d[1], &d[2], &d[3]); 4322 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); 4323 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); 4324 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); 4325 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); 4326 } 4327 4328 static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride, 4329 const uint8_t *left, int upsample_left, 4330 int dy) { 4331 __m128i dstvec[8], d[8]; 4332 4333 dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy); 4334 transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, 4335 dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, 4336 d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); 4337 for (int i = 0; i < 8; i++) { 4338 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); 4339 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), 4340 _mm_srli_si128(d[i], 8)); 4341 } 4342 } 4343 4344 static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride, 4345 const uint8_t *left, int upsample_left, 4346 int dy) { 4347 __m128i dstvec[16], d[16]; 4348 4349 dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, dy); 4350 transpose16x8_8x16_sse2( 4351 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 4352 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 4353 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 4354 &d[3], &d[4], &d[5], &d[6], &d[7]); 4355 4356 for (int i = 0; i < 8; i++) { 4357 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 4358 } 4359 } 4360 4361 static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride, 4362 const uint8_t *left, int upsample_left, 4363 int dy) { 4364 __m128i dstvec[4], d[16]; 4365 4366 dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, dy); 4367 transpose4x16_sse2(dstvec, d); 4368 for (int i = 0; i < 16; i++) { 4369 *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); 4370 } 4371 } 4372 4373 static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride, 4374 const uint8_t *left, int upsample_left, 4375 int dy) { 4376 __m128i dstvec[16], d[8]; 4377 4378 dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, dy); 4379 for (int i = 4; i < 8; i++) { 4380 d[i] = _mm_setzero_si128(); 4381 } 4382 transpose16x8_8x16_sse2( 4383 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 4384 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 4385 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 4386 &d[3], &d[4], &d[5], &d[6], &d[7]); 4387 4388 for (int i = 0; i < 4; i++) { 4389 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 4390 } 4391 } 4392 4393 static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride, 4394 const uint8_t *left, int upsample_left, 4395 int dy) { 4396 __m256i dstvec[16], d[16]; 4397 4398 dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); 4399 for (int i = 8; i < 16; i++) { 4400 dstvec[i] = _mm256_setzero_si256(); 4401 } 4402 transpose16x32_avx2(dstvec, d); 4403 4404 for (int i = 0; i < 16; i++) { 4405 _mm_storel_epi64((__m128i *)(dst + i * stride), 4406 _mm256_castsi256_si128(d[i])); 4407 } 4408 for (int i = 0; i < 16; i++) { 4409 _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), 4410 _mm256_extracti128_si256(d[i], 1)); 4411 } 4412 } 4413 4414 static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride, 4415 const uint8_t *left, int upsample_left, 4416 int dy) { 4417 __m128i dstvec[32], d[16]; 4418 4419 dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, dy); 4420 4421 transpose16x8_8x16_sse2( 4422 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 4423 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 4424 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 4425 &d[3], &d[4], &d[5], &d[6], &d[7]); 4426 transpose16x8_8x16_sse2( 4427 &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], 4428 &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], 4429 &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], 4430 &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], 4431 &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], 4432 &d[6 + 8], &d[7 + 8]); 4433 4434 for (int i = 0; i < 8; i++) { 4435 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 4436 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); 4437 } 4438 } 4439 4440 static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride, 4441 const uint8_t *left, int upsample_left, 4442 int dy) { 4443 __m128i dstvec[16], d[16]; 4444 4445 dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, dy); 4446 transpose16x16_sse2(dstvec, d); 4447 4448 for (int i = 0; i < 16; i++) { 4449 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 4450 } 4451 } 4452 4453 static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 4454 const uint8_t *left, int upsample_left, 4455 int dy) { 4456 __m256i dstvec[32], d[32]; 4457 4458 dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); 4459 transpose16x32_avx2(dstvec, d); 4460 transpose16x32_avx2(dstvec + 16, d + 16); 4461 for (int j = 0; j < 16; j++) { 4462 _mm_storeu_si128((__m128i *)(dst + j * stride), 4463 _mm256_castsi256_si128(d[j])); 4464 _mm_storeu_si128((__m128i *)(dst + j * stride + 16), 4465 _mm256_castsi256_si128(d[j + 16])); 4466 } 4467 for (int j = 0; j < 16; j++) { 4468 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), 4469 _mm256_extracti128_si256(d[j], 1)); 4470 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), 4471 _mm256_extracti128_si256(d[j + 16], 1)); 4472 } 4473 } 4474 4475 static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 4476 const uint8_t *left, int upsample_left, 4477 int dy) { 4478 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); 4479 dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); 4480 transpose(dstT, 64, dst, stride, 64, 64); 4481 } 4482 4483 static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride, 4484 const uint8_t *left, int upsample_left, 4485 int dy) { 4486 __m256i dstvec[16], d[16]; 4487 4488 dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); 4489 transpose16x32_avx2(dstvec, d); 4490 // store 4491 for (int j = 0; j < 16; j++) { 4492 _mm_storeu_si128((__m128i *)(dst + j * stride), 4493 _mm256_castsi256_si128(d[j])); 4494 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), 4495 _mm256_extracti128_si256(d[j], 1)); 4496 } 4497 } 4498 4499 static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 4500 const uint8_t *left, int upsample_left, 4501 int dy) { 4502 __m128i dstvec[32], d[16]; 4503 4504 dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, dy); 4505 for (int i = 0; i < 32; i += 16) { 4506 transpose16x16_sse2((dstvec + i), d); 4507 for (int j = 0; j < 16; j++) { 4508 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); 4509 } 4510 } 4511 } 4512 4513 static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 4514 const uint8_t *left, int upsample_left, 4515 int dy) { 4516 uint8_t dstT[64 * 32]; 4517 dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); 4518 transpose(dstT, 64, dst, stride, 32, 64); 4519 } 4520 4521 static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 4522 const uint8_t *left, int upsample_left, 4523 int dy) { 4524 uint8_t dstT[32 * 64]; 4525 dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); 4526 transpose(dstT, 32, dst, stride, 64, 32); 4527 return; 4528 } 4529 4530 static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride, 4531 const uint8_t *left, int upsample_left, 4532 int dy) { 4533 uint8_t dstT[64 * 16]; 4534 dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); 4535 transpose(dstT, 64, dst, stride, 16, 64); 4536 } 4537 4538 static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 4539 const uint8_t *left, int upsample_left, 4540 int dy) { 4541 __m128i dstvec[64], d[16]; 4542 4543 dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, dy); 4544 for (int i = 0; i < 64; i += 16) { 4545 transpose16x16_sse2((dstvec + i), d); 4546 for (int j = 0; j < 16; j++) { 4547 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); 4548 } 4549 } 4550 } 4551 4552 void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 4553 const uint8_t *above, const uint8_t *left, 4554 int upsample_left, int dx, int dy) { 4555 (void)above; 4556 (void)dx; 4557 assert(dx == 1); 4558 assert(dy > 0); 4559 4560 if (bw == bh) { 4561 switch (bw) { 4562 case 4: 4563 dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy); 4564 break; 4565 case 8: 4566 dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy); 4567 break; 4568 case 16: 4569 dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy); 4570 break; 4571 case 32: 4572 dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy); 4573 break; 4574 case 64: 4575 dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy); 4576 break; 4577 } 4578 } else { 4579 if (bw < bh) { 4580 if (bw + bw == bh) { 4581 switch (bw) { 4582 case 4: 4583 dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy); 4584 break; 4585 case 8: 4586 dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy); 4587 break; 4588 case 16: 4589 dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy); 4590 break; 4591 case 32: 4592 dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy); 4593 break; 4594 } 4595 } else { 4596 switch (bw) { 4597 case 4: 4598 dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy); 4599 break; 4600 case 8: 4601 dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy); 4602 break; 4603 case 16: 4604 dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy); 4605 break; 4606 } 4607 } 4608 } else { 4609 if (bh + bh == bw) { 4610 switch (bh) { 4611 case 4: 4612 dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy); 4613 break; 4614 case 8: 4615 dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy); 4616 break; 4617 case 16: 4618 dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy); 4619 break; 4620 case 32: 4621 dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy); 4622 break; 4623 } 4624 } else { 4625 switch (bh) { 4626 case 4: 4627 dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy); 4628 break; 4629 case 8: 4630 dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy); 4631 break; 4632 case 16: 4633 dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy); 4634 break; 4635 } 4636 } 4637 } 4638 } 4639 } 4640