1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <tmmintrin.h> 12 13 #include "./vpx_config.h" 14 #include "./vpx_dsp_rtcd.h" 15 #include "vpx/vpx_integer.h" 16 17 // ----------------------------------------------------------------------------- 18 /* 19 ; ------------------------------------------ 20 ; input: x, y, z, result 21 ; 22 ; trick from pascal 23 ; (x+2y+z+2)>>2 can be calculated as: 24 ; result = avg(x,z) 25 ; result -= xor(x,z) & 1 26 ; result = avg(result,y) 27 ; ------------------------------------------ 28 */ 29 static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, 30 const __m128i *z) { 31 const __m128i one = _mm_set1_epi16(1); 32 const __m128i a = _mm_avg_epu16(*x, *z); 33 const __m128i b = 34 _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); 35 return _mm_avg_epu16(b, *y); 36 } 37 38 void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride, 39 const uint16_t *above, 40 const uint16_t *left, int bd) { 41 const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); 42 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); 43 const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); 44 const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); 45 (void)left; 46 (void)bd; 47 _mm_storel_epi64((__m128i *)dst, avg3); 48 dst += stride; 49 _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); 50 dst += stride; 51 _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); 52 dst += stride; 53 _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); 54 dst[3] = above[7]; // aka H 55 } 56 57 static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, 58 __m128i *row, const __m128i *ar) { 59 *row = _mm_alignr_epi8(*ar, *row, 2); 60 _mm_store_si128((__m128i *)*dst, *row); 61 *dst += stride; 62 } 63 64 void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, 65 const uint16_t *above, 66 const uint16_t *left, int bd) { 67 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); 68 const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); 69 const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); 70 const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); 71 const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); 72 __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); 73 (void)left; 74 (void)bd; 75 _mm_store_si128((__m128i *)dst, avg3); 76 dst += stride; 77 d45_store_8(&dst, stride, &avg3, &HHHHHHHH); 78 d45_store_8(&dst, stride, &avg3, &HHHHHHHH); 79 d45_store_8(&dst, stride, &avg3, &HHHHHHHH); 80 d45_store_8(&dst, stride, &avg3, &HHHHHHHH); 81 d45_store_8(&dst, stride, &avg3, &HHHHHHHH); 82 d45_store_8(&dst, stride, &avg3, &HHHHHHHH); 83 d45_store_8(&dst, stride, &avg3, &HHHHHHHH); 84 } 85 86 static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, 87 __m128i *row_0, __m128i *row_1, 88 const __m128i *ar) { 89 *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2); 90 *row_1 = _mm_alignr_epi8(*ar, *row_1, 2); 91 _mm_store_si128((__m128i *)*dst, *row_0); 92 _mm_store_si128((__m128i *)(*dst + 8), *row_1); 93 *dst += stride; 94 } 95 96 void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, 97 const uint16_t *above, 98 const uint16_t *left, int bd) { 99 const __m128i A0 = _mm_load_si128((const __m128i *)above); 100 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); 101 const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); 102 const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); 103 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); 104 const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); 105 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); 106 const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); 107 __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 108 __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 109 (void)left; 110 (void)bd; 111 _mm_store_si128((__m128i *)dst, avg3_0); 112 _mm_store_si128((__m128i *)(dst + 8), avg3_1); 113 dst += stride; 114 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 115 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 116 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 117 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 118 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 119 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 120 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 121 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 122 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 123 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 124 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 125 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 126 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 127 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 128 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); 129 } 130 131 void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, 132 const uint16_t *above, 133 const uint16_t *left, int bd) { 134 const __m128i A0 = _mm_load_si128((const __m128i *)above); 135 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); 136 const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); 137 const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); 138 const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); 139 const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); 140 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); 141 const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); 142 const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); 143 const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); 144 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); 145 const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); 146 const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); 147 const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); 148 __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 149 __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 150 __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); 151 __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); 152 int i; 153 (void)left; 154 (void)bd; 155 _mm_store_si128((__m128i *)dst, avg3_0); 156 _mm_store_si128((__m128i *)(dst + 8), avg3_1); 157 _mm_store_si128((__m128i *)(dst + 16), avg3_2); 158 _mm_store_si128((__m128i *)(dst + 24), avg3_3); 159 dst += stride; 160 for (i = 1; i < 32; ++i) { 161 avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); 162 avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); 163 avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); 164 avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); 165 _mm_store_si128((__m128i *)dst, avg3_0); 166 _mm_store_si128((__m128i *)(dst + 8), avg3_1); 167 _mm_store_si128((__m128i *)(dst + 16), avg3_2); 168 _mm_store_si128((__m128i *)(dst + 24), avg3_3); 169 dst += stride; 170 } 171 } 172 173 DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { 174 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 175 }; 176 177 static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { 178 *a = _mm_shuffle_epi8(*a, *rotrw); 179 return *a; 180 } 181 182 void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, 183 const uint16_t *above, 184 const uint16_t *left, int bd) { 185 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); 186 const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); 187 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); 188 const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); 189 const __m128i IXABCDEF = 190 _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); 191 const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); 192 const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); 193 const __m128i XIJKLMNO = 194 _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); 195 const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); 196 __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); 197 __m128i rowa = avg2; 198 __m128i rowb = avg3; 199 int i; 200 (void)bd; 201 for (i = 0; i < 8; i += 2) { 202 _mm_store_si128((__m128i *)dst, rowa); 203 dst += stride; 204 _mm_store_si128((__m128i *)dst, rowb); 205 dst += stride; 206 rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); 207 rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); 208 } 209 } 210 211 void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, 212 const uint16_t *above, 213 const uint16_t *left, int bd) { 214 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); 215 const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); 216 const __m128i A0 = _mm_load_si128((const __m128i *)above); 217 const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); 218 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); 219 const __m128i avg2_0 = _mm_avg_epu16(A0, B0); 220 const __m128i avg2_1 = _mm_avg_epu16(A1, B1); 221 const __m128i L0 = _mm_load_si128((const __m128i *)left); 222 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); 223 const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); 224 const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); 225 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 226 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 227 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); 228 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); 229 const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); 230 const __m128i L1_ = _mm_srli_si128(L1, 2); 231 __m128i rowa_0 = avg2_0; 232 __m128i rowa_1 = avg2_1; 233 __m128i rowb_0 = avg3_0; 234 __m128i rowb_1 = avg3_1; 235 __m128i avg3_left[2]; 236 int i, j; 237 (void)bd; 238 avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); 239 avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); 240 for (i = 0; i < 2; ++i) { 241 __m128i avg_left = avg3_left[i]; 242 for (j = 0; j < 8; j += 2) { 243 _mm_store_si128((__m128i *)dst, rowa_0); 244 _mm_store_si128((__m128i *)(dst + 8), rowa_1); 245 dst += stride; 246 _mm_store_si128((__m128i *)dst, rowb_0); 247 _mm_store_si128((__m128i *)(dst + 8), rowb_1); 248 dst += stride; 249 rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); 250 rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); 251 rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); 252 rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); 253 } 254 } 255 } 256 257 void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, 258 const uint16_t *above, 259 const uint16_t *left, int bd) { 260 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); 261 const __m128i A0 = _mm_load_si128((const __m128i *)above); 262 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); 263 const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); 264 const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); 265 const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); 266 const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); 267 const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); 268 const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); 269 const __m128i avg2_0 = _mm_avg_epu16(A0, B0); 270 const __m128i avg2_1 = _mm_avg_epu16(A1, B1); 271 const __m128i avg2_2 = _mm_avg_epu16(A2, B2); 272 const __m128i avg2_3 = _mm_avg_epu16(A3, B3); 273 const __m128i L0 = _mm_load_si128((const __m128i *)left); 274 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); 275 const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); 276 const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); 277 const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); 278 const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); 279 const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); 280 const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); 281 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 282 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 283 const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); 284 const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); 285 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); 286 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); 287 const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); 288 const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); 289 const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); 290 const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); 291 const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); 292 const __m128i L3_ = _mm_srli_si128(L3, 2); 293 __m128i rowa_0 = avg2_0; 294 __m128i rowa_1 = avg2_1; 295 __m128i rowa_2 = avg2_2; 296 __m128i rowa_3 = avg2_3; 297 __m128i rowb_0 = avg3_0; 298 __m128i rowb_1 = avg3_1; 299 __m128i rowb_2 = avg3_2; 300 __m128i rowb_3 = avg3_3; 301 __m128i avg3_left[4]; 302 int i, j; 303 (void)bd; 304 avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); 305 avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); 306 avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); 307 avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); 308 for (i = 0; i < 4; ++i) { 309 __m128i avg_left = avg3_left[i]; 310 for (j = 0; j < 8; j += 2) { 311 _mm_store_si128((__m128i *)dst, rowa_0); 312 _mm_store_si128((__m128i *)(dst + 8), rowa_1); 313 _mm_store_si128((__m128i *)(dst + 16), rowa_2); 314 _mm_store_si128((__m128i *)(dst + 24), rowa_3); 315 dst += stride; 316 _mm_store_si128((__m128i *)dst, rowb_0); 317 _mm_store_si128((__m128i *)(dst + 8), rowb_1); 318 _mm_store_si128((__m128i *)(dst + 16), rowb_2); 319 _mm_store_si128((__m128i *)(dst + 24), rowb_3); 320 dst += stride; 321 rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); 322 rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); 323 rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); 324 rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); 325 rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); 326 rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); 327 rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); 328 rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); 329 } 330 } 331 } 332 333 void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, 334 const uint16_t *above, 335 const uint16_t *left, int bd) { 336 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); 337 const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); 338 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); 339 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); 340 const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); 341 const __m128i XIJKLMNO = 342 _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); 343 const __m128i AXIJKLMN = 344 _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); 345 const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); 346 __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); 347 __m128i rowa = avg3; 348 int i; 349 (void)bd; 350 for (i = 0; i < 8; ++i) { 351 rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); 352 _mm_store_si128((__m128i *)dst, rowa); 353 dst += stride; 354 } 355 } 356 357 void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, 358 const uint16_t *above, 359 const uint16_t *left, int bd) { 360 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); 361 const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); 362 const __m128i B0 = _mm_load_si128((const __m128i *)above); 363 const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); 364 const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); 365 const __m128i L0 = _mm_load_si128((const __m128i *)left); 366 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); 367 const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); 368 const __m128i C1 = _mm_srli_si128(B1, 2); 369 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 370 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 371 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); 372 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); 373 const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); 374 const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); 375 __m128i rowa_0 = avg3_0; 376 __m128i rowa_1 = avg3_1; 377 __m128i avg3_left[2]; 378 int i, j; 379 (void)bd; 380 avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); 381 avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); 382 for (i = 0; i < 2; ++i) { 383 __m128i avg_left = avg3_left[i]; 384 for (j = 0; j < 8; ++j) { 385 rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); 386 rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); 387 _mm_store_si128((__m128i *)dst, rowa_0); 388 _mm_store_si128((__m128i *)(dst + 8), rowa_1); 389 dst += stride; 390 } 391 } 392 } 393 394 void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, 395 const uint16_t *above, 396 const uint16_t *left, int bd) { 397 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); 398 const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); 399 const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); 400 const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); 401 const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); 402 const __m128i B0 = _mm_load_si128((const __m128i *)above); 403 const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); 404 const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); 405 const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); 406 const __m128i L0 = _mm_load_si128((const __m128i *)left); 407 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); 408 const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); 409 const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); 410 const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); 411 const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); 412 const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); 413 const __m128i C3 = _mm_srli_si128(B3, 2); 414 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 415 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 416 const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); 417 const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); 418 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); 419 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); 420 const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); 421 const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); 422 const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); 423 const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); 424 const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); 425 const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); 426 __m128i rowa_0 = avg3_0; 427 __m128i rowa_1 = avg3_1; 428 __m128i rowa_2 = avg3_2; 429 __m128i rowa_3 = avg3_3; 430 __m128i avg3_left[4]; 431 int i, j; 432 (void)bd; 433 avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); 434 avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); 435 avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); 436 avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); 437 for (i = 0; i < 4; ++i) { 438 __m128i avg_left = avg3_left[i]; 439 for (j = 0; j < 8; ++j) { 440 rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); 441 rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); 442 rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); 443 rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); 444 _mm_store_si128((__m128i *)dst, rowa_0); 445 _mm_store_si128((__m128i *)(dst + 8), rowa_1); 446 _mm_store_si128((__m128i *)(dst + 16), rowa_2); 447 _mm_store_si128((__m128i *)(dst + 24), rowa_3); 448 dst += stride; 449 } 450 } 451 } 452 453 void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, 454 const uint16_t *above, 455 const uint16_t *left, int bd) { 456 const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); 457 const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); 458 const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); 459 const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); 460 const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); 461 const __m128i XIJKLMNO = 462 _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); 463 const __m128i AXIJKLMN = 464 _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); 465 const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); 466 const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); 467 const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); 468 const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); 469 const __m128i row0 = 470 _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); 471 const __m128i row1 = 472 _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); 473 const __m128i row2 = 474 _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); 475 const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); 476 const __m128i row4 = 477 _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); 478 const __m128i row5 = 479 _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); 480 const __m128i row6 = 481 _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); 482 const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); 483 (void)bd; 484 _mm_store_si128((__m128i *)dst, row0); 485 dst += stride; 486 _mm_store_si128((__m128i *)dst, row1); 487 dst += stride; 488 _mm_store_si128((__m128i *)dst, row2); 489 dst += stride; 490 _mm_store_si128((__m128i *)dst, row3); 491 dst += stride; 492 _mm_store_si128((__m128i *)dst, row4); 493 dst += stride; 494 _mm_store_si128((__m128i *)dst, row5); 495 dst += stride; 496 _mm_store_si128((__m128i *)dst, row6); 497 dst += stride; 498 _mm_store_si128((__m128i *)dst, row7); 499 } 500 501 void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, 502 const uint16_t *above, 503 const uint16_t *left, int bd) { 504 const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); 505 const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); 506 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); 507 const __m128i B1 = _mm_srli_si128(A1, 2); 508 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); 509 const __m128i C1 = _mm_srli_si128(A1, 4); 510 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 511 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 512 const __m128i L0 = _mm_load_si128((const __m128i *)left); 513 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); 514 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); 515 const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); 516 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); 517 const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); 518 const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); 519 const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); 520 const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); 521 const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); 522 __m128i row_0 = avg3_0; 523 __m128i row_1 = avg3_1; 524 __m128i avg2_avg3_left[2][2]; 525 int i, j; 526 (void)bd; 527 528 avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); 529 avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); 530 avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); 531 avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); 532 533 for (j = 0; j < 2; ++j) { 534 for (i = 0; i < 2; ++i) { 535 const __m128i avg2_avg3 = avg2_avg3_left[j][i]; 536 row_1 = _mm_alignr_epi8(row_1, row_0, 12); 537 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); 538 _mm_store_si128((__m128i *)dst, row_0); 539 _mm_store_si128((__m128i *)(dst + 8), row_1); 540 dst += stride; 541 row_1 = _mm_alignr_epi8(row_1, row_0, 12); 542 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); 543 _mm_store_si128((__m128i *)dst, row_0); 544 _mm_store_si128((__m128i *)(dst + 8), row_1); 545 dst += stride; 546 row_1 = _mm_alignr_epi8(row_1, row_0, 12); 547 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); 548 _mm_store_si128((__m128i *)dst, row_0); 549 _mm_store_si128((__m128i *)(dst + 8), row_1); 550 dst += stride; 551 row_1 = _mm_alignr_epi8(row_1, row_0, 12); 552 row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); 553 _mm_store_si128((__m128i *)dst, row_0); 554 _mm_store_si128((__m128i *)(dst + 8), row_1); 555 dst += stride; 556 } 557 } 558 } 559 560 void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, 561 const uint16_t *above, 562 const uint16_t *left, int bd) { 563 const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); 564 const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); 565 const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); 566 const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); 567 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); 568 const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); 569 const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); 570 const __m128i B3 = _mm_srli_si128(A3, 2); 571 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); 572 const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); 573 const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); 574 const __m128i C3 = _mm_srli_si128(A3, 4); 575 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 576 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 577 const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); 578 const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); 579 const __m128i L0 = _mm_load_si128((const __m128i *)left); 580 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); 581 const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); 582 const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); 583 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); 584 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); 585 const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); 586 const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); 587 const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); 588 const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); 589 const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); 590 const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); 591 const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); 592 const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); 593 const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); 594 const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); 595 const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); 596 const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); 597 const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); 598 const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); 599 __m128i row_0 = avg3_0; 600 __m128i row_1 = avg3_1; 601 __m128i row_2 = avg3_2; 602 __m128i row_3 = avg3_3; 603 __m128i avg2_avg3_left[4][2]; 604 int i, j; 605 (void)bd; 606 607 avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); 608 avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); 609 avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); 610 avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); 611 avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); 612 avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); 613 avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); 614 avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); 615 616 for (j = 0; j < 4; ++j) { 617 for (i = 0; i < 2; ++i) { 618 const __m128i avg2_avg3 = avg2_avg3_left[j][i]; 619 row_3 = _mm_alignr_epi8(row_3, row_2, 12); 620 row_2 = _mm_alignr_epi8(row_2, row_1, 12); 621 row_1 = _mm_alignr_epi8(row_1, row_0, 12); 622 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); 623 _mm_store_si128((__m128i *)dst, row_0); 624 _mm_store_si128((__m128i *)(dst + 8), row_1); 625 _mm_store_si128((__m128i *)(dst + 16), row_2); 626 _mm_store_si128((__m128i *)(dst + 24), row_3); 627 dst += stride; 628 row_3 = _mm_alignr_epi8(row_3, row_2, 12); 629 row_2 = _mm_alignr_epi8(row_2, row_1, 12); 630 row_1 = _mm_alignr_epi8(row_1, row_0, 12); 631 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); 632 _mm_store_si128((__m128i *)dst, row_0); 633 _mm_store_si128((__m128i *)(dst + 8), row_1); 634 _mm_store_si128((__m128i *)(dst + 16), row_2); 635 _mm_store_si128((__m128i *)(dst + 24), row_3); 636 dst += stride; 637 row_3 = _mm_alignr_epi8(row_3, row_2, 12); 638 row_2 = _mm_alignr_epi8(row_2, row_1, 12); 639 row_1 = _mm_alignr_epi8(row_1, row_0, 12); 640 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); 641 _mm_store_si128((__m128i *)dst, row_0); 642 _mm_store_si128((__m128i *)(dst + 8), row_1); 643 _mm_store_si128((__m128i *)(dst + 16), row_2); 644 _mm_store_si128((__m128i *)(dst + 24), row_3); 645 dst += stride; 646 row_3 = _mm_alignr_epi8(row_3, row_2, 12); 647 row_2 = _mm_alignr_epi8(row_2, row_1, 12); 648 row_1 = _mm_alignr_epi8(row_1, row_0, 12); 649 row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); 650 _mm_store_si128((__m128i *)dst, row_0); 651 _mm_store_si128((__m128i *)(dst + 8), row_1); 652 _mm_store_si128((__m128i *)(dst + 16), row_2); 653 _mm_store_si128((__m128i *)(dst + 24), row_3); 654 dst += stride; 655 } 656 } 657 } 658 659 static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride, 660 const __m128i *a, const __m128i *b) { 661 _mm_store_si128((__m128i *)*dst, *a); 662 *dst += stride; 663 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); 664 *dst += stride; 665 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); 666 *dst += stride; 667 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); 668 *dst += stride; 669 } 670 671 void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, 672 const uint16_t *above, 673 const uint16_t *left, int bd) { 674 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left); 675 const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); 676 const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); 677 const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); 678 const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); 679 const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); 680 const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); 681 const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3); 682 const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3); 683 (void)above; 684 (void)bd; 685 d207_store_4x8(&dst, stride, &out_a, &out_b); 686 d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH); 687 } 688 689 static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride, 690 const __m128i *a, const __m128i *b, 691 const __m128i *c) { 692 _mm_store_si128((__m128i *)*dst, *a); 693 _mm_store_si128((__m128i *)(*dst + 8), *b); 694 *dst += stride; 695 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); 696 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); 697 *dst += stride; 698 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); 699 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); 700 *dst += stride; 701 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); 702 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); 703 *dst += stride; 704 } 705 706 void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, 707 const uint16_t *above, 708 const uint16_t *left, int bd) { 709 const __m128i A0 = _mm_load_si128((const __m128i *)left); 710 const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); 711 const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff); 712 const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); 713 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); 714 const __m128i B1 = _mm_alignr_epi8(LR, A1, 2); 715 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); 716 const __m128i C1 = _mm_alignr_epi8(LR, A1, 4); 717 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 718 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 719 const __m128i avg2_0 = _mm_avg_epu16(A0, B0); 720 const __m128i avg2_1 = _mm_avg_epu16(A1, B1); 721 const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); 722 const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); 723 const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); 724 const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); 725 (void)above; 726 (void)bd; 727 d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c); 728 d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d); 729 d207_store_4x16(&dst, stride, &out_c, &out_d, &LR); 730 d207_store_4x16(&dst, stride, &out_d, &LR, &LR); 731 } 732 733 static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride, 734 const __m128i *a, const __m128i *b, 735 const __m128i *c, const __m128i *d, 736 const __m128i *e) { 737 _mm_store_si128((__m128i *)*dst, *a); 738 _mm_store_si128((__m128i *)(*dst + 8), *b); 739 _mm_store_si128((__m128i *)(*dst + 16), *c); 740 _mm_store_si128((__m128i *)(*dst + 24), *d); 741 *dst += stride; 742 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); 743 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); 744 _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4)); 745 _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4)); 746 *dst += stride; 747 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); 748 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); 749 _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8)); 750 _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8)); 751 *dst += stride; 752 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); 753 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); 754 _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12)); 755 _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12)); 756 *dst += stride; 757 } 758 759 void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, 760 const uint16_t *above, 761 const uint16_t *left, int bd) { 762 const __m128i A0 = _mm_load_si128((const __m128i *)left); 763 const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); 764 const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16)); 765 const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24)); 766 const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff); 767 const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); 768 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); 769 const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); 770 const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); 771 const __m128i B3 = _mm_alignr_epi8(LR, A3, 2); 772 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); 773 const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); 774 const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); 775 const __m128i C3 = _mm_alignr_epi8(LR, A3, 4); 776 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 777 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 778 const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); 779 const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); 780 const __m128i avg2_0 = _mm_avg_epu16(A0, B0); 781 const __m128i avg2_1 = _mm_avg_epu16(A1, B1); 782 const __m128i avg2_2 = _mm_avg_epu16(A2, B2); 783 const __m128i avg2_3 = _mm_avg_epu16(A3, B3); 784 const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); 785 const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); 786 const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); 787 const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); 788 const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2); 789 const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2); 790 const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3); 791 const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3); 792 (void)above; 793 (void)bd; 794 d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e); 795 d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f); 796 d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g); 797 d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h); 798 d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR); 799 d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR); 800 d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR); 801 d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR); 802 } 803 804 static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride, 805 __m128i *a, __m128i *b, const __m128i *ar) { 806 _mm_store_si128((__m128i *)*dst, *a); 807 *dst += stride; 808 _mm_store_si128((__m128i *)*dst, *b); 809 *dst += stride; 810 *a = _mm_alignr_epi8(*ar, *a, 2); 811 *b = _mm_alignr_epi8(*ar, *b, 2); 812 _mm_store_si128((__m128i *)*dst, *a); 813 *dst += stride; 814 _mm_store_si128((__m128i *)*dst, *b); 815 *dst += stride; 816 *a = _mm_alignr_epi8(*ar, *a, 2); 817 *b = _mm_alignr_epi8(*ar, *b, 2); 818 } 819 820 void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, 821 const uint16_t *above, 822 const uint16_t *left, int bd) { 823 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); 824 const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); 825 const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); 826 const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); 827 const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); 828 __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); 829 __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); 830 (void)left; 831 (void)bd; 832 d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); 833 d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); 834 } 835 836 void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, 837 const uint16_t *above, 838 const uint16_t *left, int bd) { 839 const __m128i A0 = _mm_load_si128((const __m128i *)above); 840 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); 841 const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); 842 const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); 843 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); 844 const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); 845 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); 846 const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); 847 __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 848 __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 849 __m128i avg2_0 = _mm_avg_epu16(A0, B0); 850 __m128i avg2_1 = _mm_avg_epu16(A1, B1); 851 int i; 852 (void)left; 853 (void)bd; 854 for (i = 0; i < 14; i += 2) { 855 _mm_store_si128((__m128i *)dst, avg2_0); 856 _mm_store_si128((__m128i *)(dst + 8), avg2_1); 857 dst += stride; 858 _mm_store_si128((__m128i *)dst, avg3_0); 859 _mm_store_si128((__m128i *)(dst + 8), avg3_1); 860 dst += stride; 861 avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); 862 avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2); 863 avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); 864 avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2); 865 } 866 _mm_store_si128((__m128i *)dst, avg2_0); 867 _mm_store_si128((__m128i *)(dst + 8), avg2_1); 868 dst += stride; 869 _mm_store_si128((__m128i *)dst, avg3_0); 870 _mm_store_si128((__m128i *)(dst + 8), avg3_1); 871 } 872 873 void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, 874 const uint16_t *above, 875 const uint16_t *left, int bd) { 876 const __m128i A0 = _mm_load_si128((const __m128i *)above); 877 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); 878 const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); 879 const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); 880 const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); 881 const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); 882 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); 883 const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); 884 const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); 885 const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); 886 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); 887 const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); 888 const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); 889 const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); 890 __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); 891 __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); 892 __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); 893 __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); 894 __m128i avg2_0 = _mm_avg_epu16(A0, B0); 895 __m128i avg2_1 = _mm_avg_epu16(A1, B1); 896 __m128i avg2_2 = _mm_avg_epu16(A2, B2); 897 __m128i avg2_3 = _mm_avg_epu16(A3, B3); 898 int i; 899 (void)left; 900 (void)bd; 901 for (i = 0; i < 30; i += 2) { 902 _mm_store_si128((__m128i *)dst, avg2_0); 903 _mm_store_si128((__m128i *)(dst + 8), avg2_1); 904 _mm_store_si128((__m128i *)(dst + 16), avg2_2); 905 _mm_store_si128((__m128i *)(dst + 24), avg2_3); 906 dst += stride; 907 _mm_store_si128((__m128i *)dst, avg3_0); 908 _mm_store_si128((__m128i *)(dst + 8), avg3_1); 909 _mm_store_si128((__m128i *)(dst + 16), avg3_2); 910 _mm_store_si128((__m128i *)(dst + 24), avg3_3); 911 dst += stride; 912 avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); 913 avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2); 914 avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2); 915 avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2); 916 avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); 917 avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); 918 avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); 919 avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); 920 } 921 _mm_store_si128((__m128i *)dst, avg2_0); 922 _mm_store_si128((__m128i *)(dst + 8), avg2_1); 923 _mm_store_si128((__m128i *)(dst + 16), avg2_2); 924 _mm_store_si128((__m128i *)(dst + 24), avg2_3); 925 dst += stride; 926 _mm_store_si128((__m128i *)dst, avg3_0); 927 _mm_store_si128((__m128i *)(dst + 8), avg3_1); 928 _mm_store_si128((__m128i *)(dst + 16), avg3_2); 929 _mm_store_si128((__m128i *)(dst + 24), avg3_3); 930 } 931