1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_dsp_rtcd.h" 12 #include "vpx_dsp/ppc/types_vsx.h" 13 14 void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 15 const uint8_t *above, const uint8_t *left) { 16 const uint8x16_t d = vec_vsx_ld(0, above); 17 int i; 18 (void)left; 19 20 for (i = 0; i < 16; i++, dst += stride) { 21 vec_vsx_st(d, 0, dst); 22 } 23 } 24 25 void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 26 const uint8_t *above, const uint8_t *left) { 27 const uint8x16_t d0 = vec_vsx_ld(0, above); 28 const uint8x16_t d1 = vec_vsx_ld(16, above); 29 int i; 30 (void)left; 31 32 for (i = 0; i < 32; i++, dst += stride) { 33 vec_vsx_st(d0, 0, dst); 34 vec_vsx_st(d1, 16, dst); 35 } 36 } 37 38 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; 39 40 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, 41 const uint8_t *above, const uint8_t *left) { 42 const uint8x16_t d = vec_vsx_ld(0, left); 43 const uint8x16_t v0 = vec_splat(d, 0); 44 const uint8x16_t v1 = vec_splat(d, 1); 45 const uint8x16_t v2 = vec_splat(d, 2); 46 const uint8x16_t v3 = vec_splat(d, 3); 47 48 (void)above; 49 50 vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); 51 dst += stride; 52 vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); 53 dst += stride; 54 vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); 55 dst += stride; 56 vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); 57 } 58 59 void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, 60 const uint8_t *above, const uint8_t *left) { 61 const uint8x16_t d = vec_vsx_ld(0, left); 62 const uint8x16_t v0 = vec_splat(d, 0); 63 const uint8x16_t v1 = vec_splat(d, 1); 64 const uint8x16_t v2 = vec_splat(d, 2); 65 const uint8x16_t v3 = vec_splat(d, 3); 66 67 const uint8x16_t v4 = vec_splat(d, 4); 68 const uint8x16_t v5 = vec_splat(d, 5); 69 const uint8x16_t v6 = vec_splat(d, 6); 70 const uint8x16_t v7 = vec_splat(d, 7); 71 72 (void)above; 73 74 vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst); 75 dst += stride; 76 vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst); 77 dst += stride; 78 vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst); 79 dst += stride; 80 vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst); 81 dst += stride; 82 vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst); 83 dst += stride; 84 vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst); 85 dst += stride; 86 vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst); 87 dst += stride; 88 vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst); 89 } 90 91 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 92 const uint8_t *above, const uint8_t *left) { 93 const uint8x16_t d = vec_vsx_ld(0, left); 94 const uint8x16_t v0 = vec_splat(d, 0); 95 const uint8x16_t v1 = vec_splat(d, 1); 96 const uint8x16_t v2 = vec_splat(d, 2); 97 const uint8x16_t v3 = vec_splat(d, 3); 98 99 const uint8x16_t v4 = vec_splat(d, 4); 100 const uint8x16_t v5 = vec_splat(d, 5); 101 const uint8x16_t v6 = vec_splat(d, 6); 102 const uint8x16_t v7 = vec_splat(d, 7); 103 104 const uint8x16_t v8 = vec_splat(d, 8); 105 const uint8x16_t v9 = vec_splat(d, 9); 106 const uint8x16_t v10 = vec_splat(d, 10); 107 const uint8x16_t v11 = vec_splat(d, 11); 108 109 const uint8x16_t v12 = vec_splat(d, 12); 110 const uint8x16_t v13 = vec_splat(d, 13); 111 const uint8x16_t v14 = vec_splat(d, 14); 112 const uint8x16_t v15 = vec_splat(d, 15); 113 114 (void)above; 115 116 vec_vsx_st(v0, 0, dst); 117 dst += stride; 118 vec_vsx_st(v1, 0, dst); 119 dst += stride; 120 vec_vsx_st(v2, 0, dst); 121 dst += stride; 122 vec_vsx_st(v3, 0, dst); 123 dst += stride; 124 vec_vsx_st(v4, 0, dst); 125 dst += stride; 126 vec_vsx_st(v5, 0, dst); 127 dst += stride; 128 vec_vsx_st(v6, 0, dst); 129 dst += stride; 130 vec_vsx_st(v7, 0, dst); 131 dst += stride; 132 vec_vsx_st(v8, 0, dst); 133 dst += stride; 134 vec_vsx_st(v9, 0, dst); 135 dst += stride; 136 vec_vsx_st(v10, 0, dst); 137 dst += stride; 138 vec_vsx_st(v11, 0, dst); 139 dst += stride; 140 vec_vsx_st(v12, 0, dst); 141 dst += stride; 142 vec_vsx_st(v13, 0, dst); 143 dst += stride; 144 vec_vsx_st(v14, 0, dst); 145 dst += stride; 146 vec_vsx_st(v15, 0, dst); 147 } 148 149 #define H_PREDICTOR_32(v) \ 150 vec_vsx_st(v, 0, dst); \ 151 vec_vsx_st(v, 16, dst); \ 152 dst += stride 153 154 void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 155 const uint8_t *above, const uint8_t *left) { 156 const uint8x16_t d0 = vec_vsx_ld(0, left); 157 const uint8x16_t d1 = vec_vsx_ld(16, left); 158 159 const uint8x16_t v0_0 = vec_splat(d0, 0); 160 const uint8x16_t v1_0 = vec_splat(d0, 1); 161 const uint8x16_t v2_0 = vec_splat(d0, 2); 162 const uint8x16_t v3_0 = vec_splat(d0, 3); 163 const uint8x16_t v4_0 = vec_splat(d0, 4); 164 const uint8x16_t v5_0 = vec_splat(d0, 5); 165 const uint8x16_t v6_0 = vec_splat(d0, 6); 166 const uint8x16_t v7_0 = vec_splat(d0, 7); 167 const uint8x16_t v8_0 = vec_splat(d0, 8); 168 const uint8x16_t v9_0 = vec_splat(d0, 9); 169 const uint8x16_t v10_0 = vec_splat(d0, 10); 170 const uint8x16_t v11_0 = vec_splat(d0, 11); 171 const uint8x16_t v12_0 = vec_splat(d0, 12); 172 const uint8x16_t v13_0 = vec_splat(d0, 13); 173 const uint8x16_t v14_0 = vec_splat(d0, 14); 174 const uint8x16_t v15_0 = vec_splat(d0, 15); 175 176 const uint8x16_t v0_1 = vec_splat(d1, 0); 177 const uint8x16_t v1_1 = vec_splat(d1, 1); 178 const uint8x16_t v2_1 = vec_splat(d1, 2); 179 const uint8x16_t v3_1 = vec_splat(d1, 3); 180 const uint8x16_t v4_1 = vec_splat(d1, 4); 181 const uint8x16_t v5_1 = vec_splat(d1, 5); 182 const uint8x16_t v6_1 = vec_splat(d1, 6); 183 const uint8x16_t v7_1 = vec_splat(d1, 7); 184 const uint8x16_t v8_1 = vec_splat(d1, 8); 185 const uint8x16_t v9_1 = vec_splat(d1, 9); 186 const uint8x16_t v10_1 = vec_splat(d1, 10); 187 const uint8x16_t v11_1 = vec_splat(d1, 11); 188 const uint8x16_t v12_1 = vec_splat(d1, 12); 189 const uint8x16_t v13_1 = vec_splat(d1, 13); 190 const uint8x16_t v14_1 = vec_splat(d1, 14); 191 const uint8x16_t v15_1 = vec_splat(d1, 15); 192 193 (void)above; 194 195 H_PREDICTOR_32(v0_0); 196 H_PREDICTOR_32(v1_0); 197 H_PREDICTOR_32(v2_0); 198 H_PREDICTOR_32(v3_0); 199 200 H_PREDICTOR_32(v4_0); 201 H_PREDICTOR_32(v5_0); 202 H_PREDICTOR_32(v6_0); 203 H_PREDICTOR_32(v7_0); 204 205 H_PREDICTOR_32(v8_0); 206 H_PREDICTOR_32(v9_0); 207 H_PREDICTOR_32(v10_0); 208 H_PREDICTOR_32(v11_0); 209 210 H_PREDICTOR_32(v12_0); 211 H_PREDICTOR_32(v13_0); 212 H_PREDICTOR_32(v14_0); 213 H_PREDICTOR_32(v15_0); 214 215 H_PREDICTOR_32(v0_1); 216 H_PREDICTOR_32(v1_1); 217 H_PREDICTOR_32(v2_1); 218 H_PREDICTOR_32(v3_1); 219 220 H_PREDICTOR_32(v4_1); 221 H_PREDICTOR_32(v5_1); 222 H_PREDICTOR_32(v6_1); 223 H_PREDICTOR_32(v7_1); 224 225 H_PREDICTOR_32(v8_1); 226 H_PREDICTOR_32(v9_1); 227 H_PREDICTOR_32(v10_1); 228 H_PREDICTOR_32(v11_1); 229 230 H_PREDICTOR_32(v12_1); 231 H_PREDICTOR_32(v13_1); 232 H_PREDICTOR_32(v14_1); 233 H_PREDICTOR_32(v15_1); 234 } 235 236 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, 237 const uint8_t *above, const uint8_t *left) { 238 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); 239 const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); 240 const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); 241 int16x8_t tmp, val; 242 uint8x16_t d; 243 244 d = vec_vsx_ld(0, dst); 245 tmp = unpack_to_s16_l(d); 246 val = vec_sub(vec_add(vec_splat(l, 0), a), tl); 247 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); 248 dst += stride; 249 250 d = vec_vsx_ld(0, dst); 251 tmp = unpack_to_s16_l(d); 252 val = vec_sub(vec_add(vec_splat(l, 1), a), tl); 253 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); 254 dst += stride; 255 256 d = vec_vsx_ld(0, dst); 257 tmp = unpack_to_s16_l(d); 258 val = vec_sub(vec_add(vec_splat(l, 2), a), tl); 259 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); 260 dst += stride; 261 262 d = vec_vsx_ld(0, dst); 263 tmp = unpack_to_s16_l(d); 264 val = vec_sub(vec_add(vec_splat(l, 3), a), tl); 265 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); 266 } 267 268 void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, 269 const uint8_t *above, const uint8_t *left) { 270 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); 271 const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); 272 const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); 273 int16x8_t tmp, val; 274 275 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); 276 val = vec_sub(vec_add(vec_splat(l, 0), a), tl); 277 vec_vsx_st(vec_packsu(val, tmp), 0, dst); 278 dst += stride; 279 280 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); 281 val = vec_sub(vec_add(vec_splat(l, 1), a), tl); 282 vec_vsx_st(vec_packsu(val, tmp), 0, dst); 283 dst += stride; 284 285 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); 286 val = vec_sub(vec_add(vec_splat(l, 2), a), tl); 287 vec_vsx_st(vec_packsu(val, tmp), 0, dst); 288 dst += stride; 289 290 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); 291 val = vec_sub(vec_add(vec_splat(l, 3), a), tl); 292 vec_vsx_st(vec_packsu(val, tmp), 0, dst); 293 dst += stride; 294 295 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); 296 val = vec_sub(vec_add(vec_splat(l, 4), a), tl); 297 vec_vsx_st(vec_packsu(val, tmp), 0, dst); 298 dst += stride; 299 300 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); 301 val = vec_sub(vec_add(vec_splat(l, 5), a), tl); 302 vec_vsx_st(vec_packsu(val, tmp), 0, dst); 303 dst += stride; 304 305 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); 306 val = vec_sub(vec_add(vec_splat(l, 6), a), tl); 307 vec_vsx_st(vec_packsu(val, tmp), 0, dst); 308 dst += stride; 309 310 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); 311 val = vec_sub(vec_add(vec_splat(l, 7), a), tl); 312 vec_vsx_st(vec_packsu(val, tmp), 0, dst); 313 } 314 315 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l, 316 int16x8_t ah, int16x8_t al, int16x8_t tl) { 317 int16x8_t vh, vl, ls; 318 319 ls = vec_splat(l, 0); 320 vh = vec_sub(vec_add(ls, ah), tl); 321 vl = vec_sub(vec_add(ls, al), tl); 322 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 323 dst += stride; 324 325 ls = vec_splat(l, 1); 326 vh = vec_sub(vec_add(ls, ah), tl); 327 vl = vec_sub(vec_add(ls, al), tl); 328 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 329 dst += stride; 330 331 ls = vec_splat(l, 2); 332 vh = vec_sub(vec_add(ls, ah), tl); 333 vl = vec_sub(vec_add(ls, al), tl); 334 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 335 dst += stride; 336 337 ls = vec_splat(l, 3); 338 vh = vec_sub(vec_add(ls, ah), tl); 339 vl = vec_sub(vec_add(ls, al), tl); 340 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 341 dst += stride; 342 343 ls = vec_splat(l, 4); 344 vh = vec_sub(vec_add(ls, ah), tl); 345 vl = vec_sub(vec_add(ls, al), tl); 346 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 347 dst += stride; 348 349 ls = vec_splat(l, 5); 350 vh = vec_sub(vec_add(ls, ah), tl); 351 vl = vec_sub(vec_add(ls, al), tl); 352 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 353 dst += stride; 354 355 ls = vec_splat(l, 6); 356 vh = vec_sub(vec_add(ls, ah), tl); 357 vl = vec_sub(vec_add(ls, al), tl); 358 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 359 dst += stride; 360 361 ls = vec_splat(l, 7); 362 vh = vec_sub(vec_add(ls, ah), tl); 363 vl = vec_sub(vec_add(ls, al), tl); 364 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 365 } 366 367 void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 368 const uint8_t *above, const uint8_t *left) { 369 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); 370 const uint8x16_t l = vec_vsx_ld(0, left); 371 const int16x8_t lh = unpack_to_s16_h(l); 372 const int16x8_t ll = unpack_to_s16_l(l); 373 const uint8x16_t a = vec_vsx_ld(0, above); 374 const int16x8_t ah = unpack_to_s16_h(a); 375 const int16x8_t al = unpack_to_s16_l(a); 376 377 tm_predictor_16x8(dst, stride, lh, ah, al, tl); 378 379 dst += stride * 8; 380 381 tm_predictor_16x8(dst, stride, ll, ah, al, tl); 382 } 383 384 static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls, 385 const int16x8_t a0h, const int16x8_t a0l, 386 const int16x8_t a1h, const int16x8_t a1l, 387 const int16x8_t tl) { 388 int16x8_t vh, vl; 389 390 vh = vec_sub(vec_add(ls, a0h), tl); 391 vl = vec_sub(vec_add(ls, a0l), tl); 392 vec_vsx_st(vec_packsu(vh, vl), 0, dst); 393 vh = vec_sub(vec_add(ls, a1h), tl); 394 vl = vec_sub(vec_add(ls, a1l), tl); 395 vec_vsx_st(vec_packsu(vh, vl), 16, dst); 396 } 397 398 static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride, 399 const int16x8_t l, const uint8x16_t a0, 400 const uint8x16_t a1, const int16x8_t tl) { 401 const int16x8_t a0h = unpack_to_s16_h(a0); 402 const int16x8_t a0l = unpack_to_s16_l(a0); 403 const int16x8_t a1h = unpack_to_s16_h(a1); 404 const int16x8_t a1l = unpack_to_s16_l(a1); 405 406 tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl); 407 dst += stride; 408 409 tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl); 410 dst += stride; 411 412 tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl); 413 dst += stride; 414 415 tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl); 416 dst += stride; 417 418 tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl); 419 dst += stride; 420 421 tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl); 422 dst += stride; 423 424 tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl); 425 dst += stride; 426 427 tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl); 428 } 429 430 void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 431 const uint8_t *above, const uint8_t *left) { 432 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); 433 const uint8x16_t l0 = vec_vsx_ld(0, left); 434 const uint8x16_t l1 = vec_vsx_ld(16, left); 435 const uint8x16_t a0 = vec_vsx_ld(0, above); 436 const uint8x16_t a1 = vec_vsx_ld(16, above); 437 438 tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl); 439 dst += stride * 8; 440 441 tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl); 442 dst += stride * 8; 443 444 tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl); 445 dst += stride * 8; 446 447 tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl); 448 } 449 450 static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride, 451 const uint8x16_t val) { 452 int i; 453 454 for (i = 0; i < 8; i++, dst += stride) { 455 const uint8x16_t d = vec_vsx_ld(0, dst); 456 vec_vsx_st(xxpermdi(val, d, 1), 0, dst); 457 } 458 } 459 460 static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride, 461 const uint8x16_t val) { 462 int i; 463 464 for (i = 0; i < 16; i++, dst += stride) { 465 vec_vsx_st(val, 0, dst); 466 } 467 } 468 469 void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 470 const uint8_t *above, const uint8_t *left) { 471 const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); 472 (void)above; 473 (void)left; 474 475 dc_fill_predictor_16x16(dst, stride, v128); 476 } 477 478 static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride, 479 const uint8x16_t val) { 480 int i; 481 482 for (i = 0; i < 32; i++, dst += stride) { 483 vec_vsx_st(val, 0, dst); 484 vec_vsx_st(val, 16, dst); 485 } 486 } 487 488 void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 489 const uint8_t *above, const uint8_t *left) { 490 const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); 491 (void)above; 492 (void)left; 493 494 dc_fill_predictor_32x32(dst, stride, v128); 495 } 496 497 static uint8x16_t avg16(const uint8_t *values) { 498 const int32x4_t sum4s = 499 (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0)); 500 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8)); 501 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); 502 503 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), 504 3); 505 } 506 507 void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 508 const uint8_t *above, 509 const uint8_t *left) { 510 (void)above; 511 512 dc_fill_predictor_16x16(dst, stride, avg16(left)); 513 } 514 515 void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 516 const uint8_t *above, const uint8_t *left) { 517 (void)left; 518 519 dc_fill_predictor_16x16(dst, stride, avg16(above)); 520 } 521 522 static uint8x16_t avg32(const uint8_t *values) { 523 const uint8x16_t v0 = vec_vsx_ld(0, values); 524 const uint8x16_t v1 = vec_vsx_ld(16, values); 525 const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); 526 const int32x4_t sum4s = 527 (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0))); 528 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); 529 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); 530 531 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), 532 3); 533 } 534 535 void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 536 const uint8_t *above, 537 const uint8_t *left) { 538 (void)above; 539 540 dc_fill_predictor_32x32(dst, stride, avg32(left)); 541 } 542 543 void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 544 const uint8_t *above, const uint8_t *left) { 545 (void)left; 546 547 dc_fill_predictor_32x32(dst, stride, avg32(above)); 548 } 549 550 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) { 551 const uint8x16_t a0 = vec_vsx_ld(0, above); 552 const uint8x16_t l0 = vec_vsx_ld(0, left); 553 const int32x4_t sum4s = 554 (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); 555 const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1); 556 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8)); 557 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); 558 559 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), 560 3); 561 } 562 563 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) { 564 const uint8x16_t a0 = vec_vsx_ld(0, above); 565 const uint8x16_t l0 = vec_vsx_ld(0, left); 566 const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); 567 const int32x4_t sum4s = 568 (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); 569 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); 570 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); 571 572 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), 573 3); 574 } 575 576 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, 577 const uint8_t *above, const uint8_t *left) { 578 dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left)); 579 } 580 581 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 582 const uint8_t *above, const uint8_t *left) { 583 dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left)); 584 } 585 586 static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) { 587 const uint8x16_t a0 = vec_vsx_ld(0, above); 588 const uint8x16_t a1 = vec_vsx_ld(16, above); 589 const uint8x16_t l0 = vec_vsx_ld(0, left); 590 const uint8x16_t l1 = vec_vsx_ld(16, left); 591 const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5)); 592 const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0))); 593 const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum)); 594 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32); 595 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6)); 596 597 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), 598 3); 599 } 600 601 void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 602 const uint8_t *above, const uint8_t *left) { 603 dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left)); 604 } 605 606 static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b, 607 const uint8x16_t c) { 608 const uint8x16_t ac = 609 vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1))); 610 611 return vec_avg(ac, b); 612 } 613 614 // Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken. 615 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 616 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 }; 617 618 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, 619 const uint8_t *above, const uint8_t *left) { 620 const uint8x16_t af = vec_vsx_ld(0, above); 621 const uint8x16_t above_right = vec_splat(af, 7); 622 const uint8x16_t a = xxpermdi(af, above_right, 1); 623 const uint8x16_t b = vec_perm(a, above_right, sl1); 624 const uint8x16_t c = vec_perm(b, above_right, sl1); 625 uint8x16_t row = avg3(a, b, c); 626 int i; 627 (void)left; 628 629 for (i = 0; i < 8; i++) { 630 const uint8x16_t d = vec_vsx_ld(0, dst); 631 vec_vsx_st(xxpermdi(row, d, 1), 0, dst); 632 dst += stride; 633 row = vec_perm(row, above_right, sl1); 634 } 635 } 636 637 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 638 const uint8_t *above, const uint8_t *left) { 639 const uint8x16_t a = vec_vsx_ld(0, above); 640 const uint8x16_t above_right = vec_splat(a, 15); 641 const uint8x16_t b = vec_perm(a, above_right, sl1); 642 const uint8x16_t c = vec_perm(b, above_right, sl1); 643 uint8x16_t row = avg3(a, b, c); 644 int i; 645 (void)left; 646 647 for (i = 0; i < 16; i++) { 648 vec_vsx_st(row, 0, dst); 649 dst += stride; 650 row = vec_perm(row, above_right, sl1); 651 } 652 } 653 654 void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 655 const uint8_t *above, const uint8_t *left) { 656 const uint8x16_t a0 = vec_vsx_ld(0, above); 657 const uint8x16_t a1 = vec_vsx_ld(16, above); 658 const uint8x16_t above_right = vec_splat(a1, 15); 659 const uint8x16_t b0 = vec_perm(a0, a1, sl1); 660 const uint8x16_t b1 = vec_perm(a1, above_right, sl1); 661 const uint8x16_t c0 = vec_perm(b0, b1, sl1); 662 const uint8x16_t c1 = vec_perm(b1, above_right, sl1); 663 uint8x16_t row0 = avg3(a0, b0, c0); 664 uint8x16_t row1 = avg3(a1, b1, c1); 665 int i; 666 (void)left; 667 668 for (i = 0; i < 32; i++) { 669 vec_vsx_st(row0, 0, dst); 670 vec_vsx_st(row1, 16, dst); 671 dst += stride; 672 row0 = vec_perm(row0, row1, sl1); 673 row1 = vec_perm(row1, above_right, sl1); 674 } 675 } 676 677 void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, 678 const uint8_t *above, const uint8_t *left) { 679 const uint8x16_t af = vec_vsx_ld(0, above); 680 const uint8x16_t above_right = vec_splat(af, 9); 681 const uint8x16_t a = xxpermdi(af, above_right, 1); 682 const uint8x16_t b = vec_perm(a, above_right, sl1); 683 const uint8x16_t c = vec_perm(b, above_right, sl1); 684 uint8x16_t row0 = vec_avg(a, b); 685 uint8x16_t row1 = avg3(a, b, c); 686 int i; 687 (void)left; 688 689 for (i = 0; i < 4; i++) { 690 const uint8x16_t d0 = vec_vsx_ld(0, dst); 691 const uint8x16_t d1 = vec_vsx_ld(0, dst + stride); 692 vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst); 693 vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride); 694 dst += stride * 2; 695 row0 = vec_perm(row0, above_right, sl1); 696 row1 = vec_perm(row1, above_right, sl1); 697 } 698 } 699 700 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, 701 const uint8_t *above, const uint8_t *left) { 702 const uint8x16_t a0 = vec_vsx_ld(0, above); 703 const uint8x16_t a1 = vec_vsx_ld(16, above); 704 const uint8x16_t above_right = vec_splat(a1, 0); 705 const uint8x16_t b = vec_perm(a0, above_right, sl1); 706 const uint8x16_t c = vec_perm(b, above_right, sl1); 707 uint8x16_t row0 = vec_avg(a0, b); 708 uint8x16_t row1 = avg3(a0, b, c); 709 int i; 710 (void)left; 711 712 for (i = 0; i < 8; i++) { 713 vec_vsx_st(row0, 0, dst); 714 vec_vsx_st(row1, 0, dst + stride); 715 dst += stride * 2; 716 row0 = vec_perm(row0, above_right, sl1); 717 row1 = vec_perm(row1, above_right, sl1); 718 } 719 } 720 721 void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, 722 const uint8_t *above, const uint8_t *left) { 723 const uint8x16_t a0 = vec_vsx_ld(0, above); 724 const uint8x16_t a1 = vec_vsx_ld(16, above); 725 const uint8x16_t a2 = vec_vsx_ld(32, above); 726 const uint8x16_t above_right = vec_splat(a2, 0); 727 const uint8x16_t b0 = vec_perm(a0, a1, sl1); 728 const uint8x16_t b1 = vec_perm(a1, above_right, sl1); 729 const uint8x16_t c0 = vec_perm(b0, b1, sl1); 730 const uint8x16_t c1 = vec_perm(b1, above_right, sl1); 731 uint8x16_t row0_0 = vec_avg(a0, b0); 732 uint8x16_t row0_1 = vec_avg(a1, b1); 733 uint8x16_t row1_0 = avg3(a0, b0, c0); 734 uint8x16_t row1_1 = avg3(a1, b1, c1); 735 int i; 736 (void)left; 737 738 for (i = 0; i < 16; i++) { 739 vec_vsx_st(row0_0, 0, dst); 740 vec_vsx_st(row0_1, 16, dst); 741 vec_vsx_st(row1_0, 0, dst + stride); 742 vec_vsx_st(row1_1, 16, dst + stride); 743 dst += stride * 2; 744 row0_0 = vec_perm(row0_0, row0_1, sl1); 745 row0_1 = vec_perm(row0_1, above_right, sl1); 746 row1_0 = vec_perm(row1_0, row1_1, sl1); 747 row1_1 = vec_perm(row1_1, above_right, sl1); 748 } 749 } 750