Home | History | Annotate | Download | only in ppc
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_dsp_rtcd.h"
     12 #include "vpx_dsp/ppc/types_vsx.h"
     13 
     14 void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
     15                                const uint8_t *above, const uint8_t *left) {
     16   const uint8x16_t d = vec_vsx_ld(0, above);
     17   int i;
     18   (void)left;
     19 
     20   for (i = 0; i < 16; i++, dst += stride) {
     21     vec_vsx_st(d, 0, dst);
     22   }
     23 }
     24 
     25 void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
     26                                const uint8_t *above, const uint8_t *left) {
     27   const uint8x16_t d0 = vec_vsx_ld(0, above);
     28   const uint8x16_t d1 = vec_vsx_ld(16, above);
     29   int i;
     30   (void)left;
     31 
     32   for (i = 0; i < 32; i++, dst += stride) {
     33     vec_vsx_st(d0, 0, dst);
     34     vec_vsx_st(d1, 16, dst);
     35   }
     36 }
     37 
     38 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
     39 
     40 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
     41                              const uint8_t *above, const uint8_t *left) {
     42   const uint8x16_t d = vec_vsx_ld(0, left);
     43   const uint8x16_t v0 = vec_splat(d, 0);
     44   const uint8x16_t v1 = vec_splat(d, 1);
     45   const uint8x16_t v2 = vec_splat(d, 2);
     46   const uint8x16_t v3 = vec_splat(d, 3);
     47 
     48   (void)above;
     49 
     50   vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
     51   dst += stride;
     52   vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
     53   dst += stride;
     54   vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
     55   dst += stride;
     56   vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
     57 }
     58 
     59 void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
     60                              const uint8_t *above, const uint8_t *left) {
     61   const uint8x16_t d = vec_vsx_ld(0, left);
     62   const uint8x16_t v0 = vec_splat(d, 0);
     63   const uint8x16_t v1 = vec_splat(d, 1);
     64   const uint8x16_t v2 = vec_splat(d, 2);
     65   const uint8x16_t v3 = vec_splat(d, 3);
     66 
     67   const uint8x16_t v4 = vec_splat(d, 4);
     68   const uint8x16_t v5 = vec_splat(d, 5);
     69   const uint8x16_t v6 = vec_splat(d, 6);
     70   const uint8x16_t v7 = vec_splat(d, 7);
     71 
     72   (void)above;
     73 
     74   vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
     75   dst += stride;
     76   vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
     77   dst += stride;
     78   vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
     79   dst += stride;
     80   vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
     81   dst += stride;
     82   vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
     83   dst += stride;
     84   vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
     85   dst += stride;
     86   vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
     87   dst += stride;
     88   vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
     89 }
     90 
     91 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
     92                                const uint8_t *above, const uint8_t *left) {
     93   const uint8x16_t d = vec_vsx_ld(0, left);
     94   const uint8x16_t v0 = vec_splat(d, 0);
     95   const uint8x16_t v1 = vec_splat(d, 1);
     96   const uint8x16_t v2 = vec_splat(d, 2);
     97   const uint8x16_t v3 = vec_splat(d, 3);
     98 
     99   const uint8x16_t v4 = vec_splat(d, 4);
    100   const uint8x16_t v5 = vec_splat(d, 5);
    101   const uint8x16_t v6 = vec_splat(d, 6);
    102   const uint8x16_t v7 = vec_splat(d, 7);
    103 
    104   const uint8x16_t v8 = vec_splat(d, 8);
    105   const uint8x16_t v9 = vec_splat(d, 9);
    106   const uint8x16_t v10 = vec_splat(d, 10);
    107   const uint8x16_t v11 = vec_splat(d, 11);
    108 
    109   const uint8x16_t v12 = vec_splat(d, 12);
    110   const uint8x16_t v13 = vec_splat(d, 13);
    111   const uint8x16_t v14 = vec_splat(d, 14);
    112   const uint8x16_t v15 = vec_splat(d, 15);
    113 
    114   (void)above;
    115 
    116   vec_vsx_st(v0, 0, dst);
    117   dst += stride;
    118   vec_vsx_st(v1, 0, dst);
    119   dst += stride;
    120   vec_vsx_st(v2, 0, dst);
    121   dst += stride;
    122   vec_vsx_st(v3, 0, dst);
    123   dst += stride;
    124   vec_vsx_st(v4, 0, dst);
    125   dst += stride;
    126   vec_vsx_st(v5, 0, dst);
    127   dst += stride;
    128   vec_vsx_st(v6, 0, dst);
    129   dst += stride;
    130   vec_vsx_st(v7, 0, dst);
    131   dst += stride;
    132   vec_vsx_st(v8, 0, dst);
    133   dst += stride;
    134   vec_vsx_st(v9, 0, dst);
    135   dst += stride;
    136   vec_vsx_st(v10, 0, dst);
    137   dst += stride;
    138   vec_vsx_st(v11, 0, dst);
    139   dst += stride;
    140   vec_vsx_st(v12, 0, dst);
    141   dst += stride;
    142   vec_vsx_st(v13, 0, dst);
    143   dst += stride;
    144   vec_vsx_st(v14, 0, dst);
    145   dst += stride;
    146   vec_vsx_st(v15, 0, dst);
    147 }
    148 
    149 #define H_PREDICTOR_32(v) \
    150   vec_vsx_st(v, 0, dst);  \
    151   vec_vsx_st(v, 16, dst); \
    152   dst += stride
    153 
    154 void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
    155                                const uint8_t *above, const uint8_t *left) {
    156   const uint8x16_t d0 = vec_vsx_ld(0, left);
    157   const uint8x16_t d1 = vec_vsx_ld(16, left);
    158 
    159   const uint8x16_t v0_0 = vec_splat(d0, 0);
    160   const uint8x16_t v1_0 = vec_splat(d0, 1);
    161   const uint8x16_t v2_0 = vec_splat(d0, 2);
    162   const uint8x16_t v3_0 = vec_splat(d0, 3);
    163   const uint8x16_t v4_0 = vec_splat(d0, 4);
    164   const uint8x16_t v5_0 = vec_splat(d0, 5);
    165   const uint8x16_t v6_0 = vec_splat(d0, 6);
    166   const uint8x16_t v7_0 = vec_splat(d0, 7);
    167   const uint8x16_t v8_0 = vec_splat(d0, 8);
    168   const uint8x16_t v9_0 = vec_splat(d0, 9);
    169   const uint8x16_t v10_0 = vec_splat(d0, 10);
    170   const uint8x16_t v11_0 = vec_splat(d0, 11);
    171   const uint8x16_t v12_0 = vec_splat(d0, 12);
    172   const uint8x16_t v13_0 = vec_splat(d0, 13);
    173   const uint8x16_t v14_0 = vec_splat(d0, 14);
    174   const uint8x16_t v15_0 = vec_splat(d0, 15);
    175 
    176   const uint8x16_t v0_1 = vec_splat(d1, 0);
    177   const uint8x16_t v1_1 = vec_splat(d1, 1);
    178   const uint8x16_t v2_1 = vec_splat(d1, 2);
    179   const uint8x16_t v3_1 = vec_splat(d1, 3);
    180   const uint8x16_t v4_1 = vec_splat(d1, 4);
    181   const uint8x16_t v5_1 = vec_splat(d1, 5);
    182   const uint8x16_t v6_1 = vec_splat(d1, 6);
    183   const uint8x16_t v7_1 = vec_splat(d1, 7);
    184   const uint8x16_t v8_1 = vec_splat(d1, 8);
    185   const uint8x16_t v9_1 = vec_splat(d1, 9);
    186   const uint8x16_t v10_1 = vec_splat(d1, 10);
    187   const uint8x16_t v11_1 = vec_splat(d1, 11);
    188   const uint8x16_t v12_1 = vec_splat(d1, 12);
    189   const uint8x16_t v13_1 = vec_splat(d1, 13);
    190   const uint8x16_t v14_1 = vec_splat(d1, 14);
    191   const uint8x16_t v15_1 = vec_splat(d1, 15);
    192 
    193   (void)above;
    194 
    195   H_PREDICTOR_32(v0_0);
    196   H_PREDICTOR_32(v1_0);
    197   H_PREDICTOR_32(v2_0);
    198   H_PREDICTOR_32(v3_0);
    199 
    200   H_PREDICTOR_32(v4_0);
    201   H_PREDICTOR_32(v5_0);
    202   H_PREDICTOR_32(v6_0);
    203   H_PREDICTOR_32(v7_0);
    204 
    205   H_PREDICTOR_32(v8_0);
    206   H_PREDICTOR_32(v9_0);
    207   H_PREDICTOR_32(v10_0);
    208   H_PREDICTOR_32(v11_0);
    209 
    210   H_PREDICTOR_32(v12_0);
    211   H_PREDICTOR_32(v13_0);
    212   H_PREDICTOR_32(v14_0);
    213   H_PREDICTOR_32(v15_0);
    214 
    215   H_PREDICTOR_32(v0_1);
    216   H_PREDICTOR_32(v1_1);
    217   H_PREDICTOR_32(v2_1);
    218   H_PREDICTOR_32(v3_1);
    219 
    220   H_PREDICTOR_32(v4_1);
    221   H_PREDICTOR_32(v5_1);
    222   H_PREDICTOR_32(v6_1);
    223   H_PREDICTOR_32(v7_1);
    224 
    225   H_PREDICTOR_32(v8_1);
    226   H_PREDICTOR_32(v9_1);
    227   H_PREDICTOR_32(v10_1);
    228   H_PREDICTOR_32(v11_1);
    229 
    230   H_PREDICTOR_32(v12_1);
    231   H_PREDICTOR_32(v13_1);
    232   H_PREDICTOR_32(v14_1);
    233   H_PREDICTOR_32(v15_1);
    234 }
    235 
    236 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
    237                               const uint8_t *above, const uint8_t *left) {
    238   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
    239   const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
    240   const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
    241   int16x8_t tmp, val;
    242   uint8x16_t d;
    243 
    244   d = vec_vsx_ld(0, dst);
    245   tmp = unpack_to_s16_l(d);
    246   val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
    247   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
    248   dst += stride;
    249 
    250   d = vec_vsx_ld(0, dst);
    251   tmp = unpack_to_s16_l(d);
    252   val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
    253   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
    254   dst += stride;
    255 
    256   d = vec_vsx_ld(0, dst);
    257   tmp = unpack_to_s16_l(d);
    258   val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
    259   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
    260   dst += stride;
    261 
    262   d = vec_vsx_ld(0, dst);
    263   tmp = unpack_to_s16_l(d);
    264   val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
    265   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
    266 }
    267 
    268 void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
    269                               const uint8_t *above, const uint8_t *left) {
    270   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
    271   const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
    272   const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
    273   int16x8_t tmp, val;
    274 
    275   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
    276   val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
    277   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
    278   dst += stride;
    279 
    280   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
    281   val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
    282   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
    283   dst += stride;
    284 
    285   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
    286   val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
    287   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
    288   dst += stride;
    289 
    290   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
    291   val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
    292   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
    293   dst += stride;
    294 
    295   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
    296   val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
    297   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
    298   dst += stride;
    299 
    300   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
    301   val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
    302   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
    303   dst += stride;
    304 
    305   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
    306   val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
    307   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
    308   dst += stride;
    309 
    310   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
    311   val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
    312   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
    313 }
    314 
    315 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
    316                               int16x8_t ah, int16x8_t al, int16x8_t tl) {
    317   int16x8_t vh, vl, ls;
    318 
    319   ls = vec_splat(l, 0);
    320   vh = vec_sub(vec_add(ls, ah), tl);
    321   vl = vec_sub(vec_add(ls, al), tl);
    322   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    323   dst += stride;
    324 
    325   ls = vec_splat(l, 1);
    326   vh = vec_sub(vec_add(ls, ah), tl);
    327   vl = vec_sub(vec_add(ls, al), tl);
    328   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    329   dst += stride;
    330 
    331   ls = vec_splat(l, 2);
    332   vh = vec_sub(vec_add(ls, ah), tl);
    333   vl = vec_sub(vec_add(ls, al), tl);
    334   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    335   dst += stride;
    336 
    337   ls = vec_splat(l, 3);
    338   vh = vec_sub(vec_add(ls, ah), tl);
    339   vl = vec_sub(vec_add(ls, al), tl);
    340   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    341   dst += stride;
    342 
    343   ls = vec_splat(l, 4);
    344   vh = vec_sub(vec_add(ls, ah), tl);
    345   vl = vec_sub(vec_add(ls, al), tl);
    346   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    347   dst += stride;
    348 
    349   ls = vec_splat(l, 5);
    350   vh = vec_sub(vec_add(ls, ah), tl);
    351   vl = vec_sub(vec_add(ls, al), tl);
    352   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    353   dst += stride;
    354 
    355   ls = vec_splat(l, 6);
    356   vh = vec_sub(vec_add(ls, ah), tl);
    357   vl = vec_sub(vec_add(ls, al), tl);
    358   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    359   dst += stride;
    360 
    361   ls = vec_splat(l, 7);
    362   vh = vec_sub(vec_add(ls, ah), tl);
    363   vl = vec_sub(vec_add(ls, al), tl);
    364   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    365 }
    366 
    367 void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
    368                                 const uint8_t *above, const uint8_t *left) {
    369   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
    370   const uint8x16_t l = vec_vsx_ld(0, left);
    371   const int16x8_t lh = unpack_to_s16_h(l);
    372   const int16x8_t ll = unpack_to_s16_l(l);
    373   const uint8x16_t a = vec_vsx_ld(0, above);
    374   const int16x8_t ah = unpack_to_s16_h(a);
    375   const int16x8_t al = unpack_to_s16_l(a);
    376 
    377   tm_predictor_16x8(dst, stride, lh, ah, al, tl);
    378 
    379   dst += stride * 8;
    380 
    381   tm_predictor_16x8(dst, stride, ll, ah, al, tl);
    382 }
    383 
    384 static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
    385                                      const int16x8_t a0h, const int16x8_t a0l,
    386                                      const int16x8_t a1h, const int16x8_t a1l,
    387                                      const int16x8_t tl) {
    388   int16x8_t vh, vl;
    389 
    390   vh = vec_sub(vec_add(ls, a0h), tl);
    391   vl = vec_sub(vec_add(ls, a0l), tl);
    392   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
    393   vh = vec_sub(vec_add(ls, a1h), tl);
    394   vl = vec_sub(vec_add(ls, a1l), tl);
    395   vec_vsx_st(vec_packsu(vh, vl), 16, dst);
    396 }
    397 
    398 static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
    399                               const int16x8_t l, const uint8x16_t a0,
    400                               const uint8x16_t a1, const int16x8_t tl) {
    401   const int16x8_t a0h = unpack_to_s16_h(a0);
    402   const int16x8_t a0l = unpack_to_s16_l(a0);
    403   const int16x8_t a1h = unpack_to_s16_h(a1);
    404   const int16x8_t a1l = unpack_to_s16_l(a1);
    405 
    406   tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
    407   dst += stride;
    408 
    409   tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
    410   dst += stride;
    411 
    412   tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
    413   dst += stride;
    414 
    415   tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
    416   dst += stride;
    417 
    418   tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
    419   dst += stride;
    420 
    421   tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
    422   dst += stride;
    423 
    424   tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
    425   dst += stride;
    426 
    427   tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
    428 }
    429 
    430 void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
    431                                 const uint8_t *above, const uint8_t *left) {
    432   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
    433   const uint8x16_t l0 = vec_vsx_ld(0, left);
    434   const uint8x16_t l1 = vec_vsx_ld(16, left);
    435   const uint8x16_t a0 = vec_vsx_ld(0, above);
    436   const uint8x16_t a1 = vec_vsx_ld(16, above);
    437 
    438   tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
    439   dst += stride * 8;
    440 
    441   tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
    442   dst += stride * 8;
    443 
    444   tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
    445   dst += stride * 8;
    446 
    447   tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
    448 }
    449 
    450 static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
    451                                          const uint8x16_t val) {
    452   int i;
    453 
    454   for (i = 0; i < 8; i++, dst += stride) {
    455     const uint8x16_t d = vec_vsx_ld(0, dst);
    456     vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
    457   }
    458 }
    459 
    460 static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
    461                                            const uint8x16_t val) {
    462   int i;
    463 
    464   for (i = 0; i < 16; i++, dst += stride) {
    465     vec_vsx_st(val, 0, dst);
    466   }
    467 }
    468 
    469 void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
    470                                     const uint8_t *above, const uint8_t *left) {
    471   const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
    472   (void)above;
    473   (void)left;
    474 
    475   dc_fill_predictor_16x16(dst, stride, v128);
    476 }
    477 
    478 static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
    479                                            const uint8x16_t val) {
    480   int i;
    481 
    482   for (i = 0; i < 32; i++, dst += stride) {
    483     vec_vsx_st(val, 0, dst);
    484     vec_vsx_st(val, 16, dst);
    485   }
    486 }
    487 
    488 void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
    489                                     const uint8_t *above, const uint8_t *left) {
    490   const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
    491   (void)above;
    492   (void)left;
    493 
    494   dc_fill_predictor_32x32(dst, stride, v128);
    495 }
    496 
    497 static uint8x16_t avg16(const uint8_t *values) {
    498   const int32x4_t sum4s =
    499       (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
    500   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
    501   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
    502 
    503   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
    504                    3);
    505 }
    506 
    507 void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
    508                                      const uint8_t *above,
    509                                      const uint8_t *left) {
    510   (void)above;
    511 
    512   dc_fill_predictor_16x16(dst, stride, avg16(left));
    513 }
    514 
    515 void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
    516                                     const uint8_t *above, const uint8_t *left) {
    517   (void)left;
    518 
    519   dc_fill_predictor_16x16(dst, stride, avg16(above));
    520 }
    521 
    522 static uint8x16_t avg32(const uint8_t *values) {
    523   const uint8x16_t v0 = vec_vsx_ld(0, values);
    524   const uint8x16_t v1 = vec_vsx_ld(16, values);
    525   const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
    526   const int32x4_t sum4s =
    527       (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
    528   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
    529   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
    530 
    531   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
    532                    3);
    533 }
    534 
    535 void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
    536                                      const uint8_t *above,
    537                                      const uint8_t *left) {
    538   (void)above;
    539 
    540   dc_fill_predictor_32x32(dst, stride, avg32(left));
    541 }
    542 
    543 void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
    544                                     const uint8_t *above, const uint8_t *left) {
    545   (void)left;
    546 
    547   dc_fill_predictor_32x32(dst, stride, avg32(above));
    548 }
    549 
    550 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
    551   const uint8x16_t a0 = vec_vsx_ld(0, above);
    552   const uint8x16_t l0 = vec_vsx_ld(0, left);
    553   const int32x4_t sum4s =
    554       (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
    555   const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
    556   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
    557   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
    558 
    559   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
    560                    3);
    561 }
    562 
    563 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
    564   const uint8x16_t a0 = vec_vsx_ld(0, above);
    565   const uint8x16_t l0 = vec_vsx_ld(0, left);
    566   const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
    567   const int32x4_t sum4s =
    568       (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
    569   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
    570   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
    571 
    572   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
    573                    3);
    574 }
    575 
    576 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
    577                               const uint8_t *above, const uint8_t *left) {
    578   dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
    579 }
    580 
    581 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
    582                                 const uint8_t *above, const uint8_t *left) {
    583   dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
    584 }
    585 
    586 static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
    587   const uint8x16_t a0 = vec_vsx_ld(0, above);
    588   const uint8x16_t a1 = vec_vsx_ld(16, above);
    589   const uint8x16_t l0 = vec_vsx_ld(0, left);
    590   const uint8x16_t l1 = vec_vsx_ld(16, left);
    591   const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
    592   const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
    593   const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
    594   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
    595   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
    596 
    597   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
    598                    3);
    599 }
    600 
    601 void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
    602                                 const uint8_t *above, const uint8_t *left) {
    603   dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
    604 }
    605 
    606 static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
    607                        const uint8x16_t c) {
    608   const uint8x16_t ac =
    609       vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
    610 
    611   return vec_avg(ac, b);
    612 }
    613 
    614 // Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
    615 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
    616                                 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
    617 
    618 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
    619                                const uint8_t *above, const uint8_t *left) {
    620   const uint8x16_t af = vec_vsx_ld(0, above);
    621   const uint8x16_t above_right = vec_splat(af, 7);
    622   const uint8x16_t a = xxpermdi(af, above_right, 1);
    623   const uint8x16_t b = vec_perm(a, above_right, sl1);
    624   const uint8x16_t c = vec_perm(b, above_right, sl1);
    625   uint8x16_t row = avg3(a, b, c);
    626   int i;
    627   (void)left;
    628 
    629   for (i = 0; i < 8; i++) {
    630     const uint8x16_t d = vec_vsx_ld(0, dst);
    631     vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
    632     dst += stride;
    633     row = vec_perm(row, above_right, sl1);
    634   }
    635 }
    636 
    637 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
    638                                  const uint8_t *above, const uint8_t *left) {
    639   const uint8x16_t a = vec_vsx_ld(0, above);
    640   const uint8x16_t above_right = vec_splat(a, 15);
    641   const uint8x16_t b = vec_perm(a, above_right, sl1);
    642   const uint8x16_t c = vec_perm(b, above_right, sl1);
    643   uint8x16_t row = avg3(a, b, c);
    644   int i;
    645   (void)left;
    646 
    647   for (i = 0; i < 16; i++) {
    648     vec_vsx_st(row, 0, dst);
    649     dst += stride;
    650     row = vec_perm(row, above_right, sl1);
    651   }
    652 }
    653 
    654 void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
    655                                  const uint8_t *above, const uint8_t *left) {
    656   const uint8x16_t a0 = vec_vsx_ld(0, above);
    657   const uint8x16_t a1 = vec_vsx_ld(16, above);
    658   const uint8x16_t above_right = vec_splat(a1, 15);
    659   const uint8x16_t b0 = vec_perm(a0, a1, sl1);
    660   const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
    661   const uint8x16_t c0 = vec_perm(b0, b1, sl1);
    662   const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
    663   uint8x16_t row0 = avg3(a0, b0, c0);
    664   uint8x16_t row1 = avg3(a1, b1, c1);
    665   int i;
    666   (void)left;
    667 
    668   for (i = 0; i < 32; i++) {
    669     vec_vsx_st(row0, 0, dst);
    670     vec_vsx_st(row1, 16, dst);
    671     dst += stride;
    672     row0 = vec_perm(row0, row1, sl1);
    673     row1 = vec_perm(row1, above_right, sl1);
    674   }
    675 }
    676 
    677 void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
    678                                const uint8_t *above, const uint8_t *left) {
    679   const uint8x16_t af = vec_vsx_ld(0, above);
    680   const uint8x16_t above_right = vec_splat(af, 9);
    681   const uint8x16_t a = xxpermdi(af, above_right, 1);
    682   const uint8x16_t b = vec_perm(a, above_right, sl1);
    683   const uint8x16_t c = vec_perm(b, above_right, sl1);
    684   uint8x16_t row0 = vec_avg(a, b);
    685   uint8x16_t row1 = avg3(a, b, c);
    686   int i;
    687   (void)left;
    688 
    689   for (i = 0; i < 4; i++) {
    690     const uint8x16_t d0 = vec_vsx_ld(0, dst);
    691     const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
    692     vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
    693     vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
    694     dst += stride * 2;
    695     row0 = vec_perm(row0, above_right, sl1);
    696     row1 = vec_perm(row1, above_right, sl1);
    697   }
    698 }
    699 
    700 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
    701                                  const uint8_t *above, const uint8_t *left) {
    702   const uint8x16_t a0 = vec_vsx_ld(0, above);
    703   const uint8x16_t a1 = vec_vsx_ld(16, above);
    704   const uint8x16_t above_right = vec_splat(a1, 0);
    705   const uint8x16_t b = vec_perm(a0, above_right, sl1);
    706   const uint8x16_t c = vec_perm(b, above_right, sl1);
    707   uint8x16_t row0 = vec_avg(a0, b);
    708   uint8x16_t row1 = avg3(a0, b, c);
    709   int i;
    710   (void)left;
    711 
    712   for (i = 0; i < 8; i++) {
    713     vec_vsx_st(row0, 0, dst);
    714     vec_vsx_st(row1, 0, dst + stride);
    715     dst += stride * 2;
    716     row0 = vec_perm(row0, above_right, sl1);
    717     row1 = vec_perm(row1, above_right, sl1);
    718   }
    719 }
    720 
    721 void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
    722                                  const uint8_t *above, const uint8_t *left) {
    723   const uint8x16_t a0 = vec_vsx_ld(0, above);
    724   const uint8x16_t a1 = vec_vsx_ld(16, above);
    725   const uint8x16_t a2 = vec_vsx_ld(32, above);
    726   const uint8x16_t above_right = vec_splat(a2, 0);
    727   const uint8x16_t b0 = vec_perm(a0, a1, sl1);
    728   const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
    729   const uint8x16_t c0 = vec_perm(b0, b1, sl1);
    730   const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
    731   uint8x16_t row0_0 = vec_avg(a0, b0);
    732   uint8x16_t row0_1 = vec_avg(a1, b1);
    733   uint8x16_t row1_0 = avg3(a0, b0, c0);
    734   uint8x16_t row1_1 = avg3(a1, b1, c1);
    735   int i;
    736   (void)left;
    737 
    738   for (i = 0; i < 16; i++) {
    739     vec_vsx_st(row0_0, 0, dst);
    740     vec_vsx_st(row0_1, 16, dst);
    741     vec_vsx_st(row1_0, 0, dst + stride);
    742     vec_vsx_st(row1_1, 16, dst + stride);
    743     dst += stride * 2;
    744     row0_0 = vec_perm(row0_0, row0_1, sl1);
    745     row0_1 = vec_perm(row0_1, above_right, sl1);
    746     row1_0 = vec_perm(row1_0, row1_1, sl1);
    747     row1_1 = vec_perm(row1_1, above_right, sl1);
    748   }
    749 }
    750