Home | History | Annotate | Download | only in ppc
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 #include <assert.h>
     11 #include <string.h>
     12 #include "./vpx_dsp_rtcd.h"
     13 #include "vpx_dsp/vpx_filter.h"
     14 #include "vpx_dsp/ppc/types_vsx.h"
     15 
     16 // TODO(lu_zero): unroll
     17 static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
     18                             uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
     19   int i;
     20 
     21   for (i = h; i--;) {
     22     vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
     23     src += src_stride;
     24     dst += dst_stride;
     25   }
     26 }
     27 
     28 static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
     29                             uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
     30   int i;
     31 
     32   for (i = h; i--;) {
     33     vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
     34     vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
     35     src += src_stride;
     36     dst += dst_stride;
     37   }
     38 }
     39 
     40 static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
     41                             uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
     42   int i;
     43 
     44   for (i = h; i--;) {
     45     vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
     46     vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
     47     vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
     48     vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
     49     src += src_stride;
     50     dst += dst_stride;
     51   }
     52 }
     53 
     54 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
     55                            uint8_t *dst, ptrdiff_t dst_stride,
     56                            const int16_t *filter_x, int32_t filter_x_stride,
     57                            const int16_t *filter_y, int32_t filter_y_stride,
     58                            int32_t w, int32_t h) {
     59   (void)filter_x;
     60   (void)filter_y;
     61   (void)filter_x_stride;
     62   (void)filter_y_stride;
     63 
     64   switch (w) {
     65     case 16: {
     66       copy_w16(src, src_stride, dst, dst_stride, h);
     67       break;
     68     }
     69     case 32: {
     70       copy_w32(src, src_stride, dst, dst_stride, h);
     71       break;
     72     }
     73     case 64: {
     74       copy_w64(src, src_stride, dst, dst_stride, h);
     75       break;
     76     }
     77     default: {
     78       int i;
     79       for (i = h; i--;) {
     80         memcpy(dst, src, w);
     81         src += src_stride;
     82         dst += dst_stride;
     83       }
     84       break;
     85     }
     86   }
     87 }
     88 
     89 static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
     90                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
     91   int i;
     92 
     93   for (i = h; i--;) {
     94     const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
     95     vec_vsx_st(v, 0, dst);
     96     src += src_stride;
     97     dst += dst_stride;
     98   }
     99 }
    100 
    101 static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
    102                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
    103   int i;
    104 
    105   for (i = h; i--;) {
    106     const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
    107     const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
    108     vec_vsx_st(v0, 0, dst);
    109     vec_vsx_st(v1, 16, dst);
    110     src += src_stride;
    111     dst += dst_stride;
    112   }
    113 }
    114 
    115 static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
    116                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
    117   int i;
    118 
    119   for (i = h; i--;) {
    120     const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
    121     const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
    122     const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
    123     const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
    124     vec_vsx_st(v0, 0, dst);
    125     vec_vsx_st(v1, 16, dst);
    126     vec_vsx_st(v2, 32, dst);
    127     vec_vsx_st(v3, 48, dst);
    128     src += src_stride;
    129     dst += dst_stride;
    130   }
    131 }
    132 
    133 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
    134                           uint8_t *dst, ptrdiff_t dst_stride,
    135                           const int16_t *filter_x, int32_t filter_x_stride,
    136                           const int16_t *filter_y, int32_t filter_y_stride,
    137                           int32_t w, int32_t h) {
    138   (void)filter_x;
    139   (void)filter_y;
    140   (void)filter_x_stride;
    141   (void)filter_y_stride;
    142 
    143   switch (w) {
    144     case 16: {
    145       avg_w16(src, src_stride, dst, dst_stride, h);
    146       break;
    147     }
    148     case 32: {
    149       avg_w32(src, src_stride, dst, dst_stride, h);
    150       break;
    151     }
    152     case 64: {
    153       avg_w64(src, src_stride, dst, dst_stride, h);
    154       break;
    155     }
    156     default: {
    157       vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x,
    158                          filter_x_stride, filter_y, filter_y_stride, w, h);
    159       break;
    160     }
    161   }
    162 }
    163 
    164 static inline void convolve_line(uint8_t *dst, const int16x8_t s,
    165                                  const int16x8_t f) {
    166   const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
    167   const int32x4_t bias =
    168       vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
    169   const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
    170   const uint8x16_t v = vec_splat(
    171       vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
    172   vec_ste(v, 0, dst);
    173 }
    174 
    175 static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x,
    176                                    const int16_t *const x_filter) {
    177   const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
    178   const int16x8_t f = vec_vsx_ld(0, x_filter);
    179 
    180   convolve_line(dst, s, f);
    181 }
    182 
    183 // TODO(lu_zero): Implement 8x8 and bigger block special cases
    184 static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
    185                                   uint8_t *dst, ptrdiff_t dst_stride,
    186                                   const InterpKernel *x_filters, int x0_q4,
    187                                   int x_step_q4, int w, int h) {
    188   int x, y;
    189   src -= SUBPEL_TAPS / 2 - 1;
    190 
    191   for (y = 0; y < h; ++y) {
    192     int x_q4 = x0_q4;
    193     for (x = 0; x < w; ++x) {
    194       convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
    195                       x_filters[x_q4 & SUBPEL_MASK]);
    196       x_q4 += x_step_q4;
    197     }
    198     src += src_stride;
    199     dst += dst_stride;
    200   }
    201 }
    202 
    203 static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
    204                                       uint8_t *dst, ptrdiff_t dst_stride,
    205                                       const InterpKernel *x_filters, int x0_q4,
    206                                       int x_step_q4, int w, int h) {
    207   int x, y;
    208   src -= SUBPEL_TAPS / 2 - 1;
    209 
    210   for (y = 0; y < h; ++y) {
    211     int x_q4 = x0_q4;
    212     for (x = 0; x < w; ++x) {
    213       uint8_t v;
    214       convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
    215                       x_filters[x_q4 & SUBPEL_MASK]);
    216       dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
    217       x_q4 += x_step_q4;
    218     }
    219     src += src_stride;
    220     dst += dst_stride;
    221   }
    222 }
    223 
    224 static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
    225                                         uint8x16_t c, uint8x16_t d,
    226                                         uint8x16_t e, uint8x16_t f,
    227                                         uint8x16_t g, uint8x16_t h) {
    228   uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
    229   uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
    230   uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
    231   uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
    232 
    233   uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
    234   uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
    235 
    236   return (uint8x16_t)vec_mergeh(abcd, efgh);
    237 }
    238 
    239 static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y,
    240                                    ptrdiff_t src_stride,
    241                                    const int16_t *const y_filter) {
    242   uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
    243   uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
    244   uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
    245   uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
    246   uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
    247   uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
    248   uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
    249   uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
    250   const int16x8_t f = vec_vsx_ld(0, y_filter);
    251   uint8_t buf[16];
    252   const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
    253 
    254   vec_vsx_st(s, 0, buf);
    255 
    256   convolve_line(dst, unpack_to_s16_h(s), f);
    257 }
    258 
    259 static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
    260                                  uint8_t *dst, ptrdiff_t dst_stride,
    261                                  const InterpKernel *y_filters, int y0_q4,
    262                                  int y_step_q4, int w, int h) {
    263   int x, y;
    264   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    265 
    266   for (x = 0; x < w; ++x) {
    267     int y_q4 = y0_q4;
    268     for (y = 0; y < h; ++y) {
    269       convolve_line_v(dst + y * dst_stride,
    270                       &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
    271                       y_filters[y_q4 & SUBPEL_MASK]);
    272       y_q4 += y_step_q4;
    273     }
    274     ++src;
    275     ++dst;
    276   }
    277 }
    278 
    279 static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
    280                                      uint8_t *dst, ptrdiff_t dst_stride,
    281                                      const InterpKernel *y_filters, int y0_q4,
    282                                      int y_step_q4, int w, int h) {
    283   int x, y;
    284   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
    285 
    286   for (x = 0; x < w; ++x) {
    287     int y_q4 = y0_q4;
    288     for (y = 0; y < h; ++y) {
    289       uint8_t v;
    290       convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
    291                       y_filters[y_q4 & SUBPEL_MASK]);
    292       dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
    293       y_q4 += y_step_q4;
    294     }
    295     ++src;
    296     ++dst;
    297   }
    298 }
    299 
    300 static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
    301                             uint8_t *dst, ptrdiff_t dst_stride,
    302                             const InterpKernel *const x_filters, int x0_q4,
    303                             int x_step_q4, const InterpKernel *const y_filters,
    304                             int y0_q4, int y_step_q4, int w, int h) {
    305   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
    306   // 2d filtering proceeds in 2 steps:
    307   //   (1) Interpolate horizontally into an intermediate buffer, temp.
    308   //   (2) Interpolate temp vertically to derive the sub-pixel result.
    309   // Deriving the maximum number of rows in the temp buffer (135):
    310   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
    311   // --Largest block size is 64x64 pixels.
    312   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
    313   //   original frame (in 1/16th pixel units).
    314   // --Must round-up because block may be located at sub-pixel position.
    315   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
    316   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
    317   DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
    318   const int intermediate_height =
    319       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
    320 
    321   assert(w <= 64);
    322   assert(h <= 64);
    323   assert(y_step_q4 <= 32);
    324   assert(x_step_q4 <= 32);
    325 
    326   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
    327                  x_filters, x0_q4, x_step_q4, w, intermediate_height);
    328   convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
    329                 y_filters, y0_q4, y_step_q4, w, h);
    330 }
    331 
    332 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
    333                              uint8_t *dst, ptrdiff_t dst_stride,
    334                              const int16_t *filter_x, int x_step_q4,
    335                              const int16_t *filter_y, int y_step_q4, int w,
    336                              int h) {
    337   const InterpKernel *const filters_x = get_filter_base(filter_x);
    338   const int x0_q4 = get_filter_offset(filter_x, filters_x);
    339 
    340   (void)filter_y;
    341   (void)y_step_q4;
    342 
    343   convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
    344                  w, h);
    345 }
    346 
    347 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
    348                                  uint8_t *dst, ptrdiff_t dst_stride,
    349                                  const int16_t *filter_x, int x_step_q4,
    350                                  const int16_t *filter_y, int y_step_q4, int w,
    351                                  int h) {
    352   const InterpKernel *const filters_x = get_filter_base(filter_x);
    353   const int x0_q4 = get_filter_offset(filter_x, filters_x);
    354 
    355   (void)filter_y;
    356   (void)y_step_q4;
    357 
    358   convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
    359                      x_step_q4, w, h);
    360 }
    361 
    362 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
    363                             uint8_t *dst, ptrdiff_t dst_stride,
    364                             const int16_t *filter_x, int x_step_q4,
    365                             const int16_t *filter_y, int y_step_q4, int w,
    366                             int h) {
    367   const InterpKernel *const filters_y = get_filter_base(filter_y);
    368   const int y0_q4 = get_filter_offset(filter_y, filters_y);
    369 
    370   (void)filter_x;
    371   (void)x_step_q4;
    372 
    373   convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
    374                 w, h);
    375 }
    376 
    377 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
    378                                 uint8_t *dst, ptrdiff_t dst_stride,
    379                                 const int16_t *filter_x, int x_step_q4,
    380                                 const int16_t *filter_y, int y_step_q4, int w,
    381                                 int h) {
    382   const InterpKernel *const filters_y = get_filter_base(filter_y);
    383   const int y0_q4 = get_filter_offset(filter_y, filters_y);
    384 
    385   (void)filter_x;
    386   (void)x_step_q4;
    387 
    388   convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
    389                     y_step_q4, w, h);
    390 }
    391 
    392 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    393                        ptrdiff_t dst_stride, const int16_t *filter_x,
    394                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
    395                        int w, int h) {
    396   const InterpKernel *const filters_x = get_filter_base(filter_x);
    397   const int x0_q4 = get_filter_offset(filter_x, filters_x);
    398   const InterpKernel *const filters_y = get_filter_base(filter_y);
    399   const int y0_q4 = get_filter_offset(filter_y, filters_y);
    400 
    401   convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
    402            filters_y, y0_q4, y_step_q4, w, h);
    403 }
    404 
    405 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
    406                            uint8_t *dst, ptrdiff_t dst_stride,
    407                            const int16_t *filter_x, int x_step_q4,
    408                            const int16_t *filter_y, int y_step_q4, int w,
    409                            int h) {
    410   // Fixed size intermediate buffer places limits on parameters.
    411   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
    412   assert(w <= 64);
    413   assert(h <= 64);
    414 
    415   vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
    416                     y_step_q4, w, h);
    417   vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
    418 }
    419