1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 #include <assert.h> 11 #include <string.h> 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/vpx_filter.h" 14 #include "vpx_dsp/ppc/types_vsx.h" 15 16 // TODO(lu_zero): unroll 17 static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride, 18 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 19 int i; 20 21 for (i = h; i--;) { 22 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 23 src += src_stride; 24 dst += dst_stride; 25 } 26 } 27 28 static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride, 29 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 30 int i; 31 32 for (i = h; i--;) { 33 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 34 vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 35 src += src_stride; 36 dst += dst_stride; 37 } 38 } 39 40 static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride, 41 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 42 int i; 43 44 for (i = h; i--;) { 45 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 46 vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 47 vec_vsx_st(vec_vsx_ld(32, src), 32, dst); 48 vec_vsx_st(vec_vsx_ld(48, src), 48, dst); 49 src += src_stride; 50 dst += dst_stride; 51 } 52 } 53 54 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, 55 uint8_t *dst, ptrdiff_t dst_stride, 56 const InterpKernel *filter, int x0_q4, int x_step_q4, 57 int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { 58 (void)filter; 59 (void)x0_q4; 60 (void)x_step_q4; 61 (void)y0_q4; 62 (void)y_step_q4; 63 64 switch (w) { 65 case 16: { 66 copy_w16(src, src_stride, dst, dst_stride, h); 67 break; 68 } 69 case 32: { 70 copy_w32(src, src_stride, dst, dst_stride, h); 71 break; 72 } 73 case 64: { 74 copy_w64(src, src_stride, dst, dst_stride, h); 75 break; 76 } 77 default: { 78 int i; 79 for (i = h; i--;) { 80 memcpy(dst, src, w); 81 src += src_stride; 82 dst += dst_stride; 83 } 84 break; 85 } 86 } 87 } 88 89 static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride, 90 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 91 int i; 92 93 for (i = h; i--;) { 94 const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); 95 vec_vsx_st(v, 0, dst); 96 src += src_stride; 97 dst += dst_stride; 98 } 99 } 100 101 static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride, 102 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 103 int i; 104 105 for (i = h; i--;) { 106 const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); 107 const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); 108 vec_vsx_st(v0, 0, dst); 109 vec_vsx_st(v1, 16, dst); 110 src += src_stride; 111 dst += dst_stride; 112 } 113 } 114 115 static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride, 116 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 117 int i; 118 119 for (i = h; i--;) { 120 const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); 121 const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); 122 const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst)); 123 const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst)); 124 vec_vsx_st(v0, 0, dst); 125 vec_vsx_st(v1, 16, dst); 126 vec_vsx_st(v2, 32, dst); 127 vec_vsx_st(v3, 48, dst); 128 src += src_stride; 129 dst += dst_stride; 130 } 131 } 132 133 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, 134 uint8_t *dst, ptrdiff_t dst_stride, 135 const InterpKernel *filter, int x0_q4, int x_step_q4, 136 int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { 137 switch (w) { 138 case 16: { 139 avg_w16(src, src_stride, dst, dst_stride, h); 140 break; 141 } 142 case 32: { 143 avg_w32(src, src_stride, dst, dst_stride, h); 144 break; 145 } 146 case 64: { 147 avg_w64(src, src_stride, dst, dst_stride, h); 148 break; 149 } 150 default: { 151 vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, 152 x_step_q4, y0_q4, y_step_q4, w, h); 153 break; 154 } 155 } 156 } 157 158 static inline void convolve_line(uint8_t *dst, const int16x8_t s, 159 const int16x8_t f) { 160 const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0)); 161 const int32x4_t bias = 162 vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1)); 163 const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS)); 164 const uint8x16_t v = vec_splat( 165 vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3); 166 vec_ste(v, 0, dst); 167 } 168 169 static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x, 170 const int16_t *const x_filter) { 171 const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x)); 172 const int16x8_t f = vec_vsx_ld(0, x_filter); 173 174 convolve_line(dst, s, f); 175 } 176 177 // TODO(lu_zero): Implement 8x8 and bigger block special cases 178 static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, 179 uint8_t *dst, ptrdiff_t dst_stride, 180 const InterpKernel *x_filters, int x0_q4, 181 int x_step_q4, int w, int h) { 182 int x, y; 183 src -= SUBPEL_TAPS / 2 - 1; 184 185 for (y = 0; y < h; ++y) { 186 int x_q4 = x0_q4; 187 for (x = 0; x < w; ++x) { 188 convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS], 189 x_filters[x_q4 & SUBPEL_MASK]); 190 x_q4 += x_step_q4; 191 } 192 src += src_stride; 193 dst += dst_stride; 194 } 195 } 196 197 static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, 198 uint8_t *dst, ptrdiff_t dst_stride, 199 const InterpKernel *x_filters, int x0_q4, 200 int x_step_q4, int w, int h) { 201 int x, y; 202 src -= SUBPEL_TAPS / 2 - 1; 203 204 for (y = 0; y < h; ++y) { 205 int x_q4 = x0_q4; 206 for (x = 0; x < w; ++x) { 207 uint8_t v; 208 convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS], 209 x_filters[x_q4 & SUBPEL_MASK]); 210 dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1); 211 x_q4 += x_step_q4; 212 } 213 src += src_stride; 214 dst += dst_stride; 215 } 216 } 217 218 static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b, 219 uint8x16_t c, uint8x16_t d, 220 uint8x16_t e, uint8x16_t f, 221 uint8x16_t g, uint8x16_t h) { 222 uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b); 223 uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d); 224 uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f); 225 uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h); 226 227 uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd); 228 uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh); 229 230 return (uint8x16_t)vec_mergeh(abcd, efgh); 231 } 232 233 static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y, 234 ptrdiff_t src_stride, 235 const int16_t *const y_filter) { 236 uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride); 237 uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride); 238 uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride); 239 uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride); 240 uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride); 241 uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride); 242 uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride); 243 uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride); 244 const int16x8_t f = vec_vsx_ld(0, y_filter); 245 uint8_t buf[16]; 246 const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7); 247 248 vec_vsx_st(s, 0, buf); 249 250 convolve_line(dst, unpack_to_s16_h(s), f); 251 } 252 253 static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, 254 uint8_t *dst, ptrdiff_t dst_stride, 255 const InterpKernel *y_filters, int y0_q4, 256 int y_step_q4, int w, int h) { 257 int x, y; 258 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 259 260 for (x = 0; x < w; ++x) { 261 int y_q4 = y0_q4; 262 for (y = 0; y < h; ++y) { 263 convolve_line_v(dst + y * dst_stride, 264 &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, 265 y_filters[y_q4 & SUBPEL_MASK]); 266 y_q4 += y_step_q4; 267 } 268 ++src; 269 ++dst; 270 } 271 } 272 273 static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, 274 uint8_t *dst, ptrdiff_t dst_stride, 275 const InterpKernel *y_filters, int y0_q4, 276 int y_step_q4, int w, int h) { 277 int x, y; 278 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 279 280 for (x = 0; x < w; ++x) { 281 int y_q4 = y0_q4; 282 for (y = 0; y < h; ++y) { 283 uint8_t v; 284 convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, 285 y_filters[y_q4 & SUBPEL_MASK]); 286 dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1); 287 y_q4 += y_step_q4; 288 } 289 ++src; 290 ++dst; 291 } 292 } 293 294 static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, 295 uint8_t *dst, ptrdiff_t dst_stride, 296 const InterpKernel *const filter, int x0_q4, 297 int x_step_q4, int y0_q4, int y_step_q4, int w, 298 int h) { 299 // Note: Fixed size intermediate buffer, temp, places limits on parameters. 300 // 2d filtering proceeds in 2 steps: 301 // (1) Interpolate horizontally into an intermediate buffer, temp. 302 // (2) Interpolate temp vertically to derive the sub-pixel result. 303 // Deriving the maximum number of rows in the temp buffer (135): 304 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 305 // --Largest block size is 64x64 pixels. 306 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 307 // original frame (in 1/16th pixel units). 308 // --Must round-up because block may be located at sub-pixel position. 309 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 310 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 311 DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]); 312 const int intermediate_height = 313 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 314 315 assert(w <= 64); 316 assert(h <= 64); 317 assert(y_step_q4 <= 32); 318 assert(x_step_q4 <= 32); 319 320 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, 321 filter, x0_q4, x_step_q4, w, intermediate_height); 322 convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, 323 y0_q4, y_step_q4, w, h); 324 } 325 326 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, 327 uint8_t *dst, ptrdiff_t dst_stride, 328 const InterpKernel *filter, int x0_q4, 329 int x_step_q4, int y0_q4, int y_step_q4, int w, 330 int h) { 331 (void)y0_q4; 332 (void)y_step_q4; 333 334 convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, 335 h); 336 } 337 338 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, 339 uint8_t *dst, ptrdiff_t dst_stride, 340 const InterpKernel *filter, int x0_q4, 341 int x_step_q4, int y0_q4, int y_step_q4, int w, 342 int h) { 343 (void)y0_q4; 344 (void)y_step_q4; 345 346 convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, 347 w, h); 348 } 349 350 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, 351 uint8_t *dst, ptrdiff_t dst_stride, 352 const InterpKernel *filter, int x0_q4, 353 int x_step_q4, int y0_q4, int y_step_q4, int w, 354 int h) { 355 (void)x0_q4; 356 (void)x_step_q4; 357 358 convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, 359 h); 360 } 361 362 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, 363 uint8_t *dst, ptrdiff_t dst_stride, 364 const InterpKernel *filter, int x0_q4, 365 int x_step_q4, int y0_q4, int y_step_q4, int w, 366 int h) { 367 (void)x0_q4; 368 (void)x_step_q4; 369 370 convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, 371 w, h); 372 } 373 374 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 375 ptrdiff_t dst_stride, const InterpKernel *filter, 376 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, 377 int w, int h) { 378 convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, 379 y_step_q4, w, h); 380 } 381 382 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, 383 uint8_t *dst, ptrdiff_t dst_stride, 384 const InterpKernel *filter, int x0_q4, int x_step_q4, 385 int y0_q4, int y_step_q4, int w, int h) { 386 // Fixed size intermediate buffer places limits on parameters. 387 DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); 388 assert(w <= 64); 389 assert(h <= 64); 390 391 vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, 392 y_step_q4, w, h); 393 vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); 394 } 395