1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 #include <assert.h> 11 #include <string.h> 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/vpx_filter.h" 14 #include "vpx_dsp/ppc/types_vsx.h" 15 16 // TODO(lu_zero): unroll 17 static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride, 18 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 19 int i; 20 21 for (i = h; i--;) { 22 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 23 src += src_stride; 24 dst += dst_stride; 25 } 26 } 27 28 static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride, 29 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 30 int i; 31 32 for (i = h; i--;) { 33 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 34 vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 35 src += src_stride; 36 dst += dst_stride; 37 } 38 } 39 40 static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride, 41 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 42 int i; 43 44 for (i = h; i--;) { 45 vec_vsx_st(vec_vsx_ld(0, src), 0, dst); 46 vec_vsx_st(vec_vsx_ld(16, src), 16, dst); 47 vec_vsx_st(vec_vsx_ld(32, src), 32, dst); 48 vec_vsx_st(vec_vsx_ld(48, src), 48, dst); 49 src += src_stride; 50 dst += dst_stride; 51 } 52 } 53 54 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, 55 uint8_t *dst, ptrdiff_t dst_stride, 56 const int16_t *filter_x, int32_t filter_x_stride, 57 const int16_t *filter_y, int32_t filter_y_stride, 58 int32_t w, int32_t h) { 59 (void)filter_x; 60 (void)filter_y; 61 (void)filter_x_stride; 62 (void)filter_y_stride; 63 64 switch (w) { 65 case 16: { 66 copy_w16(src, src_stride, dst, dst_stride, h); 67 break; 68 } 69 case 32: { 70 copy_w32(src, src_stride, dst, dst_stride, h); 71 break; 72 } 73 case 64: { 74 copy_w64(src, src_stride, dst, dst_stride, h); 75 break; 76 } 77 default: { 78 int i; 79 for (i = h; i--;) { 80 memcpy(dst, src, w); 81 src += src_stride; 82 dst += dst_stride; 83 } 84 break; 85 } 86 } 87 } 88 89 static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride, 90 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 91 int i; 92 93 for (i = h; i--;) { 94 const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); 95 vec_vsx_st(v, 0, dst); 96 src += src_stride; 97 dst += dst_stride; 98 } 99 } 100 101 static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride, 102 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 103 int i; 104 105 for (i = h; i--;) { 106 const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); 107 const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); 108 vec_vsx_st(v0, 0, dst); 109 vec_vsx_st(v1, 16, dst); 110 src += src_stride; 111 dst += dst_stride; 112 } 113 } 114 115 static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride, 116 uint8_t *dst, ptrdiff_t dst_stride, int32_t h) { 117 int i; 118 119 for (i = h; i--;) { 120 const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); 121 const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); 122 const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst)); 123 const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst)); 124 vec_vsx_st(v0, 0, dst); 125 vec_vsx_st(v1, 16, dst); 126 vec_vsx_st(v2, 32, dst); 127 vec_vsx_st(v3, 48, dst); 128 src += src_stride; 129 dst += dst_stride; 130 } 131 } 132 133 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, 134 uint8_t *dst, ptrdiff_t dst_stride, 135 const int16_t *filter_x, int32_t filter_x_stride, 136 const int16_t *filter_y, int32_t filter_y_stride, 137 int32_t w, int32_t h) { 138 (void)filter_x; 139 (void)filter_y; 140 (void)filter_x_stride; 141 (void)filter_y_stride; 142 143 switch (w) { 144 case 16: { 145 avg_w16(src, src_stride, dst, dst_stride, h); 146 break; 147 } 148 case 32: { 149 avg_w32(src, src_stride, dst, dst_stride, h); 150 break; 151 } 152 case 64: { 153 avg_w64(src, src_stride, dst, dst_stride, h); 154 break; 155 } 156 default: { 157 vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x, 158 filter_x_stride, filter_y, filter_y_stride, w, h); 159 break; 160 } 161 } 162 } 163 164 static inline void convolve_line(uint8_t *dst, const int16x8_t s, 165 const int16x8_t f) { 166 const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0)); 167 const int32x4_t bias = 168 vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1)); 169 const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS)); 170 const uint8x16_t v = vec_splat( 171 vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3); 172 vec_ste(v, 0, dst); 173 } 174 175 static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x, 176 const int16_t *const x_filter) { 177 const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x)); 178 const int16x8_t f = vec_vsx_ld(0, x_filter); 179 180 convolve_line(dst, s, f); 181 } 182 183 // TODO(lu_zero): Implement 8x8 and bigger block special cases 184 static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, 185 uint8_t *dst, ptrdiff_t dst_stride, 186 const InterpKernel *x_filters, int x0_q4, 187 int x_step_q4, int w, int h) { 188 int x, y; 189 src -= SUBPEL_TAPS / 2 - 1; 190 191 for (y = 0; y < h; ++y) { 192 int x_q4 = x0_q4; 193 for (x = 0; x < w; ++x) { 194 convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS], 195 x_filters[x_q4 & SUBPEL_MASK]); 196 x_q4 += x_step_q4; 197 } 198 src += src_stride; 199 dst += dst_stride; 200 } 201 } 202 203 static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, 204 uint8_t *dst, ptrdiff_t dst_stride, 205 const InterpKernel *x_filters, int x0_q4, 206 int x_step_q4, int w, int h) { 207 int x, y; 208 src -= SUBPEL_TAPS / 2 - 1; 209 210 for (y = 0; y < h; ++y) { 211 int x_q4 = x0_q4; 212 for (x = 0; x < w; ++x) { 213 uint8_t v; 214 convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS], 215 x_filters[x_q4 & SUBPEL_MASK]); 216 dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1); 217 x_q4 += x_step_q4; 218 } 219 src += src_stride; 220 dst += dst_stride; 221 } 222 } 223 224 static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b, 225 uint8x16_t c, uint8x16_t d, 226 uint8x16_t e, uint8x16_t f, 227 uint8x16_t g, uint8x16_t h) { 228 uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b); 229 uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d); 230 uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f); 231 uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h); 232 233 uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd); 234 uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh); 235 236 return (uint8x16_t)vec_mergeh(abcd, efgh); 237 } 238 239 static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y, 240 ptrdiff_t src_stride, 241 const int16_t *const y_filter) { 242 uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride); 243 uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride); 244 uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride); 245 uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride); 246 uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride); 247 uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride); 248 uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride); 249 uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride); 250 const int16x8_t f = vec_vsx_ld(0, y_filter); 251 uint8_t buf[16]; 252 const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7); 253 254 vec_vsx_st(s, 0, buf); 255 256 convolve_line(dst, unpack_to_s16_h(s), f); 257 } 258 259 static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, 260 uint8_t *dst, ptrdiff_t dst_stride, 261 const InterpKernel *y_filters, int y0_q4, 262 int y_step_q4, int w, int h) { 263 int x, y; 264 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 265 266 for (x = 0; x < w; ++x) { 267 int y_q4 = y0_q4; 268 for (y = 0; y < h; ++y) { 269 convolve_line_v(dst + y * dst_stride, 270 &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, 271 y_filters[y_q4 & SUBPEL_MASK]); 272 y_q4 += y_step_q4; 273 } 274 ++src; 275 ++dst; 276 } 277 } 278 279 static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, 280 uint8_t *dst, ptrdiff_t dst_stride, 281 const InterpKernel *y_filters, int y0_q4, 282 int y_step_q4, int w, int h) { 283 int x, y; 284 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 285 286 for (x = 0; x < w; ++x) { 287 int y_q4 = y0_q4; 288 for (y = 0; y < h; ++y) { 289 uint8_t v; 290 convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, 291 y_filters[y_q4 & SUBPEL_MASK]); 292 dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1); 293 y_q4 += y_step_q4; 294 } 295 ++src; 296 ++dst; 297 } 298 } 299 300 static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, 301 uint8_t *dst, ptrdiff_t dst_stride, 302 const InterpKernel *const x_filters, int x0_q4, 303 int x_step_q4, const InterpKernel *const y_filters, 304 int y0_q4, int y_step_q4, int w, int h) { 305 // Note: Fixed size intermediate buffer, temp, places limits on parameters. 306 // 2d filtering proceeds in 2 steps: 307 // (1) Interpolate horizontally into an intermediate buffer, temp. 308 // (2) Interpolate temp vertically to derive the sub-pixel result. 309 // Deriving the maximum number of rows in the temp buffer (135): 310 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 311 // --Largest block size is 64x64 pixels. 312 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 313 // original frame (in 1/16th pixel units). 314 // --Must round-up because block may be located at sub-pixel position. 315 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 316 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 317 DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]); 318 const int intermediate_height = 319 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 320 321 assert(w <= 64); 322 assert(h <= 64); 323 assert(y_step_q4 <= 32); 324 assert(x_step_q4 <= 32); 325 326 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, 327 x_filters, x0_q4, x_step_q4, w, intermediate_height); 328 convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, 329 y_filters, y0_q4, y_step_q4, w, h); 330 } 331 332 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, 333 uint8_t *dst, ptrdiff_t dst_stride, 334 const int16_t *filter_x, int x_step_q4, 335 const int16_t *filter_y, int y_step_q4, int w, 336 int h) { 337 const InterpKernel *const filters_x = get_filter_base(filter_x); 338 const int x0_q4 = get_filter_offset(filter_x, filters_x); 339 340 (void)filter_y; 341 (void)y_step_q4; 342 343 convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, 344 w, h); 345 } 346 347 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, 348 uint8_t *dst, ptrdiff_t dst_stride, 349 const int16_t *filter_x, int x_step_q4, 350 const int16_t *filter_y, int y_step_q4, int w, 351 int h) { 352 const InterpKernel *const filters_x = get_filter_base(filter_x); 353 const int x0_q4 = get_filter_offset(filter_x, filters_x); 354 355 (void)filter_y; 356 (void)y_step_q4; 357 358 convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, 359 x_step_q4, w, h); 360 } 361 362 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, 363 uint8_t *dst, ptrdiff_t dst_stride, 364 const int16_t *filter_x, int x_step_q4, 365 const int16_t *filter_y, int y_step_q4, int w, 366 int h) { 367 const InterpKernel *const filters_y = get_filter_base(filter_y); 368 const int y0_q4 = get_filter_offset(filter_y, filters_y); 369 370 (void)filter_x; 371 (void)x_step_q4; 372 373 convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, 374 w, h); 375 } 376 377 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, 378 uint8_t *dst, ptrdiff_t dst_stride, 379 const int16_t *filter_x, int x_step_q4, 380 const int16_t *filter_y, int y_step_q4, int w, 381 int h) { 382 const InterpKernel *const filters_y = get_filter_base(filter_y); 383 const int y0_q4 = get_filter_offset(filter_y, filters_y); 384 385 (void)filter_x; 386 (void)x_step_q4; 387 388 convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, 389 y_step_q4, w, h); 390 } 391 392 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 393 ptrdiff_t dst_stride, const int16_t *filter_x, 394 int x_step_q4, const int16_t *filter_y, int y_step_q4, 395 int w, int h) { 396 const InterpKernel *const filters_x = get_filter_base(filter_x); 397 const int x0_q4 = get_filter_offset(filter_x, filters_x); 398 const InterpKernel *const filters_y = get_filter_base(filter_y); 399 const int y0_q4 = get_filter_offset(filter_y, filters_y); 400 401 convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, 402 filters_y, y0_q4, y_step_q4, w, h); 403 } 404 405 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, 406 uint8_t *dst, ptrdiff_t dst_stride, 407 const int16_t *filter_x, int x_step_q4, 408 const int16_t *filter_y, int y_step_q4, int w, 409 int h) { 410 // Fixed size intermediate buffer places limits on parameters. 411 DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); 412 assert(w <= 64); 413 assert(h <= 64); 414 415 vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, 416 y_step_q4, w, h); 417 vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); 418 } 419