1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <string.h> 14 15 #include "config/aom_dsp_rtcd.h" 16 #include "config/av1_rtcd.h" 17 18 #include "av1/common/blockd.h" 19 #include "av1/common/convolve.h" 20 #include "av1/common/filter.h" 21 #include "av1/common/onyxc_int.h" 22 #include "av1/common/resize.h" 23 #include "aom_dsp/aom_dsp_common.h" 24 #include "aom_ports/mem.h" 25 26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, 27 int dst_stride, int w, int h, 28 const int16_t *x_filters, int x0_qn, 29 int x_step_qn) { 30 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; 31 for (int y = 0; y < h; ++y) { 32 int x_qn = x0_qn; 33 for (int x = 0; x < w; ++x) { 34 const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; 35 const int x_filter_idx = 36 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 37 assert(x_filter_idx <= RS_SUBPEL_MASK); 38 const int16_t *const x_filter = 39 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; 40 int sum = 0; 41 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) 42 sum += src_x[k] * x_filter[k]; 43 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); 44 x_qn += x_step_qn; 45 } 46 src += src_stride; 47 dst += dst_stride; 48 } 49 } 50 51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, 52 uint16_t *dst, int dst_stride, int w, int h, 53 const int16_t *x_filters, int x0_qn, 54 int x_step_qn, int bd) { 55 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; 56 for (int y = 0; y < h; ++y) { 57 int x_qn = x0_qn; 58 for (int x = 0; x < w; ++x) { 59 const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; 60 const int x_filter_idx = 61 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; 62 assert(x_filter_idx <= RS_SUBPEL_MASK); 63 const int16_t *const x_filter = 64 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; 65 int sum = 0; 66 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) 67 sum += src_x[k] * x_filter[k]; 68 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); 69 x_qn += x_step_qn; 70 } 71 src += src_stride; 72 dst += dst_stride; 73 } 74 } 75 76 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst, 77 int dst_stride, int w, int h, int dir, 78 double norm) { 79 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 80 DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 }; 81 DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 }; 82 const int taps = 3; 83 int im_h = h + taps - 1; 84 int im_stride = w; 85 const int fo_vert = 1; 86 const int fo_horiz = 1; 87 88 // horizontal filter 89 const uint8_t *src_horiz = src - fo_vert * src_stride; 90 const int16_t *x_filter = dir ? sobel_a : sobel_b; 91 for (int y = 0; y < im_h; ++y) { 92 for (int x = 0; x < w; ++x) { 93 int16_t sum = 0; 94 for (int k = 0; k < taps; ++k) { 95 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 96 } 97 im_block[y * im_stride + x] = sum; 98 } 99 } 100 101 // vertical filter 102 int16_t *src_vert = im_block + fo_vert * im_stride; 103 const int16_t *y_filter = dir ? sobel_b : sobel_a; 104 for (int y = 0; y < h; ++y) { 105 for (int x = 0; x < w; ++x) { 106 int16_t sum = 0; 107 for (int k = 0; k < taps; ++k) { 108 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 109 } 110 dst[y * dst_stride + x] = sum * norm; 111 } 112 } 113 } 114 115 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, 116 int dst_stride, int w, int h, 117 const InterpFilterParams *filter_params_x, 118 const InterpFilterParams *filter_params_y, 119 const int subpel_x_q4, const int subpel_y_q4, 120 ConvolveParams *conv_params) { 121 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 122 int im_h = h + filter_params_y->taps - 1; 123 int im_stride = w; 124 const int fo_vert = filter_params_y->taps / 2 - 1; 125 const int fo_horiz = filter_params_x->taps / 2 - 1; 126 const int bd = 8; 127 const int bits = 128 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 129 130 // horizontal filter 131 const uint8_t *src_horiz = src - fo_vert * src_stride; 132 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 133 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 134 for (int y = 0; y < im_h; ++y) { 135 for (int x = 0; x < w; ++x) { 136 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 137 for (int k = 0; k < filter_params_x->taps; ++k) { 138 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 139 } 140 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); 141 im_block[y * im_stride + x] = 142 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 143 } 144 } 145 146 // vertical filter 147 int16_t *src_vert = im_block + fo_vert * im_stride; 148 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 149 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 150 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 151 for (int y = 0; y < h; ++y) { 152 for (int x = 0; x < w; ++x) { 153 int32_t sum = 1 << offset_bits; 154 for (int k = 0; k < filter_params_y->taps; ++k) { 155 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 156 } 157 assert(0 <= sum && sum < (1 << (offset_bits + 2))); 158 int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - 159 ((1 << (offset_bits - conv_params->round_1)) + 160 (1 << (offset_bits - conv_params->round_1 - 1))); 161 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); 162 } 163 } 164 } 165 166 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, 167 int dst_stride, int w, int h, 168 const InterpFilterParams *filter_params_x, 169 const InterpFilterParams *filter_params_y, 170 const int subpel_x_q4, const int subpel_y_q4, 171 ConvolveParams *conv_params) { 172 const int fo_vert = filter_params_y->taps / 2 - 1; 173 (void)filter_params_x; 174 (void)subpel_x_q4; 175 (void)conv_params; 176 177 assert(conv_params->round_0 <= FILTER_BITS); 178 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || 179 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); 180 181 // vertical filter 182 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 183 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 184 for (int y = 0; y < h; ++y) { 185 for (int x = 0; x < w; ++x) { 186 int32_t res = 0; 187 for (int k = 0; k < filter_params_y->taps; ++k) { 188 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; 189 } 190 dst[y * dst_stride + x] = 191 clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS)); 192 } 193 } 194 } 195 196 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, 197 int dst_stride, int w, int h, 198 const InterpFilterParams *filter_params_x, 199 const InterpFilterParams *filter_params_y, 200 const int subpel_x_q4, const int subpel_y_q4, 201 ConvolveParams *conv_params) { 202 const int fo_horiz = filter_params_x->taps / 2 - 1; 203 const int bits = FILTER_BITS - conv_params->round_0; 204 (void)filter_params_y; 205 (void)subpel_y_q4; 206 (void)conv_params; 207 208 assert(bits >= 0); 209 assert((FILTER_BITS - conv_params->round_1) >= 0 || 210 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 211 212 // horizontal filter 213 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 214 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 215 216 for (int y = 0; y < h; ++y) { 217 for (int x = 0; x < w; ++x) { 218 int32_t res = 0; 219 for (int k = 0; k < filter_params_x->taps; ++k) { 220 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; 221 } 222 res = ROUND_POWER_OF_TWO(res, conv_params->round_0); 223 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); 224 } 225 } 226 } 227 228 void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, 229 int dst_stride, int w, int h, 230 const InterpFilterParams *filter_params_x, 231 const InterpFilterParams *filter_params_y, 232 const int subpel_x_q4, const int subpel_y_q4, 233 ConvolveParams *conv_params) { 234 (void)filter_params_x; 235 (void)filter_params_y; 236 (void)subpel_x_q4; 237 (void)subpel_y_q4; 238 (void)conv_params; 239 240 for (int y = 0; y < h; ++y) { 241 memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); 242 } 243 } 244 245 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, 246 uint8_t *dst8, int dst8_stride, int w, int h, 247 const InterpFilterParams *filter_params_x, 248 const InterpFilterParams *filter_params_y, 249 const int subpel_x_q4, const int subpel_y_q4, 250 ConvolveParams *conv_params) { 251 CONV_BUF_TYPE *dst = conv_params->dst; 252 int dst_stride = conv_params->dst_stride; 253 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 254 int im_h = h + filter_params_y->taps - 1; 255 int im_stride = w; 256 const int fo_vert = filter_params_y->taps / 2 - 1; 257 const int fo_horiz = filter_params_x->taps / 2 - 1; 258 const int bd = 8; 259 const int round_bits = 260 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 261 262 // horizontal filter 263 const uint8_t *src_horiz = src - fo_vert * src_stride; 264 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 265 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 266 for (int y = 0; y < im_h; ++y) { 267 for (int x = 0; x < w; ++x) { 268 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 269 for (int k = 0; k < filter_params_x->taps; ++k) { 270 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 271 } 272 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); 273 im_block[y * im_stride + x] = 274 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 275 } 276 } 277 278 // vertical filter 279 int16_t *src_vert = im_block + fo_vert * im_stride; 280 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 281 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 282 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 283 for (int y = 0; y < h; ++y) { 284 for (int x = 0; x < w; ++x) { 285 int32_t sum = 1 << offset_bits; 286 for (int k = 0; k < filter_params_y->taps; ++k) { 287 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 288 } 289 assert(0 <= sum && sum < (1 << (offset_bits + 2))); 290 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); 291 if (conv_params->do_average) { 292 int32_t tmp = dst[y * dst_stride + x]; 293 if (conv_params->use_dist_wtd_comp_avg) { 294 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 295 tmp = tmp >> DIST_PRECISION_BITS; 296 } else { 297 tmp += res; 298 tmp = tmp >> 1; 299 } 300 tmp -= (1 << (offset_bits - conv_params->round_1)) + 301 (1 << (offset_bits - conv_params->round_1 - 1)); 302 dst8[y * dst8_stride + x] = 303 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); 304 } else { 305 dst[y * dst_stride + x] = res; 306 } 307 } 308 } 309 } 310 311 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, 312 uint8_t *dst8, int dst8_stride, int w, int h, 313 const InterpFilterParams *filter_params_x, 314 const InterpFilterParams *filter_params_y, 315 const int subpel_x_q4, const int subpel_y_q4, 316 ConvolveParams *conv_params) { 317 CONV_BUF_TYPE *dst = conv_params->dst; 318 int dst_stride = conv_params->dst_stride; 319 const int fo_vert = filter_params_y->taps / 2 - 1; 320 const int bits = FILTER_BITS - conv_params->round_0; 321 const int bd = 8; 322 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 323 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 324 (1 << (offset_bits - conv_params->round_1 - 1)); 325 const int round_bits = 326 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 327 (void)filter_params_x; 328 (void)subpel_x_q4; 329 330 // vertical filter 331 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 332 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 333 for (int y = 0; y < h; ++y) { 334 for (int x = 0; x < w; ++x) { 335 int32_t res = 0; 336 for (int k = 0; k < filter_params_y->taps; ++k) { 337 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; 338 } 339 res *= (1 << bits); 340 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; 341 342 if (conv_params->do_average) { 343 int32_t tmp = dst[y * dst_stride + x]; 344 if (conv_params->use_dist_wtd_comp_avg) { 345 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 346 tmp = tmp >> DIST_PRECISION_BITS; 347 } else { 348 tmp += res; 349 tmp = tmp >> 1; 350 } 351 tmp -= round_offset; 352 dst8[y * dst8_stride + x] = 353 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); 354 } else { 355 dst[y * dst_stride + x] = res; 356 } 357 } 358 } 359 } 360 361 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, 362 uint8_t *dst8, int dst8_stride, int w, int h, 363 const InterpFilterParams *filter_params_x, 364 const InterpFilterParams *filter_params_y, 365 const int subpel_x_q4, const int subpel_y_q4, 366 ConvolveParams *conv_params) { 367 CONV_BUF_TYPE *dst = conv_params->dst; 368 int dst_stride = conv_params->dst_stride; 369 const int fo_horiz = filter_params_x->taps / 2 - 1; 370 const int bits = FILTER_BITS - conv_params->round_1; 371 const int bd = 8; 372 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 373 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 374 (1 << (offset_bits - conv_params->round_1 - 1)); 375 const int round_bits = 376 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 377 (void)filter_params_y; 378 (void)subpel_y_q4; 379 380 // horizontal filter 381 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 382 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 383 for (int y = 0; y < h; ++y) { 384 for (int x = 0; x < w; ++x) { 385 int32_t res = 0; 386 for (int k = 0; k < filter_params_x->taps; ++k) { 387 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; 388 } 389 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); 390 res += round_offset; 391 392 if (conv_params->do_average) { 393 int32_t tmp = dst[y * dst_stride + x]; 394 if (conv_params->use_dist_wtd_comp_avg) { 395 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 396 tmp = tmp >> DIST_PRECISION_BITS; 397 } else { 398 tmp += res; 399 tmp = tmp >> 1; 400 } 401 tmp -= round_offset; 402 dst8[y * dst8_stride + x] = 403 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); 404 } else { 405 dst[y * dst_stride + x] = res; 406 } 407 } 408 } 409 } 410 411 void av1_dist_wtd_convolve_2d_copy_c( 412 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, 413 int h, const InterpFilterParams *filter_params_x, 414 const InterpFilterParams *filter_params_y, const int subpel_x_q4, 415 const int subpel_y_q4, ConvolveParams *conv_params) { 416 CONV_BUF_TYPE *dst = conv_params->dst; 417 int dst_stride = conv_params->dst_stride; 418 const int bits = 419 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; 420 const int bd = 8; 421 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 422 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 423 (1 << (offset_bits - conv_params->round_1 - 1)); 424 (void)filter_params_x; 425 (void)filter_params_y; 426 (void)subpel_x_q4; 427 (void)subpel_y_q4; 428 429 for (int y = 0; y < h; ++y) { 430 for (int x = 0; x < w; ++x) { 431 CONV_BUF_TYPE res = src[y * src_stride + x] << bits; 432 res += round_offset; 433 434 if (conv_params->do_average) { 435 int32_t tmp = dst[y * dst_stride + x]; 436 if (conv_params->use_dist_wtd_comp_avg) { 437 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 438 tmp = tmp >> DIST_PRECISION_BITS; 439 } else { 440 tmp += res; 441 tmp = tmp >> 1; 442 } 443 tmp -= round_offset; 444 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); 445 } else { 446 dst[y * dst_stride + x] = res; 447 } 448 } 449 } 450 } 451 452 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, 453 int dst8_stride, int w, int h, 454 const InterpFilterParams *filter_params_x, 455 const InterpFilterParams *filter_params_y, 456 const int subpel_x_qn, const int x_step_qn, 457 const int subpel_y_qn, const int y_step_qn, 458 ConvolveParams *conv_params) { 459 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; 460 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + 461 filter_params_y->taps; 462 CONV_BUF_TYPE *dst16 = conv_params->dst; 463 const int dst16_stride = conv_params->dst_stride; 464 const int bits = 465 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 466 assert(bits >= 0); 467 int im_stride = w; 468 const int fo_vert = filter_params_y->taps / 2 - 1; 469 const int fo_horiz = filter_params_x->taps / 2 - 1; 470 const int bd = 8; 471 472 // horizontal filter 473 const uint8_t *src_horiz = src - fo_vert * src_stride; 474 for (int y = 0; y < im_h; ++y) { 475 int x_qn = subpel_x_qn; 476 for (int x = 0; x < w; ++x, x_qn += x_step_qn) { 477 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; 478 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; 479 assert(x_filter_idx < SUBPEL_SHIFTS); 480 const int16_t *x_filter = 481 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); 482 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 483 for (int k = 0; k < filter_params_x->taps; ++k) { 484 sum += x_filter[k] * src_x[k - fo_horiz]; 485 } 486 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); 487 im_block[y * im_stride + x] = 488 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 489 } 490 src_horiz += src_stride; 491 } 492 493 // vertical filter 494 int16_t *src_vert = im_block + fo_vert * im_stride; 495 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 496 for (int x = 0; x < w; ++x) { 497 int y_qn = subpel_y_qn; 498 for (int y = 0; y < h; ++y, y_qn += y_step_qn) { 499 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; 500 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; 501 assert(y_filter_idx < SUBPEL_SHIFTS); 502 const int16_t *y_filter = 503 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); 504 int32_t sum = 1 << offset_bits; 505 for (int k = 0; k < filter_params_y->taps; ++k) { 506 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; 507 } 508 assert(0 <= sum && sum < (1 << (offset_bits + 2))); 509 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); 510 if (conv_params->is_compound) { 511 if (conv_params->do_average) { 512 int32_t tmp = dst16[y * dst16_stride + x]; 513 if (conv_params->use_dist_wtd_comp_avg) { 514 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 515 tmp = tmp >> DIST_PRECISION_BITS; 516 } else { 517 tmp += res; 518 tmp = tmp >> 1; 519 } 520 /* Subtract round offset and convolve round */ 521 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + 522 (1 << (offset_bits - conv_params->round_1 - 1))); 523 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); 524 } else { 525 dst16[y * dst16_stride + x] = res; 526 } 527 } else { 528 /* Subtract round offset and convolve round */ 529 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + 530 (1 << (offset_bits - conv_params->round_1 - 1))); 531 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); 532 } 533 } 534 src_vert++; 535 } 536 } 537 538 static void convolve_2d_scale_wrapper( 539 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 540 int h, const InterpFilterParams *filter_params_x, 541 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 542 const int x_step_qn, const int subpel_y_qn, const int y_step_qn, 543 ConvolveParams *conv_params) { 544 if (conv_params->is_compound) { 545 assert(conv_params->dst != NULL); 546 } 547 av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x, 548 filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, 549 y_step_qn, conv_params); 550 } 551 552 // TODO(huisu (at) google.com): bilinear filtering only needs 2 taps in general. So 553 // we may create optimized code to do 2-tap filtering for all bilinear filtering 554 // usages, not just IntraBC. 555 static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride, 556 uint8_t *dst, int dst_stride, int w, int h, 557 int subpel_x_q4, int subpel_y_q4, 558 ConvolveParams *conv_params) { 559 const InterpFilterParams *filter_params_x = 560 subpel_x_q4 ? &av1_intrabc_filter_params : NULL; 561 const InterpFilterParams *filter_params_y = 562 subpel_y_q4 ? &av1_intrabc_filter_params : NULL; 563 if (subpel_x_q4 != 0 && subpel_y_q4 != 0) { 564 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 565 filter_params_x, filter_params_y, 0, 0, conv_params); 566 } else if (subpel_x_q4 != 0) { 567 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, 568 filter_params_y, 0, 0, conv_params); 569 } else { 570 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, 571 filter_params_y, 0, 0, conv_params); 572 } 573 } 574 575 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, 576 int dst_stride, int w, int h, 577 InterpFilters interp_filters, const int subpel_x_q4, 578 int x_step_q4, const int subpel_y_q4, int y_step_q4, 579 int scaled, ConvolveParams *conv_params, 580 const struct scale_factors *sf, int is_intrabc) { 581 assert(IMPLIES(is_intrabc, !scaled)); 582 (void)x_step_q4; 583 (void)y_step_q4; 584 (void)dst; 585 (void)dst_stride; 586 587 if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) { 588 convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4, 589 subpel_y_q4, conv_params); 590 return; 591 } 592 593 InterpFilter filter_x = 0; 594 InterpFilter filter_y = 0; 595 const int need_filter_params_x = (subpel_x_q4 != 0) | scaled; 596 const int need_filter_params_y = (subpel_y_q4 != 0) | scaled; 597 if (need_filter_params_x) 598 filter_x = av1_extract_interp_filter(interp_filters, 1); 599 if (need_filter_params_y) 600 filter_y = av1_extract_interp_filter(interp_filters, 0); 601 const InterpFilterParams *filter_params_x = 602 need_filter_params_x 603 ? av1_get_interp_filter_params_with_block_size(filter_x, w) 604 : NULL; 605 const InterpFilterParams *filter_params_y = 606 need_filter_params_y 607 ? av1_get_interp_filter_params_with_block_size(filter_y, h) 608 : NULL; 609 610 if (scaled) { 611 convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, 612 filter_params_x, filter_params_y, subpel_x_q4, 613 x_step_q4, subpel_y_q4, y_step_q4, conv_params); 614 } else { 615 sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound]( 616 src, src_stride, dst, dst_stride, w, h, filter_params_x, 617 filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); 618 } 619 } 620 621 void av1_highbd_convolve_2d_copy_sr_c( 622 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 623 int h, const InterpFilterParams *filter_params_x, 624 const InterpFilterParams *filter_params_y, const int subpel_x_q4, 625 const int subpel_y_q4, ConvolveParams *conv_params, int bd) { 626 (void)filter_params_x; 627 (void)filter_params_y; 628 (void)subpel_x_q4; 629 (void)subpel_y_q4; 630 (void)conv_params; 631 (void)bd; 632 633 for (int y = 0; y < h; ++y) { 634 memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); 635 } 636 } 637 638 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, 639 uint16_t *dst, int dst_stride, int w, int h, 640 const InterpFilterParams *filter_params_x, 641 const InterpFilterParams *filter_params_y, 642 const int subpel_x_q4, const int subpel_y_q4, 643 ConvolveParams *conv_params, int bd) { 644 const int fo_horiz = filter_params_x->taps / 2 - 1; 645 const int bits = FILTER_BITS - conv_params->round_0; 646 (void)filter_params_y; 647 (void)subpel_y_q4; 648 649 assert(bits >= 0); 650 assert((FILTER_BITS - conv_params->round_1) >= 0 || 651 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 652 653 // horizontal filter 654 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 655 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 656 for (int y = 0; y < h; ++y) { 657 for (int x = 0; x < w; ++x) { 658 int32_t res = 0; 659 for (int k = 0; k < filter_params_x->taps; ++k) { 660 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; 661 } 662 res = ROUND_POWER_OF_TWO(res, conv_params->round_0); 663 dst[y * dst_stride + x] = 664 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); 665 } 666 } 667 } 668 669 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, 670 uint16_t *dst, int dst_stride, int w, int h, 671 const InterpFilterParams *filter_params_x, 672 const InterpFilterParams *filter_params_y, 673 const int subpel_x_q4, const int subpel_y_q4, 674 ConvolveParams *conv_params, int bd) { 675 const int fo_vert = filter_params_y->taps / 2 - 1; 676 (void)filter_params_x; 677 (void)subpel_x_q4; 678 (void)conv_params; 679 680 assert(conv_params->round_0 <= FILTER_BITS); 681 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || 682 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); 683 // vertical filter 684 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 685 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 686 for (int y = 0; y < h; ++y) { 687 for (int x = 0; x < w; ++x) { 688 int32_t res = 0; 689 for (int k = 0; k < filter_params_y->taps; ++k) { 690 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; 691 } 692 dst[y * dst_stride + x] = 693 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd); 694 } 695 } 696 } 697 698 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, 699 uint16_t *dst, int dst_stride, int w, int h, 700 const InterpFilterParams *filter_params_x, 701 const InterpFilterParams *filter_params_y, 702 const int subpel_x_q4, const int subpel_y_q4, 703 ConvolveParams *conv_params, int bd) { 704 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 705 int im_h = h + filter_params_y->taps - 1; 706 int im_stride = w; 707 const int fo_vert = filter_params_y->taps / 2 - 1; 708 const int fo_horiz = filter_params_x->taps / 2 - 1; 709 const int bits = 710 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 711 assert(bits >= 0); 712 713 // horizontal filter 714 const uint16_t *src_horiz = src - fo_vert * src_stride; 715 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 716 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 717 for (int y = 0; y < im_h; ++y) { 718 for (int x = 0; x < w; ++x) { 719 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 720 for (int k = 0; k < filter_params_x->taps; ++k) { 721 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 722 } 723 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); 724 im_block[y * im_stride + x] = 725 ROUND_POWER_OF_TWO(sum, conv_params->round_0); 726 } 727 } 728 729 // vertical filter 730 int16_t *src_vert = im_block + fo_vert * im_stride; 731 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 732 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 733 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 734 for (int y = 0; y < h; ++y) { 735 for (int x = 0; x < w; ++x) { 736 int32_t sum = 1 << offset_bits; 737 for (int k = 0; k < filter_params_y->taps; ++k) { 738 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 739 } 740 assert(0 <= sum && sum < (1 << (offset_bits + 2))); 741 int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - 742 ((1 << (offset_bits - conv_params->round_1)) + 743 (1 << (offset_bits - conv_params->round_1 - 1))); 744 dst[y * dst_stride + x] = 745 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); 746 } 747 } 748 } 749 750 void av1_highbd_dist_wtd_convolve_2d_c( 751 const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, 752 int w, int h, const InterpFilterParams *filter_params_x, 753 const InterpFilterParams *filter_params_y, const int subpel_x_q4, 754 const int subpel_y_q4, ConvolveParams *conv_params, int bd) { 755 int x, y, k; 756 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 757 CONV_BUF_TYPE *dst = conv_params->dst; 758 int dst_stride = conv_params->dst_stride; 759 int im_h = h + filter_params_y->taps - 1; 760 int im_stride = w; 761 const int fo_vert = filter_params_y->taps / 2 - 1; 762 const int fo_horiz = filter_params_x->taps / 2 - 1; 763 const int round_bits = 764 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 765 assert(round_bits >= 0); 766 767 // horizontal filter 768 const uint16_t *src_horiz = src - fo_vert * src_stride; 769 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 770 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 771 for (y = 0; y < im_h; ++y) { 772 for (x = 0; x < w; ++x) { 773 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 774 for (k = 0; k < filter_params_x->taps; ++k) { 775 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; 776 } 777 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); 778 (void)bd; 779 im_block[y * im_stride + x] = 780 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 781 } 782 } 783 784 // vertical filter 785 int16_t *src_vert = im_block + fo_vert * im_stride; 786 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 787 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 788 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 789 for (y = 0; y < h; ++y) { 790 for (x = 0; x < w; ++x) { 791 int32_t sum = 1 << offset_bits; 792 for (k = 0; k < filter_params_y->taps; ++k) { 793 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; 794 } 795 assert(0 <= sum && sum < (1 << (offset_bits + 2))); 796 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); 797 if (conv_params->do_average) { 798 int32_t tmp = dst[y * dst_stride + x]; 799 if (conv_params->use_dist_wtd_comp_avg) { 800 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 801 tmp = tmp >> DIST_PRECISION_BITS; 802 } else { 803 tmp += res; 804 tmp = tmp >> 1; 805 } 806 tmp -= (1 << (offset_bits - conv_params->round_1)) + 807 (1 << (offset_bits - conv_params->round_1 - 1)); 808 dst16[y * dst16_stride + x] = 809 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); 810 } else { 811 dst[y * dst_stride + x] = res; 812 } 813 } 814 } 815 } 816 817 void av1_highbd_dist_wtd_convolve_x_c( 818 const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, 819 int w, int h, const InterpFilterParams *filter_params_x, 820 const InterpFilterParams *filter_params_y, const int subpel_x_q4, 821 const int subpel_y_q4, ConvolveParams *conv_params, int bd) { 822 CONV_BUF_TYPE *dst = conv_params->dst; 823 int dst_stride = conv_params->dst_stride; 824 const int fo_horiz = filter_params_x->taps / 2 - 1; 825 const int bits = FILTER_BITS - conv_params->round_1; 826 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 827 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 828 (1 << (offset_bits - conv_params->round_1 - 1)); 829 const int round_bits = 830 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 831 assert(round_bits >= 0); 832 (void)filter_params_y; 833 (void)subpel_y_q4; 834 assert(bits >= 0); 835 // horizontal filter 836 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( 837 filter_params_x, subpel_x_q4 & SUBPEL_MASK); 838 for (int y = 0; y < h; ++y) { 839 for (int x = 0; x < w; ++x) { 840 int32_t res = 0; 841 for (int k = 0; k < filter_params_x->taps; ++k) { 842 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; 843 } 844 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); 845 res += round_offset; 846 847 if (conv_params->do_average) { 848 int32_t tmp = dst[y * dst_stride + x]; 849 if (conv_params->use_dist_wtd_comp_avg) { 850 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 851 tmp = tmp >> DIST_PRECISION_BITS; 852 } else { 853 tmp += res; 854 tmp = tmp >> 1; 855 } 856 tmp -= round_offset; 857 dst16[y * dst16_stride + x] = 858 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); 859 } else { 860 dst[y * dst_stride + x] = res; 861 } 862 } 863 } 864 } 865 866 void av1_highbd_dist_wtd_convolve_y_c( 867 const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, 868 int w, int h, const InterpFilterParams *filter_params_x, 869 const InterpFilterParams *filter_params_y, const int subpel_x_q4, 870 const int subpel_y_q4, ConvolveParams *conv_params, int bd) { 871 CONV_BUF_TYPE *dst = conv_params->dst; 872 int dst_stride = conv_params->dst_stride; 873 const int fo_vert = filter_params_y->taps / 2 - 1; 874 const int bits = FILTER_BITS - conv_params->round_0; 875 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 876 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 877 (1 << (offset_bits - conv_params->round_1 - 1)); 878 const int round_bits = 879 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 880 assert(round_bits >= 0); 881 (void)filter_params_x; 882 (void)subpel_x_q4; 883 assert(bits >= 0); 884 // vertical filter 885 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( 886 filter_params_y, subpel_y_q4 & SUBPEL_MASK); 887 for (int y = 0; y < h; ++y) { 888 for (int x = 0; x < w; ++x) { 889 int32_t res = 0; 890 for (int k = 0; k < filter_params_y->taps; ++k) { 891 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; 892 } 893 res *= (1 << bits); 894 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; 895 896 if (conv_params->do_average) { 897 int32_t tmp = dst[y * dst_stride + x]; 898 if (conv_params->use_dist_wtd_comp_avg) { 899 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 900 tmp = tmp >> DIST_PRECISION_BITS; 901 } else { 902 tmp += res; 903 tmp = tmp >> 1; 904 } 905 tmp -= round_offset; 906 dst16[y * dst16_stride + x] = 907 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); 908 } else { 909 dst[y * dst_stride + x] = res; 910 } 911 } 912 } 913 } 914 915 void av1_highbd_dist_wtd_convolve_2d_copy_c( 916 const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, 917 int w, int h, const InterpFilterParams *filter_params_x, 918 const InterpFilterParams *filter_params_y, const int subpel_x_q4, 919 const int subpel_y_q4, ConvolveParams *conv_params, int bd) { 920 CONV_BUF_TYPE *dst = conv_params->dst; 921 int dst_stride = conv_params->dst_stride; 922 const int bits = 923 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; 924 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 925 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 926 (1 << (offset_bits - conv_params->round_1 - 1)); 927 assert(bits >= 0); 928 (void)filter_params_x; 929 (void)filter_params_y; 930 (void)subpel_x_q4; 931 (void)subpel_y_q4; 932 933 for (int y = 0; y < h; ++y) { 934 for (int x = 0; x < w; ++x) { 935 CONV_BUF_TYPE res = src[y * src_stride + x] << bits; 936 res += round_offset; 937 if (conv_params->do_average) { 938 int32_t tmp = dst[y * dst_stride + x]; 939 if (conv_params->use_dist_wtd_comp_avg) { 940 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 941 tmp = tmp >> DIST_PRECISION_BITS; 942 } else { 943 tmp += res; 944 tmp = tmp >> 1; 945 } 946 tmp -= round_offset; 947 dst16[y * dst16_stride + x] = 948 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); 949 } else { 950 dst[y * dst_stride + x] = res; 951 } 952 } 953 } 954 } 955 956 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, 957 uint16_t *dst, int dst_stride, int w, int h, 958 const InterpFilterParams *filter_params_x, 959 const InterpFilterParams *filter_params_y, 960 const int subpel_x_qn, const int x_step_qn, 961 const int subpel_y_qn, const int y_step_qn, 962 ConvolveParams *conv_params, int bd) { 963 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; 964 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + 965 filter_params_y->taps; 966 int im_stride = w; 967 const int fo_vert = filter_params_y->taps / 2 - 1; 968 const int fo_horiz = filter_params_x->taps / 2 - 1; 969 CONV_BUF_TYPE *dst16 = conv_params->dst; 970 const int dst16_stride = conv_params->dst_stride; 971 const int bits = 972 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 973 assert(bits >= 0); 974 // horizontal filter 975 const uint16_t *src_horiz = src - fo_vert * src_stride; 976 for (int y = 0; y < im_h; ++y) { 977 int x_qn = subpel_x_qn; 978 for (int x = 0; x < w; ++x, x_qn += x_step_qn) { 979 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; 980 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; 981 assert(x_filter_idx < SUBPEL_SHIFTS); 982 const int16_t *x_filter = 983 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); 984 int32_t sum = (1 << (bd + FILTER_BITS - 1)); 985 for (int k = 0; k < filter_params_x->taps; ++k) { 986 sum += x_filter[k] * src_x[k - fo_horiz]; 987 } 988 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); 989 im_block[y * im_stride + x] = 990 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); 991 } 992 src_horiz += src_stride; 993 } 994 995 // vertical filter 996 int16_t *src_vert = im_block + fo_vert * im_stride; 997 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 998 for (int x = 0; x < w; ++x) { 999 int y_qn = subpel_y_qn; 1000 for (int y = 0; y < h; ++y, y_qn += y_step_qn) { 1001 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; 1002 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; 1003 assert(y_filter_idx < SUBPEL_SHIFTS); 1004 const int16_t *y_filter = 1005 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); 1006 int32_t sum = 1 << offset_bits; 1007 for (int k = 0; k < filter_params_y->taps; ++k) { 1008 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; 1009 } 1010 assert(0 <= sum && sum < (1 << (offset_bits + 2))); 1011 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); 1012 if (conv_params->is_compound) { 1013 if (conv_params->do_average) { 1014 int32_t tmp = dst16[y * dst16_stride + x]; 1015 if (conv_params->use_dist_wtd_comp_avg) { 1016 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; 1017 tmp = tmp >> DIST_PRECISION_BITS; 1018 } else { 1019 tmp += res; 1020 tmp = tmp >> 1; 1021 } 1022 /* Subtract round offset and convolve round */ 1023 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + 1024 (1 << (offset_bits - conv_params->round_1 - 1))); 1025 dst[y * dst_stride + x] = 1026 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); 1027 } else { 1028 dst16[y * dst16_stride + x] = res; 1029 } 1030 } else { 1031 /* Subtract round offset and convolve round */ 1032 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + 1033 (1 << (offset_bits - conv_params->round_1 - 1))); 1034 dst[y * dst_stride + x] = 1035 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); 1036 } 1037 } 1038 src_vert++; 1039 } 1040 } 1041 1042 static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride, 1043 uint16_t *dst, int dst_stride, int w, 1044 int h, int subpel_x_q4, 1045 int subpel_y_q4, 1046 ConvolveParams *conv_params, 1047 int bd) { 1048 const InterpFilterParams *filter_params_x = 1049 subpel_x_q4 ? &av1_intrabc_filter_params : NULL; 1050 const InterpFilterParams *filter_params_y = 1051 subpel_y_q4 ? &av1_intrabc_filter_params : NULL; 1052 if (subpel_x_q4 != 0 && subpel_y_q4 != 0) { 1053 av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 1054 filter_params_x, filter_params_y, 0, 0, 1055 conv_params, bd); 1056 } else if (subpel_x_q4 != 0) { 1057 av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, 1058 filter_params_x, filter_params_y, 0, 0, 1059 conv_params, bd); 1060 } else { 1061 av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, 1062 filter_params_x, filter_params_y, 0, 0, 1063 conv_params, bd); 1064 } 1065 } 1066 1067 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, 1068 uint8_t *dst8, int dst_stride, int w, int h, 1069 InterpFilters interp_filters, 1070 const int subpel_x_q4, int x_step_q4, 1071 const int subpel_y_q4, int y_step_q4, 1072 int scaled, ConvolveParams *conv_params, 1073 const struct scale_factors *sf, 1074 int is_intrabc, int bd) { 1075 assert(IMPLIES(is_intrabc, !scaled)); 1076 (void)x_step_q4; 1077 (void)y_step_q4; 1078 (void)dst_stride; 1079 const uint16_t *src = CONVERT_TO_SHORTPTR(src8); 1080 1081 if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) { 1082 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); 1083 highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, 1084 subpel_x_q4, subpel_y_q4, conv_params, bd); 1085 return; 1086 } 1087 1088 InterpFilter filter_x = 0; 1089 InterpFilter filter_y = 0; 1090 const int need_filter_params_x = (subpel_x_q4 != 0) | scaled; 1091 const int need_filter_params_y = (subpel_y_q4 != 0) | scaled; 1092 if (need_filter_params_x) 1093 filter_x = av1_extract_interp_filter(interp_filters, 1); 1094 if (need_filter_params_y) 1095 filter_y = av1_extract_interp_filter(interp_filters, 0); 1096 const InterpFilterParams *filter_params_x = 1097 need_filter_params_x 1098 ? av1_get_interp_filter_params_with_block_size(filter_x, w) 1099 : NULL; 1100 const InterpFilterParams *filter_params_y = 1101 need_filter_params_y 1102 ? av1_get_interp_filter_params_with_block_size(filter_y, h) 1103 : NULL; 1104 1105 if (scaled) { 1106 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); 1107 if (conv_params->is_compound) { 1108 assert(conv_params->dst != NULL); 1109 } 1110 av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, 1111 filter_params_x, filter_params_y, subpel_x_q4, 1112 x_step_q4, subpel_y_q4, y_step_q4, conv_params, 1113 bd); 1114 } else { 1115 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); 1116 1117 sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 != 1118 0][conv_params->is_compound]( 1119 src, src_stride, dst, dst_stride, w, h, filter_params_x, 1120 filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd); 1121 } 1122 } 1123 1124 // Note: Fixed size intermediate buffers, place limits on parameters 1125 // of some functions. 2d filtering proceeds in 2 steps: 1126 // (1) Interpolate horizontally into an intermediate buffer, temp. 1127 // (2) Interpolate temp vertically to derive the sub-pixel result. 1128 // Deriving the maximum number of rows in the temp buffer (135): 1129 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 1130 // --Largest block size is 128x128 pixels. 1131 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the 1132 // original frame (in 1/16th pixel units). 1133 // --Must round-up because block may be located at sub-pixel position. 1134 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 1135 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263. 1136 #define WIENER_MAX_EXT_SIZE 263 1137 1138 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { 1139 int sum = 0; 1140 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; 1141 return sum; 1142 } 1143 1144 static INLINE int highbd_horz_scalar_product(const uint16_t *a, 1145 const int16_t *b) { 1146 int sum = 0; 1147 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; 1148 return sum; 1149 } 1150 1151 static INLINE int highbd_vert_scalar_product(const uint16_t *a, 1152 ptrdiff_t a_stride, 1153 const int16_t *b) { 1154 int sum = 0; 1155 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; 1156 return sum; 1157 } 1158 1159 static const InterpKernel *get_filter_base(const int16_t *filter) { 1160 // NOTE: This assumes that the filter table is 256-byte aligned. 1161 // TODO(agrange) Modify to make independent of table alignment. 1162 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); 1163 } 1164 1165 static int get_filter_offset(const int16_t *f, const InterpKernel *base) { 1166 return (int)((const InterpKernel *)(intptr_t)f - base); 1167 } 1168 1169 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, 1170 uint16_t *dst, ptrdiff_t dst_stride, 1171 const InterpKernel *x_filters, int x0_q4, 1172 int x_step_q4, int w, int h, 1173 int round0_bits) { 1174 const int bd = 8; 1175 src -= SUBPEL_TAPS / 2 - 1; 1176 for (int y = 0; y < h; ++y) { 1177 int x_q4 = x0_q4; 1178 for (int x = 0; x < w; ++x) { 1179 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 1180 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 1181 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + 1182 (1 << (bd + FILTER_BITS - 1)); 1183 const int sum = horz_scalar_product(src_x, x_filter) + rounding; 1184 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, 1185 WIENER_CLAMP_LIMIT(round0_bits, bd) - 1); 1186 x_q4 += x_step_q4; 1187 } 1188 src += src_stride; 1189 dst += dst_stride; 1190 } 1191 } 1192 1193 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, 1194 uint8_t *dst, ptrdiff_t dst_stride, 1195 const InterpKernel *y_filters, int y0_q4, 1196 int y_step_q4, int w, int h, 1197 int round1_bits) { 1198 const int bd = 8; 1199 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 1200 1201 for (int x = 0; x < w; ++x) { 1202 int y_q4 = y0_q4; 1203 for (int y = 0; y < h; ++y) { 1204 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 1205 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 1206 const int rounding = 1207 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - 1208 (1 << (bd + round1_bits - 1)); 1209 const int sum = 1210 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; 1211 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits)); 1212 y_q4 += y_step_q4; 1213 } 1214 ++src; 1215 ++dst; 1216 } 1217 } 1218 1219 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, 1220 uint8_t *dst, ptrdiff_t dst_stride, 1221 const int16_t *filter_x, int x_step_q4, 1222 const int16_t *filter_y, int y_step_q4, 1223 int w, int h, 1224 const ConvolveParams *conv_params) { 1225 const InterpKernel *const filters_x = get_filter_base(filter_x); 1226 const int x0_q4 = get_filter_offset(filter_x, filters_x); 1227 1228 const InterpKernel *const filters_y = get_filter_base(filter_y); 1229 const int y0_q4 = get_filter_offset(filter_y, filters_y); 1230 1231 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; 1232 const int intermediate_height = 1233 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; 1234 memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); 1235 1236 assert(w <= MAX_SB_SIZE); 1237 assert(h <= MAX_SB_SIZE); 1238 assert(y_step_q4 <= 32); 1239 assert(x_step_q4 <= 32); 1240 1241 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), 1242 src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4, 1243 x_step_q4, w, intermediate_height, 1244 conv_params->round_0); 1245 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), 1246 MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, 1247 y_step_q4, w, h, conv_params->round_1); 1248 } 1249 1250 static void highbd_convolve_add_src_horiz_hip( 1251 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, 1252 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, 1253 int x_step_q4, int w, int h, int round0_bits, int bd) { 1254 const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd); 1255 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 1256 src -= SUBPEL_TAPS / 2 - 1; 1257 for (int y = 0; y < h; ++y) { 1258 int x_q4 = x0_q4; 1259 for (int x = 0; x < w; ++x) { 1260 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 1261 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 1262 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + 1263 (1 << (bd + FILTER_BITS - 1)); 1264 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding; 1265 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, 1266 extraprec_clamp_limit - 1); 1267 x_q4 += x_step_q4; 1268 } 1269 src += src_stride; 1270 dst += dst_stride; 1271 } 1272 } 1273 1274 static void highbd_convolve_add_src_vert_hip( 1275 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, 1276 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, 1277 int y_step_q4, int w, int h, int round1_bits, int bd) { 1278 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); 1279 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 1280 for (int x = 0; x < w; ++x) { 1281 int y_q4 = y0_q4; 1282 for (int y = 0; y < h; ++y) { 1283 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 1284 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 1285 const int rounding = 1286 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - 1287 (1 << (bd + round1_bits - 1)); 1288 const int sum = 1289 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; 1290 dst[y * dst_stride] = 1291 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd); 1292 y_q4 += y_step_q4; 1293 } 1294 ++src; 1295 ++dst; 1296 } 1297 } 1298 1299 void av1_highbd_wiener_convolve_add_src_c( 1300 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 1301 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, 1302 const int16_t *filter_y, int y_step_q4, int w, int h, 1303 const ConvolveParams *conv_params, int bd) { 1304 const InterpKernel *const filters_x = get_filter_base(filter_x); 1305 const int x0_q4 = get_filter_offset(filter_x, filters_x); 1306 1307 const InterpKernel *const filters_y = get_filter_base(filter_y); 1308 const int y0_q4 = get_filter_offset(filter_y, filters_y); 1309 1310 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; 1311 const int intermediate_height = 1312 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 1313 1314 assert(w <= MAX_SB_SIZE); 1315 assert(h <= MAX_SB_SIZE); 1316 assert(y_step_q4 <= 32); 1317 assert(x_step_q4 <= 32); 1318 assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); 1319 1320 highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), 1321 src_stride, temp, MAX_SB_SIZE, filters_x, 1322 x0_q4, x_step_q4, w, intermediate_height, 1323 conv_params->round_0, bd); 1324 highbd_convolve_add_src_vert_hip( 1325 temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, 1326 filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd); 1327 } 1328