1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 * 11 */ 12 13 #include <math.h> 14 15 #include "config/aom_config.h" 16 #include "config/aom_dsp_rtcd.h" 17 #include "config/aom_scale_rtcd.h" 18 19 #include "aom_mem/aom_mem.h" 20 #include "av1/common/onyxc_int.h" 21 #include "av1/common/resize.h" 22 #include "av1/common/restoration.h" 23 #include "aom_dsp/aom_dsp_common.h" 24 #include "aom_mem/aom_mem.h" 25 26 #include "aom_ports/mem.h" 27 28 // The 's' values are calculated based on original 'r' and 'e' values in the 29 // spec using GenSgrprojVtable(). 30 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid). 31 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = { 32 { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } }, 33 { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } }, 34 { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } }, 35 { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } }, 36 { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } }, 37 { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } }, 38 { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } }, 39 { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } }, 40 }; 41 42 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) { 43 AV1PixelRect rect; 44 45 int ss_x = is_uv && cm->seq_params.subsampling_x; 46 int ss_y = is_uv && cm->seq_params.subsampling_y; 47 48 rect.top = 0; 49 rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y); 50 rect.left = 0; 51 rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); 52 return rect; 53 } 54 55 // Count horizontal or vertical units per tile (use a width or height for 56 // tile_size, respectively). We basically want to divide the tile size by the 57 // size of a restoration unit. Rather than rounding up unconditionally as you 58 // might expect, we round to nearest, which models the way a right or bottom 59 // restoration unit can extend to up to 150% its normal width or height. The 60 // max with 1 is to deal with tiles that are smaller than half of a restoration 61 // unit. 62 int av1_lr_count_units_in_tile(int unit_size, int tile_size) { 63 return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1); 64 } 65 66 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi, 67 int is_uv) { 68 // We need to allocate enough space for restoration units to cover the 69 // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the 70 // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have 71 // to do the computation ourselves, iterating over the tiles and keeping 72 // track of the largest width and height, then upscaling. 73 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv); 74 const int max_tile_w = tile_rect.right - tile_rect.left; 75 const int max_tile_h = tile_rect.bottom - tile_rect.top; 76 77 // To calculate hpertile and vpertile (horizontal and vertical units per 78 // tile), we basically want to divide the largest tile width or height by the 79 // size of a restoration unit. Rather than rounding up unconditionally as you 80 // might expect, we round to nearest, which models the way a right or bottom 81 // restoration unit can extend to up to 150% its normal width or height. The 82 // max with 1 is to deal with tiles that are smaller than half of a 83 // restoration unit. 84 const int unit_size = rsi->restoration_unit_size; 85 const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w); 86 const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h); 87 88 rsi->units_per_tile = hpertile * vpertile; 89 rsi->horz_units_per_tile = hpertile; 90 rsi->vert_units_per_tile = vpertile; 91 92 const int ntiles = 1; 93 const int nunits = ntiles * rsi->units_per_tile; 94 95 aom_free(rsi->unit_info); 96 CHECK_MEM_ERROR(cm, rsi->unit_info, 97 (RestorationUnitInfo *)aom_memalign( 98 16, sizeof(*rsi->unit_info) * nunits)); 99 } 100 101 void av1_free_restoration_struct(RestorationInfo *rst_info) { 102 aom_free(rst_info->unit_info); 103 rst_info->unit_info = NULL; 104 } 105 106 #if 0 107 // Pair of values for each sgrproj parameter: 108 // Index 0 corresponds to r[0], e[0] 109 // Index 1 corresponds to r[1], e[1] 110 int sgrproj_mtable[SGRPROJ_PARAMS][2]; 111 112 static void GenSgrprojVtable() { 113 for (int i = 0; i < SGRPROJ_PARAMS; ++i) { 114 const sgr_params_type *const params = &sgr_params[i]; 115 for (int j = 0; j < 2; ++j) { 116 const int e = params->e[j]; 117 const int r = params->r[j]; 118 if (r == 0) { // filter is disabled 119 sgrproj_mtable[i][j] = -1; // mark invalid 120 } else { // filter is enabled 121 const int n = (2 * r + 1) * (2 * r + 1); 122 const int n2e = n * n * e; 123 assert(n2e != 0); 124 sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e); 125 } 126 } 127 } 128 } 129 #endif 130 131 void av1_loop_restoration_precal() { 132 #if 0 133 GenSgrprojVtable(); 134 #endif 135 } 136 137 static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride, 138 int border_horz, int border_vert) { 139 uint8_t *data_p; 140 int i; 141 for (i = 0; i < height; ++i) { 142 data_p = data + i * stride; 143 memset(data_p - border_horz, data_p[0], border_horz); 144 memset(data_p + width, data_p[width - 1], border_horz); 145 } 146 data_p = data - border_horz; 147 for (i = -border_vert; i < 0; ++i) { 148 memcpy(data_p + i * stride, data_p, width + 2 * border_horz); 149 } 150 for (i = height; i < height + border_vert; ++i) { 151 memcpy(data_p + i * stride, data_p + (height - 1) * stride, 152 width + 2 * border_horz); 153 } 154 } 155 156 static void extend_frame_highbd(uint16_t *data, int width, int height, 157 int stride, int border_horz, int border_vert) { 158 uint16_t *data_p; 159 int i, j; 160 for (i = 0; i < height; ++i) { 161 data_p = data + i * stride; 162 for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0]; 163 for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1]; 164 } 165 data_p = data - border_horz; 166 for (i = -border_vert; i < 0; ++i) { 167 memcpy(data_p + i * stride, data_p, 168 (width + 2 * border_horz) * sizeof(uint16_t)); 169 } 170 for (i = height; i < height + border_vert; ++i) { 171 memcpy(data_p + i * stride, data_p + (height - 1) * stride, 172 (width + 2 * border_horz) * sizeof(uint16_t)); 173 } 174 } 175 176 void extend_frame(uint8_t *data, int width, int height, int stride, 177 int border_horz, int border_vert, int highbd) { 178 if (highbd) 179 extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride, 180 border_horz, border_vert); 181 else 182 extend_frame_lowbd(data, width, height, stride, border_horz, border_vert); 183 } 184 185 static void copy_tile_lowbd(int width, int height, const uint8_t *src, 186 int src_stride, uint8_t *dst, int dst_stride) { 187 for (int i = 0; i < height; ++i) 188 memcpy(dst + i * dst_stride, src + i * src_stride, width); 189 } 190 191 static void copy_tile_highbd(int width, int height, const uint16_t *src, 192 int src_stride, uint16_t *dst, int dst_stride) { 193 for (int i = 0; i < height; ++i) 194 memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst)); 195 } 196 197 static void copy_tile(int width, int height, const uint8_t *src, int src_stride, 198 uint8_t *dst, int dst_stride, int highbd) { 199 if (highbd) 200 copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride, 201 CONVERT_TO_SHORTPTR(dst), dst_stride); 202 else 203 copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride); 204 } 205 206 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d)) 207 208 // With striped loop restoration, the filtering for each 64-pixel stripe gets 209 // most of its input from the output of CDEF (stored in data8), but we need to 210 // fill out a border of 3 pixels above/below the stripe according to the 211 // following 212 // rules: 213 // 214 // * At a frame boundary, we copy the outermost row of CDEF pixels three times. 215 // This extension is done by a call to extend_frame() at the start of the loop 216 // restoration process, so the value of copy_above/copy_below doesn't strictly 217 // matter. 218 // However, by setting *copy_above = *copy_below = 1 whenever loop filtering 219 // across tiles is disabled, we can allow 220 // {setup,restore}_processing_stripe_boundary to assume that the top/bottom 221 // data has always been copied, simplifying the behaviour at the left and 222 // right edges of tiles. 223 // 224 // * If we're at a tile boundary and loop filtering across tiles is enabled, 225 // then there is a logical stripe which is 64 pixels high, but which is split 226 // into an 8px high and a 56px high stripe so that the processing (and 227 // coefficient set usage) can be aligned to tiles. 228 // In this case, we use the 3 rows of CDEF output across the boundary for 229 // context; this corresponds to leaving the frame buffer as-is. 230 // 231 // * If we're at a tile boundary and loop filtering across tiles is disabled, 232 // then we take the outermost row of CDEF pixels *within the current tile* 233 // and copy it three times. Thus we behave exactly as if the tile were a full 234 // frame. 235 // 236 // * Otherwise, we're at a stripe boundary within a tile. In that case, we 237 // take 2 rows of deblocked pixels and extend them to 3 rows of context. 238 // 239 // The distinction between the latter two cases is handled by the 240 // av1_loop_restoration_save_boundary_lines() function, so here we just need 241 // to decide if we're overwriting the above/below boundary pixels or not. 242 static void get_stripe_boundary_info(const RestorationTileLimits *limits, 243 const AV1PixelRect *tile_rect, int ss_y, 244 int *copy_above, int *copy_below) { 245 *copy_above = 1; 246 *copy_below = 1; 247 248 const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; 249 const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; 250 251 const int first_stripe_in_tile = (limits->v_start == tile_rect->top); 252 const int this_stripe_height = 253 full_stripe_height - (first_stripe_in_tile ? runit_offset : 0); 254 const int last_stripe_in_tile = 255 (limits->v_start + this_stripe_height >= tile_rect->bottom); 256 257 if (first_stripe_in_tile) *copy_above = 0; 258 if (last_stripe_in_tile) *copy_below = 0; 259 } 260 261 // Overwrite the border pixels around a processing stripe so that the conditions 262 // listed above get_stripe_boundary_info() are preserved. 263 // We save the pixels which get overwritten into a temporary buffer, so that 264 // they can be restored by restore_processing_stripe_boundary() after we've 265 // processed the stripe. 266 // 267 // limits gives the rectangular limits of the remaining stripes for the current 268 // restoration unit. rsb is the stored stripe boundaries (taken from either 269 // deblock or CDEF output as necessary). 270 // 271 // tile_rect is the limits of the current tile and tile_stripe0 is the index of 272 // the first stripe in this tile (needed to convert the tile-relative stripe 273 // index we get from limits into something we can look up in rsb). 274 static void setup_processing_stripe_boundary( 275 const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb, 276 int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride, 277 RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) { 278 // Offsets within the line buffers. The buffer logically starts at column 279 // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ) 280 // has column x0 in the buffer. 281 const int buf_stride = rsb->stripe_boundary_stride; 282 const int buf_x0_off = limits->h_start; 283 const int line_width = 284 (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; 285 const int line_size = line_width << use_highbd; 286 287 const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; 288 289 // Replace RESTORATION_BORDER pixels above the top of the stripe 290 // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above 291 // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by 292 // duplicating the topmost of the 2 lines (see the AOMMAX call when 293 // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1). 294 // 295 // Special case: If we're at the top of a tile, which isn't on the topmost 296 // tile row, and we're allowed to loop filter across tiles, then we have a 297 // logical 64-pixel-high stripe which has been split into an 8-pixel high 298 // stripe and a 56-pixel high stripe (the current one). So, in this case, 299 // we want to leave the boundary alone! 300 if (!opt) { 301 if (copy_above) { 302 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; 303 304 for (int i = -RESTORATION_BORDER; i < 0; ++i) { 305 const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0); 306 const int buf_off = buf_x0_off + buf_row * buf_stride; 307 const uint8_t *buf = 308 rsb->stripe_boundary_above + (buf_off << use_highbd); 309 uint8_t *dst8 = data8_tl + i * data_stride; 310 // Save old pixels, then replace with data from stripe_boundary_above 311 memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER], 312 REAL_PTR(use_highbd, dst8), line_size); 313 memcpy(REAL_PTR(use_highbd, dst8), buf, line_size); 314 } 315 } 316 317 // Replace RESTORATION_BORDER pixels below the bottom of the stripe. 318 // The second buffer row is repeated, so src_row gets the values 0, 1, 1 319 // for i = 0, 1, 2. 320 if (copy_below) { 321 const int stripe_end = limits->v_start + h; 322 uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; 323 324 for (int i = 0; i < RESTORATION_BORDER; ++i) { 325 const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1); 326 const int buf_off = buf_x0_off + buf_row * buf_stride; 327 const uint8_t *src = 328 rsb->stripe_boundary_below + (buf_off << use_highbd); 329 330 uint8_t *dst8 = data8_bl + i * data_stride; 331 // Save old pixels, then replace with data from stripe_boundary_below 332 memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size); 333 memcpy(REAL_PTR(use_highbd, dst8), src, line_size); 334 } 335 } 336 } else { 337 if (copy_above) { 338 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; 339 340 // Only save and overwrite i=-RESTORATION_BORDER line. 341 uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; 342 // Save old pixels, then replace with data from stripe_boundary_above 343 memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size); 344 memcpy(REAL_PTR(use_highbd, dst8), 345 REAL_PTR(use_highbd, 346 data8_tl + (-RESTORATION_BORDER + 1) * data_stride), 347 line_size); 348 } 349 350 if (copy_below) { 351 const int stripe_end = limits->v_start + h; 352 uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; 353 354 // Only save and overwrite i=2 line. 355 uint8_t *dst8 = data8_bl + 2 * data_stride; 356 // Save old pixels, then replace with data from stripe_boundary_below 357 memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size); 358 memcpy(REAL_PTR(use_highbd, dst8), 359 REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size); 360 } 361 } 362 } 363 364 // This function restores the boundary lines modified by 365 // setup_processing_stripe_boundary. 366 // 367 // Note: We need to be careful when handling the corners of the processing 368 // unit, because (eg.) the top-left corner is considered to be part of 369 // both the left and top borders. This means that, depending on the 370 // loop_filter_across_tiles_enabled flag, the corner pixels might get 371 // overwritten twice, once as part of the "top" border and once as part 372 // of the "left" border (or similar for other corners). 373 // 374 // Everything works out fine as long as we make sure to reverse the order 375 // when restoring, ie. we need to restore the left/right borders followed 376 // by the top/bottom borders. 377 static void restore_processing_stripe_boundary( 378 const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs, 379 int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above, 380 int copy_below, int opt) { 381 const int line_width = 382 (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; 383 const int line_size = line_width << use_highbd; 384 385 const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; 386 387 if (!opt) { 388 if (copy_above) { 389 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; 390 for (int i = -RESTORATION_BORDER; i < 0; ++i) { 391 uint8_t *dst8 = data8_tl + i * data_stride; 392 memcpy(REAL_PTR(use_highbd, dst8), 393 rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size); 394 } 395 } 396 397 if (copy_below) { 398 const int stripe_bottom = limits->v_start + h; 399 uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; 400 401 for (int i = 0; i < RESTORATION_BORDER; ++i) { 402 if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break; 403 404 uint8_t *dst8 = data8_bl + i * data_stride; 405 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size); 406 } 407 } 408 } else { 409 if (copy_above) { 410 uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; 411 412 // Only restore i=-RESTORATION_BORDER line. 413 uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; 414 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size); 415 } 416 417 if (copy_below) { 418 const int stripe_bottom = limits->v_start + h; 419 uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; 420 421 // Only restore i=2 line. 422 if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) { 423 uint8_t *dst8 = data8_bl + 2 * data_stride; 424 memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size); 425 } 426 } 427 } 428 } 429 430 static void wiener_filter_stripe(const RestorationUnitInfo *rui, 431 int stripe_width, int stripe_height, 432 int procunit_width, const uint8_t *src, 433 int src_stride, uint8_t *dst, int dst_stride, 434 int32_t *tmpbuf, int bit_depth) { 435 (void)tmpbuf; 436 (void)bit_depth; 437 assert(bit_depth == 8); 438 const ConvolveParams conv_params = get_conv_params_wiener(8); 439 440 for (int j = 0; j < stripe_width; j += procunit_width) { 441 int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); 442 const uint8_t *src_p = src + j; 443 uint8_t *dst_p = dst + j; 444 av1_wiener_convolve_add_src( 445 src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16, 446 rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params); 447 } 448 } 449 450 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1) 451 over the input. The window is of size (2r + 1)x(2r + 1), and we 452 specialize to r = 1, 2, 3. A default function is used for r > 3. 453 454 Each loop follows the same format: We keep a window's worth of input 455 in individual variables and select data out of that as appropriate. 456 */ 457 static void boxsum1(int32_t *src, int width, int height, int src_stride, 458 int sqr, int32_t *dst, int dst_stride) { 459 int i, j, a, b, c; 460 assert(width > 2 * SGRPROJ_BORDER_HORZ); 461 assert(height > 2 * SGRPROJ_BORDER_VERT); 462 463 // Vertical sum over 3-pixel regions, from src into dst. 464 if (!sqr) { 465 for (j = 0; j < width; ++j) { 466 a = src[j]; 467 b = src[src_stride + j]; 468 c = src[2 * src_stride + j]; 469 470 dst[j] = a + b; 471 for (i = 1; i < height - 2; ++i) { 472 // Loop invariant: At the start of each iteration, 473 // a = src[(i - 1) * src_stride + j] 474 // b = src[(i ) * src_stride + j] 475 // c = src[(i + 1) * src_stride + j] 476 dst[i * dst_stride + j] = a + b + c; 477 a = b; 478 b = c; 479 c = src[(i + 2) * src_stride + j]; 480 } 481 dst[i * dst_stride + j] = a + b + c; 482 dst[(i + 1) * dst_stride + j] = b + c; 483 } 484 } else { 485 for (j = 0; j < width; ++j) { 486 a = src[j] * src[j]; 487 b = src[src_stride + j] * src[src_stride + j]; 488 c = src[2 * src_stride + j] * src[2 * src_stride + j]; 489 490 dst[j] = a + b; 491 for (i = 1; i < height - 2; ++i) { 492 dst[i * dst_stride + j] = a + b + c; 493 a = b; 494 b = c; 495 c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j]; 496 } 497 dst[i * dst_stride + j] = a + b + c; 498 dst[(i + 1) * dst_stride + j] = b + c; 499 } 500 } 501 502 // Horizontal sum over 3-pixel regions of dst 503 for (i = 0; i < height; ++i) { 504 a = dst[i * dst_stride]; 505 b = dst[i * dst_stride + 1]; 506 c = dst[i * dst_stride + 2]; 507 508 dst[i * dst_stride] = a + b; 509 for (j = 1; j < width - 2; ++j) { 510 // Loop invariant: At the start of each iteration, 511 // a = src[i * src_stride + (j - 1)] 512 // b = src[i * src_stride + (j )] 513 // c = src[i * src_stride + (j + 1)] 514 dst[i * dst_stride + j] = a + b + c; 515 a = b; 516 b = c; 517 c = dst[i * dst_stride + (j + 2)]; 518 } 519 dst[i * dst_stride + j] = a + b + c; 520 dst[i * dst_stride + (j + 1)] = b + c; 521 } 522 } 523 524 static void boxsum2(int32_t *src, int width, int height, int src_stride, 525 int sqr, int32_t *dst, int dst_stride) { 526 int i, j, a, b, c, d, e; 527 assert(width > 2 * SGRPROJ_BORDER_HORZ); 528 assert(height > 2 * SGRPROJ_BORDER_VERT); 529 530 // Vertical sum over 5-pixel regions, from src into dst. 531 if (!sqr) { 532 for (j = 0; j < width; ++j) { 533 a = src[j]; 534 b = src[src_stride + j]; 535 c = src[2 * src_stride + j]; 536 d = src[3 * src_stride + j]; 537 e = src[4 * src_stride + j]; 538 539 dst[j] = a + b + c; 540 dst[dst_stride + j] = a + b + c + d; 541 for (i = 2; i < height - 3; ++i) { 542 // Loop invariant: At the start of each iteration, 543 // a = src[(i - 2) * src_stride + j] 544 // b = src[(i - 1) * src_stride + j] 545 // c = src[(i ) * src_stride + j] 546 // d = src[(i + 1) * src_stride + j] 547 // e = src[(i + 2) * src_stride + j] 548 dst[i * dst_stride + j] = a + b + c + d + e; 549 a = b; 550 b = c; 551 c = d; 552 d = e; 553 e = src[(i + 3) * src_stride + j]; 554 } 555 dst[i * dst_stride + j] = a + b + c + d + e; 556 dst[(i + 1) * dst_stride + j] = b + c + d + e; 557 dst[(i + 2) * dst_stride + j] = c + d + e; 558 } 559 } else { 560 for (j = 0; j < width; ++j) { 561 a = src[j] * src[j]; 562 b = src[src_stride + j] * src[src_stride + j]; 563 c = src[2 * src_stride + j] * src[2 * src_stride + j]; 564 d = src[3 * src_stride + j] * src[3 * src_stride + j]; 565 e = src[4 * src_stride + j] * src[4 * src_stride + j]; 566 567 dst[j] = a + b + c; 568 dst[dst_stride + j] = a + b + c + d; 569 for (i = 2; i < height - 3; ++i) { 570 dst[i * dst_stride + j] = a + b + c + d + e; 571 a = b; 572 b = c; 573 c = d; 574 d = e; 575 e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j]; 576 } 577 dst[i * dst_stride + j] = a + b + c + d + e; 578 dst[(i + 1) * dst_stride + j] = b + c + d + e; 579 dst[(i + 2) * dst_stride + j] = c + d + e; 580 } 581 } 582 583 // Horizontal sum over 5-pixel regions of dst 584 for (i = 0; i < height; ++i) { 585 a = dst[i * dst_stride]; 586 b = dst[i * dst_stride + 1]; 587 c = dst[i * dst_stride + 2]; 588 d = dst[i * dst_stride + 3]; 589 e = dst[i * dst_stride + 4]; 590 591 dst[i * dst_stride] = a + b + c; 592 dst[i * dst_stride + 1] = a + b + c + d; 593 for (j = 2; j < width - 3; ++j) { 594 // Loop invariant: At the start of each iteration, 595 // a = src[i * src_stride + (j - 2)] 596 // b = src[i * src_stride + (j - 1)] 597 // c = src[i * src_stride + (j )] 598 // d = src[i * src_stride + (j + 1)] 599 // e = src[i * src_stride + (j + 2)] 600 dst[i * dst_stride + j] = a + b + c + d + e; 601 a = b; 602 b = c; 603 c = d; 604 d = e; 605 e = dst[i * dst_stride + (j + 3)]; 606 } 607 dst[i * dst_stride + j] = a + b + c + d + e; 608 dst[i * dst_stride + (j + 1)] = b + c + d + e; 609 dst[i * dst_stride + (j + 2)] = c + d + e; 610 } 611 } 612 613 static void boxsum(int32_t *src, int width, int height, int src_stride, int r, 614 int sqr, int32_t *dst, int dst_stride) { 615 if (r == 1) 616 boxsum1(src, width, height, src_stride, sqr, dst, dst_stride); 617 else if (r == 2) 618 boxsum2(src, width, height, src_stride, sqr, dst, dst_stride); 619 else 620 assert(0 && "Invalid value of r in self-guided filter"); 621 } 622 623 void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) { 624 if (params->r[0] == 0) { 625 xq[0] = 0; 626 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1]; 627 } else if (params->r[1] == 0) { 628 xq[0] = xqd[0]; 629 xq[1] = 0; 630 } else { 631 xq[0] = xqd[0]; 632 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1]; 633 } 634 } 635 636 const int32_t x_by_xplus1[256] = { 637 // Special case: Map 0 -> 1 (corresponding to a value of 1/256) 638 // instead of 0. See comments in selfguided_restoration_internal() for why 639 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, 640 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, 641 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, 642 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, 643 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, 644 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 645 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, 646 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 647 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 648 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 649 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 650 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 651 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 652 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 653 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 654 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 655 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 656 256, 657 }; 658 659 const int32_t one_by_x[MAX_NELEM] = { 660 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315, 661 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, 662 }; 663 664 static void calculate_intermediate_result(int32_t *dgd, int width, int height, 665 int dgd_stride, int bit_depth, 666 int sgr_params_idx, int radius_idx, 667 int pass, int32_t *A, int32_t *B) { 668 const sgr_params_type *const params = &sgr_params[sgr_params_idx]; 669 const int r = params->r[radius_idx]; 670 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; 671 const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; 672 // Adjusting the stride of A and B here appears to avoid bad cache effects, 673 // leading to a significant speed improvement. 674 // We also align the stride to a multiple of 16 bytes, for consistency 675 // with the SIMD version of this function. 676 int buf_stride = ((width_ext + 3) & ~3) + 16; 677 const int step = pass == 0 ? 1 : 2; 678 int i, j; 679 680 assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); 681 assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && 682 "Need SGRPROJ_BORDER_* >= r+1"); 683 684 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, 685 width_ext, height_ext, dgd_stride, r, 0, B, buf_stride); 686 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, 687 width_ext, height_ext, dgd_stride, r, 1, A, buf_stride); 688 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; 689 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; 690 // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, 691 // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. 692 for (i = -1; i < height + 1; i += step) { 693 for (j = -1; j < width + 1; ++j) { 694 const int k = i * buf_stride + j; 695 const int n = (2 * r + 1) * (2 * r + 1); 696 697 // a < 2^16 * n < 2^22 regardless of bit depth 698 uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8)); 699 // b < 2^8 * n < 2^14 regardless of bit depth 700 uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8); 701 702 // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, 703 // and p itself satisfies p < 2^14 * n^2 < 2^26. 704 // This bound on p is due to: 705 // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances 706 // 707 // Note: Sometimes, in high bit depth, we can end up with a*n < b*b. 708 // This is an artefact of rounding, and can only happen if all pixels 709 // are (almost) identical, so in this case we saturate to p=0. 710 uint32_t p = (a * n < b * b) ? 0 : a * n - b * b; 711 712 const uint32_t s = params->s[radius_idx]; 713 714 // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32 715 // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12 716 // (this holds even after accounting for the rounding in s) 717 const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS); 718 719 // Note: We have to be quite careful about the value of A[k]. 720 // This is used as a blend factor between individual pixel values and the 721 // local mean. So it logically has a range of [0, 256], including both 722 // endpoints. 723 // 724 // This is a pain for hardware, as we'd like something which can be stored 725 // in exactly 8 bits. 726 // Further, in the calculation of B[k] below, if z == 0 and r == 2, 727 // then A[k] "should be" 0. But then we can end up setting B[k] to a value 728 // slightly above 2^(8 + bit depth), due to rounding in the value of 729 // one_by_x[25-1]. 730 // 731 // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. 732 // This fixes the above issues (256 - A[k] fits in a uint8, and we can't 733 // overflow), without significantly affecting the final result: z == 0 734 // implies that the image is essentially "flat", so the local mean and 735 // individual pixel values are very similar. 736 // 737 // Note that saturating on the other side, ie. requring A[k] <= 255, 738 // would be a bad idea, as that corresponds to the case where the image 739 // is very variable, when we want to preserve the local pixel value as 740 // much as possible. 741 A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] 742 743 // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, 744 // one_by_x[n - 1] = round(2^12 / n) 745 // => the product here is < 2^(20 + bit_depth) <= 2^32, 746 // and B[k] is set to a value < 2^(8 + bit depth) 747 // This holds even with the rounding in one_by_x and in the overall 748 // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. 749 B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * 750 (uint32_t)B[k] * 751 (uint32_t)one_by_x[n - 1], 752 SGRPROJ_RECIP_BITS); 753 } 754 } 755 } 756 757 static void selfguided_restoration_fast_internal( 758 int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, 759 int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { 760 const sgr_params_type *const params = &sgr_params[sgr_params_idx]; 761 const int r = params->r[radius_idx]; 762 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; 763 // Adjusting the stride of A and B here appears to avoid bad cache effects, 764 // leading to a significant speed improvement. 765 // We also align the stride to a multiple of 16 bytes, for consistency 766 // with the SIMD version of this function. 767 int buf_stride = ((width_ext + 3) & ~3) + 16; 768 int32_t A_[RESTORATION_PROC_UNIT_PELS]; 769 int32_t B_[RESTORATION_PROC_UNIT_PELS]; 770 int32_t *A = A_; 771 int32_t *B = B_; 772 int i, j; 773 calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, 774 sgr_params_idx, radius_idx, 1, A, B); 775 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; 776 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; 777 778 // Use the A[] and B[] arrays to calculate the filtered image 779 (void)r; 780 assert(r == 2); 781 for (i = 0; i < height; ++i) { 782 if (!(i & 1)) { // even row 783 for (j = 0; j < width; ++j) { 784 const int k = i * buf_stride + j; 785 const int l = i * dgd_stride + j; 786 const int m = i * dst_stride + j; 787 const int nb = 5; 788 const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 + 789 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + 790 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * 791 5; 792 const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 + 793 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + 794 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * 795 5; 796 const int32_t v = a * dgd[l] + b; 797 dst[m] = 798 ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); 799 } 800 } else { // odd row 801 for (j = 0; j < width; ++j) { 802 const int k = i * buf_stride + j; 803 const int l = i * dgd_stride + j; 804 const int m = i * dst_stride + j; 805 const int nb = 4; 806 const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5; 807 const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5; 808 const int32_t v = a * dgd[l] + b; 809 dst[m] = 810 ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); 811 } 812 } 813 } 814 } 815 816 static void selfguided_restoration_internal(int32_t *dgd, int width, int height, 817 int dgd_stride, int32_t *dst, 818 int dst_stride, int bit_depth, 819 int sgr_params_idx, 820 int radius_idx) { 821 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; 822 // Adjusting the stride of A and B here appears to avoid bad cache effects, 823 // leading to a significant speed improvement. 824 // We also align the stride to a multiple of 16 bytes, for consistency 825 // with the SIMD version of this function. 826 int buf_stride = ((width_ext + 3) & ~3) + 16; 827 int32_t A_[RESTORATION_PROC_UNIT_PELS]; 828 int32_t B_[RESTORATION_PROC_UNIT_PELS]; 829 int32_t *A = A_; 830 int32_t *B = B_; 831 int i, j; 832 calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, 833 sgr_params_idx, radius_idx, 0, A, B); 834 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; 835 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; 836 837 // Use the A[] and B[] arrays to calculate the filtered image 838 for (i = 0; i < height; ++i) { 839 for (j = 0; j < width; ++j) { 840 const int k = i * buf_stride + j; 841 const int l = i * dgd_stride + j; 842 const int m = i * dst_stride + j; 843 const int nb = 5; 844 const int32_t a = 845 (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) * 846 4 + 847 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + 848 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * 849 3; 850 const int32_t b = 851 (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) * 852 4 + 853 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + 854 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * 855 3; 856 const int32_t v = a * dgd[l] + b; 857 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); 858 } 859 } 860 } 861 862 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, 863 int dgd_stride, int32_t *flt0, int32_t *flt1, 864 int flt_stride, int sgr_params_idx, 865 int bit_depth, int highbd) { 866 int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; 867 const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; 868 int32_t *dgd32 = 869 dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; 870 871 if (highbd) { 872 const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8); 873 for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { 874 for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { 875 dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j]; 876 } 877 } 878 } else { 879 for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { 880 for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { 881 dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j]; 882 } 883 } 884 } 885 886 const sgr_params_type *const params = &sgr_params[sgr_params_idx]; 887 // If params->r == 0 we skip the corresponding filter. We only allow one of 888 // the radii to be 0, as having both equal to 0 would be equivalent to 889 // skipping SGR entirely. 890 assert(!(params->r[0] == 0 && params->r[1] == 0)); 891 892 if (params->r[0] > 0) 893 selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride, 894 flt0, flt_stride, bit_depth, 895 sgr_params_idx, 0); 896 if (params->r[1] > 0) 897 selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1, 898 flt_stride, bit_depth, sgr_params_idx, 1); 899 return 0; 900 } 901 902 void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height, 903 int stride, int eps, const int *xqd, 904 uint8_t *dst8, int dst_stride, 905 int32_t *tmpbuf, int bit_depth, 906 int highbd) { 907 int32_t *flt0 = tmpbuf; 908 int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; 909 assert(width * height <= RESTORATION_UNITPELS_MAX); 910 911 const int ret = av1_selfguided_restoration_c( 912 dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); 913 (void)ret; 914 assert(!ret); 915 const sgr_params_type *const params = &sgr_params[eps]; 916 int xq[2]; 917 decode_xq(xqd, xq, params); 918 for (int i = 0; i < height; ++i) { 919 for (int j = 0; j < width; ++j) { 920 const int k = i * width + j; 921 uint8_t *dst8ij = dst8 + i * dst_stride + j; 922 const uint8_t *dat8ij = dat8 + i * stride + j; 923 924 const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij; 925 const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS; 926 int32_t v = u << SGRPROJ_PRJ_BITS; 927 // If params->r == 0 then we skipped the filtering in 928 // av1_selfguided_restoration_c, i.e. flt[k] == u 929 if (params->r[0] > 0) v += xq[0] * (flt0[k] - u); 930 if (params->r[1] > 0) v += xq[1] * (flt1[k] - u); 931 const int16_t w = 932 (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); 933 934 const uint16_t out = clip_pixel_highbd(w, bit_depth); 935 if (highbd) 936 *CONVERT_TO_SHORTPTR(dst8ij) = out; 937 else 938 *dst8ij = (uint8_t)out; 939 } 940 } 941 } 942 943 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui, 944 int stripe_width, int stripe_height, 945 int procunit_width, const uint8_t *src, 946 int src_stride, uint8_t *dst, int dst_stride, 947 int32_t *tmpbuf, int bit_depth) { 948 (void)bit_depth; 949 assert(bit_depth == 8); 950 951 for (int j = 0; j < stripe_width; j += procunit_width) { 952 int w = AOMMIN(procunit_width, stripe_width - j); 953 apply_selfguided_restoration(src + j, w, stripe_height, src_stride, 954 rui->sgrproj_info.ep, rui->sgrproj_info.xqd, 955 dst + j, dst_stride, tmpbuf, bit_depth, 0); 956 } 957 } 958 959 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui, 960 int stripe_width, int stripe_height, 961 int procunit_width, const uint8_t *src8, 962 int src_stride, uint8_t *dst8, 963 int dst_stride, int32_t *tmpbuf, 964 int bit_depth) { 965 (void)tmpbuf; 966 const ConvolveParams conv_params = get_conv_params_wiener(bit_depth); 967 968 for (int j = 0; j < stripe_width; j += procunit_width) { 969 int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); 970 const uint8_t *src8_p = src8 + j; 971 uint8_t *dst8_p = dst8 + j; 972 av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride, 973 rui->wiener_info.hfilter, 16, 974 rui->wiener_info.vfilter, 16, w, 975 stripe_height, &conv_params, bit_depth); 976 } 977 } 978 979 static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui, 980 int stripe_width, int stripe_height, 981 int procunit_width, 982 const uint8_t *src8, int src_stride, 983 uint8_t *dst8, int dst_stride, 984 int32_t *tmpbuf, int bit_depth) { 985 for (int j = 0; j < stripe_width; j += procunit_width) { 986 int w = AOMMIN(procunit_width, stripe_width - j); 987 apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride, 988 rui->sgrproj_info.ep, rui->sgrproj_info.xqd, 989 dst8 + j, dst_stride, tmpbuf, bit_depth, 1); 990 } 991 } 992 993 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui, 994 int stripe_width, int stripe_height, 995 int procunit_width, const uint8_t *src, 996 int src_stride, uint8_t *dst, int dst_stride, 997 int32_t *tmpbuf, int bit_depth); 998 999 #define NUM_STRIPE_FILTERS 4 1000 1001 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { 1002 wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd, 1003 sgrproj_filter_stripe_highbd 1004 }; 1005 1006 // Filter one restoration unit 1007 void av1_loop_restoration_filter_unit( 1008 const RestorationTileLimits *limits, const RestorationUnitInfo *rui, 1009 const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, 1010 const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y, 1011 int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8, 1012 int dst_stride, int32_t *tmpbuf, int optimized_lr) { 1013 RestorationType unit_rtype = rui->restoration_type; 1014 1015 int unit_h = limits->v_end - limits->v_start; 1016 int unit_w = limits->h_end - limits->h_start; 1017 uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start; 1018 uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start; 1019 1020 if (unit_rtype == RESTORE_NONE) { 1021 copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd); 1022 return; 1023 } 1024 1025 const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ); 1026 assert(filter_idx < NUM_STRIPE_FILTERS); 1027 const stripe_filter_fun stripe_filter = stripe_filters[filter_idx]; 1028 1029 const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; 1030 1031 // Convolve the whole tile one stripe at a time 1032 RestorationTileLimits remaining_stripes = *limits; 1033 int i = 0; 1034 while (i < unit_h) { 1035 int copy_above, copy_below; 1036 remaining_stripes.v_start = limits->v_start + i; 1037 1038 get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, ©_above, 1039 ©_below); 1040 1041 const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; 1042 const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; 1043 1044 // Work out where this stripe's boundaries are within 1045 // rsb->stripe_boundary_{above,below} 1046 const int tile_stripe = 1047 (remaining_stripes.v_start - tile_rect->top + runit_offset) / 1048 full_stripe_height; 1049 const int frame_stripe = tile_stripe0 + tile_stripe; 1050 const int rsb_row = RESTORATION_CTX_VERT * frame_stripe; 1051 1052 // Calculate this stripe's height, based on two rules: 1053 // * The topmost stripe in each tile is 8 luma pixels shorter than usual. 1054 // * We can't extend past the end of the current restoration unit 1055 const int nominal_stripe_height = 1056 full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0); 1057 const int h = AOMMIN(nominal_stripe_height, 1058 remaining_stripes.v_end - remaining_stripes.v_start); 1059 1060 setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd, 1061 h, data8, stride, rlbs, copy_above, 1062 copy_below, optimized_lr); 1063 1064 stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride, 1065 dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth); 1066 1067 restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h, 1068 data8, stride, copy_above, copy_below, 1069 optimized_lr); 1070 1071 i += h; 1072 } 1073 } 1074 1075 static void filter_frame_on_unit(const RestorationTileLimits *limits, 1076 const AV1PixelRect *tile_rect, 1077 int rest_unit_idx, void *priv, int32_t *tmpbuf, 1078 RestorationLineBuffers *rlbs) { 1079 FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv; 1080 const RestorationInfo *rsi = ctxt->rsi; 1081 1082 av1_loop_restoration_filter_unit( 1083 limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect, 1084 ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth, 1085 ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf, 1086 rsi->optimized_lr); 1087 } 1088 1089 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, 1090 YV12_BUFFER_CONFIG *frame, 1091 AV1_COMMON *cm, int optimized_lr, 1092 int num_planes) { 1093 const SequenceHeader *const seq_params = &cm->seq_params; 1094 const int bit_depth = seq_params->bit_depth; 1095 const int highbd = seq_params->use_highbitdepth; 1096 lr_ctxt->dst = &cm->rst_frame; 1097 1098 const int frame_width = frame->crop_widths[0]; 1099 const int frame_height = frame->crop_heights[0]; 1100 if (aom_realloc_frame_buffer( 1101 lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, 1102 seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, 1103 cm->byte_alignment, NULL, NULL, NULL) < 0) 1104 aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, 1105 "Failed to allocate restoration dst buffer"); 1106 1107 lr_ctxt->on_rest_unit = filter_frame_on_unit; 1108 lr_ctxt->frame = frame; 1109 for (int plane = 0; plane < num_planes; ++plane) { 1110 RestorationInfo *rsi = &cm->rst_info[plane]; 1111 RestorationType rtype = rsi->frame_restoration_type; 1112 rsi->optimized_lr = optimized_lr; 1113 1114 if (rtype == RESTORE_NONE) { 1115 continue; 1116 } 1117 1118 const int is_uv = plane > 0; 1119 const int plane_width = frame->crop_widths[is_uv]; 1120 const int plane_height = frame->crop_heights[is_uv]; 1121 FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane]; 1122 1123 extend_frame(frame->buffers[plane], plane_width, plane_height, 1124 frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER, 1125 highbd); 1126 1127 lr_plane_ctxt->rsi = rsi; 1128 lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x; 1129 lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y; 1130 lr_plane_ctxt->highbd = highbd; 1131 lr_plane_ctxt->bit_depth = bit_depth; 1132 lr_plane_ctxt->data8 = frame->buffers[plane]; 1133 lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane]; 1134 lr_plane_ctxt->data_stride = frame->strides[is_uv]; 1135 lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv]; 1136 lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv); 1137 lr_plane_ctxt->tile_stripe0 = 0; 1138 } 1139 } 1140 1141 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, 1142 AV1_COMMON *cm, int num_planes) { 1143 typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, 1144 YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, 1145 int vstart, int vend); 1146 static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y, 1147 aom_yv12_partial_coloc_copy_u, 1148 aom_yv12_partial_coloc_copy_v }; 1149 1150 for (int plane = 0; plane < num_planes; ++plane) { 1151 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; 1152 AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect; 1153 copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left, 1154 tile_rect.right, tile_rect.top, tile_rect.bottom); 1155 } 1156 } 1157 1158 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm, 1159 int num_planes) { 1160 FilterFrameCtxt *ctxt = lr_ctxt->ctxt; 1161 1162 for (int plane = 0; plane < num_planes; ++plane) { 1163 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) { 1164 continue; 1165 } 1166 1167 av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, 1168 &ctxt[plane], &ctxt[plane].tile_rect, 1169 cm->rst_tmpbuf, cm->rlbs); 1170 } 1171 } 1172 1173 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, 1174 AV1_COMMON *cm, int optimized_lr, 1175 void *lr_ctxt) { 1176 assert(!cm->all_lossless); 1177 const int num_planes = av1_num_planes(cm); 1178 1179 AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; 1180 1181 av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, 1182 optimized_lr, num_planes); 1183 1184 foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes); 1185 1186 av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes); 1187 } 1188 1189 void av1_foreach_rest_unit_in_row( 1190 RestorationTileLimits *limits, const AV1PixelRect *tile_rect, 1191 rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, 1192 int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane, 1193 void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, 1194 sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write, 1195 struct AV1LrSyncData *const lr_sync) { 1196 const int tile_w = tile_rect->right - tile_rect->left; 1197 const int ext_size = unit_size * 3 / 2; 1198 int x0 = 0, j = 0; 1199 while (x0 < tile_w) { 1200 int remaining_w = tile_w - x0; 1201 int w = (remaining_w < ext_size) ? remaining_w : unit_size; 1202 1203 limits->h_start = tile_rect->left + x0; 1204 limits->h_end = tile_rect->left + x0 + w; 1205 assert(limits->h_end <= tile_rect->right); 1206 1207 const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j; 1208 1209 // No sync for even numbered rows 1210 // For odd numbered rows, Loop Restoration of current block requires the LR 1211 // of top-right and bottom-right blocks to be completed 1212 1213 // top-right sync 1214 on_sync_read(lr_sync, row_number, j, plane); 1215 if ((row_number + 1) < vunits_per_tile) 1216 // bottom-right sync 1217 on_sync_read(lr_sync, row_number + 2, j, plane); 1218 1219 on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs); 1220 1221 on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane); 1222 1223 x0 += w; 1224 ++j; 1225 } 1226 } 1227 1228 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) { 1229 (void)lr_sync; 1230 (void)r; 1231 (void)c; 1232 (void)plane; 1233 } 1234 1235 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, 1236 const int sb_cols, int plane) { 1237 (void)lr_sync; 1238 (void)r; 1239 (void)c; 1240 (void)sb_cols; 1241 (void)plane; 1242 } 1243 1244 static void foreach_rest_unit_in_tile( 1245 const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols, 1246 int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size, 1247 int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv, 1248 int32_t *tmpbuf, RestorationLineBuffers *rlbs) { 1249 const int tile_h = tile_rect->bottom - tile_rect->top; 1250 const int ext_size = unit_size * 3 / 2; 1251 1252 const int tile_idx = tile_col + tile_row * tile_cols; 1253 const int unit_idx0 = tile_idx * units_per_tile; 1254 1255 int y0 = 0, i = 0; 1256 while (y0 < tile_h) { 1257 int remaining_h = tile_h - y0; 1258 int h = (remaining_h < ext_size) ? remaining_h : unit_size; 1259 1260 RestorationTileLimits limits; 1261 limits.v_start = tile_rect->top + y0; 1262 limits.v_end = tile_rect->top + y0 + h; 1263 assert(limits.v_end <= tile_rect->bottom); 1264 // Offset the tile upwards to align with the restoration processing stripe 1265 const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; 1266 limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset); 1267 if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset; 1268 1269 av1_foreach_rest_unit_in_row( 1270 &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0, 1271 hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs, 1272 av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL); 1273 1274 y0 += h; 1275 ++i; 1276 } 1277 } 1278 1279 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, 1280 rest_unit_visitor_t on_rest_unit, 1281 void *priv, AV1PixelRect *tile_rect, 1282 int32_t *tmpbuf, 1283 RestorationLineBuffers *rlbs) { 1284 const int is_uv = plane > 0; 1285 const int ss_y = is_uv && cm->seq_params.subsampling_y; 1286 1287 const RestorationInfo *rsi = &cm->rst_info[plane]; 1288 1289 foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS, 1290 rsi->horz_units_per_tile, rsi->vert_units_per_tile, 1291 rsi->units_per_tile, rsi->restoration_unit_size, 1292 ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs); 1293 } 1294 1295 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, 1296 int mi_row, int mi_col, BLOCK_SIZE bsize, 1297 int *rcol0, int *rcol1, int *rrow0, 1298 int *rrow1) { 1299 assert(rcol0 && rcol1 && rrow0 && rrow1); 1300 1301 if (bsize != cm->seq_params.sb_size) return 0; 1302 if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0; 1303 1304 assert(!cm->all_lossless); 1305 1306 const int is_uv = plane > 0; 1307 1308 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv); 1309 const int tile_w = tile_rect.right - tile_rect.left; 1310 const int tile_h = tile_rect.bottom - tile_rect.top; 1311 1312 const int mi_top = 0; 1313 const int mi_left = 0; 1314 1315 // Compute the mi-unit corners of the superblock relative to the top-left of 1316 // the tile 1317 const int mi_rel_row0 = mi_row - mi_top; 1318 const int mi_rel_col0 = mi_col - mi_left; 1319 const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize]; 1320 const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize]; 1321 1322 const RestorationInfo *rsi = &cm->rst_info[plane]; 1323 const int size = rsi->restoration_unit_size; 1324 1325 // Calculate the number of restoration units in this tile (which might be 1326 // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile) 1327 const int horz_units = av1_lr_count_units_in_tile(size, tile_w); 1328 const int vert_units = av1_lr_count_units_in_tile(size, tile_h); 1329 1330 // The size of an MI-unit on this plane of the image 1331 const int ss_x = is_uv && cm->seq_params.subsampling_x; 1332 const int ss_y = is_uv && cm->seq_params.subsampling_y; 1333 const int mi_size_x = MI_SIZE >> ss_x; 1334 const int mi_size_y = MI_SIZE >> ss_y; 1335 1336 // Write m for the relative mi column or row, D for the superres denominator 1337 // and N for the superres numerator. If u is the upscaled pixel offset then 1338 // we can write the downscaled pixel offset in two ways as: 1339 // 1340 // MI_SIZE * m = N / D u 1341 // 1342 // from which we get u = D * MI_SIZE * m / N 1343 const int mi_to_num_x = av1_superres_scaled(cm) 1344 ? mi_size_x * cm->superres_scale_denominator 1345 : mi_size_x; 1346 const int mi_to_num_y = mi_size_y; 1347 const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size; 1348 const int denom_y = size; 1349 1350 const int rnd_x = denom_x - 1; 1351 const int rnd_y = denom_y - 1; 1352 1353 // rcol0/rrow0 should be the first column/row of restoration units (relative 1354 // to the top-left of the tile) that doesn't start left/below of 1355 // mi_col/mi_row. For this calculation, we need to round up the division (if 1356 // the sb starts at runit column 10.1, the first matching runit has column 1357 // index 11) 1358 *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x; 1359 *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y; 1360 1361 // rel_col1/rel_row1 is the equivalent calculation, but for the superblock 1362 // below-right. If we're at the bottom or right of the tile, this restoration 1363 // unit might not exist, in which case we'll clamp accordingly. 1364 *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units); 1365 *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units); 1366 1367 return *rcol0 < *rcol1 && *rrow0 < *rrow1; 1368 } 1369 1370 // Extend to left and right 1371 static void extend_lines(uint8_t *buf, int width, int height, int stride, 1372 int extend, int use_highbitdepth) { 1373 for (int i = 0; i < height; ++i) { 1374 if (use_highbitdepth) { 1375 uint16_t *buf16 = (uint16_t *)buf; 1376 aom_memset16(buf16 - extend, buf16[0], extend); 1377 aom_memset16(buf16 + width, buf16[width - 1], extend); 1378 } else { 1379 memset(buf - extend, buf[0], extend); 1380 memset(buf + width, buf[width - 1], extend); 1381 } 1382 buf += stride; 1383 } 1384 } 1385 1386 static void save_deblock_boundary_lines( 1387 const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row, 1388 int stripe, int use_highbd, int is_above, 1389 RestorationStripeBoundaries *boundaries) { 1390 const int is_uv = plane > 0; 1391 const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); 1392 const int src_stride = frame->strides[is_uv] << use_highbd; 1393 const uint8_t *src_rows = src_buf + row * src_stride; 1394 1395 uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above 1396 : boundaries->stripe_boundary_below; 1397 uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); 1398 const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; 1399 uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; 1400 1401 // There is a rare case in which a processing stripe can end 1px above the 1402 // crop border. In this case, we do want to use deblocked pixels from below 1403 // the stripe (hence why we ended up in this function), but instead of 1404 // fetching 2 "below" rows we need to fetch one and duplicate it. 1405 // This is equivalent to clamping the sample locations against the crop border 1406 const int lines_to_save = 1407 AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row); 1408 assert(lines_to_save == 1 || lines_to_save == 2); 1409 1410 int upscaled_width; 1411 int line_bytes; 1412 if (av1_superres_scaled(cm)) { 1413 const int ss_x = is_uv && cm->seq_params.subsampling_x; 1414 upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x; 1415 line_bytes = upscaled_width << use_highbd; 1416 if (use_highbd) 1417 av1_upscale_normative_rows( 1418 cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv], 1419 CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride, 1420 plane, lines_to_save); 1421 else 1422 av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows, 1423 boundaries->stripe_boundary_stride, plane, 1424 lines_to_save); 1425 } else { 1426 upscaled_width = frame->crop_widths[is_uv]; 1427 line_bytes = upscaled_width << use_highbd; 1428 for (int i = 0; i < lines_to_save; i++) { 1429 memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride, 1430 line_bytes); 1431 } 1432 } 1433 // If we only saved one line, then copy it into the second line buffer 1434 if (lines_to_save == 1) 1435 memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes); 1436 1437 extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, 1438 RESTORATION_EXTRA_HORZ, use_highbd); 1439 } 1440 1441 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame, 1442 const AV1_COMMON *cm, int plane, int row, 1443 int stripe, int use_highbd, int is_above, 1444 RestorationStripeBoundaries *boundaries) { 1445 const int is_uv = plane > 0; 1446 const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); 1447 const int src_stride = frame->strides[is_uv] << use_highbd; 1448 const uint8_t *src_rows = src_buf + row * src_stride; 1449 1450 uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above 1451 : boundaries->stripe_boundary_below; 1452 uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); 1453 const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; 1454 uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; 1455 const int src_width = frame->crop_widths[is_uv]; 1456 1457 // At the point where this function is called, we've already applied 1458 // superres. So we don't need to extend the lines here, we can just 1459 // pull directly from the topmost row of the upscaled frame. 1460 const int ss_x = is_uv && cm->seq_params.subsampling_x; 1461 const int upscaled_width = av1_superres_scaled(cm) 1462 ? (cm->superres_upscaled_width + ss_x) >> ss_x 1463 : src_width; 1464 const int line_bytes = upscaled_width << use_highbd; 1465 for (int i = 0; i < RESTORATION_CTX_VERT; i++) { 1466 // Copy the line at 'row' into both context lines. This is because 1467 // we want to (effectively) extend the outermost row of CDEF data 1468 // from this tile to produce a border, rather than using deblocked 1469 // pixels from the tile above/below. 1470 memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes); 1471 } 1472 extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, 1473 RESTORATION_EXTRA_HORZ, use_highbd); 1474 } 1475 1476 static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame, 1477 int use_highbd, int plane, 1478 AV1_COMMON *cm, int after_cdef) { 1479 const int is_uv = plane > 0; 1480 const int ss_y = is_uv && cm->seq_params.subsampling_y; 1481 const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; 1482 const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y; 1483 1484 // Get the tile rectangle, with height rounded up to the next multiple of 8 1485 // luma pixels (only relevant for the bottom tile of the frame) 1486 const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv); 1487 const int stripe0 = 0; 1488 1489 RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries; 1490 1491 const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y); 1492 1493 int tile_stripe; 1494 for (tile_stripe = 0;; ++tile_stripe) { 1495 const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off); 1496 const int y0 = tile_rect.top + rel_y0; 1497 if (y0 >= tile_rect.bottom) break; 1498 1499 const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off; 1500 const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom); 1501 1502 const int frame_stripe = stripe0 + tile_stripe; 1503 1504 // In this case, we should only use CDEF pixels at the top 1505 // and bottom of the frame as a whole; internal tile boundaries 1506 // can use deblocked pixels from adjacent tiles for context. 1507 const int use_deblock_above = (frame_stripe > 0); 1508 const int use_deblock_below = (y1 < plane_height); 1509 1510 if (!after_cdef) { 1511 // Save deblocked context where needed. 1512 if (use_deblock_above) { 1513 save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT, 1514 frame_stripe, use_highbd, 1, boundaries); 1515 } 1516 if (use_deblock_below) { 1517 save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe, 1518 use_highbd, 0, boundaries); 1519 } 1520 } else { 1521 // Save CDEF context where needed. Note that we need to save the CDEF 1522 // context for a particular boundary iff we *didn't* save deblocked 1523 // context for that boundary. 1524 // 1525 // In addition, we need to save copies of the outermost line within 1526 // the tile, rather than using data from outside the tile. 1527 if (!use_deblock_above) { 1528 save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd, 1529 1, boundaries); 1530 } 1531 if (!use_deblock_below) { 1532 save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe, 1533 use_highbd, 0, boundaries); 1534 } 1535 } 1536 } 1537 } 1538 1539 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan 1540 // lines to be used as boundary in the loop restoration process. The 1541 // lines are saved in rst_internal.stripe_boundary_lines 1542 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, 1543 AV1_COMMON *cm, int after_cdef) { 1544 const int num_planes = av1_num_planes(cm); 1545 const int use_highbd = cm->seq_params.use_highbitdepth; 1546 for (int p = 0; p < num_planes; ++p) { 1547 save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef); 1548 } 1549 } 1550