Home | History | Annotate | Download | only in common
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  *
     11  */
     12 
     13 #include <math.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 #include "config/aom_scale_rtcd.h"
     18 
     19 #include "aom_mem/aom_mem.h"
     20 #include "av1/common/onyxc_int.h"
     21 #include "av1/common/resize.h"
     22 #include "av1/common/restoration.h"
     23 #include "aom_dsp/aom_dsp_common.h"
     24 #include "aom_mem/aom_mem.h"
     25 
     26 #include "aom_ports/mem.h"
     27 
     28 // The 's' values are calculated based on original 'r' and 'e' values in the
     29 // spec using GenSgrprojVtable().
     30 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
     31 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
     32   { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
     33   { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
     34   { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
     35   { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
     36   { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
     37   { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
     38   { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
     39   { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
     40 };
     41 
     42 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
     43   AV1PixelRect rect;
     44 
     45   int ss_x = is_uv && cm->seq_params.subsampling_x;
     46   int ss_y = is_uv && cm->seq_params.subsampling_y;
     47 
     48   rect.top = 0;
     49   rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
     50   rect.left = 0;
     51   rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
     52   return rect;
     53 }
     54 
     55 // Count horizontal or vertical units per tile (use a width or height for
     56 // tile_size, respectively). We basically want to divide the tile size by the
     57 // size of a restoration unit. Rather than rounding up unconditionally as you
     58 // might expect, we round to nearest, which models the way a right or bottom
     59 // restoration unit can extend to up to 150% its normal width or height. The
     60 // max with 1 is to deal with tiles that are smaller than half of a restoration
     61 // unit.
     62 int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
     63   return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
     64 }
     65 
     66 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
     67                                   int is_uv) {
     68   // We need to allocate enough space for restoration units to cover the
     69   // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
     70   // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
     71   // to do the computation ourselves, iterating over the tiles and keeping
     72   // track of the largest width and height, then upscaling.
     73   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
     74   const int max_tile_w = tile_rect.right - tile_rect.left;
     75   const int max_tile_h = tile_rect.bottom - tile_rect.top;
     76 
     77   // To calculate hpertile and vpertile (horizontal and vertical units per
     78   // tile), we basically want to divide the largest tile width or height by the
     79   // size of a restoration unit. Rather than rounding up unconditionally as you
     80   // might expect, we round to nearest, which models the way a right or bottom
     81   // restoration unit can extend to up to 150% its normal width or height. The
     82   // max with 1 is to deal with tiles that are smaller than half of a
     83   // restoration unit.
     84   const int unit_size = rsi->restoration_unit_size;
     85   const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
     86   const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
     87 
     88   rsi->units_per_tile = hpertile * vpertile;
     89   rsi->horz_units_per_tile = hpertile;
     90   rsi->vert_units_per_tile = vpertile;
     91 
     92   const int ntiles = 1;
     93   const int nunits = ntiles * rsi->units_per_tile;
     94 
     95   aom_free(rsi->unit_info);
     96   CHECK_MEM_ERROR(cm, rsi->unit_info,
     97                   (RestorationUnitInfo *)aom_memalign(
     98                       16, sizeof(*rsi->unit_info) * nunits));
     99 }
    100 
    101 void av1_free_restoration_struct(RestorationInfo *rst_info) {
    102   aom_free(rst_info->unit_info);
    103   rst_info->unit_info = NULL;
    104 }
    105 
    106 #if 0
    107 // Pair of values for each sgrproj parameter:
    108 // Index 0 corresponds to r[0], e[0]
    109 // Index 1 corresponds to r[1], e[1]
    110 int sgrproj_mtable[SGRPROJ_PARAMS][2];
    111 
    112 static void GenSgrprojVtable() {
    113   for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
    114     const sgr_params_type *const params = &sgr_params[i];
    115     for (int j = 0; j < 2; ++j) {
    116       const int e = params->e[j];
    117       const int r = params->r[j];
    118       if (r == 0) {                 // filter is disabled
    119         sgrproj_mtable[i][j] = -1;  // mark invalid
    120       } else {                      // filter is enabled
    121         const int n = (2 * r + 1) * (2 * r + 1);
    122         const int n2e = n * n * e;
    123         assert(n2e != 0);
    124         sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
    125       }
    126     }
    127   }
    128 }
    129 #endif
    130 
    131 void av1_loop_restoration_precal() {
    132 #if 0
    133   GenSgrprojVtable();
    134 #endif
    135 }
    136 
    137 static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
    138                                int border_horz, int border_vert) {
    139   uint8_t *data_p;
    140   int i;
    141   for (i = 0; i < height; ++i) {
    142     data_p = data + i * stride;
    143     memset(data_p - border_horz, data_p[0], border_horz);
    144     memset(data_p + width, data_p[width - 1], border_horz);
    145   }
    146   data_p = data - border_horz;
    147   for (i = -border_vert; i < 0; ++i) {
    148     memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
    149   }
    150   for (i = height; i < height + border_vert; ++i) {
    151     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
    152            width + 2 * border_horz);
    153   }
    154 }
    155 
    156 static void extend_frame_highbd(uint16_t *data, int width, int height,
    157                                 int stride, int border_horz, int border_vert) {
    158   uint16_t *data_p;
    159   int i, j;
    160   for (i = 0; i < height; ++i) {
    161     data_p = data + i * stride;
    162     for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
    163     for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
    164   }
    165   data_p = data - border_horz;
    166   for (i = -border_vert; i < 0; ++i) {
    167     memcpy(data_p + i * stride, data_p,
    168            (width + 2 * border_horz) * sizeof(uint16_t));
    169   }
    170   for (i = height; i < height + border_vert; ++i) {
    171     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
    172            (width + 2 * border_horz) * sizeof(uint16_t));
    173   }
    174 }
    175 
    176 void extend_frame(uint8_t *data, int width, int height, int stride,
    177                   int border_horz, int border_vert, int highbd) {
    178   if (highbd)
    179     extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
    180                         border_horz, border_vert);
    181   else
    182     extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
    183 }
    184 
    185 static void copy_tile_lowbd(int width, int height, const uint8_t *src,
    186                             int src_stride, uint8_t *dst, int dst_stride) {
    187   for (int i = 0; i < height; ++i)
    188     memcpy(dst + i * dst_stride, src + i * src_stride, width);
    189 }
    190 
    191 static void copy_tile_highbd(int width, int height, const uint16_t *src,
    192                              int src_stride, uint16_t *dst, int dst_stride) {
    193   for (int i = 0; i < height; ++i)
    194     memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
    195 }
    196 
    197 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
    198                       uint8_t *dst, int dst_stride, int highbd) {
    199   if (highbd)
    200     copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
    201                      CONVERT_TO_SHORTPTR(dst), dst_stride);
    202   else
    203     copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
    204 }
    205 
    206 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
    207 
    208 // With striped loop restoration, the filtering for each 64-pixel stripe gets
    209 // most of its input from the output of CDEF (stored in data8), but we need to
    210 // fill out a border of 3 pixels above/below the stripe according to the
    211 // following
    212 // rules:
    213 //
    214 // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
    215 //   This extension is done by a call to extend_frame() at the start of the loop
    216 //   restoration process, so the value of copy_above/copy_below doesn't strictly
    217 //   matter.
    218 //   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
    219 //   across tiles is disabled, we can allow
    220 //   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
    221 //   data has always been copied, simplifying the behaviour at the left and
    222 //   right edges of tiles.
    223 //
    224 // * If we're at a tile boundary and loop filtering across tiles is enabled,
    225 //   then there is a logical stripe which is 64 pixels high, but which is split
    226 //   into an 8px high and a 56px high stripe so that the processing (and
    227 //   coefficient set usage) can be aligned to tiles.
    228 //   In this case, we use the 3 rows of CDEF output across the boundary for
    229 //   context; this corresponds to leaving the frame buffer as-is.
    230 //
    231 // * If we're at a tile boundary and loop filtering across tiles is disabled,
    232 //   then we take the outermost row of CDEF pixels *within the current tile*
    233 //   and copy it three times. Thus we behave exactly as if the tile were a full
    234 //   frame.
    235 //
    236 // * Otherwise, we're at a stripe boundary within a tile. In that case, we
    237 //   take 2 rows of deblocked pixels and extend them to 3 rows of context.
    238 //
    239 // The distinction between the latter two cases is handled by the
    240 // av1_loop_restoration_save_boundary_lines() function, so here we just need
    241 // to decide if we're overwriting the above/below boundary pixels or not.
    242 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
    243                                      const AV1PixelRect *tile_rect, int ss_y,
    244                                      int *copy_above, int *copy_below) {
    245   *copy_above = 1;
    246   *copy_below = 1;
    247 
    248   const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
    249   const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
    250 
    251   const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
    252   const int this_stripe_height =
    253       full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
    254   const int last_stripe_in_tile =
    255       (limits->v_start + this_stripe_height >= tile_rect->bottom);
    256 
    257   if (first_stripe_in_tile) *copy_above = 0;
    258   if (last_stripe_in_tile) *copy_below = 0;
    259 }
    260 
    261 // Overwrite the border pixels around a processing stripe so that the conditions
    262 // listed above get_stripe_boundary_info() are preserved.
    263 // We save the pixels which get overwritten into a temporary buffer, so that
    264 // they can be restored by restore_processing_stripe_boundary() after we've
    265 // processed the stripe.
    266 //
    267 // limits gives the rectangular limits of the remaining stripes for the current
    268 // restoration unit. rsb is the stored stripe boundaries (taken from either
    269 // deblock or CDEF output as necessary).
    270 //
    271 // tile_rect is the limits of the current tile and tile_stripe0 is the index of
    272 // the first stripe in this tile (needed to convert the tile-relative stripe
    273 // index we get from limits into something we can look up in rsb).
    274 static void setup_processing_stripe_boundary(
    275     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
    276     int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
    277     RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
    278   // Offsets within the line buffers. The buffer logically starts at column
    279   // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
    280   // has column x0 in the buffer.
    281   const int buf_stride = rsb->stripe_boundary_stride;
    282   const int buf_x0_off = limits->h_start;
    283   const int line_width =
    284       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
    285   const int line_size = line_width << use_highbd;
    286 
    287   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
    288 
    289   // Replace RESTORATION_BORDER pixels above the top of the stripe
    290   // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
    291   // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
    292   // duplicating the topmost of the 2 lines (see the AOMMAX call when
    293   // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
    294   //
    295   // Special case: If we're at the top of a tile, which isn't on the topmost
    296   // tile row, and we're allowed to loop filter across tiles, then we have a
    297   // logical 64-pixel-high stripe which has been split into an 8-pixel high
    298   // stripe and a 56-pixel high stripe (the current one). So, in this case,
    299   // we want to leave the boundary alone!
    300   if (!opt) {
    301     if (copy_above) {
    302       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    303 
    304       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
    305         const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
    306         const int buf_off = buf_x0_off + buf_row * buf_stride;
    307         const uint8_t *buf =
    308             rsb->stripe_boundary_above + (buf_off << use_highbd);
    309         uint8_t *dst8 = data8_tl + i * data_stride;
    310         // Save old pixels, then replace with data from stripe_boundary_above
    311         memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
    312                REAL_PTR(use_highbd, dst8), line_size);
    313         memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
    314       }
    315     }
    316 
    317     // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
    318     // The second buffer row is repeated, so src_row gets the values 0, 1, 1
    319     // for i = 0, 1, 2.
    320     if (copy_below) {
    321       const int stripe_end = limits->v_start + h;
    322       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
    323 
    324       for (int i = 0; i < RESTORATION_BORDER; ++i) {
    325         const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
    326         const int buf_off = buf_x0_off + buf_row * buf_stride;
    327         const uint8_t *src =
    328             rsb->stripe_boundary_below + (buf_off << use_highbd);
    329 
    330         uint8_t *dst8 = data8_bl + i * data_stride;
    331         // Save old pixels, then replace with data from stripe_boundary_below
    332         memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
    333         memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
    334       }
    335     }
    336   } else {
    337     if (copy_above) {
    338       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    339 
    340       // Only save and overwrite i=-RESTORATION_BORDER line.
    341       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
    342       // Save old pixels, then replace with data from stripe_boundary_above
    343       memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
    344       memcpy(REAL_PTR(use_highbd, dst8),
    345              REAL_PTR(use_highbd,
    346                       data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
    347              line_size);
    348     }
    349 
    350     if (copy_below) {
    351       const int stripe_end = limits->v_start + h;
    352       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
    353 
    354       // Only save and overwrite i=2 line.
    355       uint8_t *dst8 = data8_bl + 2 * data_stride;
    356       // Save old pixels, then replace with data from stripe_boundary_below
    357       memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
    358       memcpy(REAL_PTR(use_highbd, dst8),
    359              REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
    360     }
    361   }
    362 }
    363 
    364 // This function restores the boundary lines modified by
    365 // setup_processing_stripe_boundary.
    366 //
    367 // Note: We need to be careful when handling the corners of the processing
    368 // unit, because (eg.) the top-left corner is considered to be part of
    369 // both the left and top borders. This means that, depending on the
    370 // loop_filter_across_tiles_enabled flag, the corner pixels might get
    371 // overwritten twice, once as part of the "top" border and once as part
    372 // of the "left" border (or similar for other corners).
    373 //
    374 // Everything works out fine as long as we make sure to reverse the order
    375 // when restoring, ie. we need to restore the left/right borders followed
    376 // by the top/bottom borders.
    377 static void restore_processing_stripe_boundary(
    378     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
    379     int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
    380     int copy_below, int opt) {
    381   const int line_width =
    382       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
    383   const int line_size = line_width << use_highbd;
    384 
    385   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
    386 
    387   if (!opt) {
    388     if (copy_above) {
    389       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    390       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
    391         uint8_t *dst8 = data8_tl + i * data_stride;
    392         memcpy(REAL_PTR(use_highbd, dst8),
    393                rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
    394       }
    395     }
    396 
    397     if (copy_below) {
    398       const int stripe_bottom = limits->v_start + h;
    399       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
    400 
    401       for (int i = 0; i < RESTORATION_BORDER; ++i) {
    402         if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
    403 
    404         uint8_t *dst8 = data8_bl + i * data_stride;
    405         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
    406       }
    407     }
    408   } else {
    409     if (copy_above) {
    410       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    411 
    412       // Only restore i=-RESTORATION_BORDER line.
    413       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
    414       memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
    415     }
    416 
    417     if (copy_below) {
    418       const int stripe_bottom = limits->v_start + h;
    419       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
    420 
    421       // Only restore i=2 line.
    422       if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
    423         uint8_t *dst8 = data8_bl + 2 * data_stride;
    424         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
    425       }
    426     }
    427   }
    428 }
    429 
    430 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
    431                                  int stripe_width, int stripe_height,
    432                                  int procunit_width, const uint8_t *src,
    433                                  int src_stride, uint8_t *dst, int dst_stride,
    434                                  int32_t *tmpbuf, int bit_depth) {
    435   (void)tmpbuf;
    436   (void)bit_depth;
    437   assert(bit_depth == 8);
    438   const ConvolveParams conv_params = get_conv_params_wiener(8);
    439 
    440   for (int j = 0; j < stripe_width; j += procunit_width) {
    441     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
    442     const uint8_t *src_p = src + j;
    443     uint8_t *dst_p = dst + j;
    444     av1_wiener_convolve_add_src(
    445         src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
    446         rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
    447   }
    448 }
    449 
    450 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
    451    over the input. The window is of size (2r + 1)x(2r + 1), and we
    452    specialize to r = 1, 2, 3. A default function is used for r > 3.
    453 
    454    Each loop follows the same format: We keep a window's worth of input
    455    in individual variables and select data out of that as appropriate.
    456 */
    457 static void boxsum1(int32_t *src, int width, int height, int src_stride,
    458                     int sqr, int32_t *dst, int dst_stride) {
    459   int i, j, a, b, c;
    460   assert(width > 2 * SGRPROJ_BORDER_HORZ);
    461   assert(height > 2 * SGRPROJ_BORDER_VERT);
    462 
    463   // Vertical sum over 3-pixel regions, from src into dst.
    464   if (!sqr) {
    465     for (j = 0; j < width; ++j) {
    466       a = src[j];
    467       b = src[src_stride + j];
    468       c = src[2 * src_stride + j];
    469 
    470       dst[j] = a + b;
    471       for (i = 1; i < height - 2; ++i) {
    472         // Loop invariant: At the start of each iteration,
    473         // a = src[(i - 1) * src_stride + j]
    474         // b = src[(i    ) * src_stride + j]
    475         // c = src[(i + 1) * src_stride + j]
    476         dst[i * dst_stride + j] = a + b + c;
    477         a = b;
    478         b = c;
    479         c = src[(i + 2) * src_stride + j];
    480       }
    481       dst[i * dst_stride + j] = a + b + c;
    482       dst[(i + 1) * dst_stride + j] = b + c;
    483     }
    484   } else {
    485     for (j = 0; j < width; ++j) {
    486       a = src[j] * src[j];
    487       b = src[src_stride + j] * src[src_stride + j];
    488       c = src[2 * src_stride + j] * src[2 * src_stride + j];
    489 
    490       dst[j] = a + b;
    491       for (i = 1; i < height - 2; ++i) {
    492         dst[i * dst_stride + j] = a + b + c;
    493         a = b;
    494         b = c;
    495         c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
    496       }
    497       dst[i * dst_stride + j] = a + b + c;
    498       dst[(i + 1) * dst_stride + j] = b + c;
    499     }
    500   }
    501 
    502   // Horizontal sum over 3-pixel regions of dst
    503   for (i = 0; i < height; ++i) {
    504     a = dst[i * dst_stride];
    505     b = dst[i * dst_stride + 1];
    506     c = dst[i * dst_stride + 2];
    507 
    508     dst[i * dst_stride] = a + b;
    509     for (j = 1; j < width - 2; ++j) {
    510       // Loop invariant: At the start of each iteration,
    511       // a = src[i * src_stride + (j - 1)]
    512       // b = src[i * src_stride + (j    )]
    513       // c = src[i * src_stride + (j + 1)]
    514       dst[i * dst_stride + j] = a + b + c;
    515       a = b;
    516       b = c;
    517       c = dst[i * dst_stride + (j + 2)];
    518     }
    519     dst[i * dst_stride + j] = a + b + c;
    520     dst[i * dst_stride + (j + 1)] = b + c;
    521   }
    522 }
    523 
    524 static void boxsum2(int32_t *src, int width, int height, int src_stride,
    525                     int sqr, int32_t *dst, int dst_stride) {
    526   int i, j, a, b, c, d, e;
    527   assert(width > 2 * SGRPROJ_BORDER_HORZ);
    528   assert(height > 2 * SGRPROJ_BORDER_VERT);
    529 
    530   // Vertical sum over 5-pixel regions, from src into dst.
    531   if (!sqr) {
    532     for (j = 0; j < width; ++j) {
    533       a = src[j];
    534       b = src[src_stride + j];
    535       c = src[2 * src_stride + j];
    536       d = src[3 * src_stride + j];
    537       e = src[4 * src_stride + j];
    538 
    539       dst[j] = a + b + c;
    540       dst[dst_stride + j] = a + b + c + d;
    541       for (i = 2; i < height - 3; ++i) {
    542         // Loop invariant: At the start of each iteration,
    543         // a = src[(i - 2) * src_stride + j]
    544         // b = src[(i - 1) * src_stride + j]
    545         // c = src[(i    ) * src_stride + j]
    546         // d = src[(i + 1) * src_stride + j]
    547         // e = src[(i + 2) * src_stride + j]
    548         dst[i * dst_stride + j] = a + b + c + d + e;
    549         a = b;
    550         b = c;
    551         c = d;
    552         d = e;
    553         e = src[(i + 3) * src_stride + j];
    554       }
    555       dst[i * dst_stride + j] = a + b + c + d + e;
    556       dst[(i + 1) * dst_stride + j] = b + c + d + e;
    557       dst[(i + 2) * dst_stride + j] = c + d + e;
    558     }
    559   } else {
    560     for (j = 0; j < width; ++j) {
    561       a = src[j] * src[j];
    562       b = src[src_stride + j] * src[src_stride + j];
    563       c = src[2 * src_stride + j] * src[2 * src_stride + j];
    564       d = src[3 * src_stride + j] * src[3 * src_stride + j];
    565       e = src[4 * src_stride + j] * src[4 * src_stride + j];
    566 
    567       dst[j] = a + b + c;
    568       dst[dst_stride + j] = a + b + c + d;
    569       for (i = 2; i < height - 3; ++i) {
    570         dst[i * dst_stride + j] = a + b + c + d + e;
    571         a = b;
    572         b = c;
    573         c = d;
    574         d = e;
    575         e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
    576       }
    577       dst[i * dst_stride + j] = a + b + c + d + e;
    578       dst[(i + 1) * dst_stride + j] = b + c + d + e;
    579       dst[(i + 2) * dst_stride + j] = c + d + e;
    580     }
    581   }
    582 
    583   // Horizontal sum over 5-pixel regions of dst
    584   for (i = 0; i < height; ++i) {
    585     a = dst[i * dst_stride];
    586     b = dst[i * dst_stride + 1];
    587     c = dst[i * dst_stride + 2];
    588     d = dst[i * dst_stride + 3];
    589     e = dst[i * dst_stride + 4];
    590 
    591     dst[i * dst_stride] = a + b + c;
    592     dst[i * dst_stride + 1] = a + b + c + d;
    593     for (j = 2; j < width - 3; ++j) {
    594       // Loop invariant: At the start of each iteration,
    595       // a = src[i * src_stride + (j - 2)]
    596       // b = src[i * src_stride + (j - 1)]
    597       // c = src[i * src_stride + (j    )]
    598       // d = src[i * src_stride + (j + 1)]
    599       // e = src[i * src_stride + (j + 2)]
    600       dst[i * dst_stride + j] = a + b + c + d + e;
    601       a = b;
    602       b = c;
    603       c = d;
    604       d = e;
    605       e = dst[i * dst_stride + (j + 3)];
    606     }
    607     dst[i * dst_stride + j] = a + b + c + d + e;
    608     dst[i * dst_stride + (j + 1)] = b + c + d + e;
    609     dst[i * dst_stride + (j + 2)] = c + d + e;
    610   }
    611 }
    612 
    613 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
    614                    int sqr, int32_t *dst, int dst_stride) {
    615   if (r == 1)
    616     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
    617   else if (r == 2)
    618     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
    619   else
    620     assert(0 && "Invalid value of r in self-guided filter");
    621 }
    622 
    623 void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
    624   if (params->r[0] == 0) {
    625     xq[0] = 0;
    626     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
    627   } else if (params->r[1] == 0) {
    628     xq[0] = xqd[0];
    629     xq[1] = 0;
    630   } else {
    631     xq[0] = xqd[0];
    632     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
    633   }
    634 }
    635 
    636 const int32_t x_by_xplus1[256] = {
    637   // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
    638   // instead of 0. See comments in selfguided_restoration_internal() for why
    639   1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
    640   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
    641   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
    642   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
    643   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
    644   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
    645   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
    646   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
    647   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
    648   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
    649   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
    650   254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    651   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    652   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    653   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    654   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    655   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    656   256,
    657 };
    658 
    659 const int32_t one_by_x[MAX_NELEM] = {
    660   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
    661   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
    662 };
    663 
    664 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
    665                                           int dgd_stride, int bit_depth,
    666                                           int sgr_params_idx, int radius_idx,
    667                                           int pass, int32_t *A, int32_t *B) {
    668   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
    669   const int r = params->r[radius_idx];
    670   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
    671   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
    672   // Adjusting the stride of A and B here appears to avoid bad cache effects,
    673   // leading to a significant speed improvement.
    674   // We also align the stride to a multiple of 16 bytes, for consistency
    675   // with the SIMD version of this function.
    676   int buf_stride = ((width_ext + 3) & ~3) + 16;
    677   const int step = pass == 0 ? 1 : 2;
    678   int i, j;
    679 
    680   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
    681   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
    682          "Need SGRPROJ_BORDER_* >= r+1");
    683 
    684   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
    685          width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
    686   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
    687          width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
    688   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    689   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    690   // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
    691   // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
    692   for (i = -1; i < height + 1; i += step) {
    693     for (j = -1; j < width + 1; ++j) {
    694       const int k = i * buf_stride + j;
    695       const int n = (2 * r + 1) * (2 * r + 1);
    696 
    697       // a < 2^16 * n < 2^22 regardless of bit depth
    698       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
    699       // b < 2^8 * n < 2^14 regardless of bit depth
    700       uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
    701 
    702       // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
    703       // and p itself satisfies p < 2^14 * n^2 < 2^26.
    704       // This bound on p is due to:
    705       // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
    706       //
    707       // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
    708       // This is an artefact of rounding, and can only happen if all pixels
    709       // are (almost) identical, so in this case we saturate to p=0.
    710       uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
    711 
    712       const uint32_t s = params->s[radius_idx];
    713 
    714       // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
    715       // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
    716       // (this holds even after accounting for the rounding in s)
    717       const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
    718 
    719       // Note: We have to be quite careful about the value of A[k].
    720       // This is used as a blend factor between individual pixel values and the
    721       // local mean. So it logically has a range of [0, 256], including both
    722       // endpoints.
    723       //
    724       // This is a pain for hardware, as we'd like something which can be stored
    725       // in exactly 8 bits.
    726       // Further, in the calculation of B[k] below, if z == 0 and r == 2,
    727       // then A[k] "should be" 0. But then we can end up setting B[k] to a value
    728       // slightly above 2^(8 + bit depth), due to rounding in the value of
    729       // one_by_x[25-1].
    730       //
    731       // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
    732       // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
    733       // overflow), without significantly affecting the final result: z == 0
    734       // implies that the image is essentially "flat", so the local mean and
    735       // individual pixel values are very similar.
    736       //
    737       // Note that saturating on the other side, ie. requring A[k] <= 255,
    738       // would be a bad idea, as that corresponds to the case where the image
    739       // is very variable, when we want to preserve the local pixel value as
    740       // much as possible.
    741       A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
    742 
    743       // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
    744       // one_by_x[n - 1] = round(2^12 / n)
    745       // => the product here is < 2^(20 + bit_depth) <= 2^32,
    746       // and B[k] is set to a value < 2^(8 + bit depth)
    747       // This holds even with the rounding in one_by_x and in the overall
    748       // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
    749       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
    750                                              (uint32_t)B[k] *
    751                                              (uint32_t)one_by_x[n - 1],
    752                                          SGRPROJ_RECIP_BITS);
    753     }
    754   }
    755 }
    756 
    757 static void selfguided_restoration_fast_internal(
    758     int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
    759     int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
    760   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
    761   const int r = params->r[radius_idx];
    762   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
    763   // Adjusting the stride of A and B here appears to avoid bad cache effects,
    764   // leading to a significant speed improvement.
    765   // We also align the stride to a multiple of 16 bytes, for consistency
    766   // with the SIMD version of this function.
    767   int buf_stride = ((width_ext + 3) & ~3) + 16;
    768   int32_t A_[RESTORATION_PROC_UNIT_PELS];
    769   int32_t B_[RESTORATION_PROC_UNIT_PELS];
    770   int32_t *A = A_;
    771   int32_t *B = B_;
    772   int i, j;
    773   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
    774                                 sgr_params_idx, radius_idx, 1, A, B);
    775   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    776   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    777 
    778   // Use the A[] and B[] arrays to calculate the filtered image
    779   (void)r;
    780   assert(r == 2);
    781   for (i = 0; i < height; ++i) {
    782     if (!(i & 1)) {  // even row
    783       for (j = 0; j < width; ++j) {
    784         const int k = i * buf_stride + j;
    785         const int l = i * dgd_stride + j;
    786         const int m = i * dst_stride + j;
    787         const int nb = 5;
    788         const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
    789                           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
    790                            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
    791                               5;
    792         const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
    793                           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
    794                            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
    795                               5;
    796         const int32_t v = a * dgd[l] + b;
    797         dst[m] =
    798             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
    799       }
    800     } else {  // odd row
    801       for (j = 0; j < width; ++j) {
    802         const int k = i * buf_stride + j;
    803         const int l = i * dgd_stride + j;
    804         const int m = i * dst_stride + j;
    805         const int nb = 4;
    806         const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
    807         const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
    808         const int32_t v = a * dgd[l] + b;
    809         dst[m] =
    810             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
    811       }
    812     }
    813   }
    814 }
    815 
    816 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
    817                                             int dgd_stride, int32_t *dst,
    818                                             int dst_stride, int bit_depth,
    819                                             int sgr_params_idx,
    820                                             int radius_idx) {
    821   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
    822   // Adjusting the stride of A and B here appears to avoid bad cache effects,
    823   // leading to a significant speed improvement.
    824   // We also align the stride to a multiple of 16 bytes, for consistency
    825   // with the SIMD version of this function.
    826   int buf_stride = ((width_ext + 3) & ~3) + 16;
    827   int32_t A_[RESTORATION_PROC_UNIT_PELS];
    828   int32_t B_[RESTORATION_PROC_UNIT_PELS];
    829   int32_t *A = A_;
    830   int32_t *B = B_;
    831   int i, j;
    832   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
    833                                 sgr_params_idx, radius_idx, 0, A, B);
    834   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    835   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
    836 
    837   // Use the A[] and B[] arrays to calculate the filtered image
    838   for (i = 0; i < height; ++i) {
    839     for (j = 0; j < width; ++j) {
    840       const int k = i * buf_stride + j;
    841       const int l = i * dgd_stride + j;
    842       const int m = i * dst_stride + j;
    843       const int nb = 5;
    844       const int32_t a =
    845           (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
    846               4 +
    847           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
    848            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
    849               3;
    850       const int32_t b =
    851           (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
    852               4 +
    853           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
    854            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
    855               3;
    856       const int32_t v = a * dgd[l] + b;
    857       dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
    858     }
    859   }
    860 }
    861 
    862 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
    863                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
    864                                  int flt_stride, int sgr_params_idx,
    865                                  int bit_depth, int highbd) {
    866   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
    867   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
    868   int32_t *dgd32 =
    869       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
    870 
    871   if (highbd) {
    872     const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
    873     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
    874       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
    875         dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
    876       }
    877     }
    878   } else {
    879     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
    880       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
    881         dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
    882       }
    883     }
    884   }
    885 
    886   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
    887   // If params->r == 0 we skip the corresponding filter. We only allow one of
    888   // the radii to be 0, as having both equal to 0 would be equivalent to
    889   // skipping SGR entirely.
    890   assert(!(params->r[0] == 0 && params->r[1] == 0));
    891 
    892   if (params->r[0] > 0)
    893     selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
    894                                          flt0, flt_stride, bit_depth,
    895                                          sgr_params_idx, 0);
    896   if (params->r[1] > 0)
    897     selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
    898                                     flt_stride, bit_depth, sgr_params_idx, 1);
    899   return 0;
    900 }
    901 
    902 void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
    903                                     int stride, int eps, const int *xqd,
    904                                     uint8_t *dst8, int dst_stride,
    905                                     int32_t *tmpbuf, int bit_depth,
    906                                     int highbd) {
    907   int32_t *flt0 = tmpbuf;
    908   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
    909   assert(width * height <= RESTORATION_UNITPELS_MAX);
    910 
    911   const int ret = av1_selfguided_restoration_c(
    912       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
    913   (void)ret;
    914   assert(!ret);
    915   const sgr_params_type *const params = &sgr_params[eps];
    916   int xq[2];
    917   decode_xq(xqd, xq, params);
    918   for (int i = 0; i < height; ++i) {
    919     for (int j = 0; j < width; ++j) {
    920       const int k = i * width + j;
    921       uint8_t *dst8ij = dst8 + i * dst_stride + j;
    922       const uint8_t *dat8ij = dat8 + i * stride + j;
    923 
    924       const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
    925       const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
    926       int32_t v = u << SGRPROJ_PRJ_BITS;
    927       // If params->r == 0 then we skipped the filtering in
    928       // av1_selfguided_restoration_c, i.e. flt[k] == u
    929       if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
    930       if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
    931       const int16_t w =
    932           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
    933 
    934       const uint16_t out = clip_pixel_highbd(w, bit_depth);
    935       if (highbd)
    936         *CONVERT_TO_SHORTPTR(dst8ij) = out;
    937       else
    938         *dst8ij = (uint8_t)out;
    939     }
    940   }
    941 }
    942 
    943 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
    944                                   int stripe_width, int stripe_height,
    945                                   int procunit_width, const uint8_t *src,
    946                                   int src_stride, uint8_t *dst, int dst_stride,
    947                                   int32_t *tmpbuf, int bit_depth) {
    948   (void)bit_depth;
    949   assert(bit_depth == 8);
    950 
    951   for (int j = 0; j < stripe_width; j += procunit_width) {
    952     int w = AOMMIN(procunit_width, stripe_width - j);
    953     apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
    954                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
    955                                  dst + j, dst_stride, tmpbuf, bit_depth, 0);
    956   }
    957 }
    958 
    959 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
    960                                         int stripe_width, int stripe_height,
    961                                         int procunit_width, const uint8_t *src8,
    962                                         int src_stride, uint8_t *dst8,
    963                                         int dst_stride, int32_t *tmpbuf,
    964                                         int bit_depth) {
    965   (void)tmpbuf;
    966   const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
    967 
    968   for (int j = 0; j < stripe_width; j += procunit_width) {
    969     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
    970     const uint8_t *src8_p = src8 + j;
    971     uint8_t *dst8_p = dst8 + j;
    972     av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
    973                                        rui->wiener_info.hfilter, 16,
    974                                        rui->wiener_info.vfilter, 16, w,
    975                                        stripe_height, &conv_params, bit_depth);
    976   }
    977 }
    978 
    979 static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
    980                                          int stripe_width, int stripe_height,
    981                                          int procunit_width,
    982                                          const uint8_t *src8, int src_stride,
    983                                          uint8_t *dst8, int dst_stride,
    984                                          int32_t *tmpbuf, int bit_depth) {
    985   for (int j = 0; j < stripe_width; j += procunit_width) {
    986     int w = AOMMIN(procunit_width, stripe_width - j);
    987     apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
    988                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
    989                                  dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
    990   }
    991 }
    992 
    993 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
    994                                   int stripe_width, int stripe_height,
    995                                   int procunit_width, const uint8_t *src,
    996                                   int src_stride, uint8_t *dst, int dst_stride,
    997                                   int32_t *tmpbuf, int bit_depth);
    998 
    999 #define NUM_STRIPE_FILTERS 4
   1000 
   1001 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
   1002   wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
   1003   sgrproj_filter_stripe_highbd
   1004 };
   1005 
   1006 // Filter one restoration unit
   1007 void av1_loop_restoration_filter_unit(
   1008     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
   1009     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
   1010     const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
   1011     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
   1012     int dst_stride, int32_t *tmpbuf, int optimized_lr) {
   1013   RestorationType unit_rtype = rui->restoration_type;
   1014 
   1015   int unit_h = limits->v_end - limits->v_start;
   1016   int unit_w = limits->h_end - limits->h_start;
   1017   uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
   1018   uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
   1019 
   1020   if (unit_rtype == RESTORE_NONE) {
   1021     copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
   1022     return;
   1023   }
   1024 
   1025   const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
   1026   assert(filter_idx < NUM_STRIPE_FILTERS);
   1027   const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
   1028 
   1029   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
   1030 
   1031   // Convolve the whole tile one stripe at a time
   1032   RestorationTileLimits remaining_stripes = *limits;
   1033   int i = 0;
   1034   while (i < unit_h) {
   1035     int copy_above, copy_below;
   1036     remaining_stripes.v_start = limits->v_start + i;
   1037 
   1038     get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
   1039                              &copy_below);
   1040 
   1041     const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   1042     const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
   1043 
   1044     // Work out where this stripe's boundaries are within
   1045     // rsb->stripe_boundary_{above,below}
   1046     const int tile_stripe =
   1047         (remaining_stripes.v_start - tile_rect->top + runit_offset) /
   1048         full_stripe_height;
   1049     const int frame_stripe = tile_stripe0 + tile_stripe;
   1050     const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
   1051 
   1052     // Calculate this stripe's height, based on two rules:
   1053     // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
   1054     // * We can't extend past the end of the current restoration unit
   1055     const int nominal_stripe_height =
   1056         full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
   1057     const int h = AOMMIN(nominal_stripe_height,
   1058                          remaining_stripes.v_end - remaining_stripes.v_start);
   1059 
   1060     setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
   1061                                      h, data8, stride, rlbs, copy_above,
   1062                                      copy_below, optimized_lr);
   1063 
   1064     stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
   1065                   dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
   1066 
   1067     restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
   1068                                        data8, stride, copy_above, copy_below,
   1069                                        optimized_lr);
   1070 
   1071     i += h;
   1072   }
   1073 }
   1074 
   1075 static void filter_frame_on_unit(const RestorationTileLimits *limits,
   1076                                  const AV1PixelRect *tile_rect,
   1077                                  int rest_unit_idx, void *priv, int32_t *tmpbuf,
   1078                                  RestorationLineBuffers *rlbs) {
   1079   FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
   1080   const RestorationInfo *rsi = ctxt->rsi;
   1081 
   1082   av1_loop_restoration_filter_unit(
   1083       limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
   1084       ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
   1085       ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
   1086       rsi->optimized_lr);
   1087 }
   1088 
   1089 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
   1090                                             YV12_BUFFER_CONFIG *frame,
   1091                                             AV1_COMMON *cm, int optimized_lr,
   1092                                             int num_planes) {
   1093   const SequenceHeader *const seq_params = &cm->seq_params;
   1094   const int bit_depth = seq_params->bit_depth;
   1095   const int highbd = seq_params->use_highbitdepth;
   1096   lr_ctxt->dst = &cm->rst_frame;
   1097 
   1098   const int frame_width = frame->crop_widths[0];
   1099   const int frame_height = frame->crop_heights[0];
   1100   if (aom_realloc_frame_buffer(
   1101           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
   1102           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
   1103           cm->byte_alignment, NULL, NULL, NULL) < 0)
   1104     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
   1105                        "Failed to allocate restoration dst buffer");
   1106 
   1107   lr_ctxt->on_rest_unit = filter_frame_on_unit;
   1108   lr_ctxt->frame = frame;
   1109   for (int plane = 0; plane < num_planes; ++plane) {
   1110     RestorationInfo *rsi = &cm->rst_info[plane];
   1111     RestorationType rtype = rsi->frame_restoration_type;
   1112     rsi->optimized_lr = optimized_lr;
   1113 
   1114     if (rtype == RESTORE_NONE) {
   1115       continue;
   1116     }
   1117 
   1118     const int is_uv = plane > 0;
   1119     const int plane_width = frame->crop_widths[is_uv];
   1120     const int plane_height = frame->crop_heights[is_uv];
   1121     FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
   1122 
   1123     extend_frame(frame->buffers[plane], plane_width, plane_height,
   1124                  frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
   1125                  highbd);
   1126 
   1127     lr_plane_ctxt->rsi = rsi;
   1128     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
   1129     lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
   1130     lr_plane_ctxt->highbd = highbd;
   1131     lr_plane_ctxt->bit_depth = bit_depth;
   1132     lr_plane_ctxt->data8 = frame->buffers[plane];
   1133     lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
   1134     lr_plane_ctxt->data_stride = frame->strides[is_uv];
   1135     lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
   1136     lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
   1137     lr_plane_ctxt->tile_stripe0 = 0;
   1138   }
   1139 }
   1140 
   1141 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
   1142                                       AV1_COMMON *cm, int num_planes) {
   1143   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
   1144                            YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
   1145                            int vstart, int vend);
   1146   static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
   1147                                          aom_yv12_partial_coloc_copy_u,
   1148                                          aom_yv12_partial_coloc_copy_v };
   1149 
   1150   for (int plane = 0; plane < num_planes; ++plane) {
   1151     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
   1152     AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
   1153     copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
   1154                      tile_rect.right, tile_rect.top, tile_rect.bottom);
   1155   }
   1156 }
   1157 
   1158 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
   1159                                         int num_planes) {
   1160   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
   1161 
   1162   for (int plane = 0; plane < num_planes; ++plane) {
   1163     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
   1164       continue;
   1165     }
   1166 
   1167     av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
   1168                                    &ctxt[plane], &ctxt[plane].tile_rect,
   1169                                    cm->rst_tmpbuf, cm->rlbs);
   1170   }
   1171 }
   1172 
   1173 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
   1174                                        AV1_COMMON *cm, int optimized_lr,
   1175                                        void *lr_ctxt) {
   1176   assert(!cm->all_lossless);
   1177   const int num_planes = av1_num_planes(cm);
   1178 
   1179   AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
   1180 
   1181   av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
   1182                                          optimized_lr, num_planes);
   1183 
   1184   foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
   1185 
   1186   av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
   1187 }
   1188 
   1189 void av1_foreach_rest_unit_in_row(
   1190     RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
   1191     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
   1192     int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
   1193     void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
   1194     sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
   1195     struct AV1LrSyncData *const lr_sync) {
   1196   const int tile_w = tile_rect->right - tile_rect->left;
   1197   const int ext_size = unit_size * 3 / 2;
   1198   int x0 = 0, j = 0;
   1199   while (x0 < tile_w) {
   1200     int remaining_w = tile_w - x0;
   1201     int w = (remaining_w < ext_size) ? remaining_w : unit_size;
   1202 
   1203     limits->h_start = tile_rect->left + x0;
   1204     limits->h_end = tile_rect->left + x0 + w;
   1205     assert(limits->h_end <= tile_rect->right);
   1206 
   1207     const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
   1208 
   1209     // No sync for even numbered rows
   1210     // For odd numbered rows, Loop Restoration of current block requires the LR
   1211     // of top-right and bottom-right blocks to be completed
   1212 
   1213     // top-right sync
   1214     on_sync_read(lr_sync, row_number, j, plane);
   1215     if ((row_number + 1) < vunits_per_tile)
   1216       // bottom-right sync
   1217       on_sync_read(lr_sync, row_number + 2, j, plane);
   1218 
   1219     on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
   1220 
   1221     on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
   1222 
   1223     x0 += w;
   1224     ++j;
   1225   }
   1226 }
   1227 
   1228 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
   1229   (void)lr_sync;
   1230   (void)r;
   1231   (void)c;
   1232   (void)plane;
   1233 }
   1234 
   1235 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
   1236                              const int sb_cols, int plane) {
   1237   (void)lr_sync;
   1238   (void)r;
   1239   (void)c;
   1240   (void)sb_cols;
   1241   (void)plane;
   1242 }
   1243 
   1244 static void foreach_rest_unit_in_tile(
   1245     const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
   1246     int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
   1247     int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
   1248     int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
   1249   const int tile_h = tile_rect->bottom - tile_rect->top;
   1250   const int ext_size = unit_size * 3 / 2;
   1251 
   1252   const int tile_idx = tile_col + tile_row * tile_cols;
   1253   const int unit_idx0 = tile_idx * units_per_tile;
   1254 
   1255   int y0 = 0, i = 0;
   1256   while (y0 < tile_h) {
   1257     int remaining_h = tile_h - y0;
   1258     int h = (remaining_h < ext_size) ? remaining_h : unit_size;
   1259 
   1260     RestorationTileLimits limits;
   1261     limits.v_start = tile_rect->top + y0;
   1262     limits.v_end = tile_rect->top + y0 + h;
   1263     assert(limits.v_end <= tile_rect->bottom);
   1264     // Offset the tile upwards to align with the restoration processing stripe
   1265     const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
   1266     limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
   1267     if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
   1268 
   1269     av1_foreach_rest_unit_in_row(
   1270         &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
   1271         hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
   1272         av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
   1273 
   1274     y0 += h;
   1275     ++i;
   1276   }
   1277 }
   1278 
   1279 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
   1280                                     rest_unit_visitor_t on_rest_unit,
   1281                                     void *priv, AV1PixelRect *tile_rect,
   1282                                     int32_t *tmpbuf,
   1283                                     RestorationLineBuffers *rlbs) {
   1284   const int is_uv = plane > 0;
   1285   const int ss_y = is_uv && cm->seq_params.subsampling_y;
   1286 
   1287   const RestorationInfo *rsi = &cm->rst_info[plane];
   1288 
   1289   foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
   1290                             rsi->horz_units_per_tile, rsi->vert_units_per_tile,
   1291                             rsi->units_per_tile, rsi->restoration_unit_size,
   1292                             ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
   1293 }
   1294 
   1295 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
   1296                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
   1297                                        int *rcol0, int *rcol1, int *rrow0,
   1298                                        int *rrow1) {
   1299   assert(rcol0 && rcol1 && rrow0 && rrow1);
   1300 
   1301   if (bsize != cm->seq_params.sb_size) return 0;
   1302   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
   1303 
   1304   assert(!cm->all_lossless);
   1305 
   1306   const int is_uv = plane > 0;
   1307 
   1308   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
   1309   const int tile_w = tile_rect.right - tile_rect.left;
   1310   const int tile_h = tile_rect.bottom - tile_rect.top;
   1311 
   1312   const int mi_top = 0;
   1313   const int mi_left = 0;
   1314 
   1315   // Compute the mi-unit corners of the superblock relative to the top-left of
   1316   // the tile
   1317   const int mi_rel_row0 = mi_row - mi_top;
   1318   const int mi_rel_col0 = mi_col - mi_left;
   1319   const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
   1320   const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
   1321 
   1322   const RestorationInfo *rsi = &cm->rst_info[plane];
   1323   const int size = rsi->restoration_unit_size;
   1324 
   1325   // Calculate the number of restoration units in this tile (which might be
   1326   // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
   1327   const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
   1328   const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
   1329 
   1330   // The size of an MI-unit on this plane of the image
   1331   const int ss_x = is_uv && cm->seq_params.subsampling_x;
   1332   const int ss_y = is_uv && cm->seq_params.subsampling_y;
   1333   const int mi_size_x = MI_SIZE >> ss_x;
   1334   const int mi_size_y = MI_SIZE >> ss_y;
   1335 
   1336   // Write m for the relative mi column or row, D for the superres denominator
   1337   // and N for the superres numerator. If u is the upscaled pixel offset then
   1338   // we can write the downscaled pixel offset in two ways as:
   1339   //
   1340   //   MI_SIZE * m = N / D u
   1341   //
   1342   // from which we get u = D * MI_SIZE * m / N
   1343   const int mi_to_num_x = av1_superres_scaled(cm)
   1344                               ? mi_size_x * cm->superres_scale_denominator
   1345                               : mi_size_x;
   1346   const int mi_to_num_y = mi_size_y;
   1347   const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
   1348   const int denom_y = size;
   1349 
   1350   const int rnd_x = denom_x - 1;
   1351   const int rnd_y = denom_y - 1;
   1352 
   1353   // rcol0/rrow0 should be the first column/row of restoration units (relative
   1354   // to the top-left of the tile) that doesn't start left/below of
   1355   // mi_col/mi_row. For this calculation, we need to round up the division (if
   1356   // the sb starts at runit column 10.1, the first matching runit has column
   1357   // index 11)
   1358   *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
   1359   *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
   1360 
   1361   // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
   1362   // below-right. If we're at the bottom or right of the tile, this restoration
   1363   // unit might not exist, in which case we'll clamp accordingly.
   1364   *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
   1365   *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
   1366 
   1367   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
   1368 }
   1369 
   1370 // Extend to left and right
   1371 static void extend_lines(uint8_t *buf, int width, int height, int stride,
   1372                          int extend, int use_highbitdepth) {
   1373   for (int i = 0; i < height; ++i) {
   1374     if (use_highbitdepth) {
   1375       uint16_t *buf16 = (uint16_t *)buf;
   1376       aom_memset16(buf16 - extend, buf16[0], extend);
   1377       aom_memset16(buf16 + width, buf16[width - 1], extend);
   1378     } else {
   1379       memset(buf - extend, buf[0], extend);
   1380       memset(buf + width, buf[width - 1], extend);
   1381     }
   1382     buf += stride;
   1383   }
   1384 }
   1385 
   1386 static void save_deblock_boundary_lines(
   1387     const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
   1388     int stripe, int use_highbd, int is_above,
   1389     RestorationStripeBoundaries *boundaries) {
   1390   const int is_uv = plane > 0;
   1391   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
   1392   const int src_stride = frame->strides[is_uv] << use_highbd;
   1393   const uint8_t *src_rows = src_buf + row * src_stride;
   1394 
   1395   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
   1396                                : boundaries->stripe_boundary_below;
   1397   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
   1398   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
   1399   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
   1400 
   1401   // There is a rare case in which a processing stripe can end 1px above the
   1402   // crop border. In this case, we do want to use deblocked pixels from below
   1403   // the stripe (hence why we ended up in this function), but instead of
   1404   // fetching 2 "below" rows we need to fetch one and duplicate it.
   1405   // This is equivalent to clamping the sample locations against the crop border
   1406   const int lines_to_save =
   1407       AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
   1408   assert(lines_to_save == 1 || lines_to_save == 2);
   1409 
   1410   int upscaled_width;
   1411   int line_bytes;
   1412   if (av1_superres_scaled(cm)) {
   1413     const int ss_x = is_uv && cm->seq_params.subsampling_x;
   1414     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
   1415     line_bytes = upscaled_width << use_highbd;
   1416     if (use_highbd)
   1417       av1_upscale_normative_rows(
   1418           cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
   1419           CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
   1420           plane, lines_to_save);
   1421     else
   1422       av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
   1423                                  boundaries->stripe_boundary_stride, plane,
   1424                                  lines_to_save);
   1425   } else {
   1426     upscaled_width = frame->crop_widths[is_uv];
   1427     line_bytes = upscaled_width << use_highbd;
   1428     for (int i = 0; i < lines_to_save; i++) {
   1429       memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
   1430              line_bytes);
   1431     }
   1432   }
   1433   // If we only saved one line, then copy it into the second line buffer
   1434   if (lines_to_save == 1)
   1435     memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
   1436 
   1437   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
   1438                RESTORATION_EXTRA_HORZ, use_highbd);
   1439 }
   1440 
   1441 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
   1442                                      const AV1_COMMON *cm, int plane, int row,
   1443                                      int stripe, int use_highbd, int is_above,
   1444                                      RestorationStripeBoundaries *boundaries) {
   1445   const int is_uv = plane > 0;
   1446   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
   1447   const int src_stride = frame->strides[is_uv] << use_highbd;
   1448   const uint8_t *src_rows = src_buf + row * src_stride;
   1449 
   1450   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
   1451                                : boundaries->stripe_boundary_below;
   1452   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
   1453   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
   1454   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
   1455   const int src_width = frame->crop_widths[is_uv];
   1456 
   1457   // At the point where this function is called, we've already applied
   1458   // superres. So we don't need to extend the lines here, we can just
   1459   // pull directly from the topmost row of the upscaled frame.
   1460   const int ss_x = is_uv && cm->seq_params.subsampling_x;
   1461   const int upscaled_width = av1_superres_scaled(cm)
   1462                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
   1463                                  : src_width;
   1464   const int line_bytes = upscaled_width << use_highbd;
   1465   for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
   1466     // Copy the line at 'row' into both context lines. This is because
   1467     // we want to (effectively) extend the outermost row of CDEF data
   1468     // from this tile to produce a border, rather than using deblocked
   1469     // pixels from the tile above/below.
   1470     memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
   1471   }
   1472   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
   1473                RESTORATION_EXTRA_HORZ, use_highbd);
   1474 }
   1475 
   1476 static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
   1477                                          int use_highbd, int plane,
   1478                                          AV1_COMMON *cm, int after_cdef) {
   1479   const int is_uv = plane > 0;
   1480   const int ss_y = is_uv && cm->seq_params.subsampling_y;
   1481   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   1482   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
   1483 
   1484   // Get the tile rectangle, with height rounded up to the next multiple of 8
   1485   // luma pixels (only relevant for the bottom tile of the frame)
   1486   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
   1487   const int stripe0 = 0;
   1488 
   1489   RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
   1490 
   1491   const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
   1492 
   1493   int tile_stripe;
   1494   for (tile_stripe = 0;; ++tile_stripe) {
   1495     const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
   1496     const int y0 = tile_rect.top + rel_y0;
   1497     if (y0 >= tile_rect.bottom) break;
   1498 
   1499     const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
   1500     const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
   1501 
   1502     const int frame_stripe = stripe0 + tile_stripe;
   1503 
   1504     // In this case, we should only use CDEF pixels at the top
   1505     // and bottom of the frame as a whole; internal tile boundaries
   1506     // can use deblocked pixels from adjacent tiles for context.
   1507     const int use_deblock_above = (frame_stripe > 0);
   1508     const int use_deblock_below = (y1 < plane_height);
   1509 
   1510     if (!after_cdef) {
   1511       // Save deblocked context where needed.
   1512       if (use_deblock_above) {
   1513         save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
   1514                                     frame_stripe, use_highbd, 1, boundaries);
   1515       }
   1516       if (use_deblock_below) {
   1517         save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
   1518                                     use_highbd, 0, boundaries);
   1519       }
   1520     } else {
   1521       // Save CDEF context where needed. Note that we need to save the CDEF
   1522       // context for a particular boundary iff we *didn't* save deblocked
   1523       // context for that boundary.
   1524       //
   1525       // In addition, we need to save copies of the outermost line within
   1526       // the tile, rather than using data from outside the tile.
   1527       if (!use_deblock_above) {
   1528         save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
   1529                                  1, boundaries);
   1530       }
   1531       if (!use_deblock_below) {
   1532         save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
   1533                                  use_highbd, 0, boundaries);
   1534       }
   1535     }
   1536   }
   1537 }
   1538 
   1539 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
   1540 // lines to be used as boundary in the loop restoration process. The
   1541 // lines are saved in rst_internal.stripe_boundary_lines
   1542 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
   1543                                               AV1_COMMON *cm, int after_cdef) {
   1544   const int num_planes = av1_num_planes(cm);
   1545   const int use_highbd = cm->seq_params.use_highbitdepth;
   1546   for (int p = 0; p < num_planes; ++p) {
   1547     save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
   1548   }
   1549 }
   1550