1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_config.h" 12 #include "./vp9_rtcd.h" 13 #include "./vpx_dsp_rtcd.h" 14 #include "./vpx_scale_rtcd.h" 15 16 #include "vp9/common/vp9_onyxc_int.h" 17 #include "vp9/common/vp9_postproc.h" 18 19 // TODO(jackychen): Replace this function with SSE2 code. There is 20 // one SSE2 implementation in vp8, so will consider how to share it 21 // between vp8 and vp9. 22 static void filter_by_weight(const uint8_t *src, int src_stride, 23 uint8_t *dst, int dst_stride, 24 int block_size, int src_weight) { 25 const int dst_weight = (1 << MFQE_PRECISION) - src_weight; 26 const int rounding_bit = 1 << (MFQE_PRECISION - 1); 27 int r, c; 28 29 for (r = 0; r < block_size; r++) { 30 for (c = 0; c < block_size; c++) { 31 dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit) 32 >> MFQE_PRECISION; 33 } 34 src += src_stride; 35 dst += dst_stride; 36 } 37 } 38 39 void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, 40 uint8_t *dst, int dst_stride, int src_weight) { 41 filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight); 42 } 43 44 void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, 45 uint8_t *dst, int dst_stride, 46 int src_weight) { 47 filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight); 48 } 49 50 static void filter_by_weight32x32(const uint8_t *src, int src_stride, 51 uint8_t *dst, int dst_stride, int weight) { 52 vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight); 53 vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride, 54 weight); 55 vp9_filter_by_weight16x16(src + src_stride * 16, src_stride, 56 dst + dst_stride * 16, dst_stride, weight); 57 vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride, 58 dst + dst_stride * 16 + 16, dst_stride, weight); 59 } 60 61 static void filter_by_weight64x64(const uint8_t *src, int src_stride, 62 uint8_t *dst, int dst_stride, int weight) { 63 filter_by_weight32x32(src, src_stride, dst, dst_stride, weight); 64 filter_by_weight32x32(src + 32, src_stride, dst + 32, 65 dst_stride, weight); 66 filter_by_weight32x32(src + src_stride * 32, src_stride, 67 dst + dst_stride * 32, dst_stride, weight); 68 filter_by_weight32x32(src + src_stride * 32 + 32, src_stride, 69 dst + dst_stride * 32 + 32, dst_stride, weight); 70 } 71 72 static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd, 73 int yd_stride, const uint8_t *u, const uint8_t *v, 74 int uv_stride, uint8_t *ud, uint8_t *vd, 75 int uvd_stride, BLOCK_SIZE block_size, 76 int weight) { 77 if (block_size == BLOCK_16X16) { 78 vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight); 79 vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight); 80 vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight); 81 } else if (block_size == BLOCK_32X32) { 82 filter_by_weight32x32(y, y_stride, yd, yd_stride, weight); 83 vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight); 84 vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight); 85 } else if (block_size == BLOCK_64X64) { 86 filter_by_weight64x64(y, y_stride, yd, yd_stride, weight); 87 filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight); 88 filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight); 89 } 90 } 91 92 // TODO(jackychen): Determine whether replace it with assembly code. 93 static void copy_mem8x8(const uint8_t *src, int src_stride, 94 uint8_t *dst, int dst_stride) { 95 int r; 96 for (r = 0; r < 8; r++) { 97 memcpy(dst, src, 8); 98 src += src_stride; 99 dst += dst_stride; 100 } 101 } 102 103 static void copy_mem16x16(const uint8_t *src, int src_stride, 104 uint8_t *dst, int dst_stride) { 105 int r; 106 for (r = 0; r < 16; r++) { 107 memcpy(dst, src, 16); 108 src += src_stride; 109 dst += dst_stride; 110 } 111 } 112 113 static void copy_mem32x32(const uint8_t *src, int src_stride, 114 uint8_t *dst, int dst_stride) { 115 copy_mem16x16(src, src_stride, dst, dst_stride); 116 copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride); 117 copy_mem16x16(src + src_stride * 16, src_stride, 118 dst + dst_stride * 16, dst_stride); 119 copy_mem16x16(src + src_stride * 16 + 16, src_stride, 120 dst + dst_stride * 16 + 16, dst_stride); 121 } 122 123 static void copy_mem64x64(const uint8_t *src, int src_stride, 124 uint8_t *dst, int dst_stride) { 125 copy_mem32x32(src, src_stride, dst, dst_stride); 126 copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride); 127 copy_mem32x32(src + src_stride * 32, src_stride, 128 dst + src_stride * 32, dst_stride); 129 copy_mem32x32(src + src_stride * 32 + 32, src_stride, 130 dst + src_stride * 32 + 32, dst_stride); 131 } 132 133 static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, 134 int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud, 135 uint8_t *vd, int yd_stride, int uvd_stride, 136 BLOCK_SIZE bs) { 137 if (bs == BLOCK_16X16) { 138 copy_mem16x16(y, y_stride, yd, yd_stride); 139 copy_mem8x8(u, uv_stride, ud, uvd_stride); 140 copy_mem8x8(v, uv_stride, vd, uvd_stride); 141 } else if (bs == BLOCK_32X32) { 142 copy_mem32x32(y, y_stride, yd, yd_stride); 143 copy_mem16x16(u, uv_stride, ud, uvd_stride); 144 copy_mem16x16(v, uv_stride, vd, uvd_stride); 145 } else { 146 copy_mem64x64(y, y_stride, yd, yd_stride); 147 copy_mem32x32(u, uv_stride, ud, uvd_stride); 148 copy_mem32x32(v, uv_stride, vd, uvd_stride); 149 } 150 } 151 152 static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) { 153 const int adj = qdiff >> MFQE_PRECISION; 154 if (bs == BLOCK_16X16) { 155 *sad_thr = 7 + adj; 156 } else if (bs == BLOCK_32X32) { 157 *sad_thr = 6 + adj; 158 } else { // BLOCK_64X64 159 *sad_thr = 5 + adj; 160 } 161 *vdiff_thr = 125 + qdiff; 162 } 163 164 static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, 165 const uint8_t *v, int y_stride, int uv_stride, 166 uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride, 167 int uvd_stride, int qdiff) { 168 int sad, sad_thr, vdiff, vdiff_thr; 169 uint32_t sse; 170 171 get_thr(bs, qdiff, &sad_thr, &vdiff_thr); 172 173 if (bs == BLOCK_16X16) { 174 vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; 175 sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; 176 } else if (bs == BLOCK_32X32) { 177 vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; 178 sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10; 179 } else /* if (bs == BLOCK_64X64) */ { 180 vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; 181 sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; 182 } 183 184 // vdiff > sad * 3 means vdiff should not be too small, otherwise, 185 // it might be a lighting change in smooth area. When there is a 186 // lighting change in smooth area, it is dangerous to do MFQE. 187 if (sad > 1 && vdiff > sad * 3) { 188 const int weight = 1 << MFQE_PRECISION; 189 int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr); 190 // When ifactor equals weight, no MFQE is done. 191 if (ifactor > weight) { 192 ifactor = weight; 193 } 194 apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd, 195 uvd_stride, bs, ifactor); 196 } else { 197 // Copy the block from current frame (i.e., no mfqe is done). 198 copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, 199 yd_stride, uvd_stride, bs); 200 } 201 } 202 203 static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) { 204 // Check the motion in current block(for inter frame), 205 // or check the motion in the correlated block in last frame (for keyframe). 206 const int mv_len_square = mi->mbmi.mv[0].as_mv.row * 207 mi->mbmi.mv[0].as_mv.row + 208 mi->mbmi.mv[0].as_mv.col * 209 mi->mbmi.mv[0].as_mv.col; 210 const int mv_threshold = 100; 211 return mi->mbmi.mode >= NEARESTMV && // Not an intra block 212 cur_bs >= BLOCK_16X16 && 213 mv_len_square <= mv_threshold; 214 } 215 216 // Process each partiton in a super block, recursively. 217 static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, 218 const uint8_t *y, const uint8_t *u, 219 const uint8_t *v, int y_stride, int uv_stride, 220 uint8_t *yd, uint8_t *ud, uint8_t *vd, 221 int yd_stride, int uvd_stride) { 222 int mi_offset, y_offset, uv_offset; 223 const BLOCK_SIZE cur_bs = mi->mbmi.sb_type; 224 const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex; 225 const int bsl = b_width_log2_lookup[bs]; 226 PARTITION_TYPE partition = partition_lookup[bsl][cur_bs]; 227 const BLOCK_SIZE subsize = get_subsize(bs, partition); 228 229 if (cur_bs < BLOCK_8X8) { 230 // If there are blocks smaller than 8x8, it must be on the boundary. 231 return; 232 } 233 // No MFQE on blocks smaller than 16x16 234 if (bs == BLOCK_16X16) { 235 partition = PARTITION_NONE; 236 } 237 if (bs == BLOCK_64X64) { 238 mi_offset = 4; 239 y_offset = 32; 240 uv_offset = 16; 241 } else { 242 mi_offset = 2; 243 y_offset = 16; 244 uv_offset = 8; 245 } 246 switch (partition) { 247 BLOCK_SIZE mfqe_bs, bs_tmp; 248 case PARTITION_HORZ: 249 if (bs == BLOCK_64X64) { 250 mfqe_bs = BLOCK_64X32; 251 bs_tmp = BLOCK_32X32; 252 } else { 253 mfqe_bs = BLOCK_32X16; 254 bs_tmp = BLOCK_16X16; 255 } 256 if (mfqe_decision(mi, mfqe_bs)) { 257 // Do mfqe on the first square partition. 258 mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, 259 yd, ud, vd, yd_stride, uvd_stride, qdiff); 260 // Do mfqe on the second square partition. 261 mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, 262 y_stride, uv_stride, yd + y_offset, ud + uv_offset, 263 vd + uv_offset, yd_stride, uvd_stride, qdiff); 264 } 265 if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) { 266 // Do mfqe on the first square partition. 267 mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride, 268 v + uv_offset * uv_stride, y_stride, uv_stride, 269 yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, 270 vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff); 271 // Do mfqe on the second square partition. 272 mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset, 273 u + uv_offset * uv_stride + uv_offset, 274 v + uv_offset * uv_stride + uv_offset, y_stride, 275 uv_stride, yd + y_offset * yd_stride + y_offset, 276 ud + uv_offset * uvd_stride + uv_offset, 277 vd + uv_offset * uvd_stride + uv_offset, 278 yd_stride, uvd_stride, qdiff); 279 } 280 break; 281 case PARTITION_VERT: 282 if (bs == BLOCK_64X64) { 283 mfqe_bs = BLOCK_32X64; 284 bs_tmp = BLOCK_32X32; 285 } else { 286 mfqe_bs = BLOCK_16X32; 287 bs_tmp = BLOCK_16X16; 288 } 289 if (mfqe_decision(mi, mfqe_bs)) { 290 // Do mfqe on the first square partition. 291 mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, 292 yd, ud, vd, yd_stride, uvd_stride, qdiff); 293 // Do mfqe on the second square partition. 294 mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride, 295 v + uv_offset * uv_stride, y_stride, uv_stride, 296 yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, 297 vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff); 298 } 299 if (mfqe_decision(mi + mi_offset, mfqe_bs)) { 300 // Do mfqe on the first square partition. 301 mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, 302 y_stride, uv_stride, yd + y_offset, ud + uv_offset, 303 vd + uv_offset, yd_stride, uvd_stride, qdiff); 304 // Do mfqe on the second square partition. 305 mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset, 306 u + uv_offset * uv_stride + uv_offset, 307 v + uv_offset * uv_stride + uv_offset, y_stride, 308 uv_stride, yd + y_offset * yd_stride + y_offset, 309 ud + uv_offset * uvd_stride + uv_offset, 310 vd + uv_offset * uvd_stride + uv_offset, 311 yd_stride, uvd_stride, qdiff); 312 } 313 break; 314 case PARTITION_NONE: 315 if (mfqe_decision(mi, cur_bs)) { 316 // Do mfqe on this partition. 317 mfqe_block(cur_bs, y, u, v, y_stride, uv_stride, 318 yd, ud, vd, yd_stride, uvd_stride, qdiff); 319 } else { 320 // Copy the block from current frame(i.e., no mfqe is done). 321 copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, 322 yd_stride, uvd_stride, bs); 323 } 324 break; 325 case PARTITION_SPLIT: 326 // Recursion on four square partitions, e.g. if bs is 64X64, 327 // then look into four 32X32 blocks in it. 328 mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd, 329 yd_stride, uvd_stride); 330 mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset, 331 v + uv_offset, y_stride, uv_stride, yd + y_offset, 332 ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride); 333 mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize, 334 y + y_offset * y_stride, u + uv_offset * uv_stride, 335 v + uv_offset * uv_stride, y_stride, uv_stride, 336 yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, 337 vd + uv_offset * uvd_stride, yd_stride, uvd_stride); 338 mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset, 339 subsize, y + y_offset * y_stride + y_offset, 340 u + uv_offset * uv_stride + uv_offset, 341 v + uv_offset * uv_stride + uv_offset, y_stride, 342 uv_stride, yd + y_offset * yd_stride + y_offset, 343 ud + uv_offset * uvd_stride + uv_offset, 344 vd + uv_offset * uvd_stride + uv_offset, 345 yd_stride, uvd_stride); 346 break; 347 default: 348 assert(0); 349 } 350 } 351 352 void vp9_mfqe(VP9_COMMON *cm) { 353 int mi_row, mi_col; 354 // Current decoded frame. 355 const YV12_BUFFER_CONFIG *show = cm->frame_to_show; 356 // Last decoded frame and will store the MFQE result. 357 YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer; 358 // Loop through each super block. 359 for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) { 360 for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { 361 MODE_INFO *mi; 362 MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col); 363 // Motion Info in last frame. 364 MODE_INFO *mi_prev = cm->postproc_state.prev_mi + 365 (mi_row * cm->mi_stride + mi_col); 366 const uint32_t y_stride = show->y_stride; 367 const uint32_t uv_stride = show->uv_stride; 368 const uint32_t yd_stride = dest->y_stride; 369 const uint32_t uvd_stride = dest->uv_stride; 370 const uint32_t row_offset_y = mi_row << 3; 371 const uint32_t row_offset_uv = mi_row << 2; 372 const uint32_t col_offset_y = mi_col << 3; 373 const uint32_t col_offset_uv = mi_col << 2; 374 const uint8_t *y = show->y_buffer + row_offset_y * y_stride + 375 col_offset_y; 376 const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride + 377 col_offset_uv; 378 const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride + 379 col_offset_uv; 380 uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y; 381 uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride + 382 col_offset_uv; 383 uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride + 384 col_offset_uv; 385 if (frame_is_intra_only(cm)) { 386 mi = mi_prev; 387 } else { 388 mi = mi_local; 389 } 390 mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud, 391 vd, yd_stride, uvd_stride); 392 } 393 } 394 } 395