1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "vpx_ports/config.h" 13 #include "encodemb.h" 14 #include "vp8/common/reconinter.h" 15 #include "quantize.h" 16 #include "tokenize.h" 17 #include "vp8/common/invtrans.h" 18 #include "vp8/common/recon.h" 19 #include "vp8/common/reconintra.h" 20 #include "dct.h" 21 #include "vpx_mem/vpx_mem.h" 22 23 #if CONFIG_RUNTIME_CPU_DETECT 24 #define IF_RTCD(x) (x) 25 #else 26 #define IF_RTCD(x) NULL 27 #endif 28 void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) 29 { 30 unsigned char *src_ptr = (*(be->base_src) + be->src); 31 short *diff_ptr = be->src_diff; 32 unsigned char *pred_ptr = bd->predictor; 33 int src_stride = be->src_stride; 34 35 int r, c; 36 37 for (r = 0; r < 4; r++) 38 { 39 for (c = 0; c < 4; c++) 40 { 41 diff_ptr[c] = src_ptr[c] - pred_ptr[c]; 42 } 43 44 diff_ptr += pitch; 45 pred_ptr += pitch; 46 src_ptr += src_stride; 47 } 48 } 49 50 void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) 51 { 52 short *udiff = diff + 256; 53 short *vdiff = diff + 320; 54 unsigned char *upred = pred + 256; 55 unsigned char *vpred = pred + 320; 56 57 int r, c; 58 59 for (r = 0; r < 8; r++) 60 { 61 for (c = 0; c < 8; c++) 62 { 63 udiff[c] = usrc[c] - upred[c]; 64 } 65 66 udiff += 8; 67 upred += 8; 68 usrc += stride; 69 } 70 71 for (r = 0; r < 8; r++) 72 { 73 for (c = 0; c < 8; c++) 74 { 75 vdiff[c] = vsrc[c] - vpred[c]; 76 } 77 78 vdiff += 8; 79 vpred += 8; 80 vsrc += stride; 81 } 82 } 83 84 void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) 85 { 86 int r, c; 87 88 for (r = 0; r < 16; r++) 89 { 90 for (c = 0; c < 16; c++) 91 { 92 diff[c] = src[c] - pred[c]; 93 } 94 95 diff += 16; 96 pred += 16; 97 src += stride; 98 } 99 } 100 101 static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) 102 { 103 ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); 104 ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); 105 } 106 107 static void build_dcblock(MACROBLOCK *x) 108 { 109 short *src_diff_ptr = &x->src_diff[384]; 110 int i; 111 112 for (i = 0; i < 16; i++) 113 { 114 src_diff_ptr[i] = x->coeff[i * 16]; 115 } 116 } 117 118 void vp8_transform_mbuv(MACROBLOCK *x) 119 { 120 int i; 121 122 for (i = 16; i < 24; i += 2) 123 { 124 x->vp8_short_fdct8x4(&x->block[i].src_diff[0], 125 &x->block[i].coeff[0], 16); 126 } 127 } 128 129 130 void vp8_transform_intra_mby(MACROBLOCK *x) 131 { 132 int i; 133 134 for (i = 0; i < 16; i += 2) 135 { 136 x->vp8_short_fdct8x4(&x->block[i].src_diff[0], 137 &x->block[i].coeff[0], 32); 138 } 139 140 // build dc block from 16 y dc values 141 build_dcblock(x); 142 143 // do 2nd order transform on the dc block 144 x->short_walsh4x4(&x->block[24].src_diff[0], 145 &x->block[24].coeff[0], 8); 146 147 } 148 149 150 static void transform_mb(MACROBLOCK *x) 151 { 152 int i; 153 154 for (i = 0; i < 16; i += 2) 155 { 156 x->vp8_short_fdct8x4(&x->block[i].src_diff[0], 157 &x->block[i].coeff[0], 32); 158 } 159 160 // build dc block from 16 y dc values 161 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) 162 build_dcblock(x); 163 164 for (i = 16; i < 24; i += 2) 165 { 166 x->vp8_short_fdct8x4(&x->block[i].src_diff[0], 167 &x->block[i].coeff[0], 16); 168 } 169 170 // do 2nd order transform on the dc block 171 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) 172 x->short_walsh4x4(&x->block[24].src_diff[0], 173 &x->block[24].coeff[0], 8); 174 175 } 176 177 178 static void transform_mby(MACROBLOCK *x) 179 { 180 int i; 181 182 for (i = 0; i < 16; i += 2) 183 { 184 x->vp8_short_fdct8x4(&x->block[i].src_diff[0], 185 &x->block[i].coeff[0], 32); 186 } 187 188 // build dc block from 16 y dc values 189 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) 190 { 191 build_dcblock(x); 192 x->short_walsh4x4(&x->block[24].src_diff[0], 193 &x->block[24].coeff[0], 8); 194 } 195 } 196 197 198 void vp8_stuff_inter16x16(MACROBLOCK *x) 199 { 200 vp8_build_inter_predictors_mb_s(&x->e_mbd); 201 /* 202 // recon = copy from predictors to destination 203 { 204 BLOCKD *b = &x->e_mbd.block[0]; 205 unsigned char *pred_ptr = b->predictor; 206 unsigned char *dst_ptr = *(b->base_dst) + b->dst; 207 int stride = b->dst_stride; 208 209 int i; 210 for(i=0;i<16;i++) 211 vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16); 212 213 b = &x->e_mbd.block[16]; 214 pred_ptr = b->predictor; 215 dst_ptr = *(b->base_dst) + b->dst; 216 stride = b->dst_stride; 217 218 for(i=0;i<8;i++) 219 vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8); 220 221 b = &x->e_mbd.block[20]; 222 pred_ptr = b->predictor; 223 dst_ptr = *(b->base_dst) + b->dst; 224 stride = b->dst_stride; 225 226 for(i=0;i<8;i++) 227 vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8); 228 } 229 */ 230 } 231 232 #if !(CONFIG_REALTIME_ONLY) 233 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) 234 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) 235 236 typedef struct vp8_token_state vp8_token_state; 237 238 struct vp8_token_state{ 239 int rate; 240 int error; 241 signed char next; 242 signed char token; 243 short qc; 244 }; 245 246 // TODO: experiments to find optimal multiple numbers 247 #define Y1_RD_MULT 4 248 #define UV_RD_MULT 2 249 #define Y2_RD_MULT 16 250 251 static const int plane_rd_mult[4]= 252 { 253 Y1_RD_MULT, 254 Y2_RD_MULT, 255 UV_RD_MULT, 256 Y1_RD_MULT 257 }; 258 259 static void optimize_b(MACROBLOCK *mb, int ib, int type, 260 ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, 261 const VP8_ENCODER_RTCD *rtcd) 262 { 263 BLOCK *b; 264 BLOCKD *d; 265 vp8_token_state tokens[17][2]; 266 unsigned best_mask[2]; 267 const short *dequant_ptr; 268 const short *coeff_ptr; 269 short *qcoeff_ptr; 270 short *dqcoeff_ptr; 271 int eob; 272 int i0; 273 int rc; 274 int x; 275 int sz; 276 int next; 277 int rdmult; 278 int rddiv; 279 int final_eob; 280 int rd_cost0; 281 int rd_cost1; 282 int rate0; 283 int rate1; 284 int error0; 285 int error1; 286 int t0; 287 int t1; 288 int best; 289 int band; 290 int pt; 291 int i; 292 int err_mult = plane_rd_mult[type]; 293 294 b = &mb->block[ib]; 295 d = &mb->e_mbd.block[ib]; 296 297 /* Enable this to test the effect of RDO as a replacement for the dynamic 298 * zero bin instead of an augmentation of it. 299 */ 300 #if 0 301 vp8_strict_quantize_b(b, d); 302 #endif 303 304 dequant_ptr = d->dequant; 305 coeff_ptr = b->coeff; 306 qcoeff_ptr = d->qcoeff; 307 dqcoeff_ptr = d->dqcoeff; 308 i0 = !type; 309 eob = d->eob; 310 311 /* Now set up a Viterbi trellis to evaluate alternative roundings. */ 312 rdmult = mb->rdmult * err_mult; 313 if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME) 314 rdmult = (rdmult * 9)>>4; 315 316 rddiv = mb->rddiv; 317 best_mask[0] = best_mask[1] = 0; 318 /* Initialize the sentinel node of the trellis. */ 319 tokens[eob][0].rate = 0; 320 tokens[eob][0].error = 0; 321 tokens[eob][0].next = 16; 322 tokens[eob][0].token = DCT_EOB_TOKEN; 323 tokens[eob][0].qc = 0; 324 *(tokens[eob] + 1) = *(tokens[eob] + 0); 325 next = eob; 326 for (i = eob; i-- > i0;) 327 { 328 int base_bits; 329 int d2; 330 int dx; 331 332 rc = vp8_default_zig_zag1d[i]; 333 x = qcoeff_ptr[rc]; 334 /* Only add a trellis state for non-zero coefficients. */ 335 if (x) 336 { 337 int shortcut=0; 338 error0 = tokens[next][0].error; 339 error1 = tokens[next][1].error; 340 /* Evaluate the first possibility for this state. */ 341 rate0 = tokens[next][0].rate; 342 rate1 = tokens[next][1].rate; 343 t0 = (vp8_dct_value_tokens_ptr + x)->Token; 344 /* Consider both possible successor states. */ 345 if (next < 16) 346 { 347 band = vp8_coef_bands[i + 1]; 348 pt = vp8_prev_token_class[t0]; 349 rate0 += 350 mb->token_costs[type][band][pt][tokens[next][0].token]; 351 rate1 += 352 mb->token_costs[type][band][pt][tokens[next][1].token]; 353 } 354 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); 355 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); 356 if (rd_cost0 == rd_cost1) 357 { 358 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); 359 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); 360 } 361 /* And pick the best. */ 362 best = rd_cost1 < rd_cost0; 363 base_bits = *(vp8_dct_value_cost_ptr + x); 364 dx = dqcoeff_ptr[rc] - coeff_ptr[rc]; 365 d2 = dx*dx; 366 tokens[i][0].rate = base_bits + (best ? rate1 : rate0); 367 tokens[i][0].error = d2 + (best ? error1 : error0); 368 tokens[i][0].next = next; 369 tokens[i][0].token = t0; 370 tokens[i][0].qc = x; 371 best_mask[0] |= best << i; 372 /* Evaluate the second possibility for this state. */ 373 rate0 = tokens[next][0].rate; 374 rate1 = tokens[next][1].rate; 375 376 if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) && 377 (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc])) 378 shortcut = 1; 379 else 380 shortcut = 0; 381 382 if(shortcut) 383 { 384 sz = -(x < 0); 385 x -= 2*sz + 1; 386 } 387 388 /* Consider both possible successor states. */ 389 if (!x) 390 { 391 /* If we reduced this coefficient to zero, check to see if 392 * we need to move the EOB back here. 393 */ 394 t0 = tokens[next][0].token == DCT_EOB_TOKEN ? 395 DCT_EOB_TOKEN : ZERO_TOKEN; 396 t1 = tokens[next][1].token == DCT_EOB_TOKEN ? 397 DCT_EOB_TOKEN : ZERO_TOKEN; 398 } 399 else 400 { 401 t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token; 402 } 403 if (next < 16) 404 { 405 band = vp8_coef_bands[i + 1]; 406 if(t0!=DCT_EOB_TOKEN) 407 { 408 pt = vp8_prev_token_class[t0]; 409 rate0 += mb->token_costs[type][band][pt][ 410 tokens[next][0].token]; 411 } 412 if(t1!=DCT_EOB_TOKEN) 413 { 414 pt = vp8_prev_token_class[t1]; 415 rate1 += mb->token_costs[type][band][pt][ 416 tokens[next][1].token]; 417 } 418 } 419 420 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); 421 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); 422 if (rd_cost0 == rd_cost1) 423 { 424 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); 425 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); 426 } 427 /* And pick the best. */ 428 best = rd_cost1 < rd_cost0; 429 base_bits = *(vp8_dct_value_cost_ptr + x); 430 431 if(shortcut) 432 { 433 dx -= (dequant_ptr[rc] + sz) ^ sz; 434 d2 = dx*dx; 435 } 436 tokens[i][1].rate = base_bits + (best ? rate1 : rate0); 437 tokens[i][1].error = d2 + (best ? error1 : error0); 438 tokens[i][1].next = next; 439 tokens[i][1].token =best?t1:t0; 440 tokens[i][1].qc = x; 441 best_mask[1] |= best << i; 442 /* Finally, make this the new head of the trellis. */ 443 next = i; 444 } 445 /* There's no choice to make for a zero coefficient, so we don't 446 * add a new trellis node, but we do need to update the costs. 447 */ 448 else 449 { 450 band = vp8_coef_bands[i + 1]; 451 t0 = tokens[next][0].token; 452 t1 = tokens[next][1].token; 453 /* Update the cost of each path if we're past the EOB token. */ 454 if (t0 != DCT_EOB_TOKEN) 455 { 456 tokens[next][0].rate += mb->token_costs[type][band][0][t0]; 457 tokens[next][0].token = ZERO_TOKEN; 458 } 459 if (t1 != DCT_EOB_TOKEN) 460 { 461 tokens[next][1].rate += mb->token_costs[type][band][0][t1]; 462 tokens[next][1].token = ZERO_TOKEN; 463 } 464 /* Don't update next, because we didn't add a new node. */ 465 } 466 } 467 468 /* Now pick the best path through the whole trellis. */ 469 band = vp8_coef_bands[i + 1]; 470 VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); 471 rate0 = tokens[next][0].rate; 472 rate1 = tokens[next][1].rate; 473 error0 = tokens[next][0].error; 474 error1 = tokens[next][1].error; 475 t0 = tokens[next][0].token; 476 t1 = tokens[next][1].token; 477 rate0 += mb->token_costs[type][band][pt][t0]; 478 rate1 += mb->token_costs[type][band][pt][t1]; 479 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); 480 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); 481 if (rd_cost0 == rd_cost1) 482 { 483 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); 484 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); 485 } 486 best = rd_cost1 < rd_cost0; 487 final_eob = i0 - 1; 488 for (i = next; i < eob; i = next) 489 { 490 x = tokens[i][best].qc; 491 if (x) 492 final_eob = i; 493 rc = vp8_default_zig_zag1d[i]; 494 qcoeff_ptr[rc] = x; 495 dqcoeff_ptr[rc] = x * dequant_ptr[rc]; 496 next = tokens[i][best].next; 497 best = (best_mask[best] >> i) & 1; 498 } 499 final_eob++; 500 501 d->eob = final_eob; 502 *a = *l = (d->eob != !type); 503 } 504 505 static void optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) 506 { 507 int b; 508 int type; 509 int has_2nd_order; 510 ENTROPY_CONTEXT_PLANES t_above, t_left; 511 ENTROPY_CONTEXT *ta; 512 ENTROPY_CONTEXT *tl; 513 514 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); 515 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); 516 517 ta = (ENTROPY_CONTEXT *)&t_above; 518 tl = (ENTROPY_CONTEXT *)&t_left; 519 520 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED 521 && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); 522 type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; 523 524 for (b = 0; b < 16; b++) 525 { 526 optimize_b(x, b, type, 527 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd); 528 } 529 530 for (b = 16; b < 24; b++) 531 { 532 optimize_b(x, b, PLANE_TYPE_UV, 533 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd); 534 } 535 536 if (has_2nd_order) 537 { 538 b=24; 539 optimize_b(x, b, PLANE_TYPE_Y2, 540 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd); 541 } 542 } 543 544 545 void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) 546 { 547 int b; 548 int type; 549 int has_2nd_order; 550 551 ENTROPY_CONTEXT_PLANES t_above, t_left; 552 ENTROPY_CONTEXT *ta; 553 ENTROPY_CONTEXT *tl; 554 555 if (!x->e_mbd.above_context) 556 return; 557 558 if (!x->e_mbd.left_context) 559 return; 560 561 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); 562 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); 563 564 ta = (ENTROPY_CONTEXT *)&t_above; 565 tl = (ENTROPY_CONTEXT *)&t_left; 566 567 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED 568 && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); 569 type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; 570 571 for (b = 0; b < 16; b++) 572 { 573 optimize_b(x, b, type, 574 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd); 575 } 576 577 578 if (has_2nd_order) 579 { 580 b=24; 581 optimize_b(x, b, PLANE_TYPE_Y2, 582 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd); 583 } 584 } 585 586 void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) 587 { 588 int b; 589 ENTROPY_CONTEXT_PLANES t_above, t_left; 590 ENTROPY_CONTEXT *ta; 591 ENTROPY_CONTEXT *tl; 592 593 if (!x->e_mbd.above_context) 594 return; 595 596 if (!x->e_mbd.left_context) 597 return; 598 599 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); 600 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); 601 602 ta = (ENTROPY_CONTEXT *)&t_above; 603 tl = (ENTROPY_CONTEXT *)&t_left; 604 605 for (b = 16; b < 24; b++) 606 { 607 optimize_b(x, b, PLANE_TYPE_UV, 608 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd); 609 } 610 } 611 #endif 612 613 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) 614 { 615 vp8_build_inter_predictors_mb(&x->e_mbd); 616 617 vp8_subtract_mb(rtcd, x); 618 619 transform_mb(x); 620 621 vp8_quantize_mb(x); 622 623 #if !(CONFIG_REALTIME_ONLY) 624 if (x->optimize) 625 optimize_mb(x, rtcd); 626 #endif 627 628 vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd); 629 630 RECON_INVOKE(&rtcd->common->recon, recon_mb) 631 (IF_RTCD(&rtcd->common->recon), &x->e_mbd); 632 } 633 634 635 /* this funciton is used by first pass only */ 636 void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) 637 { 638 vp8_build_inter_predictors_mby(&x->e_mbd); 639 640 ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); 641 642 transform_mby(x); 643 644 vp8_quantize_mby(x); 645 646 vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); 647 648 RECON_INVOKE(&rtcd->common->recon, recon_mby) 649 (IF_RTCD(&rtcd->common->recon), &x->e_mbd); 650 } 651 652 653 void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) 654 { 655 vp8_build_inter_predictors_mbuv(&x->e_mbd); 656 ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); 657 658 vp8_transform_mbuv(x); 659 660 vp8_quantize_mbuv(x); 661 662 } 663