Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include "./vp9_rtcd.h"
     13 #include "./vpx_config.h"
     14 
     15 #include "vpx_mem/vpx_mem.h"
     16 
     17 #include "vp9/common/vp9_idct.h"
     18 #include "vp9/common/vp9_reconinter.h"
     19 #include "vp9/common/vp9_reconintra.h"
     20 #include "vp9/common/vp9_systemdependent.h"
     21 
     22 #include "vp9/encoder/vp9_encodemb.h"
     23 #include "vp9/encoder/vp9_quantize.h"
     24 #include "vp9/encoder/vp9_rdopt.h"
     25 #include "vp9/encoder/vp9_tokenize.h"
     26 
     27 struct optimize_ctx {
     28   ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
     29   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
     30 };
     31 
     32 struct encode_b_args {
     33   MACROBLOCK *x;
     34   struct optimize_ctx *ctx;
     35   unsigned char *skip;
     36 };
     37 
     38 void vp9_subtract_block_c(int rows, int cols,
     39                           int16_t *diff, ptrdiff_t diff_stride,
     40                           const uint8_t *src, ptrdiff_t src_stride,
     41                           const uint8_t *pred, ptrdiff_t pred_stride) {
     42   int r, c;
     43 
     44   for (r = 0; r < rows; r++) {
     45     for (c = 0; c < cols; c++)
     46       diff[c] = src[c] - pred[c];
     47 
     48     diff += diff_stride;
     49     pred += pred_stride;
     50     src  += src_stride;
     51   }
     52 }
     53 
     54 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
     55   struct macroblock_plane *const p = &x->plane[plane];
     56   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
     57   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
     58   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
     59   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
     60 
     61   vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
     62                      pd->dst.buf, pd->dst.stride);
     63 }
     64 
     65 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
     66 typedef struct vp9_token_state vp9_token_state;
     67 
     68 struct vp9_token_state {
     69   int           rate;
     70   int           error;
     71   int           next;
     72   signed char   token;
     73   short         qc;
     74 };
     75 
     76 // TODO(jimbankoski): experiment to find optimal RD numbers.
     77 #define Y1_RD_MULT 4
     78 #define UV_RD_MULT 2
     79 
     80 static const int plane_rd_mult[4] = {
     81   Y1_RD_MULT,
     82   UV_RD_MULT,
     83 };
     84 
     85 #define UPDATE_RD_COST()\
     86 {\
     87   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
     88   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
     89   if (rd_cost0 == rd_cost1) {\
     90     rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
     91     rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
     92   }\
     93 }
     94 
     95 // This function is a place holder for now but may ultimately need
     96 // to scan previous tokens to work out the correct context.
     97 static int trellis_get_coeff_context(const int16_t *scan,
     98                                      const int16_t *nb,
     99                                      int idx, int token,
    100                                      uint8_t *token_cache) {
    101   int bak = token_cache[scan[idx]], pt;
    102   token_cache[scan[idx]] = vp9_pt_energy_class[token];
    103   pt = get_coef_context(nb, token_cache, idx + 1);
    104   token_cache[scan[idx]] = bak;
    105   return pt;
    106 }
    107 
    108 static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
    109                        TX_SIZE tx_size, MACROBLOCK *mb,
    110                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
    111   MACROBLOCKD *const xd = &mb->e_mbd;
    112   struct macroblock_plane *p = &mb->plane[plane];
    113   struct macroblockd_plane *pd = &xd->plane[plane];
    114   const int ref = is_inter_block(&xd->mi[0]->mbmi);
    115   vp9_token_state tokens[1025][2];
    116   unsigned best_index[1025][2];
    117   const int16_t *coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
    118   int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
    119   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    120   int eob = p->eobs[block], final_eob, sz = 0;
    121   const int i0 = 0;
    122   int rc, x, next, i;
    123   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
    124   int rate0, rate1, error0, error1, t0, t1;
    125   int best, band, pt;
    126   PLANE_TYPE type = pd->plane_type;
    127   int err_mult = plane_rd_mult[type];
    128   const int default_eob = 16 << (tx_size << 1);
    129   const int mul = 1 + (tx_size == TX_32X32);
    130   uint8_t token_cache[1024];
    131   const int16_t *dequant_ptr = pd->dequant;
    132   const uint8_t *const band_translate = get_band_translate(tx_size);
    133   const scan_order *so = get_scan(xd, tx_size, type, block);
    134   const int16_t *scan = so->scan;
    135   const int16_t *nb = so->neighbors;
    136 
    137   assert((!type && !plane) || (type && plane));
    138   assert(eob <= default_eob);
    139 
    140   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
    141   rdmult = mb->rdmult * err_mult;
    142   if (!is_inter_block(&mb->e_mbd.mi[0]->mbmi))
    143     rdmult = (rdmult * 9) >> 4;
    144   rddiv = mb->rddiv;
    145   /* Initialize the sentinel node of the trellis. */
    146   tokens[eob][0].rate = 0;
    147   tokens[eob][0].error = 0;
    148   tokens[eob][0].next = default_eob;
    149   tokens[eob][0].token = EOB_TOKEN;
    150   tokens[eob][0].qc = 0;
    151   *(tokens[eob] + 1) = *(tokens[eob] + 0);
    152   next = eob;
    153   for (i = 0; i < eob; i++)
    154     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
    155         qcoeff[scan[i]]].token];
    156 
    157   for (i = eob; i-- > i0;) {
    158     int base_bits, d2, dx;
    159 
    160     rc = scan[i];
    161     x = qcoeff[rc];
    162     /* Only add a trellis state for non-zero coefficients. */
    163     if (x) {
    164       int shortcut = 0;
    165       error0 = tokens[next][0].error;
    166       error1 = tokens[next][1].error;
    167       /* Evaluate the first possibility for this state. */
    168       rate0 = tokens[next][0].rate;
    169       rate1 = tokens[next][1].rate;
    170       t0 = (vp9_dct_value_tokens_ptr + x)->token;
    171       /* Consider both possible successor states. */
    172       if (next < default_eob) {
    173         band = band_translate[i + 1];
    174         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
    175         rate0 +=
    176           mb->token_costs[tx_size][type][ref][band][0][pt]
    177                          [tokens[next][0].token];
    178         rate1 +=
    179           mb->token_costs[tx_size][type][ref][band][0][pt]
    180                          [tokens[next][1].token];
    181       }
    182       UPDATE_RD_COST();
    183       /* And pick the best. */
    184       best = rd_cost1 < rd_cost0;
    185       base_bits = *(vp9_dct_value_cost_ptr + x);
    186       dx = mul * (dqcoeff[rc] - coeff[rc]);
    187       d2 = dx * dx;
    188       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
    189       tokens[i][0].error = d2 + (best ? error1 : error0);
    190       tokens[i][0].next = next;
    191       tokens[i][0].token = t0;
    192       tokens[i][0].qc = x;
    193       best_index[i][0] = best;
    194 
    195       /* Evaluate the second possibility for this state. */
    196       rate0 = tokens[next][0].rate;
    197       rate1 = tokens[next][1].rate;
    198 
    199       if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
    200           (abs(x)*dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
    201                                          dequant_ptr[rc != 0]))
    202         shortcut = 1;
    203       else
    204         shortcut = 0;
    205 
    206       if (shortcut) {
    207         sz = -(x < 0);
    208         x -= 2 * sz + 1;
    209       }
    210 
    211       /* Consider both possible successor states. */
    212       if (!x) {
    213         /* If we reduced this coefficient to zero, check to see if
    214          *  we need to move the EOB back here.
    215          */
    216         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
    217         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
    218       } else {
    219         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
    220       }
    221       if (next < default_eob) {
    222         band = band_translate[i + 1];
    223         if (t0 != EOB_TOKEN) {
    224           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
    225           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
    226                                   [tokens[next][0].token];
    227         }
    228         if (t1 != EOB_TOKEN) {
    229           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
    230           rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
    231                                   [tokens[next][1].token];
    232         }
    233       }
    234 
    235       UPDATE_RD_COST();
    236       /* And pick the best. */
    237       best = rd_cost1 < rd_cost0;
    238       base_bits = *(vp9_dct_value_cost_ptr + x);
    239 
    240       if (shortcut) {
    241         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
    242         d2 = dx * dx;
    243       }
    244       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
    245       tokens[i][1].error = d2 + (best ? error1 : error0);
    246       tokens[i][1].next = next;
    247       tokens[i][1].token = best ? t1 : t0;
    248       tokens[i][1].qc = x;
    249       best_index[i][1] = best;
    250       /* Finally, make this the new head of the trellis. */
    251       next = i;
    252     } else {
    253       /* There's no choice to make for a zero coefficient, so we don't
    254        *  add a new trellis node, but we do need to update the costs.
    255        */
    256       band = band_translate[i + 1];
    257       t0 = tokens[next][0].token;
    258       t1 = tokens[next][1].token;
    259       /* Update the cost of each path if we're past the EOB token. */
    260       if (t0 != EOB_TOKEN) {
    261         tokens[next][0].rate +=
    262             mb->token_costs[tx_size][type][ref][band][1][0][t0];
    263         tokens[next][0].token = ZERO_TOKEN;
    264       }
    265       if (t1 != EOB_TOKEN) {
    266         tokens[next][1].rate +=
    267             mb->token_costs[tx_size][type][ref][band][1][0][t1];
    268         tokens[next][1].token = ZERO_TOKEN;
    269       }
    270       best_index[i][0] = best_index[i][1] = 0;
    271       /* Don't update next, because we didn't add a new node. */
    272     }
    273   }
    274 
    275   /* Now pick the best path through the whole trellis. */
    276   band = band_translate[i + 1];
    277   pt = combine_entropy_contexts(*a, *l);
    278   rate0 = tokens[next][0].rate;
    279   rate1 = tokens[next][1].rate;
    280   error0 = tokens[next][0].error;
    281   error1 = tokens[next][1].error;
    282   t0 = tokens[next][0].token;
    283   t1 = tokens[next][1].token;
    284   rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0];
    285   rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1];
    286   UPDATE_RD_COST();
    287   best = rd_cost1 < rd_cost0;
    288   final_eob = i0 - 1;
    289   vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
    290   vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
    291   for (i = next; i < eob; i = next) {
    292     x = tokens[i][best].qc;
    293     if (x) {
    294       final_eob = i;
    295     }
    296     rc = scan[i];
    297     qcoeff[rc] = x;
    298     dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
    299 
    300     next = tokens[i][best].next;
    301     best = best_index[i][best];
    302   }
    303   final_eob++;
    304 
    305   mb->plane[plane].eobs[block] = final_eob;
    306   *a = *l = (final_eob > 0);
    307 }
    308 
    309 static INLINE void fdct32x32(int rd_transform,
    310                              const int16_t *src, int16_t *dst, int src_stride) {
    311   if (rd_transform)
    312     vp9_fdct32x32_rd(src, dst, src_stride);
    313   else
    314     vp9_fdct32x32(src, dst, src_stride);
    315 }
    316 
    317 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
    318                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
    319   MACROBLOCKD *const xd = &x->e_mbd;
    320   const struct macroblock_plane *const p = &x->plane[plane];
    321   const struct macroblockd_plane *const pd = &xd->plane[plane];
    322   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
    323   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
    324   int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
    325   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    326   uint16_t *const eob = &p->eobs[block];
    327   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
    328   int i, j;
    329   const int16_t *src_diff;
    330   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
    331   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
    332 
    333   switch (tx_size) {
    334     case TX_32X32:
    335       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
    336       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
    337                            p->quant, p->quant_shift, qcoeff, dqcoeff,
    338                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
    339                            scan_order->iscan);
    340       break;
    341     case TX_16X16:
    342       vp9_fdct16x16(src_diff, coeff, diff_stride);
    343       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
    344                      p->quant, p->quant_shift, qcoeff, dqcoeff,
    345                      pd->dequant, p->zbin_extra, eob,
    346                      scan_order->scan, scan_order->iscan);
    347       break;
    348     case TX_8X8:
    349       vp9_fdct8x8(src_diff, coeff, diff_stride);
    350       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
    351                      p->quant, p->quant_shift, qcoeff, dqcoeff,
    352                      pd->dequant, p->zbin_extra, eob,
    353                      scan_order->scan, scan_order->iscan);
    354       break;
    355     case TX_4X4:
    356       x->fwd_txm4x4(src_diff, coeff, diff_stride);
    357       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
    358                      p->quant, p->quant_shift, qcoeff, dqcoeff,
    359                      pd->dequant, p->zbin_extra, eob,
    360                      scan_order->scan, scan_order->iscan);
    361       break;
    362     default:
    363       assert(0);
    364   }
    365 }
    366 
    367 static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
    368                          TX_SIZE tx_size, void *arg) {
    369   struct encode_b_args *const args = arg;
    370   MACROBLOCK *const x = args->x;
    371   MACROBLOCKD *const xd = &x->e_mbd;
    372   struct optimize_ctx *const ctx = args->ctx;
    373   struct macroblock_plane *const p = &x->plane[plane];
    374   struct macroblockd_plane *const pd = &xd->plane[plane];
    375   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    376   int i, j;
    377   uint8_t *dst;
    378   ENTROPY_CONTEXT *a, *l;
    379   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
    380   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
    381   a = &ctx->ta[plane][i];
    382   l = &ctx->tl[plane][j];
    383 
    384   // TODO(jingning): per transformed block zero forcing only enabled for
    385   // luma component. will integrate chroma components as well.
    386   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
    387     p->eobs[block] = 0;
    388     *a = *l = 0;
    389     return;
    390   }
    391 
    392   if (!x->skip_recode)
    393     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
    394 
    395   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
    396     optimize_b(plane, block, plane_bsize, tx_size, x, a, l);
    397   } else {
    398     *a = *l = p->eobs[block] > 0;
    399   }
    400 
    401   if (p->eobs[block])
    402     *(args->skip) = 0;
    403 
    404   if (x->skip_encode || p->eobs[block] == 0)
    405     return;
    406 
    407   switch (tx_size) {
    408     case TX_32X32:
    409       vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
    410       break;
    411     case TX_16X16:
    412       vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
    413       break;
    414     case TX_8X8:
    415       vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
    416       break;
    417     case TX_4X4:
    418       // this is like vp9_short_idct4x4 but has a special case around eob<=1
    419       // which is significant (not just an optimization) for the lossless
    420       // case.
    421       xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
    422       break;
    423     default:
    424       assert(0 && "Invalid transform size");
    425   }
    426 }
    427 
    428 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
    429                                TX_SIZE tx_size, void *arg) {
    430   MACROBLOCK *const x = (MACROBLOCK *)arg;
    431   MACROBLOCKD *const xd = &x->e_mbd;
    432   struct macroblock_plane *const p = &x->plane[plane];
    433   struct macroblockd_plane *const pd = &xd->plane[plane];
    434   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    435   int i, j;
    436   uint8_t *dst;
    437   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
    438   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
    439 
    440   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
    441 
    442   if (p->eobs[block] > 0)
    443     xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
    444 }
    445 
    446 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
    447   vp9_subtract_plane(x, bsize, 0);
    448   vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
    449                                          encode_block_pass1, x);
    450 }
    451 
    452 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
    453   MACROBLOCKD *const xd = &x->e_mbd;
    454   struct optimize_ctx ctx;
    455   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
    456   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
    457   int plane;
    458 
    459   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
    460     if (!x->skip_recode)
    461       vp9_subtract_plane(x, bsize, plane);
    462 
    463     if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
    464       const struct macroblockd_plane* const pd = &xd->plane[plane];
    465       const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
    466       vp9_get_entropy_contexts(bsize, tx_size, pd,
    467                                ctx.ta[plane], ctx.tl[plane]);
    468     }
    469 
    470     vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
    471                                            &arg);
    472   }
    473 }
    474 
    475 static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
    476                                TX_SIZE tx_size, void *arg) {
    477   struct encode_b_args* const args = arg;
    478   MACROBLOCK *const x = args->x;
    479   MACROBLOCKD *const xd = &x->e_mbd;
    480   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
    481   struct macroblock_plane *const p = &x->plane[plane];
    482   struct macroblockd_plane *const pd = &xd->plane[plane];
    483   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
    484   int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
    485   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    486   const scan_order *scan_order;
    487   TX_TYPE tx_type;
    488   MB_PREDICTION_MODE mode;
    489   const int bwl = b_width_log2(plane_bsize);
    490   const int diff_stride = 4 * (1 << bwl);
    491   uint8_t *src, *dst;
    492   int16_t *src_diff;
    493   uint16_t *eob = &p->eobs[block];
    494   const int src_stride = p->src.stride;
    495   const int dst_stride = pd->dst.stride;
    496   int i, j;
    497   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
    498   dst = &pd->dst.buf[4 * (j * dst_stride + i)];
    499   src = &p->src.buf[4 * (j * src_stride + i)];
    500   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
    501 
    502   switch (tx_size) {
    503     case TX_32X32:
    504       scan_order = &vp9_default_scan_orders[TX_32X32];
    505       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
    506       vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
    507                               x->skip_encode ? src : dst,
    508                               x->skip_encode ? src_stride : dst_stride,
    509                               dst, dst_stride, i, j, plane);
    510       if (!x->skip_recode) {
    511         vp9_subtract_block(32, 32, src_diff, diff_stride,
    512                            src, src_stride, dst, dst_stride);
    513         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
    514         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
    515                              p->quant, p->quant_shift, qcoeff, dqcoeff,
    516                              pd->dequant, p->zbin_extra, eob, scan_order->scan,
    517                              scan_order->iscan);
    518       }
    519       if (!x->skip_encode && *eob)
    520         vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
    521       break;
    522     case TX_16X16:
    523       tx_type = get_tx_type(pd->plane_type, xd);
    524       scan_order = &vp9_scan_orders[TX_16X16][tx_type];
    525       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
    526       vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
    527                               x->skip_encode ? src : dst,
    528                               x->skip_encode ? src_stride : dst_stride,
    529                               dst, dst_stride, i, j, plane);
    530       if (!x->skip_recode) {
    531         vp9_subtract_block(16, 16, src_diff, diff_stride,
    532                            src, src_stride, dst, dst_stride);
    533         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
    534         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
    535                        p->quant, p->quant_shift, qcoeff, dqcoeff,
    536                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
    537                        scan_order->iscan);
    538       }
    539       if (!x->skip_encode && *eob)
    540         vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
    541       break;
    542     case TX_8X8:
    543       tx_type = get_tx_type(pd->plane_type, xd);
    544       scan_order = &vp9_scan_orders[TX_8X8][tx_type];
    545       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
    546       vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
    547                               x->skip_encode ? src : dst,
    548                               x->skip_encode ? src_stride : dst_stride,
    549                               dst, dst_stride, i, j, plane);
    550       if (!x->skip_recode) {
    551         vp9_subtract_block(8, 8, src_diff, diff_stride,
    552                            src, src_stride, dst, dst_stride);
    553         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
    554         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
    555                        p->quant_shift, qcoeff, dqcoeff,
    556                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
    557                        scan_order->iscan);
    558       }
    559       if (!x->skip_encode && *eob)
    560         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
    561       break;
    562     case TX_4X4:
    563       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
    564       scan_order = &vp9_scan_orders[TX_4X4][tx_type];
    565       mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
    566       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
    567                               x->skip_encode ? src : dst,
    568                               x->skip_encode ? src_stride : dst_stride,
    569                               dst, dst_stride, i, j, plane);
    570 
    571       if (!x->skip_recode) {
    572         vp9_subtract_block(4, 4, src_diff, diff_stride,
    573                            src, src_stride, dst, dst_stride);
    574         if (tx_type != DCT_DCT)
    575           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
    576         else
    577           x->fwd_txm4x4(src_diff, coeff, diff_stride);
    578         vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
    579                        p->quant_shift, qcoeff, dqcoeff,
    580                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
    581                        scan_order->iscan);
    582       }
    583 
    584       if (!x->skip_encode && *eob) {
    585         if (tx_type == DCT_DCT)
    586           // this is like vp9_short_idct4x4 but has a special case around eob<=1
    587           // which is significant (not just an optimization) for the lossless
    588           // case.
    589           xd->itxm_add(dqcoeff, dst, dst_stride, *eob);
    590         else
    591           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
    592       }
    593       break;
    594     default:
    595       assert(0);
    596   }
    597   if (*eob)
    598     *(args->skip) = 0;
    599 }
    600 
    601 void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
    602                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
    603                             unsigned char *skip) {
    604   struct encode_b_args arg = {x, NULL, skip};
    605   encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
    606 }
    607 
    608 
    609 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
    610   const MACROBLOCKD *const xd = &x->e_mbd;
    611   struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
    612 
    613   vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
    614                                          &arg);
    615 }
    616 
    617 int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
    618   MB_MODE_INFO * mbmi = &x->e_mbd.mi[0]->mbmi;
    619   x->skip_encode = 0;
    620   mbmi->mode = DC_PRED;
    621   mbmi->ref_frame[0] = INTRA_FRAME;
    622   mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
    623                                                                  : TX_8X8)
    624                                    : TX_4X4;
    625   vp9_encode_intra_block_plane(x, mbmi->sb_type, 0);
    626   return vp9_get_mb_ss(x->plane[0].src_diff);
    627 }
    628