Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vpx_config.h"
     12 #include "vp9/encoder/vp9_encodemb.h"
     13 #include "vp9/common/vp9_reconinter.h"
     14 #include "vp9/encoder/vp9_quantize.h"
     15 #include "vp9/encoder/vp9_tokenize.h"
     16 #include "vp9/common/vp9_reconintra.h"
     17 #include "vpx_mem/vpx_mem.h"
     18 #include "vp9/encoder/vp9_rdopt.h"
     19 #include "vp9/common/vp9_systemdependent.h"
     20 #include "vp9_rtcd.h"
     21 
     22 DECLARE_ALIGNED(16, extern const uint8_t,
     23                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
     24 
     25 void vp9_subtract_block_c(int rows, int cols,
     26                           int16_t *diff_ptr, ptrdiff_t diff_stride,
     27                           const uint8_t *src_ptr, ptrdiff_t src_stride,
     28                           const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
     29   int r, c;
     30 
     31   for (r = 0; r < rows; r++) {
     32     for (c = 0; c < cols; c++)
     33       diff_ptr[c] = src_ptr[c] - pred_ptr[c];
     34 
     35     diff_ptr += diff_stride;
     36     pred_ptr += pred_stride;
     37     src_ptr  += src_stride;
     38   }
     39 }
     40 
     41 static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
     42                                         int16_t *dqcoeff, uint8_t *dest,
     43                                         int stride) {
     44   if (eob <= 1)
     45     xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
     46   else
     47     xd->inv_txm4x4_add(dqcoeff, dest, stride);
     48 }
     49 
     50 static void inverse_transform_b_8x8_add(int eob,
     51                                         int16_t *dqcoeff, uint8_t *dest,
     52                                         int stride) {
     53   if (eob <= 1)
     54     vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
     55   else if (eob <= 10)
     56     vp9_short_idct10_8x8_add(dqcoeff, dest, stride);
     57   else
     58     vp9_short_idct8x8_add(dqcoeff, dest, stride);
     59 }
     60 
     61 static void inverse_transform_b_16x16_add(int eob,
     62                                           int16_t *dqcoeff, uint8_t *dest,
     63                                           int stride) {
     64   if (eob <= 1)
     65     vp9_short_idct16x16_1_add(dqcoeff, dest, stride);
     66   else if (eob <= 10)
     67     vp9_short_idct10_16x16_add(dqcoeff, dest, stride);
     68   else
     69     vp9_short_idct16x16_add(dqcoeff, dest, stride);
     70 }
     71 
     72 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
     73   struct macroblock_plane *const p = &x->plane[plane];
     74   const MACROBLOCKD *const xd = &x->e_mbd;
     75   const struct macroblockd_plane *const pd = &xd->plane[plane];
     76   const int bw = plane_block_width(bsize, pd);
     77   const int bh = plane_block_height(bsize, pd);
     78 
     79   vp9_subtract_block(bh, bw, p->src_diff, bw,
     80                      p->src.buf, p->src.stride,
     81                      pd->dst.buf, pd->dst.stride);
     82 }
     83 
     84 void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
     85   subtract_plane(x, bsize, 0);
     86 }
     87 
     88 void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
     89   int i;
     90 
     91   for (i = 1; i < MAX_MB_PLANE; i++)
     92     subtract_plane(x, bsize, i);
     93 }
     94 
     95 void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
     96   vp9_subtract_sby(x, bsize);
     97   vp9_subtract_sbuv(x, bsize);
     98 }
     99 
    100 
    101 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
    102 typedef struct vp9_token_state vp9_token_state;
    103 
    104 struct vp9_token_state {
    105   int           rate;
    106   int           error;
    107   int           next;
    108   signed char   token;
    109   short         qc;
    110 };
    111 
    112 // TODO: experiments to find optimal multiple numbers
    113 #define Y1_RD_MULT 4
    114 #define UV_RD_MULT 2
    115 
    116 static const int plane_rd_mult[4] = {
    117   Y1_RD_MULT,
    118   UV_RD_MULT,
    119 };
    120 
    121 #define UPDATE_RD_COST()\
    122 {\
    123   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
    124   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
    125   if (rd_cost0 == rd_cost1) {\
    126     rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
    127     rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
    128   }\
    129 }
    130 
    131 // This function is a place holder for now but may ultimately need
    132 // to scan previous tokens to work out the correct context.
    133 static int trellis_get_coeff_context(const int16_t *scan,
    134                                      const int16_t *nb,
    135                                      int idx, int token,
    136                                      uint8_t *token_cache) {
    137   int bak = token_cache[scan[idx]], pt;
    138   token_cache[scan[idx]] = vp9_pt_energy_class[token];
    139   pt = get_coef_context(nb, token_cache, idx + 1);
    140   token_cache[scan[idx]] = bak;
    141   return pt;
    142 }
    143 
    144 static void optimize_b(MACROBLOCK *mb,
    145                        int plane, int block, BLOCK_SIZE plane_bsize,
    146                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
    147                        TX_SIZE tx_size) {
    148   MACROBLOCKD *const xd = &mb->e_mbd;
    149   struct macroblockd_plane *pd = &xd->plane[plane];
    150   const int ref = is_inter_block(&xd->this_mi->mbmi);
    151   vp9_token_state tokens[1025][2];
    152   unsigned best_index[1025][2];
    153   const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
    154   int16_t *qcoeff_ptr;
    155   int16_t *dqcoeff_ptr;
    156   int eob = pd->eobs[block], final_eob, sz = 0;
    157   const int i0 = 0;
    158   int rc, x, next, i;
    159   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
    160   int rate0, rate1, error0, error1, t0, t1;
    161   int best, band, pt;
    162   PLANE_TYPE type = pd->plane_type;
    163   int err_mult = plane_rd_mult[type];
    164   int default_eob;
    165   const int16_t *scan, *nb;
    166   const int mul = 1 + (tx_size == TX_32X32);
    167   uint8_t token_cache[1024];
    168   const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
    169   const int16_t *dequant_ptr = pd->dequant;
    170   const uint8_t * band_translate;
    171 
    172   assert((!type && !plane) || (type && plane));
    173   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
    174   qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
    175   switch (tx_size) {
    176     default:
    177     case TX_4X4:
    178       default_eob = 16;
    179       scan = get_scan_4x4(get_tx_type_4x4(type, xd, ib));
    180       band_translate = vp9_coefband_trans_4x4;
    181       break;
    182     case TX_8X8:
    183       scan = get_scan_8x8(get_tx_type_8x8(type, xd));
    184       default_eob = 64;
    185       band_translate = vp9_coefband_trans_8x8plus;
    186       break;
    187     case TX_16X16:
    188       scan = get_scan_16x16(get_tx_type_16x16(type, xd));
    189       default_eob = 256;
    190       band_translate = vp9_coefband_trans_8x8plus;
    191       break;
    192     case TX_32X32:
    193       scan = vp9_default_scan_32x32;
    194       default_eob = 1024;
    195       band_translate = vp9_coefband_trans_8x8plus;
    196       break;
    197   }
    198   assert(eob <= default_eob);
    199 
    200   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
    201   rdmult = mb->rdmult * err_mult;
    202   if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME)
    203     rdmult = (rdmult * 9) >> 4;
    204   rddiv = mb->rddiv;
    205   /* Initialize the sentinel node of the trellis. */
    206   tokens[eob][0].rate = 0;
    207   tokens[eob][0].error = 0;
    208   tokens[eob][0].next = default_eob;
    209   tokens[eob][0].token = DCT_EOB_TOKEN;
    210   tokens[eob][0].qc = 0;
    211   *(tokens[eob] + 1) = *(tokens[eob] + 0);
    212   next = eob;
    213   for (i = 0; i < eob; i++)
    214     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
    215         qcoeff_ptr[scan[i]]].token];
    216   nb = vp9_get_coef_neighbors_handle(scan);
    217 
    218   for (i = eob; i-- > i0;) {
    219     int base_bits, d2, dx;
    220 
    221     rc = scan[i];
    222     x = qcoeff_ptr[rc];
    223     /* Only add a trellis state for non-zero coefficients. */
    224     if (x) {
    225       int shortcut = 0;
    226       error0 = tokens[next][0].error;
    227       error1 = tokens[next][1].error;
    228       /* Evaluate the first possibility for this state. */
    229       rate0 = tokens[next][0].rate;
    230       rate1 = tokens[next][1].rate;
    231       t0 = (vp9_dct_value_tokens_ptr + x)->token;
    232       /* Consider both possible successor states. */
    233       if (next < default_eob) {
    234         band = get_coef_band(band_translate, i + 1);
    235         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
    236         rate0 +=
    237           mb->token_costs[tx_size][type][ref][band][0][pt]
    238                          [tokens[next][0].token];
    239         rate1 +=
    240           mb->token_costs[tx_size][type][ref][band][0][pt]
    241                          [tokens[next][1].token];
    242       }
    243       UPDATE_RD_COST();
    244       /* And pick the best. */
    245       best = rd_cost1 < rd_cost0;
    246       base_bits = *(vp9_dct_value_cost_ptr + x);
    247       dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);
    248       d2 = dx * dx;
    249       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
    250       tokens[i][0].error = d2 + (best ? error1 : error0);
    251       tokens[i][0].next = next;
    252       tokens[i][0].token = t0;
    253       tokens[i][0].qc = x;
    254       best_index[i][0] = best;
    255 
    256       /* Evaluate the second possibility for this state. */
    257       rate0 = tokens[next][0].rate;
    258       rate1 = tokens[next][1].rate;
    259 
    260       if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&
    261           (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +
    262                                          dequant_ptr[rc != 0]))
    263         shortcut = 1;
    264       else
    265         shortcut = 0;
    266 
    267       if (shortcut) {
    268         sz = -(x < 0);
    269         x -= 2 * sz + 1;
    270       }
    271 
    272       /* Consider both possible successor states. */
    273       if (!x) {
    274         /* If we reduced this coefficient to zero, check to see if
    275          *  we need to move the EOB back here.
    276          */
    277         t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
    278              DCT_EOB_TOKEN : ZERO_TOKEN;
    279         t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
    280              DCT_EOB_TOKEN : ZERO_TOKEN;
    281       } else {
    282         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
    283       }
    284       if (next < default_eob) {
    285         band = get_coef_band(band_translate, i + 1);
    286         if (t0 != DCT_EOB_TOKEN) {
    287           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
    288           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
    289                                   [tokens[next][0].token];
    290         }
    291         if (t1 != DCT_EOB_TOKEN) {
    292           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
    293           rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
    294                                   [tokens[next][1].token];
    295         }
    296       }
    297 
    298       UPDATE_RD_COST();
    299       /* And pick the best. */
    300       best = rd_cost1 < rd_cost0;
    301       base_bits = *(vp9_dct_value_cost_ptr + x);
    302 
    303       if (shortcut) {
    304         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
    305         d2 = dx * dx;
    306       }
    307       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
    308       tokens[i][1].error = d2 + (best ? error1 : error0);
    309       tokens[i][1].next = next;
    310       tokens[i][1].token = best ? t1 : t0;
    311       tokens[i][1].qc = x;
    312       best_index[i][1] = best;
    313       /* Finally, make this the new head of the trellis. */
    314       next = i;
    315     }
    316     /* There's no choice to make for a zero coefficient, so we don't
    317      *  add a new trellis node, but we do need to update the costs.
    318      */
    319     else {
    320       band = get_coef_band(band_translate, i + 1);
    321       t0 = tokens[next][0].token;
    322       t1 = tokens[next][1].token;
    323       /* Update the cost of each path if we're past the EOB token. */
    324       if (t0 != DCT_EOB_TOKEN) {
    325         tokens[next][0].rate +=
    326             mb->token_costs[tx_size][type][ref][band][1][0][t0];
    327         tokens[next][0].token = ZERO_TOKEN;
    328       }
    329       if (t1 != DCT_EOB_TOKEN) {
    330         tokens[next][1].rate +=
    331             mb->token_costs[tx_size][type][ref][band][1][0][t1];
    332         tokens[next][1].token = ZERO_TOKEN;
    333       }
    334       best_index[i][0] = best_index[i][1] = 0;
    335       /* Don't update next, because we didn't add a new node. */
    336     }
    337   }
    338 
    339   /* Now pick the best path through the whole trellis. */
    340   band = get_coef_band(band_translate, i + 1);
    341   pt = combine_entropy_contexts(*a, *l);
    342   rate0 = tokens[next][0].rate;
    343   rate1 = tokens[next][1].rate;
    344   error0 = tokens[next][0].error;
    345   error1 = tokens[next][1].error;
    346   t0 = tokens[next][0].token;
    347   t1 = tokens[next][1].token;
    348   rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0];
    349   rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1];
    350   UPDATE_RD_COST();
    351   best = rd_cost1 < rd_cost0;
    352   final_eob = i0 - 1;
    353   vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
    354   vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
    355   for (i = next; i < eob; i = next) {
    356     x = tokens[i][best].qc;
    357     if (x) {
    358       final_eob = i;
    359     }
    360     rc = scan[i];
    361     qcoeff_ptr[rc] = x;
    362     dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
    363 
    364     next = tokens[i][best].next;
    365     best = best_index[i][best];
    366   }
    367   final_eob++;
    368 
    369   xd->plane[plane].eobs[block] = final_eob;
    370   *a = *l = (final_eob > 0);
    371 }
    372 
    373 void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
    374                     TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
    375   int x, y;
    376   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
    377   optimize_b(mb, plane, block, plane_bsize,
    378              &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
    379 }
    380 
    381 static void optimize_init_b(int plane, BLOCK_SIZE bsize,
    382                             struct encode_b_args *args) {
    383   const MACROBLOCKD *xd = &args->x->e_mbd;
    384   const struct macroblockd_plane* const pd = &xd->plane[plane];
    385   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
    386   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
    387   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
    388   const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
    389   const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
    390   int i;
    391 
    392   switch (tx_size) {
    393     case TX_4X4:
    394       vpx_memcpy(args->ctx->ta[plane], pd->above_context,
    395                  sizeof(ENTROPY_CONTEXT) * num_4x4_w);
    396       vpx_memcpy(args->ctx->tl[plane], pd->left_context,
    397                  sizeof(ENTROPY_CONTEXT) * num_4x4_h);
    398       break;
    399     case TX_8X8:
    400       for (i = 0; i < num_4x4_w; i += 2)
    401         args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i];
    402       for (i = 0; i < num_4x4_h; i += 2)
    403         args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i];
    404       break;
    405     case TX_16X16:
    406       for (i = 0; i < num_4x4_w; i += 4)
    407         args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i];
    408       for (i = 0; i < num_4x4_h; i += 4)
    409         args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i];
    410       break;
    411     case TX_32X32:
    412       for (i = 0; i < num_4x4_w; i += 8)
    413         args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i];
    414       for (i = 0; i < num_4x4_h; i += 8)
    415         args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i];
    416       break;
    417     default:
    418       assert(0);
    419   }
    420 }
    421 
    422 void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
    423                      TX_SIZE tx_size, void *arg) {
    424   struct encode_b_args* const args = arg;
    425   MACROBLOCK* const x = args->x;
    426   MACROBLOCKD* const xd = &x->e_mbd;
    427   struct macroblock_plane *const p = &x->plane[plane];
    428   struct macroblockd_plane *const pd = &xd->plane[plane];
    429   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
    430   int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
    431   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    432   const int16_t *scan, *iscan;
    433   uint16_t *eob = &pd->eobs[block];
    434   const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
    435   const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
    436   int xoff, yoff;
    437   int16_t *src_diff;
    438 
    439   switch (tx_size) {
    440     case TX_32X32:
    441       scan = vp9_default_scan_32x32;
    442       iscan = vp9_default_iscan_32x32;
    443       block >>= 6;
    444       xoff = 32 * (block & twmask);
    445       yoff = 32 * (block >> twl);
    446       src_diff = p->src_diff + 4 * bw * yoff + xoff;
    447       if (x->use_lp32x32fdct)
    448         vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
    449       else
    450         vp9_short_fdct32x32(src_diff, coeff, bw * 8);
    451       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
    452                            p->quant, p->quant_shift, qcoeff, dqcoeff,
    453                            pd->dequant, p->zbin_extra, eob, scan, iscan);
    454       break;
    455     case TX_16X16:
    456       scan = vp9_default_scan_16x16;
    457       iscan = vp9_default_iscan_16x16;
    458       block >>= 4;
    459       xoff = 16 * (block & twmask);
    460       yoff = 16 * (block >> twl);
    461       src_diff = p->src_diff + 4 * bw * yoff + xoff;
    462       x->fwd_txm16x16(src_diff, coeff, bw * 8);
    463       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
    464                      p->quant, p->quant_shift, qcoeff, dqcoeff,
    465                      pd->dequant, p->zbin_extra, eob, scan, iscan);
    466       break;
    467     case TX_8X8:
    468       scan = vp9_default_scan_8x8;
    469       iscan = vp9_default_iscan_8x8;
    470       block >>= 2;
    471       xoff = 8 * (block & twmask);
    472       yoff = 8 * (block >> twl);
    473       src_diff = p->src_diff + 4 * bw * yoff + xoff;
    474       x->fwd_txm8x8(src_diff, coeff, bw * 8);
    475       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
    476                      p->quant, p->quant_shift, qcoeff, dqcoeff,
    477                      pd->dequant, p->zbin_extra, eob, scan, iscan);
    478       break;
    479     case TX_4X4:
    480       scan = vp9_default_scan_4x4;
    481       iscan = vp9_default_iscan_4x4;
    482       xoff = 4 * (block & twmask);
    483       yoff = 4 * (block >> twl);
    484       src_diff = p->src_diff + 4 * bw * yoff + xoff;
    485       x->fwd_txm4x4(src_diff, coeff, bw * 8);
    486       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
    487                      p->quant, p->quant_shift, qcoeff, dqcoeff,
    488                      pd->dequant, p->zbin_extra, eob, scan, iscan);
    489       break;
    490     default:
    491       assert(0);
    492   }
    493 }
    494 
    495 static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
    496                          TX_SIZE tx_size, void *arg) {
    497   struct encode_b_args *const args = arg;
    498   MACROBLOCK *const x = args->x;
    499   MACROBLOCKD *const xd = &x->e_mbd;
    500   struct macroblockd_plane *const pd = &xd->plane[plane];
    501   const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
    502                                                        block);
    503 
    504   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    505   uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
    506                                                  pd->dst.buf, pd->dst.stride);
    507   vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
    508 
    509   if (x->optimize)
    510     vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
    511 
    512   if (x->skip_encode || pd->eobs[block] == 0)
    513     return;
    514 
    515   switch (tx_size) {
    516     case TX_32X32:
    517       vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
    518       break;
    519     case TX_16X16:
    520       inverse_transform_b_16x16_add(pd->eobs[block], dqcoeff, dst,
    521                                     pd->dst.stride);
    522       break;
    523     case TX_8X8:
    524       inverse_transform_b_8x8_add(pd->eobs[block], dqcoeff, dst,
    525                                   pd->dst.stride);
    526       break;
    527     case TX_4X4:
    528       // this is like vp9_short_idct4x4 but has a special case around eob<=1
    529       // which is significant (not just an optimization) for the lossless
    530       // case.
    531       inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
    532                                   dst, pd->dst.stride);
    533       break;
    534     default:
    535       assert(!"Invalid transform size");
    536   }
    537 }
    538 
    539 void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
    540   MACROBLOCKD *const xd = &x->e_mbd;
    541   struct optimize_ctx ctx;
    542   struct encode_b_args arg = {x, &ctx};
    543 
    544   vp9_subtract_sby(x, bsize);
    545   if (x->optimize)
    546     optimize_init_b(0, bsize, &arg);
    547 
    548   foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
    549 }
    550 
    551 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
    552   MACROBLOCKD *const xd = &x->e_mbd;
    553   struct optimize_ctx ctx;
    554   struct encode_b_args arg = {x, &ctx};
    555 
    556   vp9_subtract_sb(x, bsize);
    557 
    558   if (x->optimize) {
    559     int i;
    560     for (i = 0; i < MAX_MB_PLANE; ++i)
    561       optimize_init_b(i, bsize, &arg);
    562   }
    563 
    564   foreach_transformed_block(xd, bsize, encode_block, &arg);
    565 }
    566 
    567 void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
    568                             TX_SIZE tx_size, void *arg) {
    569   struct encode_b_args* const args = arg;
    570   MACROBLOCK *const x = args->x;
    571   MACROBLOCKD *const xd = &x->e_mbd;
    572   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
    573   struct macroblock_plane *const p = &x->plane[plane];
    574   struct macroblockd_plane *const pd = &xd->plane[plane];
    575   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
    576   int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
    577   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    578   const int16_t *scan, *iscan;
    579   TX_TYPE tx_type;
    580   MB_PREDICTION_MODE mode;
    581   const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
    582   const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
    583   int xoff, yoff;
    584   uint8_t *src, *dst;
    585   int16_t *src_diff;
    586   uint16_t *eob = &pd->eobs[block];
    587 
    588   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
    589     extend_for_intra(xd, plane_bsize, plane, block, tx_size);
    590 
    591   // if (x->optimize)
    592   // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
    593 
    594   switch (tx_size) {
    595     case TX_32X32:
    596       scan = vp9_default_scan_32x32;
    597       iscan = vp9_default_iscan_32x32;
    598       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
    599       block >>= 6;
    600       xoff = 32 * (block & twmask);
    601       yoff = 32 * (block >> twl);
    602       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
    603       src = p->src.buf + yoff * p->src.stride + xoff;
    604       src_diff = p->src_diff + 4 * bw * yoff + xoff;
    605       vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
    606                               dst, pd->dst.stride, dst, pd->dst.stride);
    607       vp9_subtract_block(32, 32, src_diff, bw * 4,
    608                          src, p->src.stride, dst, pd->dst.stride);
    609       if (x->use_lp32x32fdct)
    610         vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
    611       else
    612         vp9_short_fdct32x32(src_diff, coeff, bw * 8);
    613       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
    614                            p->quant, p->quant_shift, qcoeff, dqcoeff,
    615                            pd->dequant, p->zbin_extra, eob, scan, iscan);
    616       if (!x->skip_encode && *eob)
    617         vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
    618       break;
    619     case TX_16X16:
    620       tx_type = get_tx_type_16x16(pd->plane_type, xd);
    621       scan = get_scan_16x16(tx_type);
    622       iscan = get_iscan_16x16(tx_type);
    623       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
    624       block >>= 4;
    625       xoff = 16 * (block & twmask);
    626       yoff = 16 * (block >> twl);
    627       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
    628       src = p->src.buf + yoff * p->src.stride + xoff;
    629       src_diff = p->src_diff + 4 * bw * yoff + xoff;
    630       vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
    631                               dst, pd->dst.stride, dst, pd->dst.stride);
    632       vp9_subtract_block(16, 16, src_diff, bw * 4,
    633                          src, p->src.stride, dst, pd->dst.stride);
    634       if (tx_type != DCT_DCT)
    635         vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
    636       else
    637         x->fwd_txm16x16(src_diff, coeff, bw * 8);
    638       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
    639                      p->quant, p->quant_shift, qcoeff, dqcoeff,
    640                      pd->dequant, p->zbin_extra, eob, scan, iscan);
    641       if (!x->skip_encode && *eob) {
    642         if (tx_type == DCT_DCT)
    643           inverse_transform_b_16x16_add(*eob, dqcoeff, dst, pd->dst.stride);
    644         else
    645           vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
    646       }
    647       break;
    648     case TX_8X8:
    649       tx_type = get_tx_type_8x8(pd->plane_type, xd);
    650       scan = get_scan_8x8(tx_type);
    651       iscan = get_iscan_8x8(tx_type);
    652       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
    653       block >>= 2;
    654       xoff = 8 * (block & twmask);
    655       yoff = 8 * (block >> twl);
    656       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
    657       src = p->src.buf + yoff * p->src.stride + xoff;
    658       src_diff = p->src_diff + 4 * bw * yoff + xoff;
    659       vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
    660                               dst, pd->dst.stride, dst, pd->dst.stride);
    661       vp9_subtract_block(8, 8, src_diff, bw * 4,
    662                          src, p->src.stride, dst, pd->dst.stride);
    663       if (tx_type != DCT_DCT)
    664         vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
    665       else
    666         x->fwd_txm8x8(src_diff, coeff, bw * 8);
    667       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
    668                      p->quant_shift, qcoeff, dqcoeff,
    669                      pd->dequant, p->zbin_extra, eob, scan, iscan);
    670       if (!x->skip_encode && *eob) {
    671         if (tx_type == DCT_DCT)
    672           inverse_transform_b_8x8_add(*eob, dqcoeff, dst, pd->dst.stride);
    673         else
    674           vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
    675       }
    676       break;
    677     case TX_4X4:
    678       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
    679       scan = get_scan_4x4(tx_type);
    680       iscan = get_iscan_4x4(tx_type);
    681       if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
    682         mode = xd->this_mi->bmi[block].as_mode;
    683       else
    684         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
    685 
    686       xoff = 4 * (block & twmask);
    687       yoff = 4 * (block >> twl);
    688       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
    689       src = p->src.buf + yoff * p->src.stride + xoff;
    690       src_diff = p->src_diff + 4 * bw * yoff + xoff;
    691       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
    692                               dst, pd->dst.stride, dst, pd->dst.stride);
    693       vp9_subtract_block(4, 4, src_diff, bw * 4,
    694                          src, p->src.stride, dst, pd->dst.stride);
    695       if (tx_type != DCT_DCT)
    696         vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
    697       else
    698         x->fwd_txm4x4(src_diff, coeff, bw * 8);
    699       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
    700                      p->quant_shift, qcoeff, dqcoeff,
    701                      pd->dequant, p->zbin_extra, eob, scan, iscan);
    702       if (!x->skip_encode && *eob) {
    703         if (tx_type == DCT_DCT)
    704           // this is like vp9_short_idct4x4 but has a special case around eob<=1
    705           // which is significant (not just an optimization) for the lossless
    706           // case.
    707           inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
    708         else
    709           vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
    710       }
    711       break;
    712     default:
    713       assert(0);
    714   }
    715 }
    716 
    717 void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
    718   MACROBLOCKD* const xd = &x->e_mbd;
    719   struct optimize_ctx ctx;
    720   struct encode_b_args arg = {x, &ctx};
    721 
    722   foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
    723                                      &arg);
    724 }
    725 void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
    726   MACROBLOCKD* const xd = &x->e_mbd;
    727   struct optimize_ctx ctx;
    728   struct encode_b_args arg = {x, &ctx};
    729   foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
    730 }
    731 
    732