Home | History | Annotate | Download | only in common
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <limits.h>
     13 #include "./vpx_config.h"
     14 #include "vpx_dsp/vpx_dsp_common.h"
     15 #include "vpx_mem/vpx_mem.h"
     16 #include "vp9/common/vp9_entropymode.h"
     17 #include "vp9/common/vp9_thread_common.h"
     18 #include "vp9/common/vp9_reconinter.h"
     19 #include "vp9/common/vp9_loopfilter.h"
     20 
     21 #if CONFIG_MULTITHREAD
     22 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
     23   const int kMaxTryLocks = 4000;
     24   int locked = 0;
     25   int i;
     26 
     27   for (i = 0; i < kMaxTryLocks; ++i) {
     28     if (!pthread_mutex_trylock(mutex)) {
     29       locked = 1;
     30       break;
     31     }
     32   }
     33 
     34   if (!locked) pthread_mutex_lock(mutex);
     35 }
     36 #endif  // CONFIG_MULTITHREAD
     37 
     38 static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
     39 #if CONFIG_MULTITHREAD
     40   const int nsync = lf_sync->sync_range;
     41 
     42   if (r && !(c & (nsync - 1))) {
     43     pthread_mutex_t *const mutex = &lf_sync->mutex[r - 1];
     44     mutex_lock(mutex);
     45 
     46     while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
     47       pthread_cond_wait(&lf_sync->cond[r - 1], mutex);
     48     }
     49     pthread_mutex_unlock(mutex);
     50   }
     51 #else
     52   (void)lf_sync;
     53   (void)r;
     54   (void)c;
     55 #endif  // CONFIG_MULTITHREAD
     56 }
     57 
     58 static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
     59                               const int sb_cols) {
     60 #if CONFIG_MULTITHREAD
     61   const int nsync = lf_sync->sync_range;
     62   int cur;
     63   // Only signal when there are enough filtered SB for next row to run.
     64   int sig = 1;
     65 
     66   if (c < sb_cols - 1) {
     67     cur = c;
     68     if (c % nsync) sig = 0;
     69   } else {
     70     cur = sb_cols + nsync;
     71   }
     72 
     73   if (sig) {
     74     mutex_lock(&lf_sync->mutex[r]);
     75 
     76     lf_sync->cur_sb_col[r] = cur;
     77 
     78     pthread_cond_signal(&lf_sync->cond[r]);
     79     pthread_mutex_unlock(&lf_sync->mutex[r]);
     80   }
     81 #else
     82   (void)lf_sync;
     83   (void)r;
     84   (void)c;
     85   (void)sb_cols;
     86 #endif  // CONFIG_MULTITHREAD
     87 }
     88 
     89 // Implement row loopfiltering for each thread.
     90 static INLINE void thread_loop_filter_rows(
     91     const YV12_BUFFER_CONFIG *const frame_buffer, VP9_COMMON *const cm,
     92     struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop,
     93     int y_only, VP9LfSync *const lf_sync) {
     94   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
     95   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
     96   const int num_active_workers = lf_sync->num_active_workers;
     97   int mi_row, mi_col;
     98   enum lf_path path;
     99   if (y_only)
    100     path = LF_PATH_444;
    101   else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
    102     path = LF_PATH_420;
    103   else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
    104     path = LF_PATH_444;
    105   else
    106     path = LF_PATH_SLOW;
    107 
    108   assert(num_active_workers > 0);
    109 
    110   for (mi_row = start; mi_row < stop;
    111        mi_row += num_active_workers * MI_BLOCK_SIZE) {
    112     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    113     LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
    114 
    115     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {
    116       const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
    117       const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
    118       int plane;
    119 
    120       sync_read(lf_sync, r, c);
    121 
    122       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
    123 
    124       vp9_adjust_mask(cm, mi_row, mi_col, lfm);
    125 
    126       vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
    127       for (plane = 1; plane < num_planes; ++plane) {
    128         switch (path) {
    129           case LF_PATH_420:
    130             vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);
    131             break;
    132           case LF_PATH_444:
    133             vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);
    134             break;
    135           case LF_PATH_SLOW:
    136             vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
    137                                           mi_row, mi_col);
    138             break;
    139         }
    140       }
    141 
    142       sync_write(lf_sync, r, c, sb_cols);
    143     }
    144   }
    145 }
    146 
    147 // Row-based multi-threaded loopfilter hook
    148 static int loop_filter_row_worker(void *arg1, void *arg2) {
    149   VP9LfSync *const lf_sync = (VP9LfSync *)arg1;
    150   LFWorkerData *const lf_data = (LFWorkerData *)arg2;
    151   thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
    152                           lf_data->start, lf_data->stop, lf_data->y_only,
    153                           lf_sync);
    154   return 1;
    155 }
    156 
    157 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
    158                                 struct macroblockd_plane planes[MAX_MB_PLANE],
    159                                 int start, int stop, int y_only,
    160                                 VPxWorker *workers, int nworkers,
    161                                 VP9LfSync *lf_sync) {
    162   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
    163   // Number of superblock rows and cols
    164   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
    165   const int num_tile_cols = 1 << cm->log2_tile_cols;
    166   // Limit the number of workers to prevent changes in frame dimensions from
    167   // causing incorrect sync calculations when sb_rows < threads/tile_cols.
    168   // Further restrict them by the number of tile columns should the user
    169   // request more as this implementation doesn't scale well beyond that.
    170   const int num_workers = VPXMIN(nworkers, VPXMIN(num_tile_cols, sb_rows));
    171   int i;
    172 
    173   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
    174       num_workers > lf_sync->num_workers) {
    175     vp9_loop_filter_dealloc(lf_sync);
    176     vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
    177   }
    178   lf_sync->num_active_workers = num_workers;
    179 
    180   // Initialize cur_sb_col to -1 for all SB rows.
    181   memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
    182 
    183   // Set up loopfilter thread data.
    184   // The decoder is capping num_workers because it has been observed that using
    185   // more threads on the loopfilter than there are cores will hurt performance
    186   // on Android. This is because the system will only schedule the tile decode
    187   // workers on cores equal to the number of tile columns. Then if the decoder
    188   // tries to use more threads for the loopfilter, it will hurt performance
    189   // because of contention. If the multithreading code changes in the future
    190   // then the number of workers used by the loopfilter should be revisited.
    191   for (i = 0; i < num_workers; ++i) {
    192     VPxWorker *const worker = &workers[i];
    193     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
    194 
    195     worker->hook = loop_filter_row_worker;
    196     worker->data1 = lf_sync;
    197     worker->data2 = lf_data;
    198 
    199     // Loopfilter data
    200     vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
    201     lf_data->start = start + i * MI_BLOCK_SIZE;
    202     lf_data->stop = stop;
    203     lf_data->y_only = y_only;
    204 
    205     // Start loopfiltering
    206     if (i == num_workers - 1) {
    207       winterface->execute(worker);
    208     } else {
    209       winterface->launch(worker);
    210     }
    211   }
    212 
    213   // Wait till all rows are finished
    214   for (i = 0; i < num_workers; ++i) {
    215     winterface->sync(&workers[i]);
    216   }
    217 }
    218 
    219 void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
    220                               struct macroblockd_plane planes[MAX_MB_PLANE],
    221                               int frame_filter_level, int y_only,
    222                               int partial_frame, VPxWorker *workers,
    223                               int num_workers, VP9LfSync *lf_sync) {
    224   int start_mi_row, end_mi_row, mi_rows_to_filter;
    225 
    226   if (!frame_filter_level) return;
    227 
    228   start_mi_row = 0;
    229   mi_rows_to_filter = cm->mi_rows;
    230   if (partial_frame && cm->mi_rows > 8) {
    231     start_mi_row = cm->mi_rows >> 1;
    232     start_mi_row &= 0xfffffff8;
    233     mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
    234   }
    235   end_mi_row = start_mi_row + mi_rows_to_filter;
    236   vp9_loop_filter_frame_init(cm, frame_filter_level);
    237 
    238   loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, y_only,
    239                       workers, num_workers, lf_sync);
    240 }
    241 
    242 void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level,
    243                      int num_workers) {
    244   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
    245 
    246   if (!frame_filter_level) return;
    247 
    248   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
    249       num_workers > lf_sync->num_workers) {
    250     vp9_loop_filter_dealloc(lf_sync);
    251     vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
    252   }
    253 
    254   // Initialize cur_sb_col to -1 for all SB rows.
    255   memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
    256 
    257   lf_sync->corrupted = 0;
    258 
    259   memset(lf_sync->num_tiles_done, 0,
    260          sizeof(*lf_sync->num_tiles_done) * sb_rows);
    261   cm->lf_row = 0;
    262 }
    263 
    264 // Set up nsync by width.
    265 static INLINE int get_sync_range(int width) {
    266   // nsync numbers are picked by testing. For example, for 4k
    267   // video, using 4 gives best performance.
    268   if (width < 640)
    269     return 1;
    270   else if (width <= 1280)
    271     return 2;
    272   else if (width <= 4096)
    273     return 4;
    274   else
    275     return 8;
    276 }
    277 
    278 // Allocate memory for lf row synchronization
    279 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
    280                            int width, int num_workers) {
    281   lf_sync->rows = rows;
    282 #if CONFIG_MULTITHREAD
    283   {
    284     int i;
    285 
    286     CHECK_MEM_ERROR(cm, lf_sync->mutex,
    287                     vpx_malloc(sizeof(*lf_sync->mutex) * rows));
    288     if (lf_sync->mutex) {
    289       for (i = 0; i < rows; ++i) {
    290         pthread_mutex_init(&lf_sync->mutex[i], NULL);
    291       }
    292     }
    293 
    294     CHECK_MEM_ERROR(cm, lf_sync->cond,
    295                     vpx_malloc(sizeof(*lf_sync->cond) * rows));
    296     if (lf_sync->cond) {
    297       for (i = 0; i < rows; ++i) {
    298         pthread_cond_init(&lf_sync->cond[i], NULL);
    299       }
    300     }
    301     pthread_mutex_init(&lf_sync->lf_mutex, NULL);
    302 
    303     CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
    304                     vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
    305     if (lf_sync->recon_done_mutex) {
    306       int i;
    307       for (i = 0; i < rows; ++i) {
    308         pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
    309       }
    310     }
    311 
    312     CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
    313                     vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
    314     if (lf_sync->recon_done_cond) {
    315       int i;
    316       for (i = 0; i < rows; ++i) {
    317         pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
    318       }
    319     }
    320   }
    321 #endif  // CONFIG_MULTITHREAD
    322 
    323   CHECK_MEM_ERROR(cm, lf_sync->lfdata,
    324                   vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
    325   lf_sync->num_workers = num_workers;
    326   lf_sync->num_active_workers = lf_sync->num_workers;
    327 
    328   CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
    329                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
    330 
    331   CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
    332                   vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
    333                                  mi_cols_aligned_to_sb(cm->mi_rows) >>
    334                              MI_BLOCK_SIZE_LOG2));
    335 
    336   // Set up nsync.
    337   lf_sync->sync_range = get_sync_range(width);
    338 }
    339 
    340 // Deallocate lf synchronization related mutex and data
    341 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
    342   if (lf_sync != NULL) {
    343 #if CONFIG_MULTITHREAD
    344     int i;
    345 
    346     if (lf_sync->mutex != NULL) {
    347       for (i = 0; i < lf_sync->rows; ++i) {
    348         pthread_mutex_destroy(&lf_sync->mutex[i]);
    349       }
    350       vpx_free(lf_sync->mutex);
    351     }
    352     if (lf_sync->cond != NULL) {
    353       for (i = 0; i < lf_sync->rows; ++i) {
    354         pthread_cond_destroy(&lf_sync->cond[i]);
    355       }
    356       vpx_free(lf_sync->cond);
    357     }
    358     if (lf_sync->recon_done_mutex != NULL) {
    359       int i;
    360       for (i = 0; i < lf_sync->rows; ++i) {
    361         pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]);
    362       }
    363       vpx_free(lf_sync->recon_done_mutex);
    364     }
    365 
    366     pthread_mutex_destroy(&lf_sync->lf_mutex);
    367     if (lf_sync->recon_done_cond != NULL) {
    368       int i;
    369       for (i = 0; i < lf_sync->rows; ++i) {
    370         pthread_cond_destroy(&lf_sync->recon_done_cond[i]);
    371       }
    372       vpx_free(lf_sync->recon_done_cond);
    373     }
    374 #endif  // CONFIG_MULTITHREAD
    375 
    376     vpx_free(lf_sync->lfdata);
    377     vpx_free(lf_sync->cur_sb_col);
    378     vpx_free(lf_sync->num_tiles_done);
    379     // clear the structure as the source of this call may be a resize in which
    380     // case this call will be followed by an _alloc() which may fail.
    381     vp9_zero(*lf_sync);
    382   }
    383 }
    384 
    385 static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
    386   int return_val = -1;
    387   int cur_row;
    388   const int max_rows = cm->mi_rows;
    389 
    390 #if CONFIG_MULTITHREAD
    391   const int tile_cols = 1 << cm->log2_tile_cols;
    392 
    393   pthread_mutex_lock(&lf_sync->lf_mutex);
    394   if (cm->lf_row < max_rows) {
    395     cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
    396     return_val = cm->lf_row;
    397     cm->lf_row += MI_BLOCK_SIZE;
    398     if (cm->lf_row < max_rows) {
    399       /* If this is not the last row, make sure the next row is also decoded.
    400        * This is because the intra predict has to happen before loop filter */
    401       cur_row += 1;
    402     }
    403   }
    404   pthread_mutex_unlock(&lf_sync->lf_mutex);
    405 
    406   if (return_val == -1) return return_val;
    407 
    408   pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]);
    409   if (lf_sync->num_tiles_done[cur_row] < tile_cols) {
    410     pthread_cond_wait(&lf_sync->recon_done_cond[cur_row],
    411                       &lf_sync->recon_done_mutex[cur_row]);
    412   }
    413   pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]);
    414   pthread_mutex_lock(&lf_sync->lf_mutex);
    415   if (lf_sync->corrupted) {
    416     int row = return_val >> MI_BLOCK_SIZE_LOG2;
    417     pthread_mutex_lock(&lf_sync->mutex[row]);
    418     lf_sync->cur_sb_col[row] = INT_MAX;
    419     pthread_cond_signal(&lf_sync->cond[row]);
    420     pthread_mutex_unlock(&lf_sync->mutex[row]);
    421     return_val = -1;
    422   }
    423   pthread_mutex_unlock(&lf_sync->lf_mutex);
    424 #else
    425   (void)lf_sync;
    426   if (cm->lf_row < max_rows) {
    427     cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
    428     return_val = cm->lf_row;
    429     cm->lf_row += MI_BLOCK_SIZE;
    430     if (cm->lf_row < max_rows) {
    431       /* If this is not the last row, make sure the next row is also decoded.
    432        * This is because the intra predict has to happen before loop filter */
    433       cur_row += 1;
    434     }
    435   }
    436 #endif  // CONFIG_MULTITHREAD
    437 
    438   return return_val;
    439 }
    440 
    441 void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) {
    442   int mi_row;
    443   VP9_COMMON *cm = lf_data->cm;
    444 
    445   while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) {
    446     lf_data->start = mi_row;
    447     lf_data->stop = mi_row + MI_BLOCK_SIZE;
    448 
    449     thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
    450                             lf_data->start, lf_data->stop, lf_data->y_only,
    451                             lf_sync);
    452   }
    453 }
    454 
    455 void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
    456                  int corrupted) {
    457 #if CONFIG_MULTITHREAD
    458   pthread_mutex_lock(&lf_sync->lf_mutex);
    459   lf_sync->corrupted |= corrupted;
    460   pthread_mutex_unlock(&lf_sync->lf_mutex);
    461   pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
    462   lf_sync->num_tiles_done[row] += 1;
    463   if (num_tiles == lf_sync->num_tiles_done[row]) {
    464     if (is_last_row) {
    465       /* The last 2 rows wait on the last row to be done.
    466        * So, we have to broadcast the signal in this case.
    467        */
    468       pthread_cond_broadcast(&lf_sync->recon_done_cond[row]);
    469     } else {
    470       pthread_cond_signal(&lf_sync->recon_done_cond[row]);
    471     }
    472   }
    473   pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
    474 #else
    475   (void)lf_sync;
    476   (void)num_tiles;
    477   (void)row;
    478   (void)is_last_row;
    479   (void)corrupted;
    480 #endif  // CONFIG_MULTITHREAD
    481 }
    482 
    483 // Accumulate frame counts.
    484 void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
    485                                  const FRAME_COUNTS *counts, int is_dec) {
    486   int i, j, k, l, m;
    487 
    488   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
    489     for (j = 0; j < INTRA_MODES; j++)
    490       accum->y_mode[i][j] += counts->y_mode[i][j];
    491 
    492   for (i = 0; i < INTRA_MODES; i++)
    493     for (j = 0; j < INTRA_MODES; j++)
    494       accum->uv_mode[i][j] += counts->uv_mode[i][j];
    495 
    496   for (i = 0; i < PARTITION_CONTEXTS; i++)
    497     for (j = 0; j < PARTITION_TYPES; j++)
    498       accum->partition[i][j] += counts->partition[i][j];
    499 
    500   if (is_dec) {
    501     int n;
    502     for (i = 0; i < TX_SIZES; i++)
    503       for (j = 0; j < PLANE_TYPES; j++)
    504         for (k = 0; k < REF_TYPES; k++)
    505           for (l = 0; l < COEF_BANDS; l++)
    506             for (m = 0; m < COEFF_CONTEXTS; m++) {
    507               accum->eob_branch[i][j][k][l][m] +=
    508                   counts->eob_branch[i][j][k][l][m];
    509               for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
    510                 accum->coef[i][j][k][l][m][n] += counts->coef[i][j][k][l][m][n];
    511             }
    512   } else {
    513     for (i = 0; i < TX_SIZES; i++)
    514       for (j = 0; j < PLANE_TYPES; j++)
    515         for (k = 0; k < REF_TYPES; k++)
    516           for (l = 0; l < COEF_BANDS; l++)
    517             for (m = 0; m < COEFF_CONTEXTS; m++)
    518               accum->eob_branch[i][j][k][l][m] +=
    519                   counts->eob_branch[i][j][k][l][m];
    520     // In the encoder, coef is only updated at frame
    521     // level, so not need to accumulate it here.
    522     // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
    523     //   accum->coef[i][j][k][l][m][n] +=
    524     //       counts->coef[i][j][k][l][m][n];
    525   }
    526 
    527   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
    528     for (j = 0; j < SWITCHABLE_FILTERS; j++)
    529       accum->switchable_interp[i][j] += counts->switchable_interp[i][j];
    530 
    531   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
    532     for (j = 0; j < INTER_MODES; j++)
    533       accum->inter_mode[i][j] += counts->inter_mode[i][j];
    534 
    535   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
    536     for (j = 0; j < 2; j++)
    537       accum->intra_inter[i][j] += counts->intra_inter[i][j];
    538 
    539   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
    540     for (j = 0; j < 2; j++) accum->comp_inter[i][j] += counts->comp_inter[i][j];
    541 
    542   for (i = 0; i < REF_CONTEXTS; i++)
    543     for (j = 0; j < 2; j++)
    544       for (k = 0; k < 2; k++)
    545         accum->single_ref[i][j][k] += counts->single_ref[i][j][k];
    546 
    547   for (i = 0; i < REF_CONTEXTS; i++)
    548     for (j = 0; j < 2; j++) accum->comp_ref[i][j] += counts->comp_ref[i][j];
    549 
    550   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
    551     for (j = 0; j < TX_SIZES; j++)
    552       accum->tx.p32x32[i][j] += counts->tx.p32x32[i][j];
    553 
    554     for (j = 0; j < TX_SIZES - 1; j++)
    555       accum->tx.p16x16[i][j] += counts->tx.p16x16[i][j];
    556 
    557     for (j = 0; j < TX_SIZES - 2; j++)
    558       accum->tx.p8x8[i][j] += counts->tx.p8x8[i][j];
    559   }
    560 
    561   for (i = 0; i < TX_SIZES; i++)
    562     accum->tx.tx_totals[i] += counts->tx.tx_totals[i];
    563 
    564   for (i = 0; i < SKIP_CONTEXTS; i++)
    565     for (j = 0; j < 2; j++) accum->skip[i][j] += counts->skip[i][j];
    566 
    567   for (i = 0; i < MV_JOINTS; i++) accum->mv.joints[i] += counts->mv.joints[i];
    568 
    569   for (k = 0; k < 2; k++) {
    570     nmv_component_counts *const comps = &accum->mv.comps[k];
    571     const nmv_component_counts *const comps_t = &counts->mv.comps[k];
    572 
    573     for (i = 0; i < 2; i++) {
    574       comps->sign[i] += comps_t->sign[i];
    575       comps->class0_hp[i] += comps_t->class0_hp[i];
    576       comps->hp[i] += comps_t->hp[i];
    577     }
    578 
    579     for (i = 0; i < MV_CLASSES; i++) comps->classes[i] += comps_t->classes[i];
    580 
    581     for (i = 0; i < CLASS0_SIZE; i++) {
    582       comps->class0[i] += comps_t->class0[i];
    583       for (j = 0; j < MV_FP_SIZE; j++)
    584         comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
    585     }
    586 
    587     for (i = 0; i < MV_OFFSET_BITS; i++)
    588       for (j = 0; j < 2; j++) comps->bits[i][j] += comps_t->bits[i][j];
    589 
    590     for (i = 0; i < MV_FP_SIZE; i++) comps->fp[i] += comps_t->fp[i];
    591   }
    592 }
    593