1 // Copyright 2010 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Frame-reconstruction function. Memory allocation. 11 // 12 // Author: Skal (pascal.massimino (at) gmail.com) 13 14 #include <stdlib.h> 15 #include "./vp8i.h" 16 #include "../utils/utils.h" 17 18 //------------------------------------------------------------------------------ 19 // Main reconstruction function. 20 21 static const int kScan[16] = { 22 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 23 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 24 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 25 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS 26 }; 27 28 static int CheckMode(int mb_x, int mb_y, int mode) { 29 if (mode == B_DC_PRED) { 30 if (mb_x == 0) { 31 return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT; 32 } else { 33 return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED; 34 } 35 } 36 return mode; 37 } 38 39 static void Copy32b(uint8_t* const dst, const uint8_t* const src) { 40 memcpy(dst, src, 4); 41 } 42 43 static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src, 44 uint8_t* const dst) { 45 switch (bits >> 30) { 46 case 3: 47 VP8Transform(src, dst, 0); 48 break; 49 case 2: 50 VP8TransformAC3(src, dst); 51 break; 52 case 1: 53 VP8TransformDC(src, dst); 54 break; 55 default: 56 break; 57 } 58 } 59 60 static void DoUVTransform(uint32_t bits, const int16_t* const src, 61 uint8_t* const dst) { 62 if (bits & 0xff) { // any non-zero coeff at all? 63 if (bits & 0xaa) { // any non-zero AC coefficient? 64 VP8TransformUV(src, dst); // note we don't use the AC3 variant for U/V 65 } else { 66 VP8TransformDCUV(src, dst); 67 } 68 } 69 } 70 71 static void ReconstructRow(const VP8Decoder* const dec, 72 const VP8ThreadContext* ctx) { 73 int j; 74 int mb_x; 75 const int mb_y = ctx->mb_y_; 76 const int cache_id = ctx->id_; 77 uint8_t* const y_dst = dec->yuv_b_ + Y_OFF; 78 uint8_t* const u_dst = dec->yuv_b_ + U_OFF; 79 uint8_t* const v_dst = dec->yuv_b_ + V_OFF; 80 81 // Initialize left-most block. 82 for (j = 0; j < 16; ++j) { 83 y_dst[j * BPS - 1] = 129; 84 } 85 for (j = 0; j < 8; ++j) { 86 u_dst[j * BPS - 1] = 129; 87 v_dst[j * BPS - 1] = 129; 88 } 89 90 // Init top-left sample on left column too. 91 if (mb_y > 0) { 92 y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129; 93 } else { 94 // we only need to do this init once at block (0,0). 95 // Afterward, it remains valid for the whole topmost row. 96 memset(y_dst - BPS - 1, 127, 16 + 4 + 1); 97 memset(u_dst - BPS - 1, 127, 8 + 1); 98 memset(v_dst - BPS - 1, 127, 8 + 1); 99 } 100 101 // Reconstruct one row. 102 for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) { 103 const VP8MBData* const block = ctx->mb_data_ + mb_x; 104 105 // Rotate in the left samples from previously decoded block. We move four 106 // pixels at a time for alignment reason, and because of in-loop filter. 107 if (mb_x > 0) { 108 for (j = -1; j < 16; ++j) { 109 Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]); 110 } 111 for (j = -1; j < 8; ++j) { 112 Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]); 113 Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]); 114 } 115 } 116 { 117 // bring top samples into the cache 118 VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x; 119 const int16_t* const coeffs = block->coeffs_; 120 uint32_t bits = block->non_zero_y_; 121 int n; 122 123 if (mb_y > 0) { 124 memcpy(y_dst - BPS, top_yuv[0].y, 16); 125 memcpy(u_dst - BPS, top_yuv[0].u, 8); 126 memcpy(v_dst - BPS, top_yuv[0].v, 8); 127 } 128 129 // predict and add residuals 130 if (block->is_i4x4_) { // 4x4 131 uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16); 132 133 if (mb_y > 0) { 134 if (mb_x >= dec->mb_w_ - 1) { // on rightmost border 135 memset(top_right, top_yuv[0].y[15], sizeof(*top_right)); 136 } else { 137 memcpy(top_right, top_yuv[1].y, sizeof(*top_right)); 138 } 139 } 140 // replicate the top-right pixels below 141 top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0]; 142 143 // predict and add residuals for all 4x4 blocks in turn. 144 for (n = 0; n < 16; ++n, bits <<= 2) { 145 uint8_t* const dst = y_dst + kScan[n]; 146 VP8PredLuma4[block->imodes_[n]](dst); 147 DoTransform(bits, coeffs + n * 16, dst); 148 } 149 } else { // 16x16 150 const int pred_func = CheckMode(mb_x, mb_y, block->imodes_[0]); 151 VP8PredLuma16[pred_func](y_dst); 152 if (bits != 0) { 153 for (n = 0; n < 16; ++n, bits <<= 2) { 154 DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]); 155 } 156 } 157 } 158 { 159 // Chroma 160 const uint32_t bits_uv = block->non_zero_uv_; 161 const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_); 162 VP8PredChroma8[pred_func](u_dst); 163 VP8PredChroma8[pred_func](v_dst); 164 DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst); 165 DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst); 166 } 167 168 // stash away top samples for next block 169 if (mb_y < dec->mb_h_ - 1) { 170 memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16); 171 memcpy(top_yuv[0].u, u_dst + 7 * BPS, 8); 172 memcpy(top_yuv[0].v, v_dst + 7 * BPS, 8); 173 } 174 } 175 // Transfer reconstructed samples from yuv_b_ cache to final destination. 176 { 177 const int y_offset = cache_id * 16 * dec->cache_y_stride_; 178 const int uv_offset = cache_id * 8 * dec->cache_uv_stride_; 179 uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset; 180 uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset; 181 uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset; 182 for (j = 0; j < 16; ++j) { 183 memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16); 184 } 185 for (j = 0; j < 8; ++j) { 186 memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8); 187 memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8); 188 } 189 } 190 } 191 } 192 193 //------------------------------------------------------------------------------ 194 // Filtering 195 196 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary 197 // for caching, given a filtering level. 198 // Simple filter: up to 2 luma samples are read and 1 is written. 199 // Complex filter: up to 4 luma samples are read and 3 are written. Same for 200 // U/V, so it's 8 samples total (because of the 2x upsampling). 201 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 }; 202 203 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) { 204 const VP8ThreadContext* const ctx = &dec->thread_ctx_; 205 const int cache_id = ctx->id_; 206 const int y_bps = dec->cache_y_stride_; 207 const VP8FInfo* const f_info = ctx->f_info_ + mb_x; 208 uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16; 209 const int ilevel = f_info->f_ilevel_; 210 const int limit = f_info->f_limit_; 211 if (limit == 0) { 212 return; 213 } 214 assert(limit >= 3); 215 if (dec->filter_type_ == 1) { // simple 216 if (mb_x > 0) { 217 VP8SimpleHFilter16(y_dst, y_bps, limit + 4); 218 } 219 if (f_info->f_inner_) { 220 VP8SimpleHFilter16i(y_dst, y_bps, limit); 221 } 222 if (mb_y > 0) { 223 VP8SimpleVFilter16(y_dst, y_bps, limit + 4); 224 } 225 if (f_info->f_inner_) { 226 VP8SimpleVFilter16i(y_dst, y_bps, limit); 227 } 228 } else { // complex 229 const int uv_bps = dec->cache_uv_stride_; 230 uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8; 231 uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8; 232 const int hev_thresh = f_info->hev_thresh_; 233 if (mb_x > 0) { 234 VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh); 235 VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh); 236 } 237 if (f_info->f_inner_) { 238 VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh); 239 VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh); 240 } 241 if (mb_y > 0) { 242 VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh); 243 VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh); 244 } 245 if (f_info->f_inner_) { 246 VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh); 247 VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh); 248 } 249 } 250 } 251 252 // Filter the decoded macroblock row (if needed) 253 static void FilterRow(const VP8Decoder* const dec) { 254 int mb_x; 255 const int mb_y = dec->thread_ctx_.mb_y_; 256 assert(dec->thread_ctx_.filter_row_); 257 for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) { 258 DoFilter(dec, mb_x, mb_y); 259 } 260 } 261 262 //------------------------------------------------------------------------------ 263 // Precompute the filtering strength for each segment and each i4x4/i16x16 mode. 264 265 static void PrecomputeFilterStrengths(VP8Decoder* const dec) { 266 if (dec->filter_type_ > 0) { 267 int s; 268 const VP8FilterHeader* const hdr = &dec->filter_hdr_; 269 for (s = 0; s < NUM_MB_SEGMENTS; ++s) { 270 int i4x4; 271 // First, compute the initial level 272 int base_level; 273 if (dec->segment_hdr_.use_segment_) { 274 base_level = dec->segment_hdr_.filter_strength_[s]; 275 if (!dec->segment_hdr_.absolute_delta_) { 276 base_level += hdr->level_; 277 } 278 } else { 279 base_level = hdr->level_; 280 } 281 for (i4x4 = 0; i4x4 <= 1; ++i4x4) { 282 VP8FInfo* const info = &dec->fstrengths_[s][i4x4]; 283 int level = base_level; 284 if (hdr->use_lf_delta_) { 285 level += hdr->ref_lf_delta_[0]; 286 if (i4x4) { 287 level += hdr->mode_lf_delta_[0]; 288 } 289 } 290 level = (level < 0) ? 0 : (level > 63) ? 63 : level; 291 if (level > 0) { 292 int ilevel = level; 293 if (hdr->sharpness_ > 0) { 294 if (hdr->sharpness_ > 4) { 295 ilevel >>= 2; 296 } else { 297 ilevel >>= 1; 298 } 299 if (ilevel > 9 - hdr->sharpness_) { 300 ilevel = 9 - hdr->sharpness_; 301 } 302 } 303 if (ilevel < 1) ilevel = 1; 304 info->f_ilevel_ = ilevel; 305 info->f_limit_ = 2 * level + ilevel; 306 info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0; 307 } else { 308 info->f_limit_ = 0; // no filtering 309 } 310 info->f_inner_ = i4x4; 311 } 312 } 313 } 314 } 315 316 //------------------------------------------------------------------------------ 317 // Dithering 318 319 #define DITHER_AMP_TAB_SIZE 12 320 static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = { 321 // roughly, it's dqm->uv_mat_[1] 322 8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1 323 }; 324 325 void VP8InitDithering(const WebPDecoderOptions* const options, 326 VP8Decoder* const dec) { 327 assert(dec != NULL); 328 if (options != NULL) { 329 const int d = options->dithering_strength; 330 const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1; 331 const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100); 332 if (f > 0) { 333 int s; 334 int all_amp = 0; 335 for (s = 0; s < NUM_MB_SEGMENTS; ++s) { 336 VP8QuantMatrix* const dqm = &dec->dqm_[s]; 337 if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) { 338 // TODO(skal): should we specially dither more for uv_quant_ < 0? 339 const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_; 340 dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3; 341 } 342 all_amp |= dqm->dither_; 343 } 344 if (all_amp != 0) { 345 VP8InitRandom(&dec->dithering_rg_, 1.0f); 346 dec->dither_ = 1; 347 } 348 } 349 // potentially allow alpha dithering 350 dec->alpha_dithering_ = options->alpha_dithering_strength; 351 if (dec->alpha_dithering_ > 100) { 352 dec->alpha_dithering_ = 100; 353 } else if (dec->alpha_dithering_ < 0) { 354 dec->alpha_dithering_ = 0; 355 } 356 } 357 } 358 359 // minimal amp that will provide a non-zero dithering effect 360 #define MIN_DITHER_AMP 4 361 #define DITHER_DESCALE 4 362 #define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1)) 363 #define DITHER_AMP_BITS 8 364 #define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS) 365 366 static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) { 367 int i, j; 368 for (j = 0; j < 8; ++j) { 369 for (i = 0; i < 8; ++i) { 370 // TODO: could be made faster with SSE2 371 const int bits = 372 VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER; 373 // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100 374 const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE; 375 const int v = (int)dst[i] + delta; 376 dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v; 377 } 378 dst += bps; 379 } 380 } 381 382 static void DitherRow(VP8Decoder* const dec) { 383 int mb_x; 384 assert(dec->dither_); 385 for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) { 386 const VP8ThreadContext* const ctx = &dec->thread_ctx_; 387 const VP8MBData* const data = ctx->mb_data_ + mb_x; 388 const int cache_id = ctx->id_; 389 const int uv_bps = dec->cache_uv_stride_; 390 if (data->dither_ >= MIN_DITHER_AMP) { 391 uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8; 392 uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8; 393 Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_); 394 Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_); 395 } 396 } 397 } 398 399 //------------------------------------------------------------------------------ 400 // This function is called after a row of macroblocks is finished decoding. 401 // It also takes into account the following restrictions: 402 // * In case of in-loop filtering, we must hold off sending some of the bottom 403 // pixels as they are yet unfiltered. They will be when the next macroblock 404 // row is decoded. Meanwhile, we must preserve them by rotating them in the 405 // cache area. This doesn't hold for the very bottom row of the uncropped 406 // picture of course. 407 // * we must clip the remaining pixels against the cropping area. The VP8Io 408 // struct must have the following fields set correctly before calling put(): 409 410 #define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB 411 412 // Finalize and transmit a complete row. Return false in case of user-abort. 413 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) { 414 int ok = 1; 415 const VP8ThreadContext* const ctx = &dec->thread_ctx_; 416 const int cache_id = ctx->id_; 417 const int extra_y_rows = kFilterExtraRows[dec->filter_type_]; 418 const int ysize = extra_y_rows * dec->cache_y_stride_; 419 const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_; 420 const int y_offset = cache_id * 16 * dec->cache_y_stride_; 421 const int uv_offset = cache_id * 8 * dec->cache_uv_stride_; 422 uint8_t* const ydst = dec->cache_y_ - ysize + y_offset; 423 uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset; 424 uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset; 425 const int mb_y = ctx->mb_y_; 426 const int is_first_row = (mb_y == 0); 427 const int is_last_row = (mb_y >= dec->br_mb_y_ - 1); 428 429 if (dec->mt_method_ == 2) { 430 ReconstructRow(dec, ctx); 431 } 432 433 if (ctx->filter_row_) { 434 FilterRow(dec); 435 } 436 437 if (dec->dither_) { 438 DitherRow(dec); 439 } 440 441 if (io->put != NULL) { 442 int y_start = MACROBLOCK_VPOS(mb_y); 443 int y_end = MACROBLOCK_VPOS(mb_y + 1); 444 if (!is_first_row) { 445 y_start -= extra_y_rows; 446 io->y = ydst; 447 io->u = udst; 448 io->v = vdst; 449 } else { 450 io->y = dec->cache_y_ + y_offset; 451 io->u = dec->cache_u_ + uv_offset; 452 io->v = dec->cache_v_ + uv_offset; 453 } 454 455 if (!is_last_row) { 456 y_end -= extra_y_rows; 457 } 458 if (y_end > io->crop_bottom) { 459 y_end = io->crop_bottom; // make sure we don't overflow on last row. 460 } 461 io->a = NULL; 462 if (dec->alpha_data_ != NULL && y_start < y_end) { 463 // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a 464 // good idea. 465 io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start); 466 if (io->a == NULL) { 467 return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, 468 "Could not decode alpha data."); 469 } 470 } 471 if (y_start < io->crop_top) { 472 const int delta_y = io->crop_top - y_start; 473 y_start = io->crop_top; 474 assert(!(delta_y & 1)); 475 io->y += dec->cache_y_stride_ * delta_y; 476 io->u += dec->cache_uv_stride_ * (delta_y >> 1); 477 io->v += dec->cache_uv_stride_ * (delta_y >> 1); 478 if (io->a != NULL) { 479 io->a += io->width * delta_y; 480 } 481 } 482 if (y_start < y_end) { 483 io->y += io->crop_left; 484 io->u += io->crop_left >> 1; 485 io->v += io->crop_left >> 1; 486 if (io->a != NULL) { 487 io->a += io->crop_left; 488 } 489 io->mb_y = y_start - io->crop_top; 490 io->mb_w = io->crop_right - io->crop_left; 491 io->mb_h = y_end - y_start; 492 ok = io->put(io); 493 } 494 } 495 // rotate top samples if needed 496 if (cache_id + 1 == dec->num_caches_) { 497 if (!is_last_row) { 498 memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize); 499 memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize); 500 memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize); 501 } 502 } 503 504 return ok; 505 } 506 507 #undef MACROBLOCK_VPOS 508 509 //------------------------------------------------------------------------------ 510 511 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) { 512 int ok = 1; 513 VP8ThreadContext* const ctx = &dec->thread_ctx_; 514 const int filter_row = 515 (dec->filter_type_ > 0) && 516 (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_); 517 if (dec->mt_method_ == 0) { 518 // ctx->id_ and ctx->f_info_ are already set 519 ctx->mb_y_ = dec->mb_y_; 520 ctx->filter_row_ = filter_row; 521 ReconstructRow(dec, ctx); 522 ok = FinishRow(dec, io); 523 } else { 524 WebPWorker* const worker = &dec->worker_; 525 // Finish previous job *before* updating context 526 ok &= WebPGetWorkerInterface()->Sync(worker); 527 assert(worker->status_ == OK); 528 if (ok) { // spawn a new deblocking/output job 529 ctx->io_ = *io; 530 ctx->id_ = dec->cache_id_; 531 ctx->mb_y_ = dec->mb_y_; 532 ctx->filter_row_ = filter_row; 533 if (dec->mt_method_ == 2) { // swap macroblock data 534 VP8MBData* const tmp = ctx->mb_data_; 535 ctx->mb_data_ = dec->mb_data_; 536 dec->mb_data_ = tmp; 537 } else { 538 // perform reconstruction directly in main thread 539 ReconstructRow(dec, ctx); 540 } 541 if (filter_row) { // swap filter info 542 VP8FInfo* const tmp = ctx->f_info_; 543 ctx->f_info_ = dec->f_info_; 544 dec->f_info_ = tmp; 545 } 546 // (reconstruct)+filter in parallel 547 WebPGetWorkerInterface()->Launch(worker); 548 if (++dec->cache_id_ == dec->num_caches_) { 549 dec->cache_id_ = 0; 550 } 551 } 552 } 553 return ok; 554 } 555 556 //------------------------------------------------------------------------------ 557 // Finish setting up the decoding parameter once user's setup() is called. 558 559 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) { 560 // Call setup() first. This may trigger additional decoding features on 'io'. 561 // Note: Afterward, we must call teardown() no matter what. 562 if (io->setup != NULL && !io->setup(io)) { 563 VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed"); 564 return dec->status_; 565 } 566 567 // Disable filtering per user request 568 if (io->bypass_filtering) { 569 dec->filter_type_ = 0; 570 } 571 // TODO(skal): filter type / strength / sharpness forcing 572 573 // Define the area where we can skip in-loop filtering, in case of cropping. 574 // 575 // 'Simple' filter reads two luma samples outside of the macroblock 576 // and filters one. It doesn't filter the chroma samples. Hence, we can 577 // avoid doing the in-loop filtering before crop_top/crop_left position. 578 // For the 'Complex' filter, 3 samples are read and up to 3 are filtered. 579 // Means: there's a dependency chain that goes all the way up to the 580 // top-left corner of the picture (MB #0). We must filter all the previous 581 // macroblocks. 582 // TODO(skal): add an 'approximate_decoding' option, that won't produce 583 // a 1:1 bit-exactness for complex filtering? 584 { 585 const int extra_pixels = kFilterExtraRows[dec->filter_type_]; 586 if (dec->filter_type_ == 2) { 587 // For complex filter, we need to preserve the dependency chain. 588 dec->tl_mb_x_ = 0; 589 dec->tl_mb_y_ = 0; 590 } else { 591 // For simple filter, we can filter only the cropped region. 592 // We include 'extra_pixels' on the other side of the boundary, since 593 // vertical or horizontal filtering of the previous macroblock can 594 // modify some abutting pixels. 595 dec->tl_mb_x_ = (io->crop_left - extra_pixels) >> 4; 596 dec->tl_mb_y_ = (io->crop_top - extra_pixels) >> 4; 597 if (dec->tl_mb_x_ < 0) dec->tl_mb_x_ = 0; 598 if (dec->tl_mb_y_ < 0) dec->tl_mb_y_ = 0; 599 } 600 // We need some 'extra' pixels on the right/bottom. 601 dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4; 602 dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4; 603 if (dec->br_mb_x_ > dec->mb_w_) { 604 dec->br_mb_x_ = dec->mb_w_; 605 } 606 if (dec->br_mb_y_ > dec->mb_h_) { 607 dec->br_mb_y_ = dec->mb_h_; 608 } 609 } 610 PrecomputeFilterStrengths(dec); 611 return VP8_STATUS_OK; 612 } 613 614 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) { 615 int ok = 1; 616 if (dec->mt_method_ > 0) { 617 ok = WebPGetWorkerInterface()->Sync(&dec->worker_); 618 } 619 620 if (io->teardown != NULL) { 621 io->teardown(io); 622 } 623 return ok; 624 } 625 626 //------------------------------------------------------------------------------ 627 // For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line. 628 // 629 // Reason is: the deblocking filter cannot deblock the bottom horizontal edges 630 // immediately, and needs to wait for first few rows of the next macroblock to 631 // be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending 632 // on strength). 633 // With two threads, the vertical positions of the rows being decoded are: 634 // Decode: [ 0..15][16..31][32..47][48..63][64..79][... 635 // Deblock: [ 0..11][12..27][28..43][44..59][... 636 // If we use two threads and two caches of 16 pixels, the sequence would be: 637 // Decode: [ 0..15][16..31][ 0..15!!][16..31][ 0..15][... 638 // Deblock: [ 0..11][12..27!!][-4..11][12..27][... 639 // The problem occurs during row [12..15!!] that both the decoding and 640 // deblocking threads are writing simultaneously. 641 // With 3 cache lines, one get a safe write pattern: 642 // Decode: [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0.. 643 // Deblock: [ 0..11][12..27][28..43][-4..11][12..27][28... 644 // Note that multi-threaded output _without_ deblocking can make use of two 645 // cache lines of 16 pixels only, since there's no lagging behind. The decoding 646 // and output process have non-concurrent writing: 647 // Decode: [ 0..15][16..31][ 0..15][16..31][... 648 // io->put: [ 0..15][16..31][ 0..15][... 649 650 #define MT_CACHE_LINES 3 651 #define ST_CACHE_LINES 1 // 1 cache row only for single-threaded case 652 653 // Initialize multi/single-thread worker 654 static int InitThreadContext(VP8Decoder* const dec) { 655 dec->cache_id_ = 0; 656 if (dec->mt_method_ > 0) { 657 WebPWorker* const worker = &dec->worker_; 658 if (!WebPGetWorkerInterface()->Reset(worker)) { 659 return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY, 660 "thread initialization failed."); 661 } 662 worker->data1 = dec; 663 worker->data2 = (void*)&dec->thread_ctx_.io_; 664 worker->hook = (WebPWorkerHook)FinishRow; 665 dec->num_caches_ = 666 (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1; 667 } else { 668 dec->num_caches_ = ST_CACHE_LINES; 669 } 670 return 1; 671 } 672 673 int VP8GetThreadMethod(const WebPDecoderOptions* const options, 674 const WebPHeaderStructure* const headers, 675 int width, int height) { 676 if (options == NULL || options->use_threads == 0) { 677 return 0; 678 } 679 (void)headers; 680 (void)width; 681 (void)height; 682 assert(headers == NULL || !headers->is_lossless); 683 #if defined(WEBP_USE_THREAD) 684 if (width < MIN_WIDTH_FOR_THREADS) return 0; 685 // TODO(skal): tune the heuristic further 686 #if 0 687 if (height < 2 * width) return 2; 688 #endif 689 return 2; 690 #else // !WEBP_USE_THREAD 691 return 0; 692 #endif 693 } 694 695 #undef MT_CACHE_LINES 696 #undef ST_CACHE_LINES 697 698 //------------------------------------------------------------------------------ 699 // Memory setup 700 701 static int AllocateMemory(VP8Decoder* const dec) { 702 const int num_caches = dec->num_caches_; 703 const int mb_w = dec->mb_w_; 704 // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise. 705 const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t); 706 const size_t top_size = sizeof(VP8TopSamples) * mb_w; 707 const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB); 708 const size_t f_info_size = 709 (dec->filter_type_ > 0) ? 710 mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo) 711 : 0; 712 const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_); 713 const size_t mb_data_size = 714 (dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_); 715 const size_t cache_height = (16 * num_caches 716 + kFilterExtraRows[dec->filter_type_]) * 3 / 2; 717 const size_t cache_size = top_size * cache_height; 718 // alpha_size is the only one that scales as width x height. 719 const uint64_t alpha_size = (dec->alpha_data_ != NULL) ? 720 (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL; 721 const uint64_t needed = (uint64_t)intra_pred_mode_size 722 + top_size + mb_info_size + f_info_size 723 + yuv_size + mb_data_size 724 + cache_size + alpha_size + WEBP_ALIGN_CST; 725 uint8_t* mem; 726 727 if (needed != (size_t)needed) return 0; // check for overflow 728 if (needed > dec->mem_size_) { 729 WebPSafeFree(dec->mem_); 730 dec->mem_size_ = 0; 731 dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t)); 732 if (dec->mem_ == NULL) { 733 return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY, 734 "no memory during frame initialization."); 735 } 736 // down-cast is ok, thanks to WebPSafeAlloc() above. 737 dec->mem_size_ = (size_t)needed; 738 } 739 740 mem = (uint8_t*)dec->mem_; 741 dec->intra_t_ = (uint8_t*)mem; 742 mem += intra_pred_mode_size; 743 744 dec->yuv_t_ = (VP8TopSamples*)mem; 745 mem += top_size; 746 747 dec->mb_info_ = ((VP8MB*)mem) + 1; 748 mem += mb_info_size; 749 750 dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL; 751 mem += f_info_size; 752 dec->thread_ctx_.id_ = 0; 753 dec->thread_ctx_.f_info_ = dec->f_info_; 754 if (dec->mt_method_ > 0) { 755 // secondary cache line. The deblocking process need to make use of the 756 // filtering strength from previous macroblock row, while the new ones 757 // are being decoded in parallel. We'll just swap the pointers. 758 dec->thread_ctx_.f_info_ += mb_w; 759 } 760 761 mem = (uint8_t*)WEBP_ALIGN(mem); 762 assert((yuv_size & WEBP_ALIGN_CST) == 0); 763 dec->yuv_b_ = (uint8_t*)mem; 764 mem += yuv_size; 765 766 dec->mb_data_ = (VP8MBData*)mem; 767 dec->thread_ctx_.mb_data_ = (VP8MBData*)mem; 768 if (dec->mt_method_ == 2) { 769 dec->thread_ctx_.mb_data_ += mb_w; 770 } 771 mem += mb_data_size; 772 773 dec->cache_y_stride_ = 16 * mb_w; 774 dec->cache_uv_stride_ = 8 * mb_w; 775 { 776 const int extra_rows = kFilterExtraRows[dec->filter_type_]; 777 const int extra_y = extra_rows * dec->cache_y_stride_; 778 const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_; 779 dec->cache_y_ = ((uint8_t*)mem) + extra_y; 780 dec->cache_u_ = dec->cache_y_ 781 + 16 * num_caches * dec->cache_y_stride_ + extra_uv; 782 dec->cache_v_ = dec->cache_u_ 783 + 8 * num_caches * dec->cache_uv_stride_ + extra_uv; 784 dec->cache_id_ = 0; 785 } 786 mem += cache_size; 787 788 // alpha plane 789 dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL; 790 mem += alpha_size; 791 assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_); 792 793 // note: left/top-info is initialized once for all. 794 memset(dec->mb_info_ - 1, 0, mb_info_size); 795 VP8InitScanline(dec); // initialize left too. 796 797 // initialize top 798 memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size); 799 800 return 1; 801 } 802 803 static void InitIo(VP8Decoder* const dec, VP8Io* io) { 804 // prepare 'io' 805 io->mb_y = 0; 806 io->y = dec->cache_y_; 807 io->u = dec->cache_u_; 808 io->v = dec->cache_v_; 809 io->y_stride = dec->cache_y_stride_; 810 io->uv_stride = dec->cache_uv_stride_; 811 io->a = NULL; 812 } 813 814 int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io) { 815 if (!InitThreadContext(dec)) return 0; // call first. Sets dec->num_caches_. 816 if (!AllocateMemory(dec)) return 0; 817 InitIo(dec, io); 818 VP8DspInit(); // Init critical function pointers and look-up tables. 819 return 1; 820 } 821 822 //------------------------------------------------------------------------------ 823