1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Speed-critical encoding functions. 11 // 12 // Author: Skal (pascal.massimino (at) gmail.com) 13 14 #include <assert.h> 15 #include <stdlib.h> // for abs() 16 17 #include "./dsp.h" 18 #include "../enc/vp8i_enc.h" 19 20 static WEBP_INLINE uint8_t clip_8b(int v) { 21 return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; 22 } 23 24 static WEBP_INLINE int clip_max(int v, int max) { 25 return (v > max) ? max : v; 26 } 27 28 //------------------------------------------------------------------------------ 29 // Compute susceptibility based on DCT-coeff histograms: 30 // the higher, the "easier" the macroblock is to compress. 31 32 const int VP8DspScan[16 + 4 + 4] = { 33 // Luma 34 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 35 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 36 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 37 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, 38 39 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 40 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V 41 }; 42 43 // general-purpose util function 44 void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1], 45 VP8Histogram* const histo) { 46 int max_value = 0, last_non_zero = 1; 47 int k; 48 for (k = 0; k <= MAX_COEFF_THRESH; ++k) { 49 const int value = distribution[k]; 50 if (value > 0) { 51 if (value > max_value) max_value = value; 52 last_non_zero = k; 53 } 54 } 55 histo->max_value = max_value; 56 histo->last_non_zero = last_non_zero; 57 } 58 59 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, 60 int start_block, int end_block, 61 VP8Histogram* const histo) { 62 int j; 63 int distribution[MAX_COEFF_THRESH + 1] = { 0 }; 64 for (j = start_block; j < end_block; ++j) { 65 int k; 66 int16_t out[16]; 67 68 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 69 70 // Convert coefficients to bin. 71 for (k = 0; k < 16; ++k) { 72 const int v = abs(out[k]) >> 3; 73 const int clipped_value = clip_max(v, MAX_COEFF_THRESH); 74 ++distribution[clipped_value]; 75 } 76 } 77 VP8SetHistogramData(distribution, histo); 78 } 79 80 //------------------------------------------------------------------------------ 81 // run-time tables (~4k) 82 83 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] 84 85 // We declare this variable 'volatile' to prevent instruction reordering 86 // and make sure it's set to true _last_ (so as to be thread-safe) 87 static volatile int tables_ok = 0; 88 89 static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) { 90 if (!tables_ok) { 91 int i; 92 for (i = -255; i <= 255 + 255; ++i) { 93 clip1[255 + i] = clip_8b(i); 94 } 95 tables_ok = 1; 96 } 97 } 98 99 100 //------------------------------------------------------------------------------ 101 // Transforms (Paragraph 14.4) 102 103 #define STORE(x, y, v) \ 104 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) 105 106 static const int kC1 = 20091 + (1 << 16); 107 static const int kC2 = 35468; 108 #define MUL(a, b) (((a) * (b)) >> 16) 109 110 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, 111 uint8_t* dst) { 112 int C[4 * 4], *tmp; 113 int i; 114 tmp = C; 115 for (i = 0; i < 4; ++i) { // vertical pass 116 const int a = in[0] + in[8]; 117 const int b = in[0] - in[8]; 118 const int c = MUL(in[4], kC2) - MUL(in[12], kC1); 119 const int d = MUL(in[4], kC1) + MUL(in[12], kC2); 120 tmp[0] = a + d; 121 tmp[1] = b + c; 122 tmp[2] = b - c; 123 tmp[3] = a - d; 124 tmp += 4; 125 in++; 126 } 127 128 tmp = C; 129 for (i = 0; i < 4; ++i) { // horizontal pass 130 const int dc = tmp[0] + 4; 131 const int a = dc + tmp[8]; 132 const int b = dc - tmp[8]; 133 const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1); 134 const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2); 135 STORE(0, i, a + d); 136 STORE(1, i, b + c); 137 STORE(2, i, b - c); 138 STORE(3, i, a - d); 139 tmp++; 140 } 141 } 142 143 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, 144 int do_two) { 145 ITransformOne(ref, in, dst); 146 if (do_two) { 147 ITransformOne(ref + 4, in + 16, dst + 4); 148 } 149 } 150 151 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { 152 int i; 153 int tmp[16]; 154 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { 155 const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255]) 156 const int d1 = src[1] - ref[1]; 157 const int d2 = src[2] - ref[2]; 158 const int d3 = src[3] - ref[3]; 159 const int a0 = (d0 + d3); // 10b [-510,510] 160 const int a1 = (d1 + d2); 161 const int a2 = (d1 - d2); 162 const int a3 = (d0 - d3); 163 tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160] 164 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542] 165 tmp[2 + i * 4] = (a0 - a1) * 8; 166 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9; 167 } 168 for (i = 0; i < 4; ++i) { 169 const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b 170 const int a1 = (tmp[4 + i] + tmp[ 8 + i]); 171 const int a2 = (tmp[4 + i] - tmp[ 8 + i]); 172 const int a3 = (tmp[0 + i] - tmp[12 + i]); 173 out[0 + i] = (a0 + a1 + 7) >> 4; // 12b 174 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); 175 out[8 + i] = (a0 - a1 + 7) >> 4; 176 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); 177 } 178 } 179 180 static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { 181 VP8FTransform(src, ref, out); 182 VP8FTransform(src + 4, ref + 4, out + 16); 183 } 184 185 static void FTransformWHT(const int16_t* in, int16_t* out) { 186 // input is 12b signed 187 int32_t tmp[16]; 188 int i; 189 for (i = 0; i < 4; ++i, in += 64) { 190 const int a0 = (in[0 * 16] + in[2 * 16]); // 13b 191 const int a1 = (in[1 * 16] + in[3 * 16]); 192 const int a2 = (in[1 * 16] - in[3 * 16]); 193 const int a3 = (in[0 * 16] - in[2 * 16]); 194 tmp[0 + i * 4] = a0 + a1; // 14b 195 tmp[1 + i * 4] = a3 + a2; 196 tmp[2 + i * 4] = a3 - a2; 197 tmp[3 + i * 4] = a0 - a1; 198 } 199 for (i = 0; i < 4; ++i) { 200 const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b 201 const int a1 = (tmp[4 + i] + tmp[12+ i]); 202 const int a2 = (tmp[4 + i] - tmp[12+ i]); 203 const int a3 = (tmp[0 + i] - tmp[8 + i]); 204 const int b0 = a0 + a1; // 16b 205 const int b1 = a3 + a2; 206 const int b2 = a3 - a2; 207 const int b3 = a0 - a1; 208 out[ 0 + i] = b0 >> 1; // 15b 209 out[ 4 + i] = b1 >> 1; 210 out[ 8 + i] = b2 >> 1; 211 out[12 + i] = b3 >> 1; 212 } 213 } 214 215 #undef MUL 216 #undef STORE 217 218 //------------------------------------------------------------------------------ 219 // Intra predictions 220 221 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) { 222 int j; 223 for (j = 0; j < size; ++j) { 224 memset(dst + j * BPS, value, size); 225 } 226 } 227 228 static WEBP_INLINE void VerticalPred(uint8_t* dst, 229 const uint8_t* top, int size) { 230 int j; 231 if (top != NULL) { 232 for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); 233 } else { 234 Fill(dst, 127, size); 235 } 236 } 237 238 static WEBP_INLINE void HorizontalPred(uint8_t* dst, 239 const uint8_t* left, int size) { 240 if (left != NULL) { 241 int j; 242 for (j = 0; j < size; ++j) { 243 memset(dst + j * BPS, left[j], size); 244 } 245 } else { 246 Fill(dst, 129, size); 247 } 248 } 249 250 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, 251 const uint8_t* top, int size) { 252 int y; 253 if (left != NULL) { 254 if (top != NULL) { 255 const uint8_t* const clip = clip1 + 255 - left[-1]; 256 for (y = 0; y < size; ++y) { 257 const uint8_t* const clip_table = clip + left[y]; 258 int x; 259 for (x = 0; x < size; ++x) { 260 dst[x] = clip_table[top[x]]; 261 } 262 dst += BPS; 263 } 264 } else { 265 HorizontalPred(dst, left, size); 266 } 267 } else { 268 // true motion without left samples (hence: with default 129 value) 269 // is equivalent to VE prediction where you just copy the top samples. 270 // Note that if top samples are not available, the default value is 271 // then 129, and not 127 as in the VerticalPred case. 272 if (top != NULL) { 273 VerticalPred(dst, top, size); 274 } else { 275 Fill(dst, 129, size); 276 } 277 } 278 } 279 280 static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, 281 const uint8_t* top, 282 int size, int round, int shift) { 283 int DC = 0; 284 int j; 285 if (top != NULL) { 286 for (j = 0; j < size; ++j) DC += top[j]; 287 if (left != NULL) { // top and left present 288 for (j = 0; j < size; ++j) DC += left[j]; 289 } else { // top, but no left 290 DC += DC; 291 } 292 DC = (DC + round) >> shift; 293 } else if (left != NULL) { // left but no top 294 for (j = 0; j < size; ++j) DC += left[j]; 295 DC += DC; 296 DC = (DC + round) >> shift; 297 } else { // no top, no left, nothing. 298 DC = 0x80; 299 } 300 Fill(dst, DC, size); 301 } 302 303 //------------------------------------------------------------------------------ 304 // Chroma 8x8 prediction (paragraph 12.2) 305 306 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, 307 const uint8_t* top) { 308 // U block 309 DCMode(C8DC8 + dst, left, top, 8, 8, 4); 310 VerticalPred(C8VE8 + dst, top, 8); 311 HorizontalPred(C8HE8 + dst, left, 8); 312 TrueMotion(C8TM8 + dst, left, top, 8); 313 // V block 314 dst += 8; 315 if (top != NULL) top += 8; 316 if (left != NULL) left += 16; 317 DCMode(C8DC8 + dst, left, top, 8, 8, 4); 318 VerticalPred(C8VE8 + dst, top, 8); 319 HorizontalPred(C8HE8 + dst, left, 8); 320 TrueMotion(C8TM8 + dst, left, top, 8); 321 } 322 323 //------------------------------------------------------------------------------ 324 // luma 16x16 prediction (paragraph 12.3) 325 326 static void Intra16Preds(uint8_t* dst, 327 const uint8_t* left, const uint8_t* top) { 328 DCMode(I16DC16 + dst, left, top, 16, 16, 5); 329 VerticalPred(I16VE16 + dst, top, 16); 330 HorizontalPred(I16HE16 + dst, left, 16); 331 TrueMotion(I16TM16 + dst, left, top, 16); 332 } 333 334 //------------------------------------------------------------------------------ 335 // luma 4x4 prediction 336 337 #define DST(x, y) dst[(x) + (y) * BPS] 338 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) 339 #define AVG2(a, b) (((a) + (b) + 1) >> 1) 340 341 static void VE4(uint8_t* dst, const uint8_t* top) { // vertical 342 const uint8_t vals[4] = { 343 AVG3(top[-1], top[0], top[1]), 344 AVG3(top[ 0], top[1], top[2]), 345 AVG3(top[ 1], top[2], top[3]), 346 AVG3(top[ 2], top[3], top[4]) 347 }; 348 int i; 349 for (i = 0; i < 4; ++i) { 350 memcpy(dst + i * BPS, vals, 4); 351 } 352 } 353 354 static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal 355 const int X = top[-1]; 356 const int I = top[-2]; 357 const int J = top[-3]; 358 const int K = top[-4]; 359 const int L = top[-5]; 360 WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); 361 WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); 362 WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); 363 WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); 364 } 365 366 static void DC4(uint8_t* dst, const uint8_t* top) { 367 uint32_t dc = 4; 368 int i; 369 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; 370 Fill(dst, dc >> 3, 4); 371 } 372 373 static void RD4(uint8_t* dst, const uint8_t* top) { 374 const int X = top[-1]; 375 const int I = top[-2]; 376 const int J = top[-3]; 377 const int K = top[-4]; 378 const int L = top[-5]; 379 const int A = top[0]; 380 const int B = top[1]; 381 const int C = top[2]; 382 const int D = top[3]; 383 DST(0, 3) = AVG3(J, K, L); 384 DST(0, 2) = DST(1, 3) = AVG3(I, J, K); 385 DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J); 386 DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I); 387 DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X); 388 DST(2, 0) = DST(3, 1) = AVG3(C, B, A); 389 DST(3, 0) = AVG3(D, C, B); 390 } 391 392 static void LD4(uint8_t* dst, const uint8_t* top) { 393 const int A = top[0]; 394 const int B = top[1]; 395 const int C = top[2]; 396 const int D = top[3]; 397 const int E = top[4]; 398 const int F = top[5]; 399 const int G = top[6]; 400 const int H = top[7]; 401 DST(0, 0) = AVG3(A, B, C); 402 DST(1, 0) = DST(0, 1) = AVG3(B, C, D); 403 DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); 404 DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); 405 DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); 406 DST(3, 2) = DST(2, 3) = AVG3(F, G, H); 407 DST(3, 3) = AVG3(G, H, H); 408 } 409 410 static void VR4(uint8_t* dst, const uint8_t* top) { 411 const int X = top[-1]; 412 const int I = top[-2]; 413 const int J = top[-3]; 414 const int K = top[-4]; 415 const int A = top[0]; 416 const int B = top[1]; 417 const int C = top[2]; 418 const int D = top[3]; 419 DST(0, 0) = DST(1, 2) = AVG2(X, A); 420 DST(1, 0) = DST(2, 2) = AVG2(A, B); 421 DST(2, 0) = DST(3, 2) = AVG2(B, C); 422 DST(3, 0) = AVG2(C, D); 423 424 DST(0, 3) = AVG3(K, J, I); 425 DST(0, 2) = AVG3(J, I, X); 426 DST(0, 1) = DST(1, 3) = AVG3(I, X, A); 427 DST(1, 1) = DST(2, 3) = AVG3(X, A, B); 428 DST(2, 1) = DST(3, 3) = AVG3(A, B, C); 429 DST(3, 1) = AVG3(B, C, D); 430 } 431 432 static void VL4(uint8_t* dst, const uint8_t* top) { 433 const int A = top[0]; 434 const int B = top[1]; 435 const int C = top[2]; 436 const int D = top[3]; 437 const int E = top[4]; 438 const int F = top[5]; 439 const int G = top[6]; 440 const int H = top[7]; 441 DST(0, 0) = AVG2(A, B); 442 DST(1, 0) = DST(0, 2) = AVG2(B, C); 443 DST(2, 0) = DST(1, 2) = AVG2(C, D); 444 DST(3, 0) = DST(2, 2) = AVG2(D, E); 445 446 DST(0, 1) = AVG3(A, B, C); 447 DST(1, 1) = DST(0, 3) = AVG3(B, C, D); 448 DST(2, 1) = DST(1, 3) = AVG3(C, D, E); 449 DST(3, 1) = DST(2, 3) = AVG3(D, E, F); 450 DST(3, 2) = AVG3(E, F, G); 451 DST(3, 3) = AVG3(F, G, H); 452 } 453 454 static void HU4(uint8_t* dst, const uint8_t* top) { 455 const int I = top[-2]; 456 const int J = top[-3]; 457 const int K = top[-4]; 458 const int L = top[-5]; 459 DST(0, 0) = AVG2(I, J); 460 DST(2, 0) = DST(0, 1) = AVG2(J, K); 461 DST(2, 1) = DST(0, 2) = AVG2(K, L); 462 DST(1, 0) = AVG3(I, J, K); 463 DST(3, 0) = DST(1, 1) = AVG3(J, K, L); 464 DST(3, 1) = DST(1, 2) = AVG3(K, L, L); 465 DST(3, 2) = DST(2, 2) = 466 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; 467 } 468 469 static void HD4(uint8_t* dst, const uint8_t* top) { 470 const int X = top[-1]; 471 const int I = top[-2]; 472 const int J = top[-3]; 473 const int K = top[-4]; 474 const int L = top[-5]; 475 const int A = top[0]; 476 const int B = top[1]; 477 const int C = top[2]; 478 479 DST(0, 0) = DST(2, 1) = AVG2(I, X); 480 DST(0, 1) = DST(2, 2) = AVG2(J, I); 481 DST(0, 2) = DST(2, 3) = AVG2(K, J); 482 DST(0, 3) = AVG2(L, K); 483 484 DST(3, 0) = AVG3(A, B, C); 485 DST(2, 0) = AVG3(X, A, B); 486 DST(1, 0) = DST(3, 1) = AVG3(I, X, A); 487 DST(1, 1) = DST(3, 2) = AVG3(J, I, X); 488 DST(1, 2) = DST(3, 3) = AVG3(K, J, I); 489 DST(1, 3) = AVG3(L, K, J); 490 } 491 492 static void TM4(uint8_t* dst, const uint8_t* top) { 493 int x, y; 494 const uint8_t* const clip = clip1 + 255 - top[-1]; 495 for (y = 0; y < 4; ++y) { 496 const uint8_t* const clip_table = clip + top[-2 - y]; 497 for (x = 0; x < 4; ++x) { 498 dst[x] = clip_table[top[x]]; 499 } 500 dst += BPS; 501 } 502 } 503 504 #undef DST 505 #undef AVG3 506 #undef AVG2 507 508 // Left samples are top[-5 .. -2], top_left is top[-1], top are 509 // located at top[0..3], and top right is top[4..7] 510 static void Intra4Preds(uint8_t* dst, const uint8_t* top) { 511 DC4(I4DC4 + dst, top); 512 TM4(I4TM4 + dst, top); 513 VE4(I4VE4 + dst, top); 514 HE4(I4HE4 + dst, top); 515 RD4(I4RD4 + dst, top); 516 VR4(I4VR4 + dst, top); 517 LD4(I4LD4 + dst, top); 518 VL4(I4VL4 + dst, top); 519 HD4(I4HD4 + dst, top); 520 HU4(I4HU4 + dst, top); 521 } 522 523 //------------------------------------------------------------------------------ 524 // Metric 525 526 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, 527 int w, int h) { 528 int count = 0; 529 int y, x; 530 for (y = 0; y < h; ++y) { 531 for (x = 0; x < w; ++x) { 532 const int diff = (int)a[x] - b[x]; 533 count += diff * diff; 534 } 535 a += BPS; 536 b += BPS; 537 } 538 return count; 539 } 540 541 static int SSE16x16(const uint8_t* a, const uint8_t* b) { 542 return GetSSE(a, b, 16, 16); 543 } 544 static int SSE16x8(const uint8_t* a, const uint8_t* b) { 545 return GetSSE(a, b, 16, 8); 546 } 547 static int SSE8x8(const uint8_t* a, const uint8_t* b) { 548 return GetSSE(a, b, 8, 8); 549 } 550 static int SSE4x4(const uint8_t* a, const uint8_t* b) { 551 return GetSSE(a, b, 4, 4); 552 } 553 554 static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { 555 int k, x, y; 556 for (k = 0; k < 4; ++k) { 557 uint32_t avg = 0; 558 for (y = 0; y < 4; ++y) { 559 for (x = 0; x < 4; ++x) { 560 avg += ref[x + y * BPS]; 561 } 562 } 563 dc[k] = avg; 564 ref += 4; // go to next 4x4 block. 565 } 566 } 567 568 //------------------------------------------------------------------------------ 569 // Texture distortion 570 // 571 // We try to match the spectral content (weighted) between source and 572 // reconstructed samples. 573 574 // Hadamard transform 575 // Returns the weighted sum of the absolute value of transformed coefficients. 576 // w[] contains a row-major 4 by 4 symmetric matrix. 577 static int TTransform(const uint8_t* in, const uint16_t* w) { 578 int sum = 0; 579 int tmp[16]; 580 int i; 581 // horizontal pass 582 for (i = 0; i < 4; ++i, in += BPS) { 583 const int a0 = in[0] + in[2]; 584 const int a1 = in[1] + in[3]; 585 const int a2 = in[1] - in[3]; 586 const int a3 = in[0] - in[2]; 587 tmp[0 + i * 4] = a0 + a1; 588 tmp[1 + i * 4] = a3 + a2; 589 tmp[2 + i * 4] = a3 - a2; 590 tmp[3 + i * 4] = a0 - a1; 591 } 592 // vertical pass 593 for (i = 0; i < 4; ++i, ++w) { 594 const int a0 = tmp[0 + i] + tmp[8 + i]; 595 const int a1 = tmp[4 + i] + tmp[12+ i]; 596 const int a2 = tmp[4 + i] - tmp[12+ i]; 597 const int a3 = tmp[0 + i] - tmp[8 + i]; 598 const int b0 = a0 + a1; 599 const int b1 = a3 + a2; 600 const int b2 = a3 - a2; 601 const int b3 = a0 - a1; 602 603 sum += w[ 0] * abs(b0); 604 sum += w[ 4] * abs(b1); 605 sum += w[ 8] * abs(b2); 606 sum += w[12] * abs(b3); 607 } 608 return sum; 609 } 610 611 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 612 const uint16_t* const w) { 613 const int sum1 = TTransform(a, w); 614 const int sum2 = TTransform(b, w); 615 return abs(sum2 - sum1) >> 5; 616 } 617 618 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 619 const uint16_t* const w) { 620 int D = 0; 621 int x, y; 622 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 623 for (x = 0; x < 16; x += 4) { 624 D += Disto4x4(a + x + y, b + x + y, w); 625 } 626 } 627 return D; 628 } 629 630 //------------------------------------------------------------------------------ 631 // Quantization 632 // 633 634 static const uint8_t kZigzag[16] = { 635 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 636 }; 637 638 // Simple quantization 639 static int QuantizeBlock(int16_t in[16], int16_t out[16], 640 const VP8Matrix* const mtx) { 641 int last = -1; 642 int n; 643 for (n = 0; n < 16; ++n) { 644 const int j = kZigzag[n]; 645 const int sign = (in[j] < 0); 646 const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; 647 if (coeff > mtx->zthresh_[j]) { 648 const uint32_t Q = mtx->q_[j]; 649 const uint32_t iQ = mtx->iq_[j]; 650 const uint32_t B = mtx->bias_[j]; 651 int level = QUANTDIV(coeff, iQ, B); 652 if (level > MAX_LEVEL) level = MAX_LEVEL; 653 if (sign) level = -level; 654 in[j] = level * (int)Q; 655 out[n] = level; 656 if (level) last = n; 657 } else { 658 out[n] = 0; 659 in[j] = 0; 660 } 661 } 662 return (last >= 0); 663 } 664 665 static int Quantize2Blocks(int16_t in[32], int16_t out[32], 666 const VP8Matrix* const mtx) { 667 int nz; 668 nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; 669 nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; 670 return nz; 671 } 672 673 //------------------------------------------------------------------------------ 674 // Block copy 675 676 static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { 677 int y; 678 for (y = 0; y < h; ++y) { 679 memcpy(dst, src, w); 680 src += BPS; 681 dst += BPS; 682 } 683 } 684 685 static void Copy4x4(const uint8_t* src, uint8_t* dst) { 686 Copy(src, dst, 4, 4); 687 } 688 689 static void Copy16x8(const uint8_t* src, uint8_t* dst) { 690 Copy(src, dst, 16, 8); 691 } 692 693 //------------------------------------------------------------------------------ 694 // SSIM / PSNR 695 696 // hat-shaped filter. Sum of coefficients is equal to 16. 697 static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = { 698 1, 2, 3, 4, 3, 2, 1 699 }; 700 static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2 701 702 static WEBP_INLINE double SSIMCalculation( 703 const VP8DistoStats* const stats, uint32_t N /*num samples*/) { 704 const uint32_t w2 = N * N; 705 const uint32_t C1 = 20 * w2; 706 const uint32_t C2 = 60 * w2; 707 const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6 708 const uint64_t xmxm = (uint64_t)stats->xm * stats->xm; 709 const uint64_t ymym = (uint64_t)stats->ym * stats->ym; 710 if (xmxm + ymym >= C3) { 711 const int64_t xmym = (int64_t)stats->xm * stats->ym; 712 const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative 713 const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm; 714 const uint64_t syy = (uint64_t)stats->yym * N - ymym; 715 // we descale by 8 to prevent overflow during the fnum/fden multiply. 716 const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8; 717 const uint64_t den_S = (sxx + syy + C2) >> 8; 718 const uint64_t fnum = (2 * xmym + C1) * num_S; 719 const uint64_t fden = (xmxm + ymym + C1) * den_S; 720 const double r = (double)fnum / fden; 721 assert(r >= 0. && r <= 1.0); 722 return r; 723 } 724 return 1.; // area is too dark to contribute meaningfully 725 } 726 727 double VP8SSIMFromStats(const VP8DistoStats* const stats) { 728 return SSIMCalculation(stats, kWeightSum); 729 } 730 731 double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) { 732 return SSIMCalculation(stats, stats->w); 733 } 734 735 static double SSIMGetClipped_C(const uint8_t* src1, int stride1, 736 const uint8_t* src2, int stride2, 737 int xo, int yo, int W, int H) { 738 VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; 739 const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL; 740 const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1 741 : yo + VP8_SSIM_KERNEL; 742 const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL; 743 const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1 744 : xo + VP8_SSIM_KERNEL; 745 int x, y; 746 src1 += ymin * stride1; 747 src2 += ymin * stride2; 748 for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { 749 for (x = xmin; x <= xmax; ++x) { 750 const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo] 751 * kWeight[VP8_SSIM_KERNEL + y - yo]; 752 const uint32_t s1 = src1[x]; 753 const uint32_t s2 = src2[x]; 754 stats.w += w; 755 stats.xm += w * s1; 756 stats.ym += w * s2; 757 stats.xxm += w * s1 * s1; 758 stats.xym += w * s1 * s2; 759 stats.yym += w * s2 * s2; 760 } 761 } 762 return VP8SSIMFromStatsClipped(&stats); 763 } 764 765 static double SSIMGet_C(const uint8_t* src1, int stride1, 766 const uint8_t* src2, int stride2) { 767 VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; 768 int x, y; 769 for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { 770 for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { 771 const uint32_t w = kWeight[x] * kWeight[y]; 772 const uint32_t s1 = src1[x]; 773 const uint32_t s2 = src2[x]; 774 stats.xm += w * s1; 775 stats.ym += w * s2; 776 stats.xxm += w * s1 * s1; 777 stats.xym += w * s1 * s2; 778 stats.yym += w * s2 * s2; 779 } 780 } 781 return VP8SSIMFromStats(&stats); 782 } 783 784 //------------------------------------------------------------------------------ 785 786 static uint32_t AccumulateSSE(const uint8_t* src1, 787 const uint8_t* src2, int len) { 788 int i; 789 uint32_t sse2 = 0; 790 assert(len <= 65535); // to ensure that accumulation fits within uint32_t 791 for (i = 0; i < len; ++i) { 792 const int32_t diff = src1[i] - src2[i]; 793 sse2 += diff * diff; 794 } 795 return sse2; 796 } 797 798 //------------------------------------------------------------------------------ 799 800 VP8SSIMGetFunc VP8SSIMGet; 801 VP8SSIMGetClippedFunc VP8SSIMGetClipped; 802 VP8AccumulateSSEFunc VP8AccumulateSSE; 803 804 extern void VP8SSIMDspInitSSE2(void); 805 806 static volatile VP8CPUInfo ssim_last_cpuinfo_used = 807 (VP8CPUInfo)&ssim_last_cpuinfo_used; 808 809 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) { 810 if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return; 811 812 VP8SSIMGetClipped = SSIMGetClipped_C; 813 VP8SSIMGet = SSIMGet_C; 814 815 VP8AccumulateSSE = AccumulateSSE; 816 if (VP8GetCPUInfo != NULL) { 817 #if defined(WEBP_USE_SSE2) 818 if (VP8GetCPUInfo(kSSE2)) { 819 VP8SSIMDspInitSSE2(); 820 } 821 #endif 822 } 823 824 ssim_last_cpuinfo_used = VP8GetCPUInfo; 825 } 826 827 //------------------------------------------------------------------------------ 828 // Initialization 829 830 // Speed-critical function pointers. We have to initialize them to the default 831 // implementations within VP8EncDspInit(). 832 VP8CHisto VP8CollectHistogram; 833 VP8Idct VP8ITransform; 834 VP8Fdct VP8FTransform; 835 VP8Fdct VP8FTransform2; 836 VP8WHT VP8FTransformWHT; 837 VP8Intra4Preds VP8EncPredLuma4; 838 VP8IntraPreds VP8EncPredLuma16; 839 VP8IntraPreds VP8EncPredChroma8; 840 VP8Metric VP8SSE16x16; 841 VP8Metric VP8SSE8x8; 842 VP8Metric VP8SSE16x8; 843 VP8Metric VP8SSE4x4; 844 VP8WMetric VP8TDisto4x4; 845 VP8WMetric VP8TDisto16x16; 846 VP8MeanMetric VP8Mean16x4; 847 VP8QuantizeBlock VP8EncQuantizeBlock; 848 VP8Quantize2Blocks VP8EncQuantize2Blocks; 849 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; 850 VP8BlockCopy VP8Copy4x4; 851 VP8BlockCopy VP8Copy16x8; 852 853 extern void VP8EncDspInitSSE2(void); 854 extern void VP8EncDspInitSSE41(void); 855 extern void VP8EncDspInitAVX2(void); 856 extern void VP8EncDspInitNEON(void); 857 extern void VP8EncDspInitMIPS32(void); 858 extern void VP8EncDspInitMIPSdspR2(void); 859 extern void VP8EncDspInitMSA(void); 860 861 static volatile VP8CPUInfo enc_last_cpuinfo_used = 862 (VP8CPUInfo)&enc_last_cpuinfo_used; 863 864 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { 865 if (enc_last_cpuinfo_used == VP8GetCPUInfo) return; 866 867 VP8DspInit(); // common inverse transforms 868 InitTables(); 869 870 // default C implementations 871 VP8CollectHistogram = CollectHistogram; 872 VP8ITransform = ITransform; 873 VP8FTransform = FTransform; 874 VP8FTransform2 = FTransform2; 875 VP8FTransformWHT = FTransformWHT; 876 VP8EncPredLuma4 = Intra4Preds; 877 VP8EncPredLuma16 = Intra16Preds; 878 VP8EncPredChroma8 = IntraChromaPreds; 879 VP8SSE16x16 = SSE16x16; 880 VP8SSE8x8 = SSE8x8; 881 VP8SSE16x8 = SSE16x8; 882 VP8SSE4x4 = SSE4x4; 883 VP8TDisto4x4 = Disto4x4; 884 VP8TDisto16x16 = Disto16x16; 885 VP8Mean16x4 = Mean16x4; 886 VP8EncQuantizeBlock = QuantizeBlock; 887 VP8EncQuantize2Blocks = Quantize2Blocks; 888 VP8EncQuantizeBlockWHT = QuantizeBlock; 889 VP8Copy4x4 = Copy4x4; 890 VP8Copy16x8 = Copy16x8; 891 892 // If defined, use CPUInfo() to overwrite some pointers with faster versions. 893 if (VP8GetCPUInfo != NULL) { 894 #if defined(WEBP_USE_SSE2) 895 if (VP8GetCPUInfo(kSSE2)) { 896 VP8EncDspInitSSE2(); 897 #if defined(WEBP_USE_SSE41) 898 if (VP8GetCPUInfo(kSSE4_1)) { 899 VP8EncDspInitSSE41(); 900 } 901 #endif 902 } 903 #endif 904 #if defined(WEBP_USE_AVX2) 905 if (VP8GetCPUInfo(kAVX2)) { 906 VP8EncDspInitAVX2(); 907 } 908 #endif 909 #if defined(WEBP_USE_NEON) 910 if (VP8GetCPUInfo(kNEON)) { 911 VP8EncDspInitNEON(); 912 } 913 #endif 914 #if defined(WEBP_USE_MIPS32) 915 if (VP8GetCPUInfo(kMIPS32)) { 916 VP8EncDspInitMIPS32(); 917 } 918 #endif 919 #if defined(WEBP_USE_MIPS_DSP_R2) 920 if (VP8GetCPUInfo(kMIPSdspR2)) { 921 VP8EncDspInitMIPSdspR2(); 922 } 923 #endif 924 #if defined(WEBP_USE_MSA) 925 if (VP8GetCPUInfo(kMSA)) { 926 VP8EncDspInitMSA(); 927 } 928 #endif 929 } 930 enc_last_cpuinfo_used = VP8GetCPUInfo; 931 } 932