1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // This code is licensed under the same terms as WebM: 4 // Software License Agreement: http://www.webmproject.org/license/software/ 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ 6 // ----------------------------------------------------------------------------- 7 // 8 // Speed-critical encoding functions. 9 // 10 // Author: Skal (pascal.massimino (at) gmail.com) 11 12 #include <stdlib.h> // for abs() 13 #include "./dsp.h" 14 #include "../enc/vp8enci.h" 15 16 #if defined(__cplusplus) || defined(c_plusplus) 17 extern "C" { 18 #endif 19 20 //------------------------------------------------------------------------------ 21 // Compute susceptibility based on DCT-coeff histograms: 22 // the higher, the "easier" the macroblock is to compress. 23 24 static int ClipAlpha(int alpha) { 25 return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha; 26 } 27 28 int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) { 29 int num = 0, den = 0, val = 0; 30 int k; 31 int alpha; 32 // note: changing this loop to avoid the numerous "k + 1" slows things down. 33 for (k = 0; k < MAX_COEFF_THRESH; ++k) { 34 if (histo[k + 1]) { 35 val += histo[k + 1]; 36 num += val * (k + 1); 37 den += (k + 1) * (k + 1); 38 } 39 } 40 // we scale the value to a usable [0..255] range 41 alpha = den ? 10 * num / den - 5 : 0; 42 return ClipAlpha(alpha); 43 } 44 45 const int VP8DspScan[16 + 4 + 4] = { 46 // Luma 47 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 48 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 49 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 50 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, 51 52 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 53 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V 54 }; 55 56 static int CollectHistogram(const uint8_t* ref, const uint8_t* pred, 57 int start_block, int end_block) { 58 int histo[MAX_COEFF_THRESH + 1] = { 0 }; 59 int16_t out[16]; 60 int j, k; 61 for (j = start_block; j < end_block; ++j) { 62 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 63 64 // Convert coefficients to bin (within out[]). 65 for (k = 0; k < 16; ++k) { 66 const int v = abs(out[k]) >> 2; 67 out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v; 68 } 69 70 // Use bin to update histogram. 71 for (k = 0; k < 16; ++k) { 72 histo[out[k]]++; 73 } 74 } 75 76 return VP8GetAlpha(histo); 77 } 78 79 //------------------------------------------------------------------------------ 80 // run-time tables (~4k) 81 82 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] 83 84 // We declare this variable 'volatile' to prevent instruction reordering 85 // and make sure it's set to true _last_ (so as to be thread-safe) 86 static volatile int tables_ok = 0; 87 88 static void InitTables(void) { 89 if (!tables_ok) { 90 int i; 91 for (i = -255; i <= 255 + 255; ++i) { 92 clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i; 93 } 94 tables_ok = 1; 95 } 96 } 97 98 static WEBP_INLINE uint8_t clip_8b(int v) { 99 return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255; 100 } 101 102 //------------------------------------------------------------------------------ 103 // Transforms (Paragraph 14.4) 104 105 #define STORE(x, y, v) \ 106 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) 107 108 static const int kC1 = 20091 + (1 << 16); 109 static const int kC2 = 35468; 110 #define MUL(a, b) (((a) * (b)) >> 16) 111 112 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, 113 uint8_t* dst) { 114 int C[4 * 4], *tmp; 115 int i; 116 tmp = C; 117 for (i = 0; i < 4; ++i) { // vertical pass 118 const int a = in[0] + in[8]; 119 const int b = in[0] - in[8]; 120 const int c = MUL(in[4], kC2) - MUL(in[12], kC1); 121 const int d = MUL(in[4], kC1) + MUL(in[12], kC2); 122 tmp[0] = a + d; 123 tmp[1] = b + c; 124 tmp[2] = b - c; 125 tmp[3] = a - d; 126 tmp += 4; 127 in++; 128 } 129 130 tmp = C; 131 for (i = 0; i < 4; ++i) { // horizontal pass 132 const int dc = tmp[0] + 4; 133 const int a = dc + tmp[8]; 134 const int b = dc - tmp[8]; 135 const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1); 136 const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2); 137 STORE(0, i, a + d); 138 STORE(1, i, b + c); 139 STORE(2, i, b - c); 140 STORE(3, i, a - d); 141 tmp++; 142 } 143 } 144 145 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, 146 int do_two) { 147 ITransformOne(ref, in, dst); 148 if (do_two) { 149 ITransformOne(ref + 4, in + 16, dst + 4); 150 } 151 } 152 153 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { 154 int i; 155 int tmp[16]; 156 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { 157 const int d0 = src[0] - ref[0]; 158 const int d1 = src[1] - ref[1]; 159 const int d2 = src[2] - ref[2]; 160 const int d3 = src[3] - ref[3]; 161 const int a0 = (d0 + d3) << 3; 162 const int a1 = (d1 + d2) << 3; 163 const int a2 = (d1 - d2) << 3; 164 const int a3 = (d0 - d3) << 3; 165 tmp[0 + i * 4] = (a0 + a1); 166 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12; 167 tmp[2 + i * 4] = (a0 - a1); 168 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12; 169 } 170 for (i = 0; i < 4; ++i) { 171 const int a0 = (tmp[0 + i] + tmp[12 + i]); 172 const int a1 = (tmp[4 + i] + tmp[ 8 + i]); 173 const int a2 = (tmp[4 + i] - tmp[ 8 + i]); 174 const int a3 = (tmp[0 + i] - tmp[12 + i]); 175 out[0 + i] = (a0 + a1 + 7) >> 4; 176 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); 177 out[8 + i] = (a0 - a1 + 7) >> 4; 178 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); 179 } 180 } 181 182 static void ITransformWHT(const int16_t* in, int16_t* out) { 183 int tmp[16]; 184 int i; 185 for (i = 0; i < 4; ++i) { 186 const int a0 = in[0 + i] + in[12 + i]; 187 const int a1 = in[4 + i] + in[ 8 + i]; 188 const int a2 = in[4 + i] - in[ 8 + i]; 189 const int a3 = in[0 + i] - in[12 + i]; 190 tmp[0 + i] = a0 + a1; 191 tmp[8 + i] = a0 - a1; 192 tmp[4 + i] = a3 + a2; 193 tmp[12 + i] = a3 - a2; 194 } 195 for (i = 0; i < 4; ++i) { 196 const int dc = tmp[0 + i * 4] + 3; // w/ rounder 197 const int a0 = dc + tmp[3 + i * 4]; 198 const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4]; 199 const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4]; 200 const int a3 = dc - tmp[3 + i * 4]; 201 out[ 0] = (a0 + a1) >> 3; 202 out[16] = (a3 + a2) >> 3; 203 out[32] = (a0 - a1) >> 3; 204 out[48] = (a3 - a2) >> 3; 205 out += 64; 206 } 207 } 208 209 static void FTransformWHT(const int16_t* in, int16_t* out) { 210 int tmp[16]; 211 int i; 212 for (i = 0; i < 4; ++i, in += 64) { 213 const int a0 = (in[0 * 16] + in[2 * 16]) << 2; 214 const int a1 = (in[1 * 16] + in[3 * 16]) << 2; 215 const int a2 = (in[1 * 16] - in[3 * 16]) << 2; 216 const int a3 = (in[0 * 16] - in[2 * 16]) << 2; 217 tmp[0 + i * 4] = (a0 + a1) + (a0 != 0); 218 tmp[1 + i * 4] = a3 + a2; 219 tmp[2 + i * 4] = a3 - a2; 220 tmp[3 + i * 4] = a0 - a1; 221 } 222 for (i = 0; i < 4; ++i) { 223 const int a0 = (tmp[0 + i] + tmp[8 + i]); 224 const int a1 = (tmp[4 + i] + tmp[12+ i]); 225 const int a2 = (tmp[4 + i] - tmp[12+ i]); 226 const int a3 = (tmp[0 + i] - tmp[8 + i]); 227 const int b0 = a0 + a1; 228 const int b1 = a3 + a2; 229 const int b2 = a3 - a2; 230 const int b3 = a0 - a1; 231 out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3; 232 out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3; 233 out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3; 234 out[12 + i] = (b3 + (b3 > 0) + 3) >> 3; 235 } 236 } 237 238 #undef MUL 239 #undef STORE 240 241 //------------------------------------------------------------------------------ 242 // Intra predictions 243 244 #define DST(x, y) dst[(x) + (y) * BPS] 245 246 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) { 247 int j; 248 for (j = 0; j < size; ++j) { 249 memset(dst + j * BPS, value, size); 250 } 251 } 252 253 static WEBP_INLINE void VerticalPred(uint8_t* dst, 254 const uint8_t* top, int size) { 255 int j; 256 if (top) { 257 for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); 258 } else { 259 Fill(dst, 127, size); 260 } 261 } 262 263 static WEBP_INLINE void HorizontalPred(uint8_t* dst, 264 const uint8_t* left, int size) { 265 if (left) { 266 int j; 267 for (j = 0; j < size; ++j) { 268 memset(dst + j * BPS, left[j], size); 269 } 270 } else { 271 Fill(dst, 129, size); 272 } 273 } 274 275 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, 276 const uint8_t* top, int size) { 277 int y; 278 if (left) { 279 if (top) { 280 const uint8_t* const clip = clip1 + 255 - left[-1]; 281 for (y = 0; y < size; ++y) { 282 const uint8_t* const clip_table = clip + left[y]; 283 int x; 284 for (x = 0; x < size; ++x) { 285 dst[x] = clip_table[top[x]]; 286 } 287 dst += BPS; 288 } 289 } else { 290 HorizontalPred(dst, left, size); 291 } 292 } else { 293 // true motion without left samples (hence: with default 129 value) 294 // is equivalent to VE prediction where you just copy the top samples. 295 // Note that if top samples are not available, the default value is 296 // then 129, and not 127 as in the VerticalPred case. 297 if (top) { 298 VerticalPred(dst, top, size); 299 } else { 300 Fill(dst, 129, size); 301 } 302 } 303 } 304 305 static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, 306 const uint8_t* top, 307 int size, int round, int shift) { 308 int DC = 0; 309 int j; 310 if (top) { 311 for (j = 0; j < size; ++j) DC += top[j]; 312 if (left) { // top and left present 313 for (j = 0; j < size; ++j) DC += left[j]; 314 } else { // top, but no left 315 DC += DC; 316 } 317 DC = (DC + round) >> shift; 318 } else if (left) { // left but no top 319 for (j = 0; j < size; ++j) DC += left[j]; 320 DC += DC; 321 DC = (DC + round) >> shift; 322 } else { // no top, no left, nothing. 323 DC = 0x80; 324 } 325 Fill(dst, DC, size); 326 } 327 328 //------------------------------------------------------------------------------ 329 // Chroma 8x8 prediction (paragraph 12.2) 330 331 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, 332 const uint8_t* top) { 333 // U block 334 DCMode(C8DC8 + dst, left, top, 8, 8, 4); 335 VerticalPred(C8VE8 + dst, top, 8); 336 HorizontalPred(C8HE8 + dst, left, 8); 337 TrueMotion(C8TM8 + dst, left, top, 8); 338 // V block 339 dst += 8; 340 if (top) top += 8; 341 if (left) left += 16; 342 DCMode(C8DC8 + dst, left, top, 8, 8, 4); 343 VerticalPred(C8VE8 + dst, top, 8); 344 HorizontalPred(C8HE8 + dst, left, 8); 345 TrueMotion(C8TM8 + dst, left, top, 8); 346 } 347 348 //------------------------------------------------------------------------------ 349 // luma 16x16 prediction (paragraph 12.3) 350 351 static void Intra16Preds(uint8_t* dst, 352 const uint8_t* left, const uint8_t* top) { 353 DCMode(I16DC16 + dst, left, top, 16, 16, 5); 354 VerticalPred(I16VE16 + dst, top, 16); 355 HorizontalPred(I16HE16 + dst, left, 16); 356 TrueMotion(I16TM16 + dst, left, top, 16); 357 } 358 359 //------------------------------------------------------------------------------ 360 // luma 4x4 prediction 361 362 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) 363 #define AVG2(a, b) (((a) + (b) + 1) >> 1) 364 365 static void VE4(uint8_t* dst, const uint8_t* top) { // vertical 366 const uint8_t vals[4] = { 367 AVG3(top[-1], top[0], top[1]), 368 AVG3(top[ 0], top[1], top[2]), 369 AVG3(top[ 1], top[2], top[3]), 370 AVG3(top[ 2], top[3], top[4]) 371 }; 372 int i; 373 for (i = 0; i < 4; ++i) { 374 memcpy(dst + i * BPS, vals, 4); 375 } 376 } 377 378 static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal 379 const int X = top[-1]; 380 const int I = top[-2]; 381 const int J = top[-3]; 382 const int K = top[-4]; 383 const int L = top[-5]; 384 *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J); 385 *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K); 386 *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L); 387 *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L); 388 } 389 390 static void DC4(uint8_t* dst, const uint8_t* top) { 391 uint32_t dc = 4; 392 int i; 393 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; 394 Fill(dst, dc >> 3, 4); 395 } 396 397 static void RD4(uint8_t* dst, const uint8_t* top) { 398 const int X = top[-1]; 399 const int I = top[-2]; 400 const int J = top[-3]; 401 const int K = top[-4]; 402 const int L = top[-5]; 403 const int A = top[0]; 404 const int B = top[1]; 405 const int C = top[2]; 406 const int D = top[3]; 407 DST(0, 3) = AVG3(J, K, L); 408 DST(0, 2) = DST(1, 3) = AVG3(I, J, K); 409 DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J); 410 DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I); 411 DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X); 412 DST(2, 0) = DST(3, 1) = AVG3(C, B, A); 413 DST(3, 0) = AVG3(D, C, B); 414 } 415 416 static void LD4(uint8_t* dst, const uint8_t* top) { 417 const int A = top[0]; 418 const int B = top[1]; 419 const int C = top[2]; 420 const int D = top[3]; 421 const int E = top[4]; 422 const int F = top[5]; 423 const int G = top[6]; 424 const int H = top[7]; 425 DST(0, 0) = AVG3(A, B, C); 426 DST(1, 0) = DST(0, 1) = AVG3(B, C, D); 427 DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); 428 DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); 429 DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); 430 DST(3, 2) = DST(2, 3) = AVG3(F, G, H); 431 DST(3, 3) = AVG3(G, H, H); 432 } 433 434 static void VR4(uint8_t* dst, const uint8_t* top) { 435 const int X = top[-1]; 436 const int I = top[-2]; 437 const int J = top[-3]; 438 const int K = top[-4]; 439 const int A = top[0]; 440 const int B = top[1]; 441 const int C = top[2]; 442 const int D = top[3]; 443 DST(0, 0) = DST(1, 2) = AVG2(X, A); 444 DST(1, 0) = DST(2, 2) = AVG2(A, B); 445 DST(2, 0) = DST(3, 2) = AVG2(B, C); 446 DST(3, 0) = AVG2(C, D); 447 448 DST(0, 3) = AVG3(K, J, I); 449 DST(0, 2) = AVG3(J, I, X); 450 DST(0, 1) = DST(1, 3) = AVG3(I, X, A); 451 DST(1, 1) = DST(2, 3) = AVG3(X, A, B); 452 DST(2, 1) = DST(3, 3) = AVG3(A, B, C); 453 DST(3, 1) = AVG3(B, C, D); 454 } 455 456 static void VL4(uint8_t* dst, const uint8_t* top) { 457 const int A = top[0]; 458 const int B = top[1]; 459 const int C = top[2]; 460 const int D = top[3]; 461 const int E = top[4]; 462 const int F = top[5]; 463 const int G = top[6]; 464 const int H = top[7]; 465 DST(0, 0) = AVG2(A, B); 466 DST(1, 0) = DST(0, 2) = AVG2(B, C); 467 DST(2, 0) = DST(1, 2) = AVG2(C, D); 468 DST(3, 0) = DST(2, 2) = AVG2(D, E); 469 470 DST(0, 1) = AVG3(A, B, C); 471 DST(1, 1) = DST(0, 3) = AVG3(B, C, D); 472 DST(2, 1) = DST(1, 3) = AVG3(C, D, E); 473 DST(3, 1) = DST(2, 3) = AVG3(D, E, F); 474 DST(3, 2) = AVG3(E, F, G); 475 DST(3, 3) = AVG3(F, G, H); 476 } 477 478 static void HU4(uint8_t* dst, const uint8_t* top) { 479 const int I = top[-2]; 480 const int J = top[-3]; 481 const int K = top[-4]; 482 const int L = top[-5]; 483 DST(0, 0) = AVG2(I, J); 484 DST(2, 0) = DST(0, 1) = AVG2(J, K); 485 DST(2, 1) = DST(0, 2) = AVG2(K, L); 486 DST(1, 0) = AVG3(I, J, K); 487 DST(3, 0) = DST(1, 1) = AVG3(J, K, L); 488 DST(3, 1) = DST(1, 2) = AVG3(K, L, L); 489 DST(3, 2) = DST(2, 2) = 490 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; 491 } 492 493 static void HD4(uint8_t* dst, const uint8_t* top) { 494 const int X = top[-1]; 495 const int I = top[-2]; 496 const int J = top[-3]; 497 const int K = top[-4]; 498 const int L = top[-5]; 499 const int A = top[0]; 500 const int B = top[1]; 501 const int C = top[2]; 502 503 DST(0, 0) = DST(2, 1) = AVG2(I, X); 504 DST(0, 1) = DST(2, 2) = AVG2(J, I); 505 DST(0, 2) = DST(2, 3) = AVG2(K, J); 506 DST(0, 3) = AVG2(L, K); 507 508 DST(3, 0) = AVG3(A, B, C); 509 DST(2, 0) = AVG3(X, A, B); 510 DST(1, 0) = DST(3, 1) = AVG3(I, X, A); 511 DST(1, 1) = DST(3, 2) = AVG3(J, I, X); 512 DST(1, 2) = DST(3, 3) = AVG3(K, J, I); 513 DST(1, 3) = AVG3(L, K, J); 514 } 515 516 static void TM4(uint8_t* dst, const uint8_t* top) { 517 int x, y; 518 const uint8_t* const clip = clip1 + 255 - top[-1]; 519 for (y = 0; y < 4; ++y) { 520 const uint8_t* const clip_table = clip + top[-2 - y]; 521 for (x = 0; x < 4; ++x) { 522 dst[x] = clip_table[top[x]]; 523 } 524 dst += BPS; 525 } 526 } 527 528 #undef DST 529 #undef AVG3 530 #undef AVG2 531 532 // Left samples are top[-5 .. -2], top_left is top[-1], top are 533 // located at top[0..3], and top right is top[4..7] 534 static void Intra4Preds(uint8_t* dst, const uint8_t* top) { 535 DC4(I4DC4 + dst, top); 536 TM4(I4TM4 + dst, top); 537 VE4(I4VE4 + dst, top); 538 HE4(I4HE4 + dst, top); 539 RD4(I4RD4 + dst, top); 540 VR4(I4VR4 + dst, top); 541 LD4(I4LD4 + dst, top); 542 VL4(I4VL4 + dst, top); 543 HD4(I4HD4 + dst, top); 544 HU4(I4HU4 + dst, top); 545 } 546 547 //------------------------------------------------------------------------------ 548 // Metric 549 550 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, 551 int w, int h) { 552 int count = 0; 553 int y, x; 554 for (y = 0; y < h; ++y) { 555 for (x = 0; x < w; ++x) { 556 const int diff = (int)a[x] - b[x]; 557 count += diff * diff; 558 } 559 a += BPS; 560 b += BPS; 561 } 562 return count; 563 } 564 565 static int SSE16x16(const uint8_t* a, const uint8_t* b) { 566 return GetSSE(a, b, 16, 16); 567 } 568 static int SSE16x8(const uint8_t* a, const uint8_t* b) { 569 return GetSSE(a, b, 16, 8); 570 } 571 static int SSE8x8(const uint8_t* a, const uint8_t* b) { 572 return GetSSE(a, b, 8, 8); 573 } 574 static int SSE4x4(const uint8_t* a, const uint8_t* b) { 575 return GetSSE(a, b, 4, 4); 576 } 577 578 //------------------------------------------------------------------------------ 579 // Texture distortion 580 // 581 // We try to match the spectral content (weighted) between source and 582 // reconstructed samples. 583 584 // Hadamard transform 585 // Returns the weighted sum of the absolute value of transformed coefficients. 586 static int TTransform(const uint8_t* in, const uint16_t* w) { 587 int sum = 0; 588 int tmp[16]; 589 int i; 590 // horizontal pass 591 for (i = 0; i < 4; ++i, in += BPS) { 592 const int a0 = (in[0] + in[2]) << 2; 593 const int a1 = (in[1] + in[3]) << 2; 594 const int a2 = (in[1] - in[3]) << 2; 595 const int a3 = (in[0] - in[2]) << 2; 596 tmp[0 + i * 4] = a0 + a1 + (a0 != 0); 597 tmp[1 + i * 4] = a3 + a2; 598 tmp[2 + i * 4] = a3 - a2; 599 tmp[3 + i * 4] = a0 - a1; 600 } 601 // vertical pass 602 for (i = 0; i < 4; ++i, ++w) { 603 const int a0 = (tmp[0 + i] + tmp[8 + i]); 604 const int a1 = (tmp[4 + i] + tmp[12+ i]); 605 const int a2 = (tmp[4 + i] - tmp[12+ i]); 606 const int a3 = (tmp[0 + i] - tmp[8 + i]); 607 const int b0 = a0 + a1; 608 const int b1 = a3 + a2; 609 const int b2 = a3 - a2; 610 const int b3 = a0 - a1; 611 // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 612 sum += w[ 0] * ((abs(b0) + 3) >> 3); 613 sum += w[ 4] * ((abs(b1) + 3) >> 3); 614 sum += w[ 8] * ((abs(b2) + 3) >> 3); 615 sum += w[12] * ((abs(b3) + 3) >> 3); 616 } 617 return sum; 618 } 619 620 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 621 const uint16_t* const w) { 622 const int sum1 = TTransform(a, w); 623 const int sum2 = TTransform(b, w); 624 return (abs(sum2 - sum1) + 8) >> 4; 625 } 626 627 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 628 const uint16_t* const w) { 629 int D = 0; 630 int x, y; 631 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 632 for (x = 0; x < 16; x += 4) { 633 D += Disto4x4(a + x + y, b + x + y, w); 634 } 635 } 636 return D; 637 } 638 639 //------------------------------------------------------------------------------ 640 // Quantization 641 // 642 643 static const uint8_t kZigzag[16] = { 644 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 645 }; 646 647 // Simple quantization 648 static int QuantizeBlock(int16_t in[16], int16_t out[16], 649 int n, const VP8Matrix* const mtx) { 650 int last = -1; 651 for (; n < 16; ++n) { 652 const int j = kZigzag[n]; 653 const int sign = (in[j] < 0); 654 int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; 655 if (coeff > 2047) coeff = 2047; 656 if (coeff > mtx->zthresh_[j]) { 657 const int Q = mtx->q_[j]; 658 const int iQ = mtx->iq_[j]; 659 const int B = mtx->bias_[j]; 660 out[n] = QUANTDIV(coeff, iQ, B); 661 if (sign) out[n] = -out[n]; 662 in[j] = out[n] * Q; 663 if (out[n]) last = n; 664 } else { 665 out[n] = 0; 666 in[j] = 0; 667 } 668 } 669 return (last >= 0); 670 } 671 672 //------------------------------------------------------------------------------ 673 // Block copy 674 675 static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) { 676 int y; 677 for (y = 0; y < size; ++y) { 678 memcpy(dst, src, size); 679 src += BPS; 680 dst += BPS; 681 } 682 } 683 684 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); } 685 686 //------------------------------------------------------------------------------ 687 // Initialization 688 689 // Speed-critical function pointers. We have to initialize them to the default 690 // implementations within VP8EncDspInit(). 691 VP8CHisto VP8CollectHistogram; 692 VP8Idct VP8ITransform; 693 VP8Fdct VP8FTransform; 694 VP8WHT VP8ITransformWHT; 695 VP8WHT VP8FTransformWHT; 696 VP8Intra4Preds VP8EncPredLuma4; 697 VP8IntraPreds VP8EncPredLuma16; 698 VP8IntraPreds VP8EncPredChroma8; 699 VP8Metric VP8SSE16x16; 700 VP8Metric VP8SSE8x8; 701 VP8Metric VP8SSE16x8; 702 VP8Metric VP8SSE4x4; 703 VP8WMetric VP8TDisto4x4; 704 VP8WMetric VP8TDisto16x16; 705 VP8QuantizeBlock VP8EncQuantizeBlock; 706 VP8BlockCopy VP8Copy4x4; 707 708 extern void VP8EncDspInitSSE2(void); 709 710 void VP8EncDspInit(void) { 711 InitTables(); 712 713 // default C implementations 714 VP8CollectHistogram = CollectHistogram; 715 VP8ITransform = ITransform; 716 VP8FTransform = FTransform; 717 VP8ITransformWHT = ITransformWHT; 718 VP8FTransformWHT = FTransformWHT; 719 VP8EncPredLuma4 = Intra4Preds; 720 VP8EncPredLuma16 = Intra16Preds; 721 VP8EncPredChroma8 = IntraChromaPreds; 722 VP8SSE16x16 = SSE16x16; 723 VP8SSE8x8 = SSE8x8; 724 VP8SSE16x8 = SSE16x8; 725 VP8SSE4x4 = SSE4x4; 726 VP8TDisto4x4 = Disto4x4; 727 VP8TDisto16x16 = Disto16x16; 728 VP8EncQuantizeBlock = QuantizeBlock; 729 VP8Copy4x4 = Copy4x4; 730 731 // If defined, use CPUInfo() to overwrite some pointers with faster versions. 732 if (VP8GetCPUInfo) { 733 #if defined(WEBP_USE_SSE2) 734 if (VP8GetCPUInfo(kSSE2)) { 735 VP8EncDspInitSSE2(); 736 } 737 #endif 738 } 739 } 740 741 #if defined(__cplusplus) || defined(c_plusplus) 742 } // extern "C" 743 #endif 744