1 // Copyright 2011 Google Inc. 2 // 3 // This code is licensed under the same terms as WebM: 4 // Software License Agreement: http://www.webmproject.org/license/software/ 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ 6 // ----------------------------------------------------------------------------- 7 // 8 // speed-critical functions. 9 // 10 // Author: Skal (pascal.massimino (at) gmail.com) 11 12 #include <assert.h> 13 #include "vp8enci.h" 14 15 #if defined(__cplusplus) || defined(c_plusplus) 16 extern "C" { 17 #endif 18 19 //----------------------------------------------------------------------------- 20 // Compute susceptibility based on DCT-coeff histograms: 21 // the higher, the "easier" the macroblock is to compress. 22 23 static int ClipAlpha(int alpha) { 24 return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha; 25 } 26 27 int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) { 28 int num = 0, den = 0, val = 0; 29 int k; 30 int alpha; 31 // note: changing this loop to avoid the numerous "k + 1" slows things down. 32 for (k = 0; k < MAX_COEFF_THRESH; ++k) { 33 if (histo[k + 1]) { 34 val += histo[k + 1]; 35 num += val * (k + 1); 36 den += (k + 1) * (k + 1); 37 } 38 } 39 // we scale the value to a usable [0..255] range 40 alpha = den ? 10 * num / den - 5 : 0; 41 return ClipAlpha(alpha); 42 } 43 44 static int CollectHistogram(const uint8_t* ref, const uint8_t* pred, 45 int start_block, int end_block) { 46 int histo[MAX_COEFF_THRESH + 1] = { 0 }; 47 int16_t out[16]; 48 int j, k; 49 for (j = start_block; j < end_block; ++j) { 50 VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out); 51 52 // Convert coefficients to bin (within out[]). 53 for (k = 0; k < 16; ++k) { 54 const int v = abs(out[k]) >> 2; 55 out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v; 56 } 57 58 // Use bin to update histogram. 59 for (k = 0; k < 16; ++k) { 60 histo[out[k]]++; 61 } 62 } 63 64 return VP8GetAlpha(histo); 65 } 66 67 //----------------------------------------------------------------------------- 68 // run-time tables (~4k) 69 70 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] 71 72 // We declare this variable 'volatile' to prevent instruction reordering 73 // and make sure it's set to true _last_ (so as to be thread-safe) 74 static volatile int tables_ok = 0; 75 76 static void InitTables(void) { 77 if (!tables_ok) { 78 int i; 79 for (i = -255; i <= 255 + 255; ++i) { 80 clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i; 81 } 82 tables_ok = 1; 83 } 84 } 85 86 static inline uint8_t clip_8b(int v) { 87 return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255; 88 } 89 90 //----------------------------------------------------------------------------- 91 // Transforms (Paragraph 14.4) 92 93 #define STORE(x, y, v) \ 94 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) 95 96 static const int kC1 = 20091 + (1 << 16); 97 static const int kC2 = 35468; 98 #define MUL(a, b) (((a) * (b)) >> 16) 99 100 static inline void ITransformOne(const uint8_t* ref, const int16_t* in, 101 uint8_t* dst) { 102 int C[4 * 4], *tmp; 103 int i; 104 tmp = C; 105 for (i = 0; i < 4; ++i) { // vertical pass 106 const int a = in[0] + in[8]; 107 const int b = in[0] - in[8]; 108 const int c = MUL(in[4], kC2) - MUL(in[12], kC1); 109 const int d = MUL(in[4], kC1) + MUL(in[12], kC2); 110 tmp[0] = a + d; 111 tmp[1] = b + c; 112 tmp[2] = b - c; 113 tmp[3] = a - d; 114 tmp += 4; 115 in++; 116 } 117 118 tmp = C; 119 for (i = 0; i < 4; ++i) { // horizontal pass 120 const int dc = tmp[0] + 4; 121 const int a = dc + tmp[8]; 122 const int b = dc - tmp[8]; 123 const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1); 124 const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2); 125 STORE(0, i, a + d); 126 STORE(1, i, b + c); 127 STORE(2, i, b - c); 128 STORE(3, i, a - d); 129 tmp++; 130 } 131 } 132 133 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, 134 int do_two) { 135 ITransformOne(ref, in, dst); 136 if (do_two) { 137 ITransformOne(ref + 4, in + 16, dst + 4); 138 } 139 } 140 141 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { 142 int i; 143 int tmp[16]; 144 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { 145 const int d0 = src[0] - ref[0]; 146 const int d1 = src[1] - ref[1]; 147 const int d2 = src[2] - ref[2]; 148 const int d3 = src[3] - ref[3]; 149 const int a0 = (d0 + d3) << 3; 150 const int a1 = (d1 + d2) << 3; 151 const int a2 = (d1 - d2) << 3; 152 const int a3 = (d0 - d3) << 3; 153 tmp[0 + i * 4] = (a0 + a1); 154 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12; 155 tmp[2 + i * 4] = (a0 - a1); 156 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12; 157 } 158 for (i = 0; i < 4; ++i) { 159 const int a0 = (tmp[0 + i] + tmp[12 + i]); 160 const int a1 = (tmp[4 + i] + tmp[ 8 + i]); 161 const int a2 = (tmp[4 + i] - tmp[ 8 + i]); 162 const int a3 = (tmp[0 + i] - tmp[12 + i]); 163 out[0 + i] = (a0 + a1 + 7) >> 4; 164 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); 165 out[8 + i] = (a0 - a1 + 7) >> 4; 166 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); 167 } 168 } 169 170 static void ITransformWHT(const int16_t* in, int16_t* out) { 171 int tmp[16]; 172 int i; 173 for (i = 0; i < 4; ++i) { 174 const int a0 = in[0 + i] + in[12 + i]; 175 const int a1 = in[4 + i] + in[ 8 + i]; 176 const int a2 = in[4 + i] - in[ 8 + i]; 177 const int a3 = in[0 + i] - in[12 + i]; 178 tmp[0 + i] = a0 + a1; 179 tmp[8 + i] = a0 - a1; 180 tmp[4 + i] = a3 + a2; 181 tmp[12 + i] = a3 - a2; 182 } 183 for (i = 0; i < 4; ++i) { 184 const int dc = tmp[0 + i * 4] + 3; // w/ rounder 185 const int a0 = dc + tmp[3 + i * 4]; 186 const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4]; 187 const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4]; 188 const int a3 = dc - tmp[3 + i * 4]; 189 out[ 0] = (a0 + a1) >> 3; 190 out[16] = (a3 + a2) >> 3; 191 out[32] = (a0 - a1) >> 3; 192 out[48] = (a3 - a2) >> 3; 193 out += 64; 194 } 195 } 196 197 static void FTransformWHT(const int16_t* in, int16_t* out) { 198 int tmp[16]; 199 int i; 200 for (i = 0; i < 4; ++i, in += 64) { 201 const int a0 = (in[0 * 16] + in[2 * 16]) << 2; 202 const int a1 = (in[1 * 16] + in[3 * 16]) << 2; 203 const int a2 = (in[1 * 16] - in[3 * 16]) << 2; 204 const int a3 = (in[0 * 16] - in[2 * 16]) << 2; 205 tmp[0 + i * 4] = (a0 + a1) + (a0 != 0); 206 tmp[1 + i * 4] = a3 + a2; 207 tmp[2 + i * 4] = a3 - a2; 208 tmp[3 + i * 4] = a0 - a1; 209 } 210 for (i = 0; i < 4; ++i) { 211 const int a0 = (tmp[0 + i] + tmp[8 + i]); 212 const int a1 = (tmp[4 + i] + tmp[12+ i]); 213 const int a2 = (tmp[4 + i] - tmp[12+ i]); 214 const int a3 = (tmp[0 + i] - tmp[8 + i]); 215 const int b0 = a0 + a1; 216 const int b1 = a3 + a2; 217 const int b2 = a3 - a2; 218 const int b3 = a0 - a1; 219 out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3; 220 out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3; 221 out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3; 222 out[12 + i] = (b3 + (b3 > 0) + 3) >> 3; 223 } 224 } 225 226 #undef MUL 227 #undef STORE 228 229 //----------------------------------------------------------------------------- 230 // Intra predictions 231 232 #define OUT(x, y) dst[(x) + (y) * BPS] 233 234 static inline void Fill(uint8_t* dst, int value, int size) { 235 int j; 236 for (j = 0; j < size; ++j) { 237 memset(dst + j * BPS, value, size); 238 } 239 } 240 241 static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) { 242 int j; 243 if (top) { 244 for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); 245 } else { 246 Fill(dst, 127, size); 247 } 248 } 249 250 static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) { 251 if (left) { 252 int j; 253 for (j = 0; j < size; ++j) { 254 memset(dst + j * BPS, left[j], size); 255 } 256 } else { 257 Fill(dst, 129, size); 258 } 259 } 260 261 static inline void TrueMotion(uint8_t* dst, const uint8_t* left, 262 const uint8_t* top, int size) { 263 int y; 264 if (left) { 265 if (top) { 266 const uint8_t* const clip = clip1 + 255 - left[-1]; 267 for (y = 0; y < size; ++y) { 268 const uint8_t* const clip_table = clip + left[y]; 269 int x; 270 for (x = 0; x < size; ++x) { 271 dst[x] = clip_table[top[x]]; 272 } 273 dst += BPS; 274 } 275 } else { 276 HorizontalPred(dst, left, size); 277 } 278 } else { 279 // true motion without left samples (hence: with default 129 value) 280 // is equivalent to VE prediction where you just copy the top samples. 281 // Note that if top samples are not available, the default value is 282 // then 129, and not 127 as in the VerticalPred case. 283 if (top) { 284 VerticalPred(dst, top, size); 285 } else { 286 Fill(dst, 129, size); 287 } 288 } 289 } 290 291 static inline void DCMode(uint8_t* dst, const uint8_t* left, 292 const uint8_t* top, 293 int size, int round, int shift) { 294 int DC = 0; 295 int j; 296 if (top) { 297 for (j = 0; j < size; ++j) DC += top[j]; 298 if (left) { // top and left present 299 for (j = 0; j < size; ++j) DC += left[j]; 300 } else { // top, but no left 301 DC += DC; 302 } 303 DC = (DC + round) >> shift; 304 } else if (left) { // left but no top 305 for (j = 0; j < size; ++j) DC += left[j]; 306 DC += DC; 307 DC = (DC + round) >> shift; 308 } else { // no top, no left, nothing. 309 DC = 0x80; 310 } 311 Fill(dst, DC, size); 312 } 313 314 //----------------------------------------------------------------------------- 315 // Chroma 8x8 prediction (paragraph 12.2) 316 317 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, 318 const uint8_t* top) { 319 // U block 320 DCMode(C8DC8 + dst, left, top, 8, 8, 4); 321 VerticalPred(C8VE8 + dst, top, 8); 322 HorizontalPred(C8HE8 + dst, left, 8); 323 TrueMotion(C8TM8 + dst, left, top, 8); 324 // V block 325 dst += 8; 326 if (top) top += 8; 327 if (left) left += 16; 328 DCMode(C8DC8 + dst, left, top, 8, 8, 4); 329 VerticalPred(C8VE8 + dst, top, 8); 330 HorizontalPred(C8HE8 + dst, left, 8); 331 TrueMotion(C8TM8 + dst, left, top, 8); 332 } 333 334 //----------------------------------------------------------------------------- 335 // luma 16x16 prediction (paragraph 12.3) 336 337 static void Intra16Preds(uint8_t* dst, 338 const uint8_t* left, const uint8_t* top) { 339 DCMode(I16DC16 + dst, left, top, 16, 16, 5); 340 VerticalPred(I16VE16 + dst, top, 16); 341 HorizontalPred(I16HE16 + dst, left, 16); 342 TrueMotion(I16TM16 + dst, left, top, 16); 343 } 344 345 //----------------------------------------------------------------------------- 346 // luma 4x4 prediction 347 348 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) 349 #define AVG2(a, b) (((a) + (b) + 1) >> 1) 350 351 static void VE4(uint8_t* dst, const uint8_t* top) { // vertical 352 const uint8_t vals[4] = { 353 AVG3(top[-1], top[0], top[1]), 354 AVG3(top[ 0], top[1], top[2]), 355 AVG3(top[ 1], top[2], top[3]), 356 AVG3(top[ 2], top[3], top[4]) 357 }; 358 int i; 359 for (i = 0; i < 4; ++i) { 360 memcpy(dst + i * BPS, vals, 4); 361 } 362 } 363 364 static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal 365 const int X = top[-1]; 366 const int I = top[-2]; 367 const int J = top[-3]; 368 const int K = top[-4]; 369 const int L = top[-5]; 370 *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J); 371 *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K); 372 *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L); 373 *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L); 374 } 375 376 static void DC4(uint8_t* dst, const uint8_t* top) { 377 uint32_t dc = 4; 378 int i; 379 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; 380 Fill(dst, dc >> 3, 4); 381 } 382 383 static void RD4(uint8_t* dst, const uint8_t* top) { 384 const int X = top[-1]; 385 const int I = top[-2]; 386 const int J = top[-3]; 387 const int K = top[-4]; 388 const int L = top[-5]; 389 const int A = top[0]; 390 const int B = top[1]; 391 const int C = top[2]; 392 const int D = top[3]; 393 OUT(0, 3) = AVG3(J, K, L); 394 OUT(0, 2) = OUT(1, 3) = AVG3(I, J, K); 395 OUT(0, 1) = OUT(1, 2) = OUT(2, 3) = AVG3(X, I, J); 396 OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I); 397 OUT(1, 0) = OUT(2, 1) = OUT(3, 2) = AVG3(B, A, X); 398 OUT(2, 0) = OUT(3, 1) = AVG3(C, B, A); 399 OUT(3, 0) = AVG3(D, C, B); 400 } 401 402 static void LD4(uint8_t* dst, const uint8_t* top) { 403 const int A = top[0]; 404 const int B = top[1]; 405 const int C = top[2]; 406 const int D = top[3]; 407 const int E = top[4]; 408 const int F = top[5]; 409 const int G = top[6]; 410 const int H = top[7]; 411 OUT(0, 0) = AVG3(A, B, C); 412 OUT(1, 0) = OUT(0, 1) = AVG3(B, C, D); 413 OUT(2, 0) = OUT(1, 1) = OUT(0, 2) = AVG3(C, D, E); 414 OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F); 415 OUT(3, 1) = OUT(2, 2) = OUT(1, 3) = AVG3(E, F, G); 416 OUT(3, 2) = OUT(2, 3) = AVG3(F, G, H); 417 OUT(3, 3) = AVG3(G, H, H); 418 } 419 420 static void VR4(uint8_t* dst, const uint8_t* top) { 421 const int X = top[-1]; 422 const int I = top[-2]; 423 const int J = top[-3]; 424 const int K = top[-4]; 425 const int A = top[0]; 426 const int B = top[1]; 427 const int C = top[2]; 428 const int D = top[3]; 429 OUT(0, 0) = OUT(1, 2) = AVG2(X, A); 430 OUT(1, 0) = OUT(2, 2) = AVG2(A, B); 431 OUT(2, 0) = OUT(3, 2) = AVG2(B, C); 432 OUT(3, 0) = AVG2(C, D); 433 434 OUT(0, 3) = AVG3(K, J, I); 435 OUT(0, 2) = AVG3(J, I, X); 436 OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A); 437 OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B); 438 OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C); 439 OUT(3, 1) = AVG3(B, C, D); 440 } 441 442 static void VL4(uint8_t* dst, const uint8_t* top) { 443 const int A = top[0]; 444 const int B = top[1]; 445 const int C = top[2]; 446 const int D = top[3]; 447 const int E = top[4]; 448 const int F = top[5]; 449 const int G = top[6]; 450 const int H = top[7]; 451 OUT(0, 0) = AVG2(A, B); 452 OUT(1, 0) = OUT(0, 2) = AVG2(B, C); 453 OUT(2, 0) = OUT(1, 2) = AVG2(C, D); 454 OUT(3, 0) = OUT(2, 2) = AVG2(D, E); 455 456 OUT(0, 1) = AVG3(A, B, C); 457 OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D); 458 OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E); 459 OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F); 460 OUT(3, 2) = AVG3(E, F, G); 461 OUT(3, 3) = AVG3(F, G, H); 462 } 463 464 static void HU4(uint8_t* dst, const uint8_t* top) { 465 const int I = top[-2]; 466 const int J = top[-3]; 467 const int K = top[-4]; 468 const int L = top[-5]; 469 OUT(0, 0) = AVG2(I, J); 470 OUT(2, 0) = OUT(0, 1) = AVG2(J, K); 471 OUT(2, 1) = OUT(0, 2) = AVG2(K, L); 472 OUT(1, 0) = AVG3(I, J, K); 473 OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L); 474 OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L); 475 OUT(3, 2) = OUT(2, 2) = 476 OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L; 477 } 478 479 static void HD4(uint8_t* dst, const uint8_t* top) { 480 const int X = top[-1]; 481 const int I = top[-2]; 482 const int J = top[-3]; 483 const int K = top[-4]; 484 const int L = top[-5]; 485 const int A = top[0]; 486 const int B = top[1]; 487 const int C = top[2]; 488 489 OUT(0, 0) = OUT(2, 1) = AVG2(I, X); 490 OUT(0, 1) = OUT(2, 2) = AVG2(J, I); 491 OUT(0, 2) = OUT(2, 3) = AVG2(K, J); 492 OUT(0, 3) = AVG2(L, K); 493 494 OUT(3, 0) = AVG3(A, B, C); 495 OUT(2, 0) = AVG3(X, A, B); 496 OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A); 497 OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X); 498 OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I); 499 OUT(1, 3) = AVG3(L, K, J); 500 } 501 502 static void TM4(uint8_t* dst, const uint8_t* top) { 503 int x, y; 504 const uint8_t* const clip = clip1 + 255 - top[-1]; 505 for (y = 0; y < 4; ++y) { 506 const uint8_t* const clip_table = clip + top[-2 - y]; 507 for (x = 0; x < 4; ++x) { 508 dst[x] = clip_table[top[x]]; 509 } 510 dst += BPS; 511 } 512 } 513 514 #undef AVG3 515 #undef AVG2 516 517 // Left samples are top[-5 .. -2], top_left is top[-1], top are 518 // located at top[0..3], and top right is top[4..7] 519 static void Intra4Preds(uint8_t* dst, const uint8_t* top) { 520 DC4(I4DC4 + dst, top); 521 TM4(I4TM4 + dst, top); 522 VE4(I4VE4 + dst, top); 523 HE4(I4HE4 + dst, top); 524 RD4(I4RD4 + dst, top); 525 VR4(I4VR4 + dst, top); 526 LD4(I4LD4 + dst, top); 527 VL4(I4VL4 + dst, top); 528 HD4(I4HD4 + dst, top); 529 HU4(I4HU4 + dst, top); 530 } 531 532 //----------------------------------------------------------------------------- 533 // Metric 534 535 static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) { 536 int count = 0; 537 int y, x; 538 for (y = 0; y < h; ++y) { 539 for (x = 0; x < w; ++x) { 540 const int diff = (int)a[x] - b[x]; 541 count += diff * diff; 542 } 543 a += BPS; 544 b += BPS; 545 } 546 return count; 547 } 548 549 static int SSE16x16(const uint8_t* a, const uint8_t* b) { 550 return GetSSE(a, b, 16, 16); 551 } 552 static int SSE16x8(const uint8_t* a, const uint8_t* b) { 553 return GetSSE(a, b, 16, 8); 554 } 555 static int SSE8x8(const uint8_t* a, const uint8_t* b) { 556 return GetSSE(a, b, 8, 8); 557 } 558 static int SSE4x4(const uint8_t* a, const uint8_t* b) { 559 return GetSSE(a, b, 4, 4); 560 } 561 562 //----------------------------------------------------------------------------- 563 // Texture distortion 564 // 565 // We try to match the spectral content (weighted) between source and 566 // reconstructed samples. 567 568 // Hadamard transform 569 // Returns the weighted sum of the absolute value of transformed coefficients. 570 static int TTransform(const uint8_t* in, const uint16_t* w) { 571 int sum = 0; 572 int tmp[16]; 573 int i; 574 // horizontal pass 575 for (i = 0; i < 4; ++i, in += BPS) { 576 const int a0 = (in[0] + in[2]) << 2; 577 const int a1 = (in[1] + in[3]) << 2; 578 const int a2 = (in[1] - in[3]) << 2; 579 const int a3 = (in[0] - in[2]) << 2; 580 tmp[0 + i * 4] = a0 + a1 + (a0 != 0); 581 tmp[1 + i * 4] = a3 + a2; 582 tmp[2 + i * 4] = a3 - a2; 583 tmp[3 + i * 4] = a0 - a1; 584 } 585 // vertical pass 586 for (i = 0; i < 4; ++i, ++w) { 587 const int a0 = (tmp[0 + i] + tmp[8 + i]); 588 const int a1 = (tmp[4 + i] + tmp[12+ i]); 589 const int a2 = (tmp[4 + i] - tmp[12+ i]); 590 const int a3 = (tmp[0 + i] - tmp[8 + i]); 591 const int b0 = a0 + a1; 592 const int b1 = a3 + a2; 593 const int b2 = a3 - a2; 594 const int b3 = a0 - a1; 595 // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 596 sum += w[ 0] * ((abs(b0) + 3) >> 3); 597 sum += w[ 4] * ((abs(b1) + 3) >> 3); 598 sum += w[ 8] * ((abs(b2) + 3) >> 3); 599 sum += w[12] * ((abs(b3) + 3) >> 3); 600 } 601 return sum; 602 } 603 604 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 605 const uint16_t* const w) { 606 const int sum1 = TTransform(a, w); 607 const int sum2 = TTransform(b, w); 608 return (abs(sum2 - sum1) + 8) >> 4; 609 } 610 611 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 612 const uint16_t* const w) { 613 int D = 0; 614 int x, y; 615 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 616 for (x = 0; x < 16; x += 4) { 617 D += Disto4x4(a + x + y, b + x + y, w); 618 } 619 } 620 return D; 621 } 622 623 //----------------------------------------------------------------------------- 624 // Quantization 625 // 626 627 // Simple quantization 628 static int QuantizeBlock(int16_t in[16], int16_t out[16], 629 int n, const VP8Matrix* const mtx) { 630 int last = -1; 631 for (; n < 16; ++n) { 632 const int j = VP8Zigzag[n]; 633 const int sign = (in[j] < 0); 634 int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; 635 if (coeff > 2047) coeff = 2047; 636 if (coeff > mtx->zthresh_[j]) { 637 const int Q = mtx->q_[j]; 638 const int iQ = mtx->iq_[j]; 639 const int B = mtx->bias_[j]; 640 out[n] = QUANTDIV(coeff, iQ, B); 641 if (sign) out[n] = -out[n]; 642 in[j] = out[n] * Q; 643 if (out[n]) last = n; 644 } else { 645 out[n] = 0; 646 in[j] = 0; 647 } 648 } 649 return (last >= 0); 650 } 651 652 //----------------------------------------------------------------------------- 653 // Block copy 654 655 static inline void Copy(const uint8_t* src, uint8_t* dst, int size) { 656 int y; 657 for (y = 0; y < size; ++y) { 658 memcpy(dst, src, size); 659 src += BPS; 660 dst += BPS; 661 } 662 } 663 664 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); } 665 static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); } 666 static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); } 667 668 //----------------------------------------------------------------------------- 669 // SSE2 detection. 670 // 671 672 #if defined(__pic__) && defined(__i386__) 673 static inline void GetCPUInfo(int cpu_info[4], int info_type) { 674 __asm__ volatile ( 675 "mov %%ebx, %%edi\n" 676 "cpuid\n" 677 "xchg %%edi, %%ebx\n" 678 : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) 679 : "a"(info_type)); 680 } 681 #elif defined(__i386__) || defined(__x86_64__) 682 static inline void GetCPUInfo(int cpu_info[4], int info_type) { 683 __asm__ volatile ( 684 "cpuid\n" 685 : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) 686 : "a"(info_type)); 687 } 688 #elif defined(_MSC_VER) // Visual C++ 689 #define GetCPUInfo __cpuid 690 #endif 691 692 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) 693 static int x86CPUInfo(CPUFeature feature) { 694 int cpu_info[4]; 695 GetCPUInfo(cpu_info, 1); 696 if (feature == kSSE2) { 697 return 0 != (cpu_info[3] & 0x04000000); 698 } 699 if (feature == kSSE3) { 700 return 0 != (cpu_info[2] & 0x00000001); 701 } 702 return 0; 703 } 704 VP8CPUInfo VP8EncGetCPUInfo = x86CPUInfo; 705 #else 706 VP8CPUInfo VP8EncGetCPUInfo = NULL; 707 #endif 708 709 // Speed-critical function pointers. We have to initialize them to the default 710 // implementations within VP8EncDspInit(). 711 VP8CHisto VP8CollectHistogram; 712 VP8Idct VP8ITransform; 713 VP8Fdct VP8FTransform; 714 VP8WHT VP8ITransformWHT; 715 VP8WHT VP8FTransformWHT; 716 VP8Intra4Preds VP8EncPredLuma4; 717 VP8IntraPreds VP8EncPredLuma16; 718 VP8IntraPreds VP8EncPredChroma8; 719 VP8Metric VP8SSE16x16; 720 VP8Metric VP8SSE8x8; 721 VP8Metric VP8SSE16x8; 722 VP8Metric VP8SSE4x4; 723 VP8WMetric VP8TDisto4x4; 724 VP8WMetric VP8TDisto16x16; 725 VP8QuantizeBlock VP8EncQuantizeBlock; 726 VP8BlockCopy VP8Copy4x4; 727 VP8BlockCopy VP8Copy8x8; 728 VP8BlockCopy VP8Copy16x16; 729 730 extern void VP8EncDspInitSSE2(void); 731 732 void VP8EncDspInit(void) { 733 InitTables(); 734 735 // default C implementations 736 VP8CollectHistogram = CollectHistogram; 737 VP8ITransform = ITransform; 738 VP8FTransform = FTransform; 739 VP8ITransformWHT = ITransformWHT; 740 VP8FTransformWHT = FTransformWHT; 741 VP8EncPredLuma4 = Intra4Preds; 742 VP8EncPredLuma16 = Intra16Preds; 743 VP8EncPredChroma8 = IntraChromaPreds; 744 VP8SSE16x16 = SSE16x16; 745 VP8SSE8x8 = SSE8x8; 746 VP8SSE16x8 = SSE16x8; 747 VP8SSE4x4 = SSE4x4; 748 VP8TDisto4x4 = Disto4x4; 749 VP8TDisto16x16 = Disto16x16; 750 VP8EncQuantizeBlock = QuantizeBlock; 751 VP8Copy4x4 = Copy4x4; 752 VP8Copy8x8 = Copy8x8; 753 VP8Copy16x16 = Copy16x16; 754 755 // If defined, use CPUInfo() to overwrite some pointers with faster versions. 756 if (VP8EncGetCPUInfo) { 757 if (VP8EncGetCPUInfo(kSSE2)) { 758 #if defined(__SSE2__) || defined(_MSC_VER) 759 VP8EncDspInitSSE2(); 760 #endif 761 } 762 if (VP8EncGetCPUInfo(kSSE3)) { 763 // later we'll plug some SSE3 variant here 764 } 765 } 766 } 767 768 #if defined(__cplusplus) || defined(c_plusplus) 769 } // extern "C" 770 #endif 771