1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include <float.h> 13 #include <math.h> 14 #include <stdio.h> 15 #include "vpx_mem/vpx_mem.h" 16 #include "vpxscale_arbitrary.h" 17 18 #define FIXED_POINT 19 20 #define MAX_IN_WIDTH 800 21 #define MAX_IN_HEIGHT 600 22 #define MAX_OUT_WIDTH 800 23 #define MAX_OUT_HEIGHT 600 24 #define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \ 25 MAX_OUT_WIDTH : MAX_OUT_HEIGHT) 26 27 BICUBIC_SCALER_STRUCT g_b_scaler; 28 static int g_first_time = 1; 29 30 #pragma DATA_SECTION(g_hbuf, "VP6_HEAP") 31 #pragma DATA_ALIGN (g_hbuf, 32); 32 unsigned char g_hbuf[MAX_OUT_DIMENSION]; 33 34 #pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP") 35 #pragma DATA_ALIGN (g_hbuf_uv, 32); 36 unsigned char g_hbuf_uv[MAX_OUT_DIMENSION]; 37 38 39 #ifdef FIXED_POINT 40 static int a_i = 0.6 * 65536; 41 #else 42 static float a = -0.6; 43 #endif 44 45 #ifdef FIXED_POINT 46 // 3 2 47 // C0 = a*t - a*t 48 // 49 static short c0_fixed(unsigned int t) 50 { 51 // put t in Q16 notation 52 unsigned short v1, v2; 53 54 // Q16 55 v1 = (a_i * t) >> 16; 56 v1 = (v1 * t) >> 16; 57 58 // Q16 59 v2 = (a_i * t) >> 16; 60 v2 = (v2 * t) >> 16; 61 v2 = (v2 * t) >> 16; 62 63 // Q12 64 return -((v1 - v2) >> 4); 65 } 66 67 // 2 3 68 // C1 = a*t + (3-2*a)*t - (2-a)*t 69 // 70 static short c1_fixed(unsigned int t) 71 { 72 unsigned short v1, v2, v3; 73 unsigned short two, three; 74 75 // Q16 76 v1 = (a_i * t) >> 16; 77 78 // Q13 79 two = 2 << 13; 80 v2 = two - (a_i >> 3); 81 v2 = (v2 * t) >> 16; 82 v2 = (v2 * t) >> 16; 83 v2 = (v2 * t) >> 16; 84 85 // Q13 86 three = 3 << 13; 87 v3 = three - (2 * (a_i >> 3)); 88 v3 = (v3 * t) >> 16; 89 v3 = (v3 * t) >> 16; 90 91 // Q12 92 return (((v1 >> 3) - v2 + v3) >> 1); 93 94 } 95 96 // 2 3 97 // C2 = 1 - (3-a)*t + (2-a)*t 98 // 99 static short c2_fixed(unsigned int t) 100 { 101 unsigned short v1, v2, v3; 102 unsigned short two, three; 103 104 // Q13 105 v1 = 1 << 13; 106 107 // Q13 108 three = 3 << 13; 109 v2 = three - (a_i >> 3); 110 v2 = (v2 * t) >> 16; 111 v2 = (v2 * t) >> 16; 112 113 // Q13 114 two = 2 << 13; 115 v3 = two - (a_i >> 3); 116 v3 = (v3 * t) >> 16; 117 v3 = (v3 * t) >> 16; 118 v3 = (v3 * t) >> 16; 119 120 // Q12 121 return (v1 - v2 + v3) >> 1; 122 } 123 124 // 2 3 125 // C3 = a*t - 2*a*t + a*t 126 // 127 static short c3_fixed(unsigned int t) 128 { 129 int v1, v2, v3; 130 131 // Q16 132 v1 = (a_i * t) >> 16; 133 134 // Q15 135 v2 = 2 * (a_i >> 1); 136 v2 = (v2 * t) >> 16; 137 v2 = (v2 * t) >> 16; 138 139 // Q16 140 v3 = (a_i * t) >> 16; 141 v3 = (v3 * t) >> 16; 142 v3 = (v3 * t) >> 16; 143 144 // Q12 145 return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3); 146 } 147 #else 148 // 3 2 149 // C0 = -a*t + a*t 150 // 151 float C0(float t) 152 { 153 return -a * t * t * t + a * t * t; 154 } 155 156 // 2 3 157 // C1 = -a*t + (2*a+3)*t - (a+2)*t 158 // 159 float C1(float t) 160 { 161 return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t; 162 } 163 164 // 2 3 165 // C2 = 1 - (a+3)*t + (a+2)*t 166 // 167 float C2(float t) 168 { 169 return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f; 170 } 171 172 // 2 3 173 // C3 = a*t - 2*a*t + a*t 174 // 175 float C3(float t) 176 { 177 return a * t * t * t - 2.0f * a * t * t + a * t; 178 } 179 #endif 180 181 #if 0 182 int compare_real_fixed() 183 { 184 int i, errors = 0; 185 float mult = 1.0 / 10000.0; 186 unsigned int fixed_mult = mult * 4294967296;//65536; 187 unsigned int phase_offset_int; 188 float phase_offset_real; 189 190 for (i = 0; i < 10000; i++) 191 { 192 int fixed0, fixed1, fixed2, fixed3, fixed_total; 193 int real0, real1, real2, real3, real_total; 194 195 phase_offset_real = (float)i * mult; 196 phase_offset_int = (fixed_mult * i) >> 16; 197 // phase_offset_int = phase_offset_real * 65536; 198 199 fixed0 = c0_fixed(phase_offset_int); 200 real0 = C0(phase_offset_real) * 4096.0; 201 202 if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1))) 203 errors++; 204 205 fixed1 = c1_fixed(phase_offset_int); 206 real1 = C1(phase_offset_real) * 4096.0; 207 208 if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1))) 209 errors++; 210 211 fixed2 = c2_fixed(phase_offset_int); 212 real2 = C2(phase_offset_real) * 4096.0; 213 214 if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1))) 215 errors++; 216 217 fixed3 = c3_fixed(phase_offset_int); 218 real3 = C3(phase_offset_real) * 4096.0; 219 220 if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1))) 221 errors++; 222 223 fixed_total = fixed0 + fixed1 + fixed2 + fixed3; 224 real_total = real0 + real1 + real2 + real3; 225 226 if ((fixed_total > 4097) || (fixed_total < 4094)) 227 errors ++; 228 229 if ((real_total > 4097) || (real_total < 4095)) 230 errors ++; 231 } 232 233 return errors; 234 } 235 #endif 236 237 // Find greatest common denominator between two integers. Method used here is 238 // slow compared to Euclid's algorithm, but does not require any division. 239 int gcd(int a, int b) 240 { 241 // Problem with this algorithm is that if a or b = 0 this function 242 // will never exit. Don't want to return 0 because any computation 243 // that was based on a common denoninator and tried to reduce by 244 // dividing by 0 would fail. Best solution that could be thought of 245 // would to be fail by returing a 1; 246 if (a <= 0 || b <= 0) 247 return 1; 248 249 while (a != b) 250 { 251 if (b > a) 252 b = b - a; 253 else 254 { 255 int tmp = a;//swap large and 256 a = b; //small 257 b = tmp; 258 } 259 } 260 261 return b; 262 } 263 264 void bicubic_coefficient_init() 265 { 266 vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT)); 267 g_first_time = 0; 268 } 269 270 void bicubic_coefficient_destroy() 271 { 272 if (!g_first_time) 273 { 274 vpx_free(g_b_scaler.l_w); 275 276 vpx_free(g_b_scaler.l_h); 277 278 vpx_free(g_b_scaler.l_h_uv); 279 280 vpx_free(g_b_scaler.c_w); 281 282 vpx_free(g_b_scaler.c_h); 283 284 vpx_free(g_b_scaler.c_h_uv); 285 286 vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT)); 287 } 288 } 289 290 // Create the coeffients that will be used for the cubic interpolation. 291 // Because scaling does not have to be equal in the vertical and horizontal 292 // regimes the phase offsets will be different. There are 4 coefficents 293 // for each point, two on each side. The layout is that there are the 294 // 4 coefficents for each phase in the array and then the next phase. 295 int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height) 296 { 297 int i; 298 #ifdef FIXED_POINT 299 int phase_offset_int; 300 unsigned int fixed_mult; 301 int product_val = 0; 302 #else 303 float phase_offset; 304 #endif 305 int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv; 306 307 if (g_first_time) 308 bicubic_coefficient_init(); 309 310 311 // check to see if the coefficents have already been set up correctly 312 if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height) 313 && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height)) 314 return 0; 315 316 g_b_scaler.in_width = in_width; 317 g_b_scaler.in_height = in_height; 318 g_b_scaler.out_width = out_width; 319 g_b_scaler.out_height = out_height; 320 321 // Don't want to allow crazy scaling, just try and prevent a catastrophic 322 // failure here. Want to fail after setting the member functions so if 323 // if the scaler is called the member functions will not scale. 324 if (out_width <= 0 || out_height <= 0) 325 return -1; 326 327 // reduce in/out width and height ratios using the gcd 328 gcd_w = gcd(out_width, in_width); 329 gcd_h = gcd(out_height, in_height); 330 gcd_h_uv = gcd(out_height, in_height / 2); 331 332 // the numerator width and height are to be saved in 333 // globals so they can be used during the scaling process 334 // without having to be recalculated. 335 g_b_scaler.nw = out_width / gcd_w; 336 d_w = in_width / gcd_w; 337 338 g_b_scaler.nh = out_height / gcd_h; 339 d_h = in_height / gcd_h; 340 341 g_b_scaler.nh_uv = out_height / gcd_h_uv; 342 d_h_uv = (in_height / 2) / gcd_h_uv; 343 344 // allocate memory for the coefficents 345 vpx_free(g_b_scaler.l_w); 346 347 vpx_free(g_b_scaler.l_h); 348 349 vpx_free(g_b_scaler.l_h_uv); 350 351 g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2); 352 g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2); 353 g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2); 354 355 vpx_free(g_b_scaler.c_w); 356 357 vpx_free(g_b_scaler.c_h); 358 359 vpx_free(g_b_scaler.c_h_uv); 360 361 g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2); 362 g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2); 363 g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2); 364 365 g_b_scaler.hbuf = g_hbuf; 366 g_b_scaler.hbuf_uv = g_hbuf_uv; 367 368 // Set up polyphase filter taps. This needs to be done before 369 // the scaling because of the floating point math required. The 370 // coefficients are multiplied by 2^12 so that fixed point math 371 // can be used in the main scaling loop. 372 #ifdef FIXED_POINT 373 fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296; 374 375 product_val = 0; 376 377 for (i = 0; i < g_b_scaler.nw; i++) 378 { 379 if (product_val > g_b_scaler.nw) 380 product_val -= g_b_scaler.nw; 381 382 phase_offset_int = (fixed_mult * product_val) >> 16; 383 384 g_b_scaler.c_w[i*4] = c3_fixed(phase_offset_int); 385 g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int); 386 g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int); 387 g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int); 388 389 product_val += d_w; 390 } 391 392 393 fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296; 394 395 product_val = 0; 396 397 for (i = 0; i < g_b_scaler.nh; i++) 398 { 399 if (product_val > g_b_scaler.nh) 400 product_val -= g_b_scaler.nh; 401 402 phase_offset_int = (fixed_mult * product_val) >> 16; 403 404 g_b_scaler.c_h[i*4] = c0_fixed(phase_offset_int); 405 g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int); 406 g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int); 407 g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int); 408 409 product_val += d_h; 410 } 411 412 fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296; 413 414 product_val = 0; 415 416 for (i = 0; i < g_b_scaler.nh_uv; i++) 417 { 418 if (product_val > g_b_scaler.nh_uv) 419 product_val -= g_b_scaler.nh_uv; 420 421 phase_offset_int = (fixed_mult * product_val) >> 16; 422 423 g_b_scaler.c_h_uv[i*4] = c0_fixed(phase_offset_int); 424 g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int); 425 g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int); 426 g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int); 427 428 product_val += d_h_uv; 429 } 430 431 #else 432 433 for (i = 0; i < g_nw; i++) 434 { 435 phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw; 436 g_c_w[i*4] = (C3(phase_offset) * 4096.0); 437 g_c_w[i*4+1] = (C2(phase_offset) * 4096.0); 438 g_c_w[i*4+2] = (C1(phase_offset) * 4096.0); 439 g_c_w[i*4+3] = (C0(phase_offset) * 4096.0); 440 } 441 442 for (i = 0; i < g_nh; i++) 443 { 444 phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh; 445 g_c_h[i*4] = (C0(phase_offset) * 4096.0); 446 g_c_h[i*4+1] = (C1(phase_offset) * 4096.0); 447 g_c_h[i*4+2] = (C2(phase_offset) * 4096.0); 448 g_c_h[i*4+3] = (C3(phase_offset) * 4096.0); 449 } 450 451 for (i = 0; i < g_nh_uv; i++) 452 { 453 phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv; 454 g_c_h_uv[i*4] = (C0(phase_offset) * 4096.0); 455 g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0); 456 g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0); 457 g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0); 458 } 459 460 #endif 461 462 // Create an array that corresponds input lines to output lines. 463 // This doesn't require floating point math, but it does require 464 // a division and because hardware division is not present that 465 // is a call. 466 for (i = 0; i < out_width; i++) 467 { 468 g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw; 469 470 if ((g_b_scaler.l_w[i] + 2) <= in_width) 471 g_b_scaler.max_usable_out_width = i; 472 473 } 474 475 for (i = 0; i < out_height + 1; i++) 476 { 477 g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh; 478 g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv; 479 } 480 481 return 0; 482 } 483 484 int bicubic_scale(int in_width, int in_height, int in_stride, 485 int out_width, int out_height, int out_stride, 486 unsigned char *input_image, unsigned char *output_image) 487 { 488 short *RESTRICT l_w, * RESTRICT l_h; 489 short *RESTRICT c_w, * RESTRICT c_h; 490 unsigned char *RESTRICT ip, * RESTRICT op; 491 unsigned char *RESTRICT hbuf; 492 int h, w, lw, lh; 493 int temp_sum; 494 int phase_offset_w, phase_offset_h; 495 496 c_w = g_b_scaler.c_w; 497 c_h = g_b_scaler.c_h; 498 499 op = output_image; 500 501 l_w = g_b_scaler.l_w; 502 l_h = g_b_scaler.l_h; 503 504 phase_offset_h = 0; 505 506 for (h = 0; h < out_height; h++) 507 { 508 // select the row to work on 509 lh = l_h[h]; 510 ip = input_image + (in_stride * lh); 511 512 // vp8_filter the row vertically into an temporary buffer. 513 // If the phase offset == 0 then all the multiplication 514 // is going to result in the output equalling the input. 515 // So instead point the temporary buffer to the input. 516 // Also handle the boundry condition of not being able to 517 // filter that last lines. 518 if (phase_offset_h && (lh < in_height - 2)) 519 { 520 hbuf = g_b_scaler.hbuf; 521 522 for (w = 0; w < in_width; w++) 523 { 524 temp_sum = c_h[phase_offset_h*4+3] * ip[w - in_stride]; 525 temp_sum += c_h[phase_offset_h*4+2] * ip[w]; 526 temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride]; 527 temp_sum += c_h[phase_offset_h*4] * ip[w + 2*in_stride]; 528 529 hbuf[w] = temp_sum >> 12; 530 } 531 } 532 else 533 hbuf = ip; 534 535 // increase the phase offset for the next time around. 536 if (++phase_offset_h >= g_b_scaler.nh) 537 phase_offset_h = 0; 538 539 // now filter and expand it horizontally into the final 540 // output buffer 541 phase_offset_w = 0; 542 543 for (w = 0; w < out_width; w++) 544 { 545 // get the index to use to expand the image 546 lw = l_w[w]; 547 548 temp_sum = c_w[phase_offset_w*4] * hbuf[lw - 1]; 549 temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw]; 550 temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1]; 551 temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2]; 552 temp_sum = temp_sum >> 12; 553 554 if (++phase_offset_w >= g_b_scaler.nw) 555 phase_offset_w = 0; 556 557 // boundry conditions 558 if ((lw + 2) >= in_width) 559 temp_sum = hbuf[lw]; 560 561 if (lw == 0) 562 temp_sum = hbuf[0]; 563 564 op[w] = temp_sum; 565 } 566 567 op += out_stride; 568 } 569 570 return 0; 571 } 572 573 void bicubic_scale_frame_reset() 574 { 575 g_b_scaler.out_width = 0; 576 g_b_scaler.out_height = 0; 577 } 578 579 void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, 580 int new_width, int new_height) 581 { 582 583 dst->y_width = new_width; 584 dst->y_height = new_height; 585 dst->uv_width = new_width / 2; 586 dst->uv_height = new_height / 2; 587 588 dst->y_stride = dst->y_width; 589 dst->uv_stride = dst->uv_width; 590 591 bicubic_scale(src->y_width, src->y_height, src->y_stride, 592 new_width, new_height, dst->y_stride, 593 src->y_buffer, dst->y_buffer); 594 595 bicubic_scale(src->uv_width, src->uv_height, src->uv_stride, 596 new_width / 2, new_height / 2, dst->uv_stride, 597 src->u_buffer, dst->u_buffer); 598 599 bicubic_scale(src->uv_width, src->uv_height, src->uv_stride, 600 new_width / 2, new_height / 2, dst->uv_stride, 601 src->v_buffer, dst->v_buffer); 602 } 603