Home | History | Annotate | Download | only in generic
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include <float.h>
     13 #include <math.h>
     14 #include <stdio.h>
     15 #include "vpx_mem/vpx_mem.h"
     16 #include "vpxscale_arbitrary.h"
     17 
     18 #define FIXED_POINT
     19 
     20 #define MAX_IN_WIDTH        800
     21 #define MAX_IN_HEIGHT       600
     22 #define MAX_OUT_WIDTH       800
     23 #define MAX_OUT_HEIGHT      600
     24 #define MAX_OUT_DIMENSION   ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
     25                              MAX_OUT_WIDTH : MAX_OUT_HEIGHT)
     26 
     27 BICUBIC_SCALER_STRUCT g_b_scaler;
     28 static int g_first_time = 1;
     29 
     30 #pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
     31 #pragma DATA_ALIGN (g_hbuf, 32);
     32 unsigned char g_hbuf[MAX_OUT_DIMENSION];
     33 
     34 #pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
     35 #pragma DATA_ALIGN (g_hbuf_uv, 32);
     36 unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];
     37 
     38 
     39 #ifdef FIXED_POINT
     40 static int a_i = 0.6 * 65536;
     41 #else
     42 static float a = -0.6;
     43 #endif
     44 
     45 #ifdef FIXED_POINT
     46 //         3     2
     47 // C0 = a*t - a*t
     48 //
     49 static short c0_fixed(unsigned int t)
     50 {
     51     // put t in Q16 notation
     52     unsigned short v1, v2;
     53 
     54     // Q16
     55     v1 = (a_i * t) >> 16;
     56     v1 = (v1 * t) >> 16;
     57 
     58     // Q16
     59     v2 = (a_i * t) >> 16;
     60     v2 = (v2 * t) >> 16;
     61     v2 = (v2 * t) >> 16;
     62 
     63     // Q12
     64     return -((v1 - v2) >> 4);
     65 }
     66 
     67 //                     2          3
     68 // C1 = a*t + (3-2*a)*t  - (2-a)*t
     69 //
     70 static short c1_fixed(unsigned int t)
     71 {
     72     unsigned short v1, v2, v3;
     73     unsigned short two, three;
     74 
     75     // Q16
     76     v1 = (a_i * t) >> 16;
     77 
     78     // Q13
     79     two = 2 << 13;
     80     v2 = two - (a_i >> 3);
     81     v2 = (v2 * t) >> 16;
     82     v2 = (v2 * t) >> 16;
     83     v2 = (v2 * t) >> 16;
     84 
     85     // Q13
     86     three = 3 << 13;
     87     v3 = three - (2 * (a_i >> 3));
     88     v3 = (v3 * t) >> 16;
     89     v3 = (v3 * t) >> 16;
     90 
     91     // Q12
     92     return (((v1 >> 3) - v2 + v3) >> 1);
     93 
     94 }
     95 
     96 //                 2          3
     97 // C2 = 1 - (3-a)*t  + (2-a)*t
     98 //
     99 static short c2_fixed(unsigned int t)
    100 {
    101     unsigned short v1, v2, v3;
    102     unsigned short two, three;
    103 
    104     // Q13
    105     v1 = 1 << 13;
    106 
    107     // Q13
    108     three = 3 << 13;
    109     v2 = three - (a_i >> 3);
    110     v2 = (v2 * t) >> 16;
    111     v2 = (v2 * t) >> 16;
    112 
    113     // Q13
    114     two = 2 << 13;
    115     v3 = two - (a_i >> 3);
    116     v3 = (v3 * t) >> 16;
    117     v3 = (v3 * t) >> 16;
    118     v3 = (v3 * t) >> 16;
    119 
    120     // Q12
    121     return (v1 - v2 + v3) >> 1;
    122 }
    123 
    124 //                 2      3
    125 // C3 = a*t - 2*a*t  + a*t
    126 //
    127 static short c3_fixed(unsigned int t)
    128 {
    129     int v1, v2, v3;
    130 
    131     // Q16
    132     v1 = (a_i * t) >> 16;
    133 
    134     // Q15
    135     v2 = 2 * (a_i >> 1);
    136     v2 = (v2 * t) >> 16;
    137     v2 = (v2 * t) >> 16;
    138 
    139     // Q16
    140     v3 = (a_i * t) >> 16;
    141     v3 = (v3 * t) >> 16;
    142     v3 = (v3 * t) >> 16;
    143 
    144     // Q12
    145     return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
    146 }
    147 #else
    148 //          3     2
    149 // C0 = -a*t + a*t
    150 //
    151 float C0(float t)
    152 {
    153     return -a * t * t * t + a * t * t;
    154 }
    155 
    156 //                      2          3
    157 // C1 = -a*t + (2*a+3)*t  - (a+2)*t
    158 //
    159 float C1(float t)
    160 {
    161     return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
    162 }
    163 
    164 //                 2          3
    165 // C2 = 1 - (a+3)*t  + (a+2)*t
    166 //
    167 float C2(float t)
    168 {
    169     return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
    170 }
    171 
    172 //                 2      3
    173 // C3 = a*t - 2*a*t  + a*t
    174 //
    175 float C3(float t)
    176 {
    177     return a * t * t * t - 2.0f * a * t * t + a * t;
    178 }
    179 #endif
    180 
    181 #if 0
    182 int compare_real_fixed()
    183 {
    184     int i, errors = 0;
    185     float mult = 1.0 / 10000.0;
    186     unsigned int fixed_mult = mult * 4294967296;//65536;
    187     unsigned int phase_offset_int;
    188     float phase_offset_real;
    189 
    190     for (i = 0; i < 10000; i++)
    191     {
    192         int fixed0, fixed1, fixed2, fixed3, fixed_total;
    193         int real0, real1, real2, real3, real_total;
    194 
    195         phase_offset_real = (float)i * mult;
    196         phase_offset_int = (fixed_mult * i) >> 16;
    197 //      phase_offset_int = phase_offset_real * 65536;
    198 
    199         fixed0 = c0_fixed(phase_offset_int);
    200         real0 = C0(phase_offset_real) * 4096.0;
    201 
    202         if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
    203             errors++;
    204 
    205         fixed1 = c1_fixed(phase_offset_int);
    206         real1 = C1(phase_offset_real) * 4096.0;
    207 
    208         if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
    209             errors++;
    210 
    211         fixed2 = c2_fixed(phase_offset_int);
    212         real2 = C2(phase_offset_real) * 4096.0;
    213 
    214         if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
    215             errors++;
    216 
    217         fixed3 = c3_fixed(phase_offset_int);
    218         real3 = C3(phase_offset_real) * 4096.0;
    219 
    220         if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
    221             errors++;
    222 
    223         fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
    224         real_total = real0 + real1 + real2 + real3;
    225 
    226         if ((fixed_total > 4097) || (fixed_total < 4094))
    227             errors ++;
    228 
    229         if ((real_total > 4097) || (real_total < 4095))
    230             errors ++;
    231     }
    232 
    233     return errors;
    234 }
    235 #endif
    236 
    237 // Find greatest common denominator between two integers.  Method used here is
    238 //  slow compared to Euclid's algorithm, but does not require any division.
    239 int gcd(int a, int b)
    240 {
    241     // Problem with this algorithm is that if a or b = 0 this function
    242     //  will never exit.  Don't want to return 0 because any computation
    243     //  that was based on a common denoninator and tried to reduce by
    244     //  dividing by 0 would fail.  Best solution that could be thought of
    245     //  would to be fail by returing a 1;
    246     if (a <= 0 || b <= 0)
    247         return 1;
    248 
    249     while (a != b)
    250     {
    251         if (b > a)
    252             b = b - a;
    253         else
    254         {
    255             int tmp = a;//swap large and
    256             a = b; //small
    257             b = tmp;
    258         }
    259     }
    260 
    261     return b;
    262 }
    263 
    264 void bicubic_coefficient_init()
    265 {
    266     vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
    267     g_first_time = 0;
    268 }
    269 
    270 void bicubic_coefficient_destroy()
    271 {
    272     if (!g_first_time)
    273     {
    274         vpx_free(g_b_scaler.l_w);
    275 
    276         vpx_free(g_b_scaler.l_h);
    277 
    278         vpx_free(g_b_scaler.l_h_uv);
    279 
    280         vpx_free(g_b_scaler.c_w);
    281 
    282         vpx_free(g_b_scaler.c_h);
    283 
    284         vpx_free(g_b_scaler.c_h_uv);
    285 
    286         vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
    287     }
    288 }
    289 
    290 // Create the coeffients that will be used for the cubic interpolation.
    291 //  Because scaling does not have to be equal in the vertical and horizontal
    292 //  regimes the phase offsets will be different.  There are 4 coefficents
    293 //  for each point, two on each side.  The layout is that there are the
    294 //  4 coefficents for each phase in the array and then the next phase.
    295 int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
    296 {
    297     int i;
    298 #ifdef FIXED_POINT
    299     int phase_offset_int;
    300     unsigned int fixed_mult;
    301     int product_val = 0;
    302 #else
    303     float phase_offset;
    304 #endif
    305     int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;
    306 
    307     if (g_first_time)
    308         bicubic_coefficient_init();
    309 
    310 
    311     // check to see if the coefficents have already been set up correctly
    312     if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
    313         && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
    314         return 0;
    315 
    316     g_b_scaler.in_width = in_width;
    317     g_b_scaler.in_height = in_height;
    318     g_b_scaler.out_width = out_width;
    319     g_b_scaler.out_height = out_height;
    320 
    321     // Don't want to allow crazy scaling, just try and prevent a catastrophic
    322     //  failure here.  Want to fail after setting the member functions so if
    323     //  if the scaler is called the member functions will not scale.
    324     if (out_width <= 0 || out_height <= 0)
    325         return -1;
    326 
    327     // reduce in/out width and height ratios using the gcd
    328     gcd_w = gcd(out_width, in_width);
    329     gcd_h = gcd(out_height, in_height);
    330     gcd_h_uv = gcd(out_height, in_height / 2);
    331 
    332     // the numerator width and height are to be saved in
    333     //  globals so they can be used during the scaling process
    334     //  without having to be recalculated.
    335     g_b_scaler.nw = out_width / gcd_w;
    336     d_w = in_width / gcd_w;
    337 
    338     g_b_scaler.nh = out_height / gcd_h;
    339     d_h = in_height / gcd_h;
    340 
    341     g_b_scaler.nh_uv = out_height / gcd_h_uv;
    342     d_h_uv = (in_height / 2) / gcd_h_uv;
    343 
    344     // allocate memory for the coefficents
    345     vpx_free(g_b_scaler.l_w);
    346 
    347     vpx_free(g_b_scaler.l_h);
    348 
    349     vpx_free(g_b_scaler.l_h_uv);
    350 
    351     g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
    352     g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
    353     g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);
    354 
    355     vpx_free(g_b_scaler.c_w);
    356 
    357     vpx_free(g_b_scaler.c_h);
    358 
    359     vpx_free(g_b_scaler.c_h_uv);
    360 
    361     g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
    362     g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
    363     g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);
    364 
    365     g_b_scaler.hbuf = g_hbuf;
    366     g_b_scaler.hbuf_uv = g_hbuf_uv;
    367 
    368     // Set up polyphase filter taps.  This needs to be done before
    369     //  the scaling because of the floating point math required.  The
    370     //  coefficients are multiplied by 2^12 so that fixed point math
    371     //  can be used in the main scaling loop.
    372 #ifdef FIXED_POINT
    373     fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;
    374 
    375     product_val = 0;
    376 
    377     for (i = 0; i < g_b_scaler.nw; i++)
    378     {
    379         if (product_val > g_b_scaler.nw)
    380             product_val -= g_b_scaler.nw;
    381 
    382         phase_offset_int = (fixed_mult * product_val) >> 16;
    383 
    384         g_b_scaler.c_w[i*4]   = c3_fixed(phase_offset_int);
    385         g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
    386         g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
    387         g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);
    388 
    389         product_val += d_w;
    390     }
    391 
    392 
    393     fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;
    394 
    395     product_val = 0;
    396 
    397     for (i = 0; i < g_b_scaler.nh; i++)
    398     {
    399         if (product_val > g_b_scaler.nh)
    400             product_val -= g_b_scaler.nh;
    401 
    402         phase_offset_int = (fixed_mult * product_val) >> 16;
    403 
    404         g_b_scaler.c_h[i*4]   = c0_fixed(phase_offset_int);
    405         g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
    406         g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
    407         g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);
    408 
    409         product_val += d_h;
    410     }
    411 
    412     fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;
    413 
    414     product_val = 0;
    415 
    416     for (i = 0; i < g_b_scaler.nh_uv; i++)
    417     {
    418         if (product_val > g_b_scaler.nh_uv)
    419             product_val -= g_b_scaler.nh_uv;
    420 
    421         phase_offset_int = (fixed_mult * product_val) >> 16;
    422 
    423         g_b_scaler.c_h_uv[i*4]   = c0_fixed(phase_offset_int);
    424         g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
    425         g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
    426         g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);
    427 
    428         product_val += d_h_uv;
    429     }
    430 
    431 #else
    432 
    433     for (i = 0; i < g_nw; i++)
    434     {
    435         phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
    436         g_c_w[i*4]   = (C3(phase_offset) * 4096.0);
    437         g_c_w[i*4+1] = (C2(phase_offset) * 4096.0);
    438         g_c_w[i*4+2] = (C1(phase_offset) * 4096.0);
    439         g_c_w[i*4+3] = (C0(phase_offset) * 4096.0);
    440     }
    441 
    442     for (i = 0; i < g_nh; i++)
    443     {
    444         phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
    445         g_c_h[i*4]   = (C0(phase_offset) * 4096.0);
    446         g_c_h[i*4+1] = (C1(phase_offset) * 4096.0);
    447         g_c_h[i*4+2] = (C2(phase_offset) * 4096.0);
    448         g_c_h[i*4+3] = (C3(phase_offset) * 4096.0);
    449     }
    450 
    451     for (i = 0; i < g_nh_uv; i++)
    452     {
    453         phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
    454         g_c_h_uv[i*4]   = (C0(phase_offset) * 4096.0);
    455         g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0);
    456         g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0);
    457         g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0);
    458     }
    459 
    460 #endif
    461 
    462     // Create an array that corresponds input lines to output lines.
    463     //  This doesn't require floating point math, but it does require
    464     //  a division and because hardware division is not present that
    465     //  is a call.
    466     for (i = 0; i < out_width; i++)
    467     {
    468         g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;
    469 
    470         if ((g_b_scaler.l_w[i] + 2) <= in_width)
    471             g_b_scaler.max_usable_out_width = i;
    472 
    473     }
    474 
    475     for (i = 0; i < out_height + 1; i++)
    476     {
    477         g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
    478         g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
    479     }
    480 
    481     return 0;
    482 }
    483 
    484 int bicubic_scale(int in_width, int in_height, int in_stride,
    485                   int out_width, int out_height, int out_stride,
    486                   unsigned char *input_image, unsigned char *output_image)
    487 {
    488     short *RESTRICT l_w, * RESTRICT l_h;
    489     short *RESTRICT c_w, * RESTRICT c_h;
    490     unsigned char *RESTRICT ip, * RESTRICT op;
    491     unsigned char *RESTRICT hbuf;
    492     int h, w, lw, lh;
    493     int temp_sum;
    494     int phase_offset_w, phase_offset_h;
    495 
    496     c_w = g_b_scaler.c_w;
    497     c_h = g_b_scaler.c_h;
    498 
    499     op = output_image;
    500 
    501     l_w = g_b_scaler.l_w;
    502     l_h = g_b_scaler.l_h;
    503 
    504     phase_offset_h = 0;
    505 
    506     for (h = 0; h < out_height; h++)
    507     {
    508         // select the row to work on
    509         lh = l_h[h];
    510         ip = input_image + (in_stride * lh);
    511 
    512         // vp8_filter the row vertically into an temporary buffer.
    513         //  If the phase offset == 0 then all the multiplication
    514         //  is going to result in the output equalling the input.
    515         //  So instead point the temporary buffer to the input.
    516         //  Also handle the boundry condition of not being able to
    517         //  filter that last lines.
    518         if (phase_offset_h && (lh < in_height - 2))
    519         {
    520             hbuf = g_b_scaler.hbuf;
    521 
    522             for (w = 0; w < in_width; w++)
    523             {
    524                 temp_sum =  c_h[phase_offset_h*4+3] * ip[w - in_stride];
    525                 temp_sum += c_h[phase_offset_h*4+2] * ip[w];
    526                 temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride];
    527                 temp_sum += c_h[phase_offset_h*4]   * ip[w + 2*in_stride];
    528 
    529                 hbuf[w] = temp_sum >> 12;
    530             }
    531         }
    532         else
    533             hbuf = ip;
    534 
    535         // increase the phase offset for the next time around.
    536         if (++phase_offset_h >= g_b_scaler.nh)
    537             phase_offset_h = 0;
    538 
    539         // now filter and expand it horizontally into the final
    540         //  output buffer
    541         phase_offset_w = 0;
    542 
    543         for (w = 0; w < out_width; w++)
    544         {
    545             // get the index to use to expand the image
    546             lw = l_w[w];
    547 
    548             temp_sum =  c_w[phase_offset_w*4]   * hbuf[lw - 1];
    549             temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw];
    550             temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1];
    551             temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2];
    552             temp_sum = temp_sum >> 12;
    553 
    554             if (++phase_offset_w >= g_b_scaler.nw)
    555                 phase_offset_w = 0;
    556 
    557             // boundry conditions
    558             if ((lw + 2) >= in_width)
    559                 temp_sum = hbuf[lw];
    560 
    561             if (lw == 0)
    562                 temp_sum = hbuf[0];
    563 
    564             op[w] = temp_sum;
    565         }
    566 
    567         op += out_stride;
    568     }
    569 
    570     return 0;
    571 }
    572 
    573 void bicubic_scale_frame_reset()
    574 {
    575     g_b_scaler.out_width = 0;
    576     g_b_scaler.out_height = 0;
    577 }
    578 
    579 void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
    580                          int new_width, int new_height)
    581 {
    582 
    583     dst->y_width = new_width;
    584     dst->y_height = new_height;
    585     dst->uv_width = new_width / 2;
    586     dst->uv_height = new_height / 2;
    587 
    588     dst->y_stride = dst->y_width;
    589     dst->uv_stride = dst->uv_width;
    590 
    591     bicubic_scale(src->y_width, src->y_height, src->y_stride,
    592                   new_width, new_height, dst->y_stride,
    593                   src->y_buffer, dst->y_buffer);
    594 
    595     bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
    596                   new_width / 2, new_height / 2, dst->uv_stride,
    597                   src->u_buffer, dst->u_buffer);
    598 
    599     bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
    600                   new_width / 2, new_height / 2, dst->uv_stride,
    601                   src->v_buffer, dst->v_buffer);
    602 }
    603