Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <math.h>
     13 
     14 #include "./vp9_rtcd.h"
     15 #include "./vpx_config.h"
     16 #include "./vpx_dsp_rtcd.h"
     17 
     18 #include "vp9/common/vp9_blockd.h"
     19 #include "vp9/common/vp9_idct.h"
     20 #include "vpx_dsp/fwd_txfm.h"
     21 #include "vpx_ports/mem.h"
     22 
     23 static void fdct4(const tran_low_t *input, tran_low_t *output) {
     24   tran_high_t step[4];
     25   tran_high_t temp1, temp2;
     26 
     27   step[0] = input[0] + input[3];
     28   step[1] = input[1] + input[2];
     29   step[2] = input[1] - input[2];
     30   step[3] = input[0] - input[3];
     31 
     32   temp1 = (step[0] + step[1]) * cospi_16_64;
     33   temp2 = (step[0] - step[1]) * cospi_16_64;
     34   output[0] = (tran_low_t)fdct_round_shift(temp1);
     35   output[2] = (tran_low_t)fdct_round_shift(temp2);
     36   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
     37   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
     38   output[1] = (tran_low_t)fdct_round_shift(temp1);
     39   output[3] = (tran_low_t)fdct_round_shift(temp2);
     40 }
     41 
     42 static void fdct8(const tran_low_t *input, tran_low_t *output) {
     43   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
     44   tran_high_t t0, t1, t2, t3;                  // needs32
     45   tran_high_t x0, x1, x2, x3;                  // canbe16
     46 
     47   // stage 1
     48   s0 = input[0] + input[7];
     49   s1 = input[1] + input[6];
     50   s2 = input[2] + input[5];
     51   s3 = input[3] + input[4];
     52   s4 = input[3] - input[4];
     53   s5 = input[2] - input[5];
     54   s6 = input[1] - input[6];
     55   s7 = input[0] - input[7];
     56 
     57   // fdct4(step, step);
     58   x0 = s0 + s3;
     59   x1 = s1 + s2;
     60   x2 = s1 - s2;
     61   x3 = s0 - s3;
     62   t0 = (x0 + x1) * cospi_16_64;
     63   t1 = (x0 - x1) * cospi_16_64;
     64   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
     65   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
     66   output[0] = (tran_low_t)fdct_round_shift(t0);
     67   output[2] = (tran_low_t)fdct_round_shift(t2);
     68   output[4] = (tran_low_t)fdct_round_shift(t1);
     69   output[6] = (tran_low_t)fdct_round_shift(t3);
     70 
     71   // Stage 2
     72   t0 = (s6 - s5) * cospi_16_64;
     73   t1 = (s6 + s5) * cospi_16_64;
     74   t2 = (tran_low_t)fdct_round_shift(t0);
     75   t3 = (tran_low_t)fdct_round_shift(t1);
     76 
     77   // Stage 3
     78   x0 = s4 + t2;
     79   x1 = s4 - t2;
     80   x2 = s7 - t3;
     81   x3 = s7 + t3;
     82 
     83   // Stage 4
     84   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
     85   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
     86   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     87   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
     88   output[1] = (tran_low_t)fdct_round_shift(t0);
     89   output[3] = (tran_low_t)fdct_round_shift(t2);
     90   output[5] = (tran_low_t)fdct_round_shift(t1);
     91   output[7] = (tran_low_t)fdct_round_shift(t3);
     92 }
     93 
     94 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
     95   tran_high_t step1[8];      // canbe16
     96   tran_high_t step2[8];      // canbe16
     97   tran_high_t step3[8];      // canbe16
     98   tran_high_t input[8];      // canbe16
     99   tran_high_t temp1, temp2;  // needs32
    100 
    101   // step 1
    102   input[0] = in[0] + in[15];
    103   input[1] = in[1] + in[14];
    104   input[2] = in[2] + in[13];
    105   input[3] = in[3] + in[12];
    106   input[4] = in[4] + in[11];
    107   input[5] = in[5] + in[10];
    108   input[6] = in[6] + in[ 9];
    109   input[7] = in[7] + in[ 8];
    110 
    111   step1[0] = in[7] - in[ 8];
    112   step1[1] = in[6] - in[ 9];
    113   step1[2] = in[5] - in[10];
    114   step1[3] = in[4] - in[11];
    115   step1[4] = in[3] - in[12];
    116   step1[5] = in[2] - in[13];
    117   step1[6] = in[1] - in[14];
    118   step1[7] = in[0] - in[15];
    119 
    120   // fdct8(step, step);
    121   {
    122     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    123     tran_high_t t0, t1, t2, t3;                  // needs32
    124     tran_high_t x0, x1, x2, x3;                  // canbe16
    125 
    126     // stage 1
    127     s0 = input[0] + input[7];
    128     s1 = input[1] + input[6];
    129     s2 = input[2] + input[5];
    130     s3 = input[3] + input[4];
    131     s4 = input[3] - input[4];
    132     s5 = input[2] - input[5];
    133     s6 = input[1] - input[6];
    134     s7 = input[0] - input[7];
    135 
    136     // fdct4(step, step);
    137     x0 = s0 + s3;
    138     x1 = s1 + s2;
    139     x2 = s1 - s2;
    140     x3 = s0 - s3;
    141     t0 = (x0 + x1) * cospi_16_64;
    142     t1 = (x0 - x1) * cospi_16_64;
    143     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
    144     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
    145     out[0] = (tran_low_t)fdct_round_shift(t0);
    146     out[4] = (tran_low_t)fdct_round_shift(t2);
    147     out[8] = (tran_low_t)fdct_round_shift(t1);
    148     out[12] = (tran_low_t)fdct_round_shift(t3);
    149 
    150     // Stage 2
    151     t0 = (s6 - s5) * cospi_16_64;
    152     t1 = (s6 + s5) * cospi_16_64;
    153     t2 = fdct_round_shift(t0);
    154     t3 = fdct_round_shift(t1);
    155 
    156     // Stage 3
    157     x0 = s4 + t2;
    158     x1 = s4 - t2;
    159     x2 = s7 - t3;
    160     x3 = s7 + t3;
    161 
    162     // Stage 4
    163     t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    164     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    165     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    166     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    167     out[2] = (tran_low_t)fdct_round_shift(t0);
    168     out[6] = (tran_low_t)fdct_round_shift(t2);
    169     out[10] = (tran_low_t)fdct_round_shift(t1);
    170     out[14] = (tran_low_t)fdct_round_shift(t3);
    171   }
    172 
    173   // step 2
    174   temp1 = (step1[5] - step1[2]) * cospi_16_64;
    175   temp2 = (step1[4] - step1[3]) * cospi_16_64;
    176   step2[2] = fdct_round_shift(temp1);
    177   step2[3] = fdct_round_shift(temp2);
    178   temp1 = (step1[4] + step1[3]) * cospi_16_64;
    179   temp2 = (step1[5] + step1[2]) * cospi_16_64;
    180   step2[4] = fdct_round_shift(temp1);
    181   step2[5] = fdct_round_shift(temp2);
    182 
    183   // step 3
    184   step3[0] = step1[0] + step2[3];
    185   step3[1] = step1[1] + step2[2];
    186   step3[2] = step1[1] - step2[2];
    187   step3[3] = step1[0] - step2[3];
    188   step3[4] = step1[7] - step2[4];
    189   step3[5] = step1[6] - step2[5];
    190   step3[6] = step1[6] + step2[5];
    191   step3[7] = step1[7] + step2[4];
    192 
    193   // step 4
    194   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
    195   temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
    196   step2[1] = fdct_round_shift(temp1);
    197   step2[2] = fdct_round_shift(temp2);
    198   temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
    199   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
    200   step2[5] = fdct_round_shift(temp1);
    201   step2[6] = fdct_round_shift(temp2);
    202 
    203   // step 5
    204   step1[0] = step3[0] + step2[1];
    205   step1[1] = step3[0] - step2[1];
    206   step1[2] = step3[3] + step2[2];
    207   step1[3] = step3[3] - step2[2];
    208   step1[4] = step3[4] - step2[5];
    209   step1[5] = step3[4] + step2[5];
    210   step1[6] = step3[7] - step2[6];
    211   step1[7] = step3[7] + step2[6];
    212 
    213   // step 6
    214   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
    215   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
    216   out[1] = (tran_low_t)fdct_round_shift(temp1);
    217   out[9] = (tran_low_t)fdct_round_shift(temp2);
    218 
    219   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
    220   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
    221   out[5] = (tran_low_t)fdct_round_shift(temp1);
    222   out[13] = (tran_low_t)fdct_round_shift(temp2);
    223 
    224   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
    225   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
    226   out[3] = (tran_low_t)fdct_round_shift(temp1);
    227   out[11] = (tran_low_t)fdct_round_shift(temp2);
    228 
    229   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
    230   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
    231   out[7] = (tran_low_t)fdct_round_shift(temp1);
    232   out[15] = (tran_low_t)fdct_round_shift(temp2);
    233 }
    234 
    235 static void fadst4(const tran_low_t *input, tran_low_t *output) {
    236   tran_high_t x0, x1, x2, x3;
    237   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    238 
    239   x0 = input[0];
    240   x1 = input[1];
    241   x2 = input[2];
    242   x3 = input[3];
    243 
    244   if (!(x0 | x1 | x2 | x3)) {
    245     output[0] = output[1] = output[2] = output[3] = 0;
    246     return;
    247   }
    248 
    249   s0 = sinpi_1_9 * x0;
    250   s1 = sinpi_4_9 * x0;
    251   s2 = sinpi_2_9 * x1;
    252   s3 = sinpi_1_9 * x1;
    253   s4 = sinpi_3_9 * x2;
    254   s5 = sinpi_4_9 * x3;
    255   s6 = sinpi_2_9 * x3;
    256   s7 = x0 + x1 - x3;
    257 
    258   x0 = s0 + s2 + s5;
    259   x1 = sinpi_3_9 * s7;
    260   x2 = s1 - s3 + s6;
    261   x3 = s4;
    262 
    263   s0 = x0 + x3;
    264   s1 = x1;
    265   s2 = x2 - x3;
    266   s3 = x2 - x0 + x3;
    267 
    268   // 1-D transform scaling factor is sqrt(2).
    269   output[0] = (tran_low_t)fdct_round_shift(s0);
    270   output[1] = (tran_low_t)fdct_round_shift(s1);
    271   output[2] = (tran_low_t)fdct_round_shift(s2);
    272   output[3] = (tran_low_t)fdct_round_shift(s3);
    273 }
    274 
    275 static void fadst8(const tran_low_t *input, tran_low_t *output) {
    276   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    277 
    278   tran_high_t x0 = input[7];
    279   tran_high_t x1 = input[0];
    280   tran_high_t x2 = input[5];
    281   tran_high_t x3 = input[2];
    282   tran_high_t x4 = input[3];
    283   tran_high_t x5 = input[4];
    284   tran_high_t x6 = input[1];
    285   tran_high_t x7 = input[6];
    286 
    287   // stage 1
    288   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
    289   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
    290   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    291   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    292   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    293   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    294   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
    295   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
    296 
    297   x0 = fdct_round_shift(s0 + s4);
    298   x1 = fdct_round_shift(s1 + s5);
    299   x2 = fdct_round_shift(s2 + s6);
    300   x3 = fdct_round_shift(s3 + s7);
    301   x4 = fdct_round_shift(s0 - s4);
    302   x5 = fdct_round_shift(s1 - s5);
    303   x6 = fdct_round_shift(s2 - s6);
    304   x7 = fdct_round_shift(s3 - s7);
    305 
    306   // stage 2
    307   s0 = x0;
    308   s1 = x1;
    309   s2 = x2;
    310   s3 = x3;
    311   s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
    312   s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
    313   s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
    314   s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
    315 
    316   x0 = s0 + s2;
    317   x1 = s1 + s3;
    318   x2 = s0 - s2;
    319   x3 = s1 - s3;
    320   x4 = fdct_round_shift(s4 + s6);
    321   x5 = fdct_round_shift(s5 + s7);
    322   x6 = fdct_round_shift(s4 - s6);
    323   x7 = fdct_round_shift(s5 - s7);
    324 
    325   // stage 3
    326   s2 = cospi_16_64 * (x2 + x3);
    327   s3 = cospi_16_64 * (x2 - x3);
    328   s6 = cospi_16_64 * (x6 + x7);
    329   s7 = cospi_16_64 * (x6 - x7);
    330 
    331   x2 = fdct_round_shift(s2);
    332   x3 = fdct_round_shift(s3);
    333   x6 = fdct_round_shift(s6);
    334   x7 = fdct_round_shift(s7);
    335 
    336   output[0] = (tran_low_t)x0;
    337   output[1] = (tran_low_t)-x4;
    338   output[2] = (tran_low_t)x6;
    339   output[3] = (tran_low_t)-x2;
    340   output[4] = (tran_low_t)x3;
    341   output[5] = (tran_low_t)-x7;
    342   output[6] = (tran_low_t)x5;
    343   output[7] = (tran_low_t)-x1;
    344 }
    345 
    346 static void fadst16(const tran_low_t *input, tran_low_t *output) {
    347   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    348   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    349 
    350   tran_high_t x0 = input[15];
    351   tran_high_t x1 = input[0];
    352   tran_high_t x2 = input[13];
    353   tran_high_t x3 = input[2];
    354   tran_high_t x4 = input[11];
    355   tran_high_t x5 = input[4];
    356   tran_high_t x6 = input[9];
    357   tran_high_t x7 = input[6];
    358   tran_high_t x8 = input[7];
    359   tran_high_t x9 = input[8];
    360   tran_high_t x10 = input[5];
    361   tran_high_t x11 = input[10];
    362   tran_high_t x12 = input[3];
    363   tran_high_t x13 = input[12];
    364   tran_high_t x14 = input[1];
    365   tran_high_t x15 = input[14];
    366 
    367   // stage 1
    368   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
    369   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    370   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
    371   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    372   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
    373   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    374   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    375   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    376   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    377   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    378   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    379   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    380   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    381   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
    382   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    383   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
    384 
    385   x0 = fdct_round_shift(s0 + s8);
    386   x1 = fdct_round_shift(s1 + s9);
    387   x2 = fdct_round_shift(s2 + s10);
    388   x3 = fdct_round_shift(s3 + s11);
    389   x4 = fdct_round_shift(s4 + s12);
    390   x5 = fdct_round_shift(s5 + s13);
    391   x6 = fdct_round_shift(s6 + s14);
    392   x7 = fdct_round_shift(s7 + s15);
    393   x8  = fdct_round_shift(s0 - s8);
    394   x9  = fdct_round_shift(s1 - s9);
    395   x10 = fdct_round_shift(s2 - s10);
    396   x11 = fdct_round_shift(s3 - s11);
    397   x12 = fdct_round_shift(s4 - s12);
    398   x13 = fdct_round_shift(s5 - s13);
    399   x14 = fdct_round_shift(s6 - s14);
    400   x15 = fdct_round_shift(s7 - s15);
    401 
    402   // stage 2
    403   s0 = x0;
    404   s1 = x1;
    405   s2 = x2;
    406   s3 = x3;
    407   s4 = x4;
    408   s5 = x5;
    409   s6 = x6;
    410   s7 = x7;
    411   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
    412   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
    413   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
    414   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
    415   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
    416   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
    417   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
    418   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
    419 
    420   x0 = s0 + s4;
    421   x1 = s1 + s5;
    422   x2 = s2 + s6;
    423   x3 = s3 + s7;
    424   x4 = s0 - s4;
    425   x5 = s1 - s5;
    426   x6 = s2 - s6;
    427   x7 = s3 - s7;
    428   x8 = fdct_round_shift(s8 + s12);
    429   x9 = fdct_round_shift(s9 + s13);
    430   x10 = fdct_round_shift(s10 + s14);
    431   x11 = fdct_round_shift(s11 + s15);
    432   x12 = fdct_round_shift(s8 - s12);
    433   x13 = fdct_round_shift(s9 - s13);
    434   x14 = fdct_round_shift(s10 - s14);
    435   x15 = fdct_round_shift(s11 - s15);
    436 
    437   // stage 3
    438   s0 = x0;
    439   s1 = x1;
    440   s2 = x2;
    441   s3 = x3;
    442   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
    443   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    444   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
    445   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
    446   s8 = x8;
    447   s9 = x9;
    448   s10 = x10;
    449   s11 = x11;
    450   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
    451   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    452   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
    453   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
    454 
    455   x0 = s0 + s2;
    456   x1 = s1 + s3;
    457   x2 = s0 - s2;
    458   x3 = s1 - s3;
    459   x4 = fdct_round_shift(s4 + s6);
    460   x5 = fdct_round_shift(s5 + s7);
    461   x6 = fdct_round_shift(s4 - s6);
    462   x7 = fdct_round_shift(s5 - s7);
    463   x8 = s8 + s10;
    464   x9 = s9 + s11;
    465   x10 = s8 - s10;
    466   x11 = s9 - s11;
    467   x12 = fdct_round_shift(s12 + s14);
    468   x13 = fdct_round_shift(s13 + s15);
    469   x14 = fdct_round_shift(s12 - s14);
    470   x15 = fdct_round_shift(s13 - s15);
    471 
    472   // stage 4
    473   s2 = (- cospi_16_64) * (x2 + x3);
    474   s3 = cospi_16_64 * (x2 - x3);
    475   s6 = cospi_16_64 * (x6 + x7);
    476   s7 = cospi_16_64 * (- x6 + x7);
    477   s10 = cospi_16_64 * (x10 + x11);
    478   s11 = cospi_16_64 * (- x10 + x11);
    479   s14 = (- cospi_16_64) * (x14 + x15);
    480   s15 = cospi_16_64 * (x14 - x15);
    481 
    482   x2 = fdct_round_shift(s2);
    483   x3 = fdct_round_shift(s3);
    484   x6 = fdct_round_shift(s6);
    485   x7 = fdct_round_shift(s7);
    486   x10 = fdct_round_shift(s10);
    487   x11 = fdct_round_shift(s11);
    488   x14 = fdct_round_shift(s14);
    489   x15 = fdct_round_shift(s15);
    490 
    491   output[0] = (tran_low_t)x0;
    492   output[1] = (tran_low_t)-x8;
    493   output[2] = (tran_low_t)x12;
    494   output[3] = (tran_low_t)-x4;
    495   output[4] = (tran_low_t)x6;
    496   output[5] = (tran_low_t)x14;
    497   output[6] = (tran_low_t)x10;
    498   output[7] = (tran_low_t)x2;
    499   output[8] = (tran_low_t)x3;
    500   output[9] = (tran_low_t)x11;
    501   output[10] = (tran_low_t)x15;
    502   output[11] = (tran_low_t)x7;
    503   output[12] = (tran_low_t)x5;
    504   output[13] = (tran_low_t)-x13;
    505   output[14] = (tran_low_t)x9;
    506   output[15] = (tran_low_t)-x1;
    507 }
    508 
    509 static const transform_2d FHT_4[] = {
    510   { fdct4,  fdct4  },  // DCT_DCT  = 0
    511   { fadst4, fdct4  },  // ADST_DCT = 1
    512   { fdct4,  fadst4 },  // DCT_ADST = 2
    513   { fadst4, fadst4 }   // ADST_ADST = 3
    514 };
    515 
    516 static const transform_2d FHT_8[] = {
    517   { fdct8,  fdct8  },  // DCT_DCT  = 0
    518   { fadst8, fdct8  },  // ADST_DCT = 1
    519   { fdct8,  fadst8 },  // DCT_ADST = 2
    520   { fadst8, fadst8 }   // ADST_ADST = 3
    521 };
    522 
    523 static const transform_2d FHT_16[] = {
    524   { fdct16,  fdct16  },  // DCT_DCT  = 0
    525   { fadst16, fdct16  },  // ADST_DCT = 1
    526   { fdct16,  fadst16 },  // DCT_ADST = 2
    527   { fadst16, fadst16 }   // ADST_ADST = 3
    528 };
    529 
    530 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
    531                   int stride, int tx_type) {
    532   if (tx_type == DCT_DCT) {
    533     vpx_fdct4x4_c(input, output, stride);
    534   } else {
    535     tran_low_t out[4 * 4];
    536     int i, j;
    537     tran_low_t temp_in[4], temp_out[4];
    538     const transform_2d ht = FHT_4[tx_type];
    539 
    540     // Columns
    541     for (i = 0; i < 4; ++i) {
    542       for (j = 0; j < 4; ++j)
    543         temp_in[j] = input[j * stride + i] * 16;
    544       if (i == 0 && temp_in[0])
    545         temp_in[0] += 1;
    546       ht.cols(temp_in, temp_out);
    547       for (j = 0; j < 4; ++j)
    548         out[j * 4 + i] = temp_out[j];
    549     }
    550 
    551     // Rows
    552     for (i = 0; i < 4; ++i) {
    553       for (j = 0; j < 4; ++j)
    554         temp_in[j] = out[j + i * 4];
    555       ht.rows(temp_in, temp_out);
    556       for (j = 0; j < 4; ++j)
    557         output[j + i * 4] = (temp_out[j] + 1) >> 2;
    558     }
    559   }
    560 }
    561 
    562 void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
    563                          tran_low_t *coeff_ptr, intptr_t n_coeffs,
    564                          int skip_block,
    565                          const int16_t *zbin_ptr, const int16_t *round_ptr,
    566                          const int16_t *quant_ptr,
    567                          const int16_t *quant_shift_ptr,
    568                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
    569                          const int16_t *dequant_ptr,
    570                          uint16_t *eob_ptr,
    571                          const int16_t *scan, const int16_t *iscan) {
    572   int eob = -1;
    573 
    574   int i, j;
    575   tran_low_t intermediate[64];
    576 
    577   // Transform columns
    578   {
    579     tran_low_t *output = intermediate;
    580     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    581     tran_high_t t0, t1, t2, t3;                  // needs32
    582     tran_high_t x0, x1, x2, x3;                  // canbe16
    583 
    584     int i;
    585     for (i = 0; i < 8; i++) {
    586       // stage 1
    587       s0 = (input[0 * stride] + input[7 * stride]) * 4;
    588       s1 = (input[1 * stride] + input[6 * stride]) * 4;
    589       s2 = (input[2 * stride] + input[5 * stride]) * 4;
    590       s3 = (input[3 * stride] + input[4 * stride]) * 4;
    591       s4 = (input[3 * stride] - input[4 * stride]) * 4;
    592       s5 = (input[2 * stride] - input[5 * stride]) * 4;
    593       s6 = (input[1 * stride] - input[6 * stride]) * 4;
    594       s7 = (input[0 * stride] - input[7 * stride]) * 4;
    595 
    596       // fdct4(step, step);
    597       x0 = s0 + s3;
    598       x1 = s1 + s2;
    599       x2 = s1 - s2;
    600       x3 = s0 - s3;
    601       t0 = (x0 + x1) * cospi_16_64;
    602       t1 = (x0 - x1) * cospi_16_64;
    603       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
    604       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
    605       output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
    606       output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
    607       output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
    608       output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
    609 
    610       // Stage 2
    611       t0 = (s6 - s5) * cospi_16_64;
    612       t1 = (s6 + s5) * cospi_16_64;
    613       t2 = fdct_round_shift(t0);
    614       t3 = fdct_round_shift(t1);
    615 
    616       // Stage 3
    617       x0 = s4 + t2;
    618       x1 = s4 - t2;
    619       x2 = s7 - t3;
    620       x3 = s7 + t3;
    621 
    622       // Stage 4
    623       t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    624       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    625       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    626       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
    627       output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
    628       output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
    629       output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
    630       output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
    631       input++;
    632       output++;
    633     }
    634   }
    635 
    636   // Rows
    637   for (i = 0; i < 8; ++i) {
    638     fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
    639     for (j = 0; j < 8; ++j)
    640       coeff_ptr[j + i * 8] /= 2;
    641   }
    642 
    643   // TODO(jingning) Decide the need of these arguments after the
    644   // quantization process is completed.
    645   (void)zbin_ptr;
    646   (void)quant_shift_ptr;
    647   (void)iscan;
    648 
    649   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    650   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    651 
    652   if (!skip_block) {
    653     // Quantization pass: All coefficients with index >= zero_flag are
    654     // skippable. Note: zero_flag can be zero.
    655     for (i = 0; i < n_coeffs; i++) {
    656       const int rc = scan[i];
    657       const int coeff = coeff_ptr[rc];
    658       const int coeff_sign = (coeff >> 31);
    659       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    660 
    661       int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
    662       tmp = (tmp * quant_ptr[rc != 0]) >> 16;
    663 
    664       qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
    665       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
    666 
    667       if (tmp)
    668         eob = i;
    669     }
    670   }
    671   *eob_ptr = eob + 1;
    672 }
    673 
    674 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
    675                   int stride, int tx_type) {
    676   if (tx_type == DCT_DCT) {
    677     vpx_fdct8x8_c(input, output, stride);
    678   } else {
    679     tran_low_t out[64];
    680     int i, j;
    681     tran_low_t temp_in[8], temp_out[8];
    682     const transform_2d ht = FHT_8[tx_type];
    683 
    684     // Columns
    685     for (i = 0; i < 8; ++i) {
    686       for (j = 0; j < 8; ++j)
    687         temp_in[j] = input[j * stride + i] * 4;
    688       ht.cols(temp_in, temp_out);
    689       for (j = 0; j < 8; ++j)
    690         out[j * 8 + i] = temp_out[j];
    691     }
    692 
    693     // Rows
    694     for (i = 0; i < 8; ++i) {
    695       for (j = 0; j < 8; ++j)
    696         temp_in[j] = out[j + i * 8];
    697       ht.rows(temp_in, temp_out);
    698       for (j = 0; j < 8; ++j)
    699         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    700     }
    701   }
    702 }
    703 
    704 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    705    pixel. */
    706 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
    707   int i;
    708   tran_high_t a1, b1, c1, d1, e1;
    709   const int16_t *ip_pass0 = input;
    710   const tran_low_t *ip = NULL;
    711   tran_low_t *op = output;
    712 
    713   for (i = 0; i < 4; i++) {
    714     a1 = ip_pass0[0 * stride];
    715     b1 = ip_pass0[1 * stride];
    716     c1 = ip_pass0[2 * stride];
    717     d1 = ip_pass0[3 * stride];
    718 
    719     a1 += b1;
    720     d1 = d1 - c1;
    721     e1 = (a1 - d1) >> 1;
    722     b1 = e1 - b1;
    723     c1 = e1 - c1;
    724     a1 -= c1;
    725     d1 += b1;
    726     op[0] = (tran_low_t)a1;
    727     op[4] = (tran_low_t)c1;
    728     op[8] = (tran_low_t)d1;
    729     op[12] = (tran_low_t)b1;
    730 
    731     ip_pass0++;
    732     op++;
    733   }
    734   ip = output;
    735   op = output;
    736 
    737   for (i = 0; i < 4; i++) {
    738     a1 = ip[0];
    739     b1 = ip[1];
    740     c1 = ip[2];
    741     d1 = ip[3];
    742 
    743     a1 += b1;
    744     d1 -= c1;
    745     e1 = (a1 - d1) >> 1;
    746     b1 = e1 - b1;
    747     c1 = e1 - c1;
    748     a1 -= c1;
    749     d1 += b1;
    750     op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
    751     op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
    752     op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
    753     op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
    754 
    755     ip += 4;
    756     op += 4;
    757   }
    758 }
    759 
    760 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
    761                     int stride, int tx_type) {
    762   if (tx_type == DCT_DCT) {
    763     vpx_fdct16x16_c(input, output, stride);
    764   } else {
    765     tran_low_t out[256];
    766     int i, j;
    767     tran_low_t temp_in[16], temp_out[16];
    768     const transform_2d ht = FHT_16[tx_type];
    769 
    770     // Columns
    771     for (i = 0; i < 16; ++i) {
    772       for (j = 0; j < 16; ++j)
    773         temp_in[j] = input[j * stride + i] * 4;
    774       ht.cols(temp_in, temp_out);
    775       for (j = 0; j < 16; ++j)
    776         out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
    777     }
    778 
    779     // Rows
    780     for (i = 0; i < 16; ++i) {
    781       for (j = 0; j < 16; ++j)
    782         temp_in[j] = out[j + i * 16];
    783       ht.rows(temp_in, temp_out);
    784       for (j = 0; j < 16; ++j)
    785         output[j + i * 16] = temp_out[j];
    786     }
    787   }
    788 }
    789 
    790 #if CONFIG_VP9_HIGHBITDEPTH
    791 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
    792                          int stride, int tx_type) {
    793   vp9_fht4x4_c(input, output, stride, tx_type);
    794 }
    795 
    796 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
    797                          int stride, int tx_type) {
    798   vp9_fht8x8_c(input, output, stride, tx_type);
    799 }
    800 
    801 void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
    802                           int stride) {
    803   vp9_fwht4x4_c(input, output, stride);
    804 }
    805 
    806 void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
    807                            int stride, int tx_type) {
    808   vp9_fht16x16_c(input, output, stride, tx_type);
    809 }
    810 #endif  // CONFIG_VP9_HIGHBITDEPTH
    811