Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <math.h>
     13 
     14 #include "./vp9_rtcd.h"
     15 #include "./vpx_config.h"
     16 #include "./vpx_dsp_rtcd.h"
     17 
     18 #include "vp9/common/vp9_blockd.h"
     19 #include "vp9/common/vp9_idct.h"
     20 #include "vpx_dsp/fwd_txfm.h"
     21 #include "vpx_ports/mem.h"
     22 
     23 static void fdct4(const tran_low_t *input, tran_low_t *output) {
     24   tran_high_t step[4];
     25   tran_high_t temp1, temp2;
     26 
     27   step[0] = input[0] + input[3];
     28   step[1] = input[1] + input[2];
     29   step[2] = input[1] - input[2];
     30   step[3] = input[0] - input[3];
     31 
     32   temp1 = (step[0] + step[1]) * cospi_16_64;
     33   temp2 = (step[0] - step[1]) * cospi_16_64;
     34   output[0] = (tran_low_t)fdct_round_shift(temp1);
     35   output[2] = (tran_low_t)fdct_round_shift(temp2);
     36   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
     37   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
     38   output[1] = (tran_low_t)fdct_round_shift(temp1);
     39   output[3] = (tran_low_t)fdct_round_shift(temp2);
     40 }
     41 
     42 static void fdct8(const tran_low_t *input, tran_low_t *output) {
     43   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
     44   tran_high_t t0, t1, t2, t3;                  // needs32
     45   tran_high_t x0, x1, x2, x3;                  // canbe16
     46 
     47   // stage 1
     48   s0 = input[0] + input[7];
     49   s1 = input[1] + input[6];
     50   s2 = input[2] + input[5];
     51   s3 = input[3] + input[4];
     52   s4 = input[3] - input[4];
     53   s5 = input[2] - input[5];
     54   s6 = input[1] - input[6];
     55   s7 = input[0] - input[7];
     56 
     57   // fdct4(step, step);
     58   x0 = s0 + s3;
     59   x1 = s1 + s2;
     60   x2 = s1 - s2;
     61   x3 = s0 - s3;
     62   t0 = (x0 + x1) * cospi_16_64;
     63   t1 = (x0 - x1) * cospi_16_64;
     64   t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
     65   t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
     66   output[0] = (tran_low_t)fdct_round_shift(t0);
     67   output[2] = (tran_low_t)fdct_round_shift(t2);
     68   output[4] = (tran_low_t)fdct_round_shift(t1);
     69   output[6] = (tran_low_t)fdct_round_shift(t3);
     70 
     71   // Stage 2
     72   t0 = (s6 - s5) * cospi_16_64;
     73   t1 = (s6 + s5) * cospi_16_64;
     74   t2 = (tran_low_t)fdct_round_shift(t0);
     75   t3 = (tran_low_t)fdct_round_shift(t1);
     76 
     77   // Stage 3
     78   x0 = s4 + t2;
     79   x1 = s4 - t2;
     80   x2 = s7 - t3;
     81   x3 = s7 + t3;
     82 
     83   // Stage 4
     84   t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
     85   t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
     86   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     87   t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
     88   output[1] = (tran_low_t)fdct_round_shift(t0);
     89   output[3] = (tran_low_t)fdct_round_shift(t2);
     90   output[5] = (tran_low_t)fdct_round_shift(t1);
     91   output[7] = (tran_low_t)fdct_round_shift(t3);
     92 }
     93 
     94 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
     95   tran_high_t step1[8];      // canbe16
     96   tran_high_t step2[8];      // canbe16
     97   tran_high_t step3[8];      // canbe16
     98   tran_high_t input[8];      // canbe16
     99   tran_high_t temp1, temp2;  // needs32
    100 
    101   // step 1
    102   input[0] = in[0] + in[15];
    103   input[1] = in[1] + in[14];
    104   input[2] = in[2] + in[13];
    105   input[3] = in[3] + in[12];
    106   input[4] = in[4] + in[11];
    107   input[5] = in[5] + in[10];
    108   input[6] = in[6] + in[9];
    109   input[7] = in[7] + in[8];
    110 
    111   step1[0] = in[7] - in[8];
    112   step1[1] = in[6] - in[9];
    113   step1[2] = in[5] - in[10];
    114   step1[3] = in[4] - in[11];
    115   step1[4] = in[3] - in[12];
    116   step1[5] = in[2] - in[13];
    117   step1[6] = in[1] - in[14];
    118   step1[7] = in[0] - in[15];
    119 
    120   // fdct8(step, step);
    121   {
    122     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    123     tran_high_t t0, t1, t2, t3;                  // needs32
    124     tran_high_t x0, x1, x2, x3;                  // canbe16
    125 
    126     // stage 1
    127     s0 = input[0] + input[7];
    128     s1 = input[1] + input[6];
    129     s2 = input[2] + input[5];
    130     s3 = input[3] + input[4];
    131     s4 = input[3] - input[4];
    132     s5 = input[2] - input[5];
    133     s6 = input[1] - input[6];
    134     s7 = input[0] - input[7];
    135 
    136     // fdct4(step, step);
    137     x0 = s0 + s3;
    138     x1 = s1 + s2;
    139     x2 = s1 - s2;
    140     x3 = s0 - s3;
    141     t0 = (x0 + x1) * cospi_16_64;
    142     t1 = (x0 - x1) * cospi_16_64;
    143     t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
    144     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
    145     out[0] = (tran_low_t)fdct_round_shift(t0);
    146     out[4] = (tran_low_t)fdct_round_shift(t2);
    147     out[8] = (tran_low_t)fdct_round_shift(t1);
    148     out[12] = (tran_low_t)fdct_round_shift(t3);
    149 
    150     // Stage 2
    151     t0 = (s6 - s5) * cospi_16_64;
    152     t1 = (s6 + s5) * cospi_16_64;
    153     t2 = fdct_round_shift(t0);
    154     t3 = fdct_round_shift(t1);
    155 
    156     // Stage 3
    157     x0 = s4 + t2;
    158     x1 = s4 - t2;
    159     x2 = s7 - t3;
    160     x3 = s7 + t3;
    161 
    162     // Stage 4
    163     t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
    164     t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
    165     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    166     t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
    167     out[2] = (tran_low_t)fdct_round_shift(t0);
    168     out[6] = (tran_low_t)fdct_round_shift(t2);
    169     out[10] = (tran_low_t)fdct_round_shift(t1);
    170     out[14] = (tran_low_t)fdct_round_shift(t3);
    171   }
    172 
    173   // step 2
    174   temp1 = (step1[5] - step1[2]) * cospi_16_64;
    175   temp2 = (step1[4] - step1[3]) * cospi_16_64;
    176   step2[2] = fdct_round_shift(temp1);
    177   step2[3] = fdct_round_shift(temp2);
    178   temp1 = (step1[4] + step1[3]) * cospi_16_64;
    179   temp2 = (step1[5] + step1[2]) * cospi_16_64;
    180   step2[4] = fdct_round_shift(temp1);
    181   step2[5] = fdct_round_shift(temp2);
    182 
    183   // step 3
    184   step3[0] = step1[0] + step2[3];
    185   step3[1] = step1[1] + step2[2];
    186   step3[2] = step1[1] - step2[2];
    187   step3[3] = step1[0] - step2[3];
    188   step3[4] = step1[7] - step2[4];
    189   step3[5] = step1[6] - step2[5];
    190   step3[6] = step1[6] + step2[5];
    191   step3[7] = step1[7] + step2[4];
    192 
    193   // step 4
    194   temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
    195   temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
    196   step2[1] = fdct_round_shift(temp1);
    197   step2[2] = fdct_round_shift(temp2);
    198   temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
    199   temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
    200   step2[5] = fdct_round_shift(temp1);
    201   step2[6] = fdct_round_shift(temp2);
    202 
    203   // step 5
    204   step1[0] = step3[0] + step2[1];
    205   step1[1] = step3[0] - step2[1];
    206   step1[2] = step3[3] + step2[2];
    207   step1[3] = step3[3] - step2[2];
    208   step1[4] = step3[4] - step2[5];
    209   step1[5] = step3[4] + step2[5];
    210   step1[6] = step3[7] - step2[6];
    211   step1[7] = step3[7] + step2[6];
    212 
    213   // step 6
    214   temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
    215   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
    216   out[1] = (tran_low_t)fdct_round_shift(temp1);
    217   out[9] = (tran_low_t)fdct_round_shift(temp2);
    218 
    219   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
    220   temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
    221   out[5] = (tran_low_t)fdct_round_shift(temp1);
    222   out[13] = (tran_low_t)fdct_round_shift(temp2);
    223 
    224   temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
    225   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
    226   out[3] = (tran_low_t)fdct_round_shift(temp1);
    227   out[11] = (tran_low_t)fdct_round_shift(temp2);
    228 
    229   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
    230   temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
    231   out[7] = (tran_low_t)fdct_round_shift(temp1);
    232   out[15] = (tran_low_t)fdct_round_shift(temp2);
    233 }
    234 
    235 static void fadst4(const tran_low_t *input, tran_low_t *output) {
    236   tran_high_t x0, x1, x2, x3;
    237   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    238 
    239   x0 = input[0];
    240   x1 = input[1];
    241   x2 = input[2];
    242   x3 = input[3];
    243 
    244   if (!(x0 | x1 | x2 | x3)) {
    245     output[0] = output[1] = output[2] = output[3] = 0;
    246     return;
    247   }
    248 
    249   s0 = sinpi_1_9 * x0;
    250   s1 = sinpi_4_9 * x0;
    251   s2 = sinpi_2_9 * x1;
    252   s3 = sinpi_1_9 * x1;
    253   s4 = sinpi_3_9 * x2;
    254   s5 = sinpi_4_9 * x3;
    255   s6 = sinpi_2_9 * x3;
    256   s7 = x0 + x1 - x3;
    257 
    258   x0 = s0 + s2 + s5;
    259   x1 = sinpi_3_9 * s7;
    260   x2 = s1 - s3 + s6;
    261   x3 = s4;
    262 
    263   s0 = x0 + x3;
    264   s1 = x1;
    265   s2 = x2 - x3;
    266   s3 = x2 - x0 + x3;
    267 
    268   // 1-D transform scaling factor is sqrt(2).
    269   output[0] = (tran_low_t)fdct_round_shift(s0);
    270   output[1] = (tran_low_t)fdct_round_shift(s1);
    271   output[2] = (tran_low_t)fdct_round_shift(s2);
    272   output[3] = (tran_low_t)fdct_round_shift(s3);
    273 }
    274 
    275 static void fadst8(const tran_low_t *input, tran_low_t *output) {
    276   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
    277 
    278   tran_high_t x0 = input[7];
    279   tran_high_t x1 = input[0];
    280   tran_high_t x2 = input[5];
    281   tran_high_t x3 = input[2];
    282   tran_high_t x4 = input[3];
    283   tran_high_t x5 = input[4];
    284   tran_high_t x6 = input[1];
    285   tran_high_t x7 = input[6];
    286 
    287   // stage 1
    288   s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
    289   s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
    290   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
    291   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
    292   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
    293   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
    294   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
    295   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
    296 
    297   x0 = fdct_round_shift(s0 + s4);
    298   x1 = fdct_round_shift(s1 + s5);
    299   x2 = fdct_round_shift(s2 + s6);
    300   x3 = fdct_round_shift(s3 + s7);
    301   x4 = fdct_round_shift(s0 - s4);
    302   x5 = fdct_round_shift(s1 - s5);
    303   x6 = fdct_round_shift(s2 - s6);
    304   x7 = fdct_round_shift(s3 - s7);
    305 
    306   // stage 2
    307   s0 = x0;
    308   s1 = x1;
    309   s2 = x2;
    310   s3 = x3;
    311   s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
    312   s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
    313   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
    314   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
    315 
    316   x0 = s0 + s2;
    317   x1 = s1 + s3;
    318   x2 = s0 - s2;
    319   x3 = s1 - s3;
    320   x4 = fdct_round_shift(s4 + s6);
    321   x5 = fdct_round_shift(s5 + s7);
    322   x6 = fdct_round_shift(s4 - s6);
    323   x7 = fdct_round_shift(s5 - s7);
    324 
    325   // stage 3
    326   s2 = cospi_16_64 * (x2 + x3);
    327   s3 = cospi_16_64 * (x2 - x3);
    328   s6 = cospi_16_64 * (x6 + x7);
    329   s7 = cospi_16_64 * (x6 - x7);
    330 
    331   x2 = fdct_round_shift(s2);
    332   x3 = fdct_round_shift(s3);
    333   x6 = fdct_round_shift(s6);
    334   x7 = fdct_round_shift(s7);
    335 
    336   output[0] = (tran_low_t)x0;
    337   output[1] = (tran_low_t)-x4;
    338   output[2] = (tran_low_t)x6;
    339   output[3] = (tran_low_t)-x2;
    340   output[4] = (tran_low_t)x3;
    341   output[5] = (tran_low_t)-x7;
    342   output[6] = (tran_low_t)x5;
    343   output[7] = (tran_low_t)-x1;
    344 }
    345 
    346 static void fadst16(const tran_low_t *input, tran_low_t *output) {
    347   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
    348   tran_high_t s9, s10, s11, s12, s13, s14, s15;
    349 
    350   tran_high_t x0 = input[15];
    351   tran_high_t x1 = input[0];
    352   tran_high_t x2 = input[13];
    353   tran_high_t x3 = input[2];
    354   tran_high_t x4 = input[11];
    355   tran_high_t x5 = input[4];
    356   tran_high_t x6 = input[9];
    357   tran_high_t x7 = input[6];
    358   tran_high_t x8 = input[7];
    359   tran_high_t x9 = input[8];
    360   tran_high_t x10 = input[5];
    361   tran_high_t x11 = input[10];
    362   tran_high_t x12 = input[3];
    363   tran_high_t x13 = input[12];
    364   tran_high_t x14 = input[1];
    365   tran_high_t x15 = input[14];
    366 
    367   // stage 1
    368   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
    369   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
    370   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
    371   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
    372   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
    373   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
    374   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
    375   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
    376   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
    377   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
    378   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
    379   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
    380   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
    381   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
    382   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
    383   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
    384 
    385   x0 = fdct_round_shift(s0 + s8);
    386   x1 = fdct_round_shift(s1 + s9);
    387   x2 = fdct_round_shift(s2 + s10);
    388   x3 = fdct_round_shift(s3 + s11);
    389   x4 = fdct_round_shift(s4 + s12);
    390   x5 = fdct_round_shift(s5 + s13);
    391   x6 = fdct_round_shift(s6 + s14);
    392   x7 = fdct_round_shift(s7 + s15);
    393   x8 = fdct_round_shift(s0 - s8);
    394   x9 = fdct_round_shift(s1 - s9);
    395   x10 = fdct_round_shift(s2 - s10);
    396   x11 = fdct_round_shift(s3 - s11);
    397   x12 = fdct_round_shift(s4 - s12);
    398   x13 = fdct_round_shift(s5 - s13);
    399   x14 = fdct_round_shift(s6 - s14);
    400   x15 = fdct_round_shift(s7 - s15);
    401 
    402   // stage 2
    403   s0 = x0;
    404   s1 = x1;
    405   s2 = x2;
    406   s3 = x3;
    407   s4 = x4;
    408   s5 = x5;
    409   s6 = x6;
    410   s7 = x7;
    411   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
    412   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
    413   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
    414   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
    415   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
    416   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
    417   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
    418   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
    419 
    420   x0 = s0 + s4;
    421   x1 = s1 + s5;
    422   x2 = s2 + s6;
    423   x3 = s3 + s7;
    424   x4 = s0 - s4;
    425   x5 = s1 - s5;
    426   x6 = s2 - s6;
    427   x7 = s3 - s7;
    428   x8 = fdct_round_shift(s8 + s12);
    429   x9 = fdct_round_shift(s9 + s13);
    430   x10 = fdct_round_shift(s10 + s14);
    431   x11 = fdct_round_shift(s11 + s15);
    432   x12 = fdct_round_shift(s8 - s12);
    433   x13 = fdct_round_shift(s9 - s13);
    434   x14 = fdct_round_shift(s10 - s14);
    435   x15 = fdct_round_shift(s11 - s15);
    436 
    437   // stage 3
    438   s0 = x0;
    439   s1 = x1;
    440   s2 = x2;
    441   s3 = x3;
    442   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
    443   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
    444   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
    445   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
    446   s8 = x8;
    447   s9 = x9;
    448   s10 = x10;
    449   s11 = x11;
    450   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
    451   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
    452   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
    453   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
    454 
    455   x0 = s0 + s2;
    456   x1 = s1 + s3;
    457   x2 = s0 - s2;
    458   x3 = s1 - s3;
    459   x4 = fdct_round_shift(s4 + s6);
    460   x5 = fdct_round_shift(s5 + s7);
    461   x6 = fdct_round_shift(s4 - s6);
    462   x7 = fdct_round_shift(s5 - s7);
    463   x8 = s8 + s10;
    464   x9 = s9 + s11;
    465   x10 = s8 - s10;
    466   x11 = s9 - s11;
    467   x12 = fdct_round_shift(s12 + s14);
    468   x13 = fdct_round_shift(s13 + s15);
    469   x14 = fdct_round_shift(s12 - s14);
    470   x15 = fdct_round_shift(s13 - s15);
    471 
    472   // stage 4
    473   s2 = (-cospi_16_64) * (x2 + x3);
    474   s3 = cospi_16_64 * (x2 - x3);
    475   s6 = cospi_16_64 * (x6 + x7);
    476   s7 = cospi_16_64 * (-x6 + x7);
    477   s10 = cospi_16_64 * (x10 + x11);
    478   s11 = cospi_16_64 * (-x10 + x11);
    479   s14 = (-cospi_16_64) * (x14 + x15);
    480   s15 = cospi_16_64 * (x14 - x15);
    481 
    482   x2 = fdct_round_shift(s2);
    483   x3 = fdct_round_shift(s3);
    484   x6 = fdct_round_shift(s6);
    485   x7 = fdct_round_shift(s7);
    486   x10 = fdct_round_shift(s10);
    487   x11 = fdct_round_shift(s11);
    488   x14 = fdct_round_shift(s14);
    489   x15 = fdct_round_shift(s15);
    490 
    491   output[0] = (tran_low_t)x0;
    492   output[1] = (tran_low_t)-x8;
    493   output[2] = (tran_low_t)x12;
    494   output[3] = (tran_low_t)-x4;
    495   output[4] = (tran_low_t)x6;
    496   output[5] = (tran_low_t)x14;
    497   output[6] = (tran_low_t)x10;
    498   output[7] = (tran_low_t)x2;
    499   output[8] = (tran_low_t)x3;
    500   output[9] = (tran_low_t)x11;
    501   output[10] = (tran_low_t)x15;
    502   output[11] = (tran_low_t)x7;
    503   output[12] = (tran_low_t)x5;
    504   output[13] = (tran_low_t)-x13;
    505   output[14] = (tran_low_t)x9;
    506   output[15] = (tran_low_t)-x1;
    507 }
    508 
    509 static const transform_2d FHT_4[] = {
    510   { fdct4, fdct4 },   // DCT_DCT  = 0
    511   { fadst4, fdct4 },  // ADST_DCT = 1
    512   { fdct4, fadst4 },  // DCT_ADST = 2
    513   { fadst4, fadst4 }  // ADST_ADST = 3
    514 };
    515 
    516 static const transform_2d FHT_8[] = {
    517   { fdct8, fdct8 },   // DCT_DCT  = 0
    518   { fadst8, fdct8 },  // ADST_DCT = 1
    519   { fdct8, fadst8 },  // DCT_ADST = 2
    520   { fadst8, fadst8 }  // ADST_ADST = 3
    521 };
    522 
    523 static const transform_2d FHT_16[] = {
    524   { fdct16, fdct16 },   // DCT_DCT  = 0
    525   { fadst16, fdct16 },  // ADST_DCT = 1
    526   { fdct16, fadst16 },  // DCT_ADST = 2
    527   { fadst16, fadst16 }  // ADST_ADST = 3
    528 };
    529 
    530 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
    531                   int tx_type) {
    532   if (tx_type == DCT_DCT) {
    533     vpx_fdct4x4_c(input, output, stride);
    534   } else {
    535     tran_low_t out[4 * 4];
    536     int i, j;
    537     tran_low_t temp_in[4], temp_out[4];
    538     const transform_2d ht = FHT_4[tx_type];
    539 
    540     // Columns
    541     for (i = 0; i < 4; ++i) {
    542       for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
    543       if (i == 0 && temp_in[0]) temp_in[0] += 1;
    544       ht.cols(temp_in, temp_out);
    545       for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
    546     }
    547 
    548     // Rows
    549     for (i = 0; i < 4; ++i) {
    550       for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
    551       ht.rows(temp_in, temp_out);
    552       for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
    553     }
    554   }
    555 }
    556 
    557 void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
    558                          tran_low_t *coeff_ptr, intptr_t n_coeffs,
    559                          int skip_block, const int16_t *round_ptr,
    560                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
    561                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
    562                          uint16_t *eob_ptr, const int16_t *scan,
    563                          const int16_t *iscan) {
    564   int eob = -1;
    565 
    566   int i, j;
    567   tran_low_t intermediate[64];
    568 
    569   (void)iscan;
    570 
    571   // Transform columns
    572   {
    573     tran_low_t *output = intermediate;
    574     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
    575     tran_high_t t0, t1, t2, t3;                  // needs32
    576     tran_high_t x0, x1, x2, x3;                  // canbe16
    577 
    578     int i;
    579     for (i = 0; i < 8; i++) {
    580       // stage 1
    581       s0 = (input[0 * stride] + input[7 * stride]) * 4;
    582       s1 = (input[1 * stride] + input[6 * stride]) * 4;
    583       s2 = (input[2 * stride] + input[5 * stride]) * 4;
    584       s3 = (input[3 * stride] + input[4 * stride]) * 4;
    585       s4 = (input[3 * stride] - input[4 * stride]) * 4;
    586       s5 = (input[2 * stride] - input[5 * stride]) * 4;
    587       s6 = (input[1 * stride] - input[6 * stride]) * 4;
    588       s7 = (input[0 * stride] - input[7 * stride]) * 4;
    589 
    590       // fdct4(step, step);
    591       x0 = s0 + s3;
    592       x1 = s1 + s2;
    593       x2 = s1 - s2;
    594       x3 = s0 - s3;
    595       t0 = (x0 + x1) * cospi_16_64;
    596       t1 = (x0 - x1) * cospi_16_64;
    597       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
    598       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
    599       output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
    600       output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
    601       output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
    602       output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
    603 
    604       // Stage 2
    605       t0 = (s6 - s5) * cospi_16_64;
    606       t1 = (s6 + s5) * cospi_16_64;
    607       t2 = fdct_round_shift(t0);
    608       t3 = fdct_round_shift(t1);
    609 
    610       // Stage 3
    611       x0 = s4 + t2;
    612       x1 = s4 - t2;
    613       x2 = s7 - t3;
    614       x3 = s7 + t3;
    615 
    616       // Stage 4
    617       t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
    618       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
    619       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    620       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
    621       output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
    622       output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
    623       output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
    624       output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
    625       input++;
    626       output++;
    627     }
    628   }
    629 
    630   // Rows
    631   for (i = 0; i < 8; ++i) {
    632     fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
    633     for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
    634   }
    635 
    636   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    637   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
    638 
    639   if (!skip_block) {
    640     // Quantization pass: All coefficients with index >= zero_flag are
    641     // skippable. Note: zero_flag can be zero.
    642     for (i = 0; i < n_coeffs; i++) {
    643       const int rc = scan[i];
    644       const int coeff = coeff_ptr[rc];
    645       const int coeff_sign = (coeff >> 31);
    646       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
    647 
    648       int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
    649       tmp = (tmp * quant_ptr[rc != 0]) >> 16;
    650 
    651       qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
    652       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
    653 
    654       if (tmp) eob = i;
    655     }
    656   }
    657   *eob_ptr = eob + 1;
    658 }
    659 
    660 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
    661                   int tx_type) {
    662   if (tx_type == DCT_DCT) {
    663     vpx_fdct8x8_c(input, output, stride);
    664   } else {
    665     tran_low_t out[64];
    666     int i, j;
    667     tran_low_t temp_in[8], temp_out[8];
    668     const transform_2d ht = FHT_8[tx_type];
    669 
    670     // Columns
    671     for (i = 0; i < 8; ++i) {
    672       for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
    673       ht.cols(temp_in, temp_out);
    674       for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
    675     }
    676 
    677     // Rows
    678     for (i = 0; i < 8; ++i) {
    679       for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
    680       ht.rows(temp_in, temp_out);
    681       for (j = 0; j < 8; ++j)
    682         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    683     }
    684   }
    685 }
    686 
    687 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
    688    pixel. */
    689 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
    690   int i;
    691   tran_high_t a1, b1, c1, d1, e1;
    692   const int16_t *ip_pass0 = input;
    693   const tran_low_t *ip = NULL;
    694   tran_low_t *op = output;
    695 
    696   for (i = 0; i < 4; i++) {
    697     a1 = ip_pass0[0 * stride];
    698     b1 = ip_pass0[1 * stride];
    699     c1 = ip_pass0[2 * stride];
    700     d1 = ip_pass0[3 * stride];
    701 
    702     a1 += b1;
    703     d1 = d1 - c1;
    704     e1 = (a1 - d1) >> 1;
    705     b1 = e1 - b1;
    706     c1 = e1 - c1;
    707     a1 -= c1;
    708     d1 += b1;
    709     op[0] = (tran_low_t)a1;
    710     op[4] = (tran_low_t)c1;
    711     op[8] = (tran_low_t)d1;
    712     op[12] = (tran_low_t)b1;
    713 
    714     ip_pass0++;
    715     op++;
    716   }
    717   ip = output;
    718   op = output;
    719 
    720   for (i = 0; i < 4; i++) {
    721     a1 = ip[0];
    722     b1 = ip[1];
    723     c1 = ip[2];
    724     d1 = ip[3];
    725 
    726     a1 += b1;
    727     d1 -= c1;
    728     e1 = (a1 - d1) >> 1;
    729     b1 = e1 - b1;
    730     c1 = e1 - c1;
    731     a1 -= c1;
    732     d1 += b1;
    733     op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
    734     op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
    735     op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
    736     op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
    737 
    738     ip += 4;
    739     op += 4;
    740   }
    741 }
    742 
    743 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
    744                     int tx_type) {
    745   if (tx_type == DCT_DCT) {
    746     vpx_fdct16x16_c(input, output, stride);
    747   } else {
    748     tran_low_t out[256];
    749     int i, j;
    750     tran_low_t temp_in[16], temp_out[16];
    751     const transform_2d ht = FHT_16[tx_type];
    752 
    753     // Columns
    754     for (i = 0; i < 16; ++i) {
    755       for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
    756       ht.cols(temp_in, temp_out);
    757       for (j = 0; j < 16; ++j)
    758         out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
    759     }
    760 
    761     // Rows
    762     for (i = 0; i < 16; ++i) {
    763       for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
    764       ht.rows(temp_in, temp_out);
    765       for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
    766     }
    767   }
    768 }
    769 
    770 #if CONFIG_VP9_HIGHBITDEPTH
    771 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
    772                          int tx_type) {
    773   vp9_fht4x4_c(input, output, stride, tx_type);
    774 }
    775 
    776 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
    777                          int tx_type) {
    778   vp9_fht8x8_c(input, output, stride, tx_type);
    779 }
    780 
    781 void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
    782                           int stride) {
    783   vp9_fwht4x4_c(input, output, stride);
    784 }
    785 
    786 void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
    787                            int tx_type) {
    788   vp9_fht16x16_c(input, output, stride, tx_type);
    789 }
    790 #endif  // CONFIG_VP9_HIGHBITDEPTH
    791