Home | History | Annotate | Download | only in common
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevc_itrans_recon_16x16.c
     22  *
     23  * @brief
     24  *  Contains function definitions for inverse transform  and reconstruction 16x16
     25  *
     26  *
     27  * @author
     28  *  100470
     29  *
     30  * @par List of Functions:
     31  *  - ihevc_itrans_recon_16x16()
     32  *
     33  * @remarks
     34  *  None
     35  *
     36  *******************************************************************************
     37  */
     38 #include <stdio.h>
     39 #include <string.h>
     40 #include "ihevc_typedefs.h"
     41 #include "ihevc_macros.h"
     42 #include "ihevc_platform_macros.h"
     43 #include "ihevc_defs.h"
     44 #include "ihevc_trans_tables.h"
     45 #include "ihevc_itrans_recon.h"
     46 #include "ihevc_func_selector.h"
     47 #include "ihevc_trans_macros.h"
     48 
     49 /**
     50  *******************************************************************************
     51  *
     52  * @brief
     53  *  This function performs Inverse transform  and reconstruction for 16x16
     54  * input block
     55  *
     56  * @par Description:
     57  *  Performs inverse transform and adds the prediction  data and clips output
     58  * to 8 bit
     59  *
     60  * @param[in] pi2_src
     61  *  Input 16x16 coefficients
     62  *
     63  * @param[in] pi2_tmp
     64  *  Temporary 16x16 buffer for storing inverse
     65  *
     66  *  transform
     67  *  1st stage output
     68  *
     69  * @param[in] pu1_pred
     70  *  Prediction 16x16 block
     71  *
     72  * @param[out] pu1_dst
     73  *  Output 16x16 block
     74  *
     75  * @param[in] src_strd
     76  *  Input stride
     77  *
     78  * @param[in] pred_strd
     79  *  Prediction stride
     80  *
     81  * @param[in] dst_strd
     82  *  Output Stride
     83  *
     84  * @param[in] shift
     85  *  Output shift
     86  *
     87  * @param[in] zero_cols
     88  *  Zero columns in pi2_src
     89  *
     90  * @returns  Void
     91  *
     92  * @remarks
     93  *  None
     94  *
     95  *******************************************************************************
     96  */
     97 
     98 void ihevc_itrans_recon_16x16(WORD16 *pi2_src,
     99                               WORD16 *pi2_tmp,
    100                               UWORD8 *pu1_pred,
    101                               UWORD8 *pu1_dst,
    102                               WORD32 src_strd,
    103                               WORD32 pred_strd,
    104                               WORD32 dst_strd,
    105                               WORD32 zero_cols,
    106                               WORD32 zero_rows)
    107 {
    108     WORD32 j, k;
    109     WORD32 e[8], o[8];
    110     WORD32 ee[4], eo[4];
    111     WORD32 eee[2], eeo[2];
    112     WORD32 add;
    113     WORD32 shift;
    114     WORD16 *pi2_tmp_orig;
    115     WORD32 trans_size;
    116     WORD32 zero_rows_2nd_stage = zero_cols;
    117     WORD32 row_limit_2nd_stage;
    118 
    119     if((zero_cols & 0xFFF0) == 0xFFF0)
    120         row_limit_2nd_stage = 4;
    121     else if((zero_cols & 0xFF00) == 0xFF00)
    122         row_limit_2nd_stage = 8;
    123     else
    124         row_limit_2nd_stage = TRANS_SIZE_16;
    125 
    126     trans_size = TRANS_SIZE_16;
    127     pi2_tmp_orig = pi2_tmp;
    128     if((zero_rows & 0xFFF0) == 0xFFF0)  /* First 4 rows of input are non-zero */
    129     {
    130         /* Inverse Transform 1st stage */
    131         /************************************************************************************************/
    132         /**********************************START - IT_RECON_16x16****************************************/
    133         /************************************************************************************************/
    134 
    135         shift = IT_SHIFT_STAGE_1;
    136         add = 1 << (shift - 1);
    137 
    138         for(j = 0; j < row_limit_2nd_stage; j++)
    139         {
    140             /* Checking for Zero Cols */
    141             if((zero_cols & 1) == 1)
    142             {
    143                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
    144             }
    145             else
    146             {
    147                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    148                 for(k = 0; k < 8; k++)
    149                 {
    150                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
    151                                     + g_ai2_ihevc_trans_16[3][k]
    152                                                     * pi2_src[3 * src_strd];
    153                 }
    154                 for(k = 0; k < 4; k++)
    155                 {
    156                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
    157                 }
    158                 eeo[0] = 0;
    159                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
    160                 eeo[1] = 0;
    161                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
    162 
    163                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    164                 for(k = 0; k < 2; k++)
    165                 {
    166                     ee[k] = eee[k] + eeo[k];
    167                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    168                 }
    169                 for(k = 0; k < 4; k++)
    170                 {
    171                     e[k] = ee[k] + eo[k];
    172                     e[k + 4] = ee[3 - k] - eo[3 - k];
    173                 }
    174                 for(k = 0; k < 8; k++)
    175                 {
    176                     pi2_tmp[k] =
    177                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    178                     pi2_tmp[k + 8] =
    179                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    180                 }
    181             }
    182             pi2_src++;
    183             pi2_tmp += trans_size;
    184             zero_cols = zero_cols >> 1;
    185         }
    186 
    187         pi2_tmp = pi2_tmp_orig;
    188 
    189         /* Inverse Transform 2nd stage */
    190         shift = IT_SHIFT_STAGE_2;
    191         add = 1 << (shift - 1);
    192 
    193         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
    194         {
    195             for(j = 0; j < trans_size; j++)
    196             {
    197                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    198                 for(k = 0; k < 8; k++)
    199                 {
    200                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    201                                     + g_ai2_ihevc_trans_16[3][k]
    202                                                     * pi2_tmp[3 * trans_size];
    203                 }
    204                 for(k = 0; k < 4; k++)
    205                 {
    206                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
    207                 }
    208                 eeo[0] = 0;
    209                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    210                 eeo[1] = 0;
    211                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    212 
    213                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    214                 for(k = 0; k < 2; k++)
    215                 {
    216                     ee[k] = eee[k] + eeo[k];
    217                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    218                 }
    219                 for(k = 0; k < 4; k++)
    220                 {
    221                     e[k] = ee[k] + eo[k];
    222                     e[k + 4] = ee[3 - k] - eo[3 - k];
    223                 }
    224                 for(k = 0; k < 8; k++)
    225                 {
    226                     WORD32 itrans_out;
    227                     itrans_out =
    228                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    229                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    230                     itrans_out =
    231                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    232                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    233                 }
    234                 pi2_tmp++;
    235                 pu1_pred += pred_strd;
    236                 pu1_dst += dst_strd;
    237             }
    238         }
    239         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
    240         {
    241             for(j = 0; j < trans_size; j++)
    242             {
    243                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    244                 for(k = 0; k < 8; k++)
    245                 {
    246                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    247                                     + g_ai2_ihevc_trans_16[3][k]
    248                                                     * pi2_tmp[3 * trans_size]
    249                                     + g_ai2_ihevc_trans_16[5][k]
    250                                                     * pi2_tmp[5 * trans_size]
    251                                     + g_ai2_ihevc_trans_16[7][k]
    252                                                     * pi2_tmp[7 * trans_size];
    253                 }
    254                 for(k = 0; k < 4; k++)
    255                 {
    256                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    257                                     + g_ai2_ihevc_trans_16[6][k]
    258                                                     * pi2_tmp[6 * trans_size];
    259                 }
    260                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
    261                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    262                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
    263                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    264 
    265                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    266                 for(k = 0; k < 2; k++)
    267                 {
    268                     ee[k] = eee[k] + eeo[k];
    269                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    270                 }
    271                 for(k = 0; k < 4; k++)
    272                 {
    273                     e[k] = ee[k] + eo[k];
    274                     e[k + 4] = ee[3 - k] - eo[3 - k];
    275                 }
    276                 for(k = 0; k < 8; k++)
    277                 {
    278                     WORD32 itrans_out;
    279                     itrans_out =
    280                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    281                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    282                     itrans_out =
    283                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    284                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    285                 }
    286                 pi2_tmp++;
    287                 pu1_pred += pred_strd;
    288                 pu1_dst += dst_strd;
    289             }
    290         }
    291         else /* All rows of output of 1st stage are non-zero */
    292         {
    293             for(j = 0; j < trans_size; j++)
    294             {
    295                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    296                 for(k = 0; k < 8; k++)
    297                 {
    298                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    299                                     + g_ai2_ihevc_trans_16[3][k]
    300                                                     * pi2_tmp[3 * trans_size]
    301                                     + g_ai2_ihevc_trans_16[5][k]
    302                                                     * pi2_tmp[5 * trans_size]
    303                                     + g_ai2_ihevc_trans_16[7][k]
    304                                                     * pi2_tmp[7 * trans_size]
    305                                     + g_ai2_ihevc_trans_16[9][k]
    306                                                     * pi2_tmp[9 * trans_size]
    307                                     + g_ai2_ihevc_trans_16[11][k]
    308                                                     * pi2_tmp[11 * trans_size]
    309                                     + g_ai2_ihevc_trans_16[13][k]
    310                                                     * pi2_tmp[13 * trans_size]
    311                                     + g_ai2_ihevc_trans_16[15][k]
    312                                                     * pi2_tmp[15 * trans_size];
    313                 }
    314                 for(k = 0; k < 4; k++)
    315                 {
    316                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    317                                     + g_ai2_ihevc_trans_16[6][k]
    318                                                     * pi2_tmp[6 * trans_size]
    319                                     + g_ai2_ihevc_trans_16[10][k]
    320                                                     * pi2_tmp[10 * trans_size]
    321                                     + g_ai2_ihevc_trans_16[14][k]
    322                                                     * pi2_tmp[14 * trans_size];
    323                 }
    324                 eeo[0] =
    325                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
    326                                                 + g_ai2_ihevc_trans_16[12][0]
    327                                                                 * pi2_tmp[12
    328                                                                                 * trans_size];
    329                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
    330                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
    331                 eeo[1] =
    332                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
    333                                                 + g_ai2_ihevc_trans_16[12][1]
    334                                                                 * pi2_tmp[12
    335                                                                                 * trans_size];
    336                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
    337                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
    338 
    339                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    340                 for(k = 0; k < 2; k++)
    341                 {
    342                     ee[k] = eee[k] + eeo[k];
    343                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    344                 }
    345                 for(k = 0; k < 4; k++)
    346                 {
    347                     e[k] = ee[k] + eo[k];
    348                     e[k + 4] = ee[3 - k] - eo[3 - k];
    349                 }
    350                 for(k = 0; k < 8; k++)
    351                 {
    352                     WORD32 itrans_out;
    353                     itrans_out =
    354                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    355                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    356                     itrans_out =
    357                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    358                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    359                 }
    360                 pi2_tmp++;
    361                 pu1_pred += pred_strd;
    362                 pu1_dst += dst_strd;
    363             }
    364         }
    365         /************************************************************************************************/
    366         /************************************END - IT_RECON_16x16****************************************/
    367         /************************************************************************************************/
    368     }
    369     else if((zero_rows & 0xFF00) == 0xFF00)  /* First 8 rows of input are non-zero */
    370     {
    371         /* Inverse Transform 1st stage */
    372         /************************************************************************************************/
    373         /**********************************START - IT_RECON_16x16****************************************/
    374         /************************************************************************************************/
    375 
    376         shift = IT_SHIFT_STAGE_1;
    377         add = 1 << (shift - 1);
    378 
    379         for(j = 0; j < row_limit_2nd_stage; j++)
    380         {
    381             /* Checking for Zero Cols */
    382             if((zero_cols & 1) == 1)
    383             {
    384                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
    385             }
    386             else
    387             {
    388                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    389                 for(k = 0; k < 8; k++)
    390                 {
    391                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
    392                                     + g_ai2_ihevc_trans_16[3][k]
    393                                                     * pi2_src[3 * src_strd]
    394                                     + g_ai2_ihevc_trans_16[5][k]
    395                                                     * pi2_src[5 * src_strd]
    396                                     + g_ai2_ihevc_trans_16[7][k]
    397                                                     * pi2_src[7 * src_strd];
    398                 }
    399                 for(k = 0; k < 4; k++)
    400                 {
    401                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
    402                                     + g_ai2_ihevc_trans_16[6][k]
    403                                                     * pi2_src[6 * src_strd];
    404                 }
    405                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
    406                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
    407                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
    408                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
    409 
    410                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    411                 for(k = 0; k < 2; k++)
    412                 {
    413                     ee[k] = eee[k] + eeo[k];
    414                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    415                 }
    416                 for(k = 0; k < 4; k++)
    417                 {
    418                     e[k] = ee[k] + eo[k];
    419                     e[k + 4] = ee[3 - k] - eo[3 - k];
    420                 }
    421                 for(k = 0; k < 8; k++)
    422                 {
    423                     pi2_tmp[k] =
    424                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    425                     pi2_tmp[k + 8] =
    426                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    427                 }
    428             }
    429             pi2_src++;
    430             pi2_tmp += trans_size;
    431             zero_cols = zero_cols >> 1;
    432         }
    433 
    434         pi2_tmp = pi2_tmp_orig;
    435 
    436         /* Inverse Transform 2nd stage */
    437         shift = IT_SHIFT_STAGE_2;
    438         add = 1 << (shift - 1);
    439 
    440         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
    441         {
    442             for(j = 0; j < trans_size; j++)
    443             {
    444                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    445                 for(k = 0; k < 8; k++)
    446                 {
    447                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    448                                     + g_ai2_ihevc_trans_16[3][k]
    449                                                     * pi2_tmp[3 * trans_size];
    450                 }
    451                 for(k = 0; k < 4; k++)
    452                 {
    453                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
    454                 }
    455                 eeo[0] = 0;
    456                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    457                 eeo[1] = 0;
    458                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    459 
    460                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    461                 for(k = 0; k < 2; k++)
    462                 {
    463                     ee[k] = eee[k] + eeo[k];
    464                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    465                 }
    466                 for(k = 0; k < 4; k++)
    467                 {
    468                     e[k] = ee[k] + eo[k];
    469                     e[k + 4] = ee[3 - k] - eo[3 - k];
    470                 }
    471                 for(k = 0; k < 8; k++)
    472                 {
    473                     WORD32 itrans_out;
    474                     itrans_out =
    475                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    476                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    477                     itrans_out =
    478                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    479                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    480                 }
    481                 pi2_tmp++;
    482                 pu1_pred += pred_strd;
    483                 pu1_dst += dst_strd;
    484             }
    485         }
    486         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
    487         {
    488             for(j = 0; j < trans_size; j++)
    489             {
    490                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    491                 for(k = 0; k < 8; k++)
    492                 {
    493                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    494                                     + g_ai2_ihevc_trans_16[3][k]
    495                                                     * pi2_tmp[3 * trans_size]
    496                                     + g_ai2_ihevc_trans_16[5][k]
    497                                                     * pi2_tmp[5 * trans_size]
    498                                     + g_ai2_ihevc_trans_16[7][k]
    499                                                     * pi2_tmp[7 * trans_size];
    500                 }
    501                 for(k = 0; k < 4; k++)
    502                 {
    503                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    504                                     + g_ai2_ihevc_trans_16[6][k]
    505                                                     * pi2_tmp[6 * trans_size];
    506                 }
    507                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
    508                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    509                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
    510                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    511 
    512                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    513                 for(k = 0; k < 2; k++)
    514                 {
    515                     ee[k] = eee[k] + eeo[k];
    516                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    517                 }
    518                 for(k = 0; k < 4; k++)
    519                 {
    520                     e[k] = ee[k] + eo[k];
    521                     e[k + 4] = ee[3 - k] - eo[3 - k];
    522                 }
    523                 for(k = 0; k < 8; k++)
    524                 {
    525                     WORD32 itrans_out;
    526                     itrans_out =
    527                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    528                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    529                     itrans_out =
    530                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    531                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    532                 }
    533                 pi2_tmp++;
    534                 pu1_pred += pred_strd;
    535                 pu1_dst += dst_strd;
    536             }
    537         }
    538         else /* All rows of output of 1st stage are non-zero */
    539         {
    540             for(j = 0; j < trans_size; j++)
    541             {
    542                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    543                 for(k = 0; k < 8; k++)
    544                 {
    545                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    546                                     + g_ai2_ihevc_trans_16[3][k]
    547                                                     * pi2_tmp[3 * trans_size]
    548                                     + g_ai2_ihevc_trans_16[5][k]
    549                                                     * pi2_tmp[5 * trans_size]
    550                                     + g_ai2_ihevc_trans_16[7][k]
    551                                                     * pi2_tmp[7 * trans_size]
    552                                     + g_ai2_ihevc_trans_16[9][k]
    553                                                     * pi2_tmp[9 * trans_size]
    554                                     + g_ai2_ihevc_trans_16[11][k]
    555                                                     * pi2_tmp[11 * trans_size]
    556                                     + g_ai2_ihevc_trans_16[13][k]
    557                                                     * pi2_tmp[13 * trans_size]
    558                                     + g_ai2_ihevc_trans_16[15][k]
    559                                                     * pi2_tmp[15 * trans_size];
    560                 }
    561                 for(k = 0; k < 4; k++)
    562                 {
    563                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    564                                     + g_ai2_ihevc_trans_16[6][k]
    565                                                     * pi2_tmp[6 * trans_size]
    566                                     + g_ai2_ihevc_trans_16[10][k]
    567                                                     * pi2_tmp[10 * trans_size]
    568                                     + g_ai2_ihevc_trans_16[14][k]
    569                                                     * pi2_tmp[14 * trans_size];
    570                 }
    571                 eeo[0] =
    572                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
    573                                                 + g_ai2_ihevc_trans_16[12][0]
    574                                                                 * pi2_tmp[12
    575                                                                                 * trans_size];
    576                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
    577                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
    578                 eeo[1] =
    579                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
    580                                                 + g_ai2_ihevc_trans_16[12][1]
    581                                                                 * pi2_tmp[12
    582                                                                                 * trans_size];
    583                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
    584                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
    585 
    586                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    587                 for(k = 0; k < 2; k++)
    588                 {
    589                     ee[k] = eee[k] + eeo[k];
    590                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    591                 }
    592                 for(k = 0; k < 4; k++)
    593                 {
    594                     e[k] = ee[k] + eo[k];
    595                     e[k + 4] = ee[3 - k] - eo[3 - k];
    596                 }
    597                 for(k = 0; k < 8; k++)
    598                 {
    599                     WORD32 itrans_out;
    600                     itrans_out =
    601                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    602                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    603                     itrans_out =
    604                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    605                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    606                 }
    607                 pi2_tmp++;
    608                 pu1_pred += pred_strd;
    609                 pu1_dst += dst_strd;
    610             }
    611         }
    612         /************************************************************************************************/
    613         /************************************END - IT_RECON_16x16****************************************/
    614         /************************************************************************************************/
    615     }
    616     else  /* All rows of input are non-zero */
    617     {
    618         /* Inverse Transform 1st stage */
    619         /************************************************************************************************/
    620         /**********************************START - IT_RECON_16x16****************************************/
    621         /************************************************************************************************/
    622 
    623         shift = IT_SHIFT_STAGE_1;
    624         add = 1 << (shift - 1);
    625 
    626         for(j = 0; j < row_limit_2nd_stage; j++)
    627         {
    628             /* Checking for Zero Cols */
    629             if((zero_cols & 1) == 1)
    630             {
    631                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
    632             }
    633             else
    634             {
    635                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    636                 for(k = 0; k < 8; k++)
    637                 {
    638                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
    639                                     + g_ai2_ihevc_trans_16[3][k]
    640                                                     * pi2_src[3 * src_strd]
    641                                     + g_ai2_ihevc_trans_16[5][k]
    642                                                     * pi2_src[5 * src_strd]
    643                                     + g_ai2_ihevc_trans_16[7][k]
    644                                                     * pi2_src[7 * src_strd]
    645                                     + g_ai2_ihevc_trans_16[9][k]
    646                                                     * pi2_src[9 * src_strd]
    647                                     + g_ai2_ihevc_trans_16[11][k]
    648                                                     * pi2_src[11 * src_strd]
    649                                     + g_ai2_ihevc_trans_16[13][k]
    650                                                     * pi2_src[13 * src_strd]
    651                                     + g_ai2_ihevc_trans_16[15][k]
    652                                                     * pi2_src[15 * src_strd];
    653                 }
    654                 for(k = 0; k < 4; k++)
    655                 {
    656                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
    657                                     + g_ai2_ihevc_trans_16[6][k]
    658                                                     * pi2_src[6 * src_strd]
    659                                     + g_ai2_ihevc_trans_16[10][k]
    660                                                     * pi2_src[10 * src_strd]
    661                                     + g_ai2_ihevc_trans_16[14][k]
    662                                                     * pi2_src[14 * src_strd];
    663                 }
    664                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
    665                                 + g_ai2_ihevc_trans_16[12][0]
    666                                                 * pi2_src[12 * src_strd];
    667                 eee[0] =
    668                                 g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
    669                                                 + g_ai2_ihevc_trans_16[8][0]
    670                                                                 * pi2_src[8
    671                                                                                 * src_strd];
    672                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
    673                                 + g_ai2_ihevc_trans_16[12][1]
    674                                                 * pi2_src[12 * src_strd];
    675                 eee[1] =
    676                                 g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
    677                                                 + g_ai2_ihevc_trans_16[8][1]
    678                                                                 * pi2_src[8
    679                                                                                 * src_strd];
    680 
    681                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    682                 for(k = 0; k < 2; k++)
    683                 {
    684                     ee[k] = eee[k] + eeo[k];
    685                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    686                 }
    687                 for(k = 0; k < 4; k++)
    688                 {
    689                     e[k] = ee[k] + eo[k];
    690                     e[k + 4] = ee[3 - k] - eo[3 - k];
    691                 }
    692                 for(k = 0; k < 8; k++)
    693                 {
    694                     pi2_tmp[k] =
    695                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    696                     pi2_tmp[k + 8] =
    697                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    698                 }
    699             }
    700             pi2_src++;
    701             pi2_tmp += trans_size;
    702             zero_cols = zero_cols >> 1;
    703         }
    704 
    705         pi2_tmp = pi2_tmp_orig;
    706 
    707         /* Inverse Transform 2nd stage */
    708         shift = IT_SHIFT_STAGE_2;
    709         add = 1 << (shift - 1);
    710 
    711         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
    712         {
    713             for(j = 0; j < trans_size; j++)
    714             {
    715                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    716                 for(k = 0; k < 8; k++)
    717                 {
    718                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    719                                     + g_ai2_ihevc_trans_16[3][k]
    720                                                     * pi2_tmp[3 * trans_size];
    721                 }
    722                 for(k = 0; k < 4; k++)
    723                 {
    724                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
    725                 }
    726                 eeo[0] = 0;
    727                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    728                 eeo[1] = 0;
    729                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    730 
    731                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    732                 for(k = 0; k < 2; k++)
    733                 {
    734                     ee[k] = eee[k] + eeo[k];
    735                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    736                 }
    737                 for(k = 0; k < 4; k++)
    738                 {
    739                     e[k] = ee[k] + eo[k];
    740                     e[k + 4] = ee[3 - k] - eo[3 - k];
    741                 }
    742                 for(k = 0; k < 8; k++)
    743                 {
    744                     WORD32 itrans_out;
    745                     itrans_out =
    746                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    747                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    748                     itrans_out =
    749                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    750                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    751                 }
    752                 pi2_tmp++;
    753                 pu1_pred += pred_strd;
    754                 pu1_dst += dst_strd;
    755             }
    756         }
    757         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
    758         {
    759             for(j = 0; j < trans_size; j++)
    760             {
    761                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    762                 for(k = 0; k < 8; k++)
    763                 {
    764                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    765                                     + g_ai2_ihevc_trans_16[3][k]
    766                                                     * pi2_tmp[3 * trans_size]
    767                                     + g_ai2_ihevc_trans_16[5][k]
    768                                                     * pi2_tmp[5 * trans_size]
    769                                     + g_ai2_ihevc_trans_16[7][k]
    770                                                     * pi2_tmp[7 * trans_size];
    771                 }
    772                 for(k = 0; k < 4; k++)
    773                 {
    774                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    775                                     + g_ai2_ihevc_trans_16[6][k]
    776                                                     * pi2_tmp[6 * trans_size];
    777                 }
    778                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
    779                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    780                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
    781                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    782 
    783                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    784                 for(k = 0; k < 2; k++)
    785                 {
    786                     ee[k] = eee[k] + eeo[k];
    787                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    788                 }
    789                 for(k = 0; k < 4; k++)
    790                 {
    791                     e[k] = ee[k] + eo[k];
    792                     e[k + 4] = ee[3 - k] - eo[3 - k];
    793                 }
    794                 for(k = 0; k < 8; k++)
    795                 {
    796                     WORD32 itrans_out;
    797                     itrans_out =
    798                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    799                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    800                     itrans_out =
    801                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    802                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    803                 }
    804                 pi2_tmp++;
    805                 pu1_pred += pred_strd;
    806                 pu1_dst += dst_strd;
    807             }
    808         }
    809         else /* All rows of output of 1st stage are non-zero */
    810         {
    811             for(j = 0; j < trans_size; j++)
    812             {
    813                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    814                 for(k = 0; k < 8; k++)
    815                 {
    816                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    817                                     + g_ai2_ihevc_trans_16[3][k]
    818                                                     * pi2_tmp[3 * trans_size]
    819                                     + g_ai2_ihevc_trans_16[5][k]
    820                                                     * pi2_tmp[5 * trans_size]
    821                                     + g_ai2_ihevc_trans_16[7][k]
    822                                                     * pi2_tmp[7 * trans_size]
    823                                     + g_ai2_ihevc_trans_16[9][k]
    824                                                     * pi2_tmp[9 * trans_size]
    825                                     + g_ai2_ihevc_trans_16[11][k]
    826                                                     * pi2_tmp[11 * trans_size]
    827                                     + g_ai2_ihevc_trans_16[13][k]
    828                                                     * pi2_tmp[13 * trans_size]
    829                                     + g_ai2_ihevc_trans_16[15][k]
    830                                                     * pi2_tmp[15 * trans_size];
    831                 }
    832                 for(k = 0; k < 4; k++)
    833                 {
    834                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    835                                     + g_ai2_ihevc_trans_16[6][k]
    836                                                     * pi2_tmp[6 * trans_size]
    837                                     + g_ai2_ihevc_trans_16[10][k]
    838                                                     * pi2_tmp[10 * trans_size]
    839                                     + g_ai2_ihevc_trans_16[14][k]
    840                                                     * pi2_tmp[14 * trans_size];
    841                 }
    842                 eeo[0] =
    843                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
    844                                                 + g_ai2_ihevc_trans_16[12][0]
    845                                                                 * pi2_tmp[12
    846                                                                                 * trans_size];
    847                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
    848                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
    849                 eeo[1] =
    850                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
    851                                                 + g_ai2_ihevc_trans_16[12][1]
    852                                                                 * pi2_tmp[12
    853                                                                                 * trans_size];
    854                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
    855                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
    856 
    857                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    858                 for(k = 0; k < 2; k++)
    859                 {
    860                     ee[k] = eee[k] + eeo[k];
    861                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    862                 }
    863                 for(k = 0; k < 4; k++)
    864                 {
    865                     e[k] = ee[k] + eo[k];
    866                     e[k + 4] = ee[3 - k] - eo[3 - k];
    867                 }
    868                 for(k = 0; k < 8; k++)
    869                 {
    870                     WORD32 itrans_out;
    871                     itrans_out =
    872                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    873                     pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
    874                     itrans_out =
    875                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    876                     pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
    877                 }
    878                 pi2_tmp++;
    879                 pu1_pred += pred_strd;
    880                 pu1_dst += dst_strd;
    881             }
    882         }
    883         /************************************************************************************************/
    884         /************************************END - IT_RECON_16x16****************************************/
    885         /************************************************************************************************/
    886     }
    887 
    888 }
    889 
    890