Home | History | Annotate | Download | only in common
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevc_chroma_itrans_recon_16x16.c
     22  *
     23  * @brief
     24  *  Contains function definitions for 16x16 inverse transform  and reconstruction
     25  * of chroma interleaved data.
     26  *
     27  * @author
     28  *  100470
     29  *
     30  * @par List of Functions:
     31  *  - ihevc_chroma_itrans_recon_16x16()
     32  *
     33  * @remarks
     34  *  None
     35  *
     36  *******************************************************************************
     37  */
     38 
     39 #include <stdio.h>
     40 #include <string.h>
     41 #include "ihevc_typedefs.h"
     42 #include "ihevc_macros.h"
     43 #include "ihevc_platform_macros.h"
     44 #include "ihevc_defs.h"
     45 #include "ihevc_trans_tables.h"
     46 #include "ihevc_chroma_itrans_recon.h"
     47 #include "ihevc_func_selector.h"
     48 #include "ihevc_trans_macros.h"
     49 
     50 /* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
     51 /* Data visualization */
     52 /* U V U V U V U V */
     53 /* U V U V U V U V */
     54 /* U V U V U V U V */
     55 /* U V U V U V U V */
     56 /* If the pointer points to first byte of above stream (U) , functions will operate on U component */
     57 /* If the pointer points to second byte of above stream (V) , functions will operate on V component */
     58 
     59 
     60 /**
     61  *******************************************************************************
     62  *
     63  * @brief
     64  *  This function performs Inverse transform  and reconstruction for 16x16
     65  * input block
     66  *
     67  * @par Description:
     68  *  Performs inverse transform and adds the prediction  data and clips output
     69  * to 8 bit
     70  *
     71  * @param[in] pi2_src
     72  *  Input 16x16 coefficients
     73  *
     74  * @param[in] pi2_tmp
     75  *  Temporary 16x16 buffer for storing inverse transform
     76  *  1st stage output
     77  *
     78  * @param[in] pu1_pred
     79  *  Prediction 16x16 block
     80  *
     81  * @param[out] pu1_dst
     82  *  Output 16x16 block
     83  *
     84  * @param[in] src_strd
     85  *  Input stride
     86  *
     87  * @param[in] pred_strd
     88  *  Prediction stride
     89  *
     90  * @param[in] dst_strd
     91  *  Output Stride
     92  *
     93  * @param[in] shift
     94  *  Output shift
     95  *
     96  * @param[in] zero_cols
     97  *  Zero columns in pi2_src
     98  *
     99  * @returns  Void
    100  *
    101  * @remarks
    102  *  None
    103  *
    104  *******************************************************************************
    105  */
    106 
    107 
    108 void ihevc_chroma_itrans_recon_16x16(WORD16 *pi2_src,
    109                                      WORD16 *pi2_tmp,
    110                                      UWORD8 *pu1_pred,
    111                                      UWORD8 *pu1_dst,
    112                                      WORD32 src_strd,
    113                                      WORD32 pred_strd,
    114                                      WORD32 dst_strd,
    115                                      WORD32 zero_cols,
    116                                      WORD32 zero_rows)
    117 {
    118     WORD32 j, k;
    119     WORD32 e[8], o[8];
    120     WORD32 ee[4], eo[4];
    121     WORD32 eee[2], eeo[2];
    122     WORD32 add;
    123     WORD32 shift;
    124     WORD16 *pi2_tmp_orig;
    125     WORD32 trans_size;
    126     WORD32 row_limit_2nd_stage, zero_rows_2nd_stage = zero_cols;
    127 
    128     trans_size = TRANS_SIZE_16;
    129     pi2_tmp_orig = pi2_tmp;
    130 
    131     if((zero_cols & 0xFFF0) == 0xFFF0)
    132         row_limit_2nd_stage = 4;
    133     else if((zero_cols & 0xFF00) == 0xFF00)
    134         row_limit_2nd_stage = 8;
    135     else
    136         row_limit_2nd_stage = TRANS_SIZE_16;
    137 
    138     if((zero_rows & 0xFFF0) == 0xFFF0) /* First 4 rows of input are non-zero */
    139     {
    140         /************************************************************************************************/
    141         /**********************************START - IT_RECON_16x16****************************************/
    142         /************************************************************************************************/
    143 
    144         /* Inverse Transform 1st stage */
    145         shift = IT_SHIFT_STAGE_1;
    146         add = 1 << (shift - 1);
    147 
    148         for(j = 0; j < row_limit_2nd_stage; j++)
    149         {
    150             /* Checking for Zero Cols */
    151             if((zero_cols & 1) == 1)
    152             {
    153                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
    154             }
    155             else
    156             {
    157                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    158                 for(k = 0; k < 8; k++)
    159                 {
    160                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
    161                                     + g_ai2_ihevc_trans_16[3][k]
    162                                                     * pi2_src[3 * src_strd];
    163                 }
    164                 for(k = 0; k < 4; k++)
    165                 {
    166                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
    167                 }
    168                 eeo[0] = 0;
    169                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
    170                 eeo[1] = 0;
    171                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
    172 
    173                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    174                 for(k = 0; k < 2; k++)
    175                 {
    176                     ee[k] = eee[k] + eeo[k];
    177                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    178                 }
    179                 for(k = 0; k < 4; k++)
    180                 {
    181                     e[k] = ee[k] + eo[k];
    182                     e[k + 4] = ee[3 - k] - eo[3 - k];
    183                 }
    184                 for(k = 0; k < 8; k++)
    185                 {
    186                     pi2_tmp[k] =
    187                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    188                     pi2_tmp[k + 8] =
    189                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    190                 }
    191             }
    192             pi2_src++;
    193             pi2_tmp += trans_size;
    194             zero_cols = zero_cols >> 1;
    195         }
    196 
    197         pi2_tmp = pi2_tmp_orig;
    198 
    199         /* Inverse Transform 2nd stage */
    200         shift = IT_SHIFT_STAGE_2;
    201         add = 1 << (shift - 1);
    202         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
    203         {
    204             for(j = 0; j < trans_size; j++)
    205             {
    206                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    207                 for(k = 0; k < 8; k++)
    208                 {
    209                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    210                                     + g_ai2_ihevc_trans_16[3][k]
    211                                                     * pi2_tmp[3 * trans_size];
    212                 }
    213                 for(k = 0; k < 4; k++)
    214                 {
    215                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
    216                 }
    217                 eeo[0] = 0;
    218                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    219                 eeo[1] = 0;
    220                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    221 
    222                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    223                 for(k = 0; k < 2; k++)
    224                 {
    225                     ee[k] = eee[k] + eeo[k];
    226                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    227                 }
    228                 for(k = 0; k < 4; k++)
    229                 {
    230                     e[k] = ee[k] + eo[k];
    231                     e[k + 4] = ee[3 - k] - eo[3 - k];
    232                 }
    233                 for(k = 0; k < 8; k++)
    234                 {
    235                     WORD32 itrans_out;
    236                     itrans_out =
    237                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    238                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    239                     itrans_out =
    240                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    241                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    242                 }
    243                 pi2_tmp++;
    244                 pu1_pred += pred_strd;
    245                 pu1_dst += dst_strd;
    246             }
    247         }
    248         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
    249         {
    250             for(j = 0; j < trans_size; j++)
    251             {
    252                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    253                 for(k = 0; k < 8; k++)
    254                 {
    255                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    256                                     + g_ai2_ihevc_trans_16[3][k]
    257                                                     * pi2_tmp[3 * trans_size]
    258                                     + g_ai2_ihevc_trans_16[5][k]
    259                                                     * pi2_tmp[5 * trans_size]
    260                                     + g_ai2_ihevc_trans_16[7][k]
    261                                                     * pi2_tmp[7 * trans_size];
    262                 }
    263                 for(k = 0; k < 4; k++)
    264                 {
    265                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    266                                     + g_ai2_ihevc_trans_16[6][k]
    267                                                     * pi2_tmp[6 * trans_size];
    268                 }
    269                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
    270                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    271                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
    272                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    273 
    274                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    275                 for(k = 0; k < 2; k++)
    276                 {
    277                     ee[k] = eee[k] + eeo[k];
    278                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    279                 }
    280                 for(k = 0; k < 4; k++)
    281                 {
    282                     e[k] = ee[k] + eo[k];
    283                     e[k + 4] = ee[3 - k] - eo[3 - k];
    284                 }
    285                 for(k = 0; k < 8; k++)
    286                 {
    287                     WORD32 itrans_out;
    288                     itrans_out =
    289                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    290                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    291                     itrans_out =
    292                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    293                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    294                 }
    295                 pi2_tmp++;
    296                 pu1_pred += pred_strd;
    297                 pu1_dst += dst_strd;
    298             }
    299         }
    300         else /* All rows of output of 1st stage are non-zero */
    301         {
    302             for(j = 0; j < trans_size; j++)
    303             {
    304                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    305                 for(k = 0; k < 8; k++)
    306                 {
    307                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    308                                     + g_ai2_ihevc_trans_16[3][k]
    309                                                     * pi2_tmp[3 * trans_size]
    310                                     + g_ai2_ihevc_trans_16[5][k]
    311                                                     * pi2_tmp[5 * trans_size]
    312                                     + g_ai2_ihevc_trans_16[7][k]
    313                                                     * pi2_tmp[7 * trans_size]
    314                                     + g_ai2_ihevc_trans_16[9][k]
    315                                                     * pi2_tmp[9 * trans_size]
    316                                     + g_ai2_ihevc_trans_16[11][k]
    317                                                     * pi2_tmp[11 * trans_size]
    318                                     + g_ai2_ihevc_trans_16[13][k]
    319                                                     * pi2_tmp[13 * trans_size]
    320                                     + g_ai2_ihevc_trans_16[15][k]
    321                                                     * pi2_tmp[15 * trans_size];
    322                 }
    323                 for(k = 0; k < 4; k++)
    324                 {
    325                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    326                                     + g_ai2_ihevc_trans_16[6][k]
    327                                                     * pi2_tmp[6 * trans_size]
    328                                     + g_ai2_ihevc_trans_16[10][k]
    329                                                     * pi2_tmp[10 * trans_size]
    330                                     + g_ai2_ihevc_trans_16[14][k]
    331                                                     * pi2_tmp[14 * trans_size];
    332                 }
    333                 eeo[0] =
    334                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
    335                                                 + g_ai2_ihevc_trans_16[12][0]
    336                                                                 * pi2_tmp[12
    337                                                                                 * trans_size];
    338                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
    339                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
    340                 eeo[1] =
    341                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
    342                                                 + g_ai2_ihevc_trans_16[12][1]
    343                                                                 * pi2_tmp[12
    344                                                                                 * trans_size];
    345                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
    346                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
    347 
    348                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    349                 for(k = 0; k < 2; k++)
    350                 {
    351                     ee[k] = eee[k] + eeo[k];
    352                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    353                 }
    354                 for(k = 0; k < 4; k++)
    355                 {
    356                     e[k] = ee[k] + eo[k];
    357                     e[k + 4] = ee[3 - k] - eo[3 - k];
    358                 }
    359                 for(k = 0; k < 8; k++)
    360                 {
    361                     WORD32 itrans_out;
    362                     itrans_out =
    363                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    364                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    365                     itrans_out =
    366                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    367                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    368                 }
    369                 pi2_tmp++;
    370                 pu1_pred += pred_strd;
    371                 pu1_dst += dst_strd;
    372             }
    373         }
    374         /************************************************************************************************/
    375         /************************************END - IT_RECON_16x16****************************************/
    376         /************************************************************************************************/
    377     }
    378     else if((zero_rows & 0xFF00) == 0xFF00) /* First 8 rows of input are non-zero */
    379     {
    380         /************************************************************************************************/
    381         /**********************************START - IT_RECON_16x16****************************************/
    382         /************************************************************************************************/
    383 
    384         /* Inverse Transform 1st stage */
    385         shift = IT_SHIFT_STAGE_1;
    386         add = 1 << (shift - 1);
    387 
    388         for(j = 0; j < row_limit_2nd_stage; j++)
    389         {
    390             /* Checking for Zero Cols */
    391             if((zero_cols & 1) == 1)
    392             {
    393                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
    394             }
    395             else
    396             {
    397                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    398                 for(k = 0; k < 8; k++)
    399                 {
    400                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
    401                                     + g_ai2_ihevc_trans_16[3][k]
    402                                                     * pi2_src[3 * src_strd]
    403                                     + g_ai2_ihevc_trans_16[5][k]
    404                                                     * pi2_src[5 * src_strd]
    405                                     + g_ai2_ihevc_trans_16[7][k]
    406                                                     * pi2_src[7 * src_strd];
    407                 }
    408                 for(k = 0; k < 4; k++)
    409                 {
    410                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
    411                                     + g_ai2_ihevc_trans_16[6][k]
    412                                                     * pi2_src[6 * src_strd];
    413                 }
    414                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
    415                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
    416                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
    417                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
    418 
    419                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    420                 for(k = 0; k < 2; k++)
    421                 {
    422                     ee[k] = eee[k] + eeo[k];
    423                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    424                 }
    425                 for(k = 0; k < 4; k++)
    426                 {
    427                     e[k] = ee[k] + eo[k];
    428                     e[k + 4] = ee[3 - k] - eo[3 - k];
    429                 }
    430                 for(k = 0; k < 8; k++)
    431                 {
    432                     pi2_tmp[k] =
    433                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    434                     pi2_tmp[k + 8] =
    435                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    436                 }
    437             }
    438             pi2_src++;
    439             pi2_tmp += trans_size;
    440             zero_cols = zero_cols >> 1;
    441         }
    442 
    443         pi2_tmp = pi2_tmp_orig;
    444 
    445         /* Inverse Transform 2nd stage */
    446         shift = IT_SHIFT_STAGE_2;
    447         add = 1 << (shift - 1);
    448         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
    449         {
    450             for(j = 0; j < trans_size; j++)
    451             {
    452                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    453                 for(k = 0; k < 8; k++)
    454                 {
    455                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    456                                     + g_ai2_ihevc_trans_16[3][k]
    457                                                     * pi2_tmp[3 * trans_size];
    458                 }
    459                 for(k = 0; k < 4; k++)
    460                 {
    461                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
    462                 }
    463                 eeo[0] = 0;
    464                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    465                 eeo[1] = 0;
    466                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    467 
    468                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    469                 for(k = 0; k < 2; k++)
    470                 {
    471                     ee[k] = eee[k] + eeo[k];
    472                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    473                 }
    474                 for(k = 0; k < 4; k++)
    475                 {
    476                     e[k] = ee[k] + eo[k];
    477                     e[k + 4] = ee[3 - k] - eo[3 - k];
    478                 }
    479                 for(k = 0; k < 8; k++)
    480                 {
    481                     WORD32 itrans_out;
    482                     itrans_out =
    483                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    484                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    485                     itrans_out =
    486                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    487                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    488                 }
    489                 pi2_tmp++;
    490                 pu1_pred += pred_strd;
    491                 pu1_dst += dst_strd;
    492             }
    493         }
    494         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
    495         {
    496             for(j = 0; j < trans_size; j++)
    497             {
    498                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    499                 for(k = 0; k < 8; k++)
    500                 {
    501                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    502                                     + g_ai2_ihevc_trans_16[3][k]
    503                                                     * pi2_tmp[3 * trans_size]
    504                                     + g_ai2_ihevc_trans_16[5][k]
    505                                                     * pi2_tmp[5 * trans_size]
    506                                     + g_ai2_ihevc_trans_16[7][k]
    507                                                     * pi2_tmp[7 * trans_size];
    508                 }
    509                 for(k = 0; k < 4; k++)
    510                 {
    511                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    512                                     + g_ai2_ihevc_trans_16[6][k]
    513                                                     * pi2_tmp[6 * trans_size];
    514                 }
    515                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
    516                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    517                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
    518                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    519 
    520                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    521                 for(k = 0; k < 2; k++)
    522                 {
    523                     ee[k] = eee[k] + eeo[k];
    524                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    525                 }
    526                 for(k = 0; k < 4; k++)
    527                 {
    528                     e[k] = ee[k] + eo[k];
    529                     e[k + 4] = ee[3 - k] - eo[3 - k];
    530                 }
    531                 for(k = 0; k < 8; k++)
    532                 {
    533                     WORD32 itrans_out;
    534                     itrans_out =
    535                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    536                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    537                     itrans_out =
    538                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    539                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    540                 }
    541                 pi2_tmp++;
    542                 pu1_pred += pred_strd;
    543                 pu1_dst += dst_strd;
    544             }
    545         }
    546         else /* All rows of output of 1st stage are non-zero */
    547         {
    548             for(j = 0; j < trans_size; j++)
    549             {
    550                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    551                 for(k = 0; k < 8; k++)
    552                 {
    553                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    554                                     + g_ai2_ihevc_trans_16[3][k]
    555                                                     * pi2_tmp[3 * trans_size]
    556                                     + g_ai2_ihevc_trans_16[5][k]
    557                                                     * pi2_tmp[5 * trans_size]
    558                                     + g_ai2_ihevc_trans_16[7][k]
    559                                                     * pi2_tmp[7 * trans_size]
    560                                     + g_ai2_ihevc_trans_16[9][k]
    561                                                     * pi2_tmp[9 * trans_size]
    562                                     + g_ai2_ihevc_trans_16[11][k]
    563                                                     * pi2_tmp[11 * trans_size]
    564                                     + g_ai2_ihevc_trans_16[13][k]
    565                                                     * pi2_tmp[13 * trans_size]
    566                                     + g_ai2_ihevc_trans_16[15][k]
    567                                                     * pi2_tmp[15 * trans_size];
    568                 }
    569                 for(k = 0; k < 4; k++)
    570                 {
    571                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    572                                     + g_ai2_ihevc_trans_16[6][k]
    573                                                     * pi2_tmp[6 * trans_size]
    574                                     + g_ai2_ihevc_trans_16[10][k]
    575                                                     * pi2_tmp[10 * trans_size]
    576                                     + g_ai2_ihevc_trans_16[14][k]
    577                                                     * pi2_tmp[14 * trans_size];
    578                 }
    579                 eeo[0] =
    580                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
    581                                                 + g_ai2_ihevc_trans_16[12][0]
    582                                                                 * pi2_tmp[12
    583                                                                                 * trans_size];
    584                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
    585                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
    586                 eeo[1] =
    587                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
    588                                                 + g_ai2_ihevc_trans_16[12][1]
    589                                                                 * pi2_tmp[12
    590                                                                                 * trans_size];
    591                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
    592                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
    593 
    594                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    595                 for(k = 0; k < 2; k++)
    596                 {
    597                     ee[k] = eee[k] + eeo[k];
    598                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    599                 }
    600                 for(k = 0; k < 4; k++)
    601                 {
    602                     e[k] = ee[k] + eo[k];
    603                     e[k + 4] = ee[3 - k] - eo[3 - k];
    604                 }
    605                 for(k = 0; k < 8; k++)
    606                 {
    607                     WORD32 itrans_out;
    608                     itrans_out =
    609                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    610                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    611                     itrans_out =
    612                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    613                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    614                 }
    615                 pi2_tmp++;
    616                 pu1_pred += pred_strd;
    617                 pu1_dst += dst_strd;
    618             }
    619         }
    620         /************************************************************************************************/
    621         /************************************END - IT_RECON_16x16****************************************/
    622         /************************************************************************************************/
    623     }
    624     else /* All rows of input are non-zero */
    625     {
    626         /************************************************************************************************/
    627         /**********************************START - IT_RECON_16x16****************************************/
    628         /************************************************************************************************/
    629 
    630         /* Inverse Transform 1st stage */
    631         shift = IT_SHIFT_STAGE_1;
    632         add = 1 << (shift - 1);
    633 
    634         for(j = 0; j < row_limit_2nd_stage; j++)
    635         {
    636             /* Checking for Zero Cols */
    637             if((zero_cols & 1) == 1)
    638             {
    639                 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
    640             }
    641             else
    642             {
    643                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    644                 for(k = 0; k < 8; k++)
    645                 {
    646                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
    647                                     + g_ai2_ihevc_trans_16[3][k]
    648                                                     * pi2_src[3 * src_strd]
    649                                     + g_ai2_ihevc_trans_16[5][k]
    650                                                     * pi2_src[5 * src_strd]
    651                                     + g_ai2_ihevc_trans_16[7][k]
    652                                                     * pi2_src[7 * src_strd]
    653                                     + g_ai2_ihevc_trans_16[9][k]
    654                                                     * pi2_src[9 * src_strd]
    655                                     + g_ai2_ihevc_trans_16[11][k]
    656                                                     * pi2_src[11 * src_strd]
    657                                     + g_ai2_ihevc_trans_16[13][k]
    658                                                     * pi2_src[13 * src_strd]
    659                                     + g_ai2_ihevc_trans_16[15][k]
    660                                                     * pi2_src[15 * src_strd];
    661                 }
    662                 for(k = 0; k < 4; k++)
    663                 {
    664                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
    665                                     + g_ai2_ihevc_trans_16[6][k]
    666                                                     * pi2_src[6 * src_strd]
    667                                     + g_ai2_ihevc_trans_16[10][k]
    668                                                     * pi2_src[10 * src_strd]
    669                                     + g_ai2_ihevc_trans_16[14][k]
    670                                                     * pi2_src[14 * src_strd];
    671                 }
    672                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
    673                                 + g_ai2_ihevc_trans_16[12][0]
    674                                                 * pi2_src[12 * src_strd];
    675                 eee[0] =
    676                                 g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
    677                                                 + g_ai2_ihevc_trans_16[8][0]
    678                                                                 * pi2_src[8
    679                                                                                 * src_strd];
    680                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
    681                                 + g_ai2_ihevc_trans_16[12][1]
    682                                                 * pi2_src[12 * src_strd];
    683                 eee[1] =
    684                                 g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
    685                                                 + g_ai2_ihevc_trans_16[8][1]
    686                                                                 * pi2_src[8
    687                                                                                 * src_strd];
    688 
    689                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    690                 for(k = 0; k < 2; k++)
    691                 {
    692                     ee[k] = eee[k] + eeo[k];
    693                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    694                 }
    695                 for(k = 0; k < 4; k++)
    696                 {
    697                     e[k] = ee[k] + eo[k];
    698                     e[k + 4] = ee[3 - k] - eo[3 - k];
    699                 }
    700                 for(k = 0; k < 8; k++)
    701                 {
    702                     pi2_tmp[k] =
    703                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    704                     pi2_tmp[k + 8] =
    705                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    706                 }
    707             }
    708             pi2_src++;
    709             pi2_tmp += trans_size;
    710             zero_cols = zero_cols >> 1;
    711         }
    712 
    713         pi2_tmp = pi2_tmp_orig;
    714 
    715         /* Inverse Transform 2nd stage */
    716         shift = IT_SHIFT_STAGE_2;
    717         add = 1 << (shift - 1);
    718         if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
    719         {
    720             for(j = 0; j < trans_size; j++)
    721             {
    722                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    723                 for(k = 0; k < 8; k++)
    724                 {
    725                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    726                                     + g_ai2_ihevc_trans_16[3][k]
    727                                                     * pi2_tmp[3 * trans_size];
    728                 }
    729                 for(k = 0; k < 4; k++)
    730                 {
    731                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
    732                 }
    733                 eeo[0] = 0;
    734                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    735                 eeo[1] = 0;
    736                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    737 
    738                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    739                 for(k = 0; k < 2; k++)
    740                 {
    741                     ee[k] = eee[k] + eeo[k];
    742                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    743                 }
    744                 for(k = 0; k < 4; k++)
    745                 {
    746                     e[k] = ee[k] + eo[k];
    747                     e[k + 4] = ee[3 - k] - eo[3 - k];
    748                 }
    749                 for(k = 0; k < 8; k++)
    750                 {
    751                     WORD32 itrans_out;
    752                     itrans_out =
    753                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    754                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    755                     itrans_out =
    756                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    757                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    758                 }
    759                 pi2_tmp++;
    760                 pu1_pred += pred_strd;
    761                 pu1_dst += dst_strd;
    762             }
    763         }
    764         else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
    765         {
    766             for(j = 0; j < trans_size; j++)
    767             {
    768                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    769                 for(k = 0; k < 8; k++)
    770                 {
    771                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    772                                     + g_ai2_ihevc_trans_16[3][k]
    773                                                     * pi2_tmp[3 * trans_size]
    774                                     + g_ai2_ihevc_trans_16[5][k]
    775                                                     * pi2_tmp[5 * trans_size]
    776                                     + g_ai2_ihevc_trans_16[7][k]
    777                                                     * pi2_tmp[7 * trans_size];
    778                 }
    779                 for(k = 0; k < 4; k++)
    780                 {
    781                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    782                                     + g_ai2_ihevc_trans_16[6][k]
    783                                                     * pi2_tmp[6 * trans_size];
    784                 }
    785                 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
    786                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
    787                 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
    788                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
    789 
    790                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    791                 for(k = 0; k < 2; k++)
    792                 {
    793                     ee[k] = eee[k] + eeo[k];
    794                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    795                 }
    796                 for(k = 0; k < 4; k++)
    797                 {
    798                     e[k] = ee[k] + eo[k];
    799                     e[k + 4] = ee[3 - k] - eo[3 - k];
    800                 }
    801                 for(k = 0; k < 8; k++)
    802                 {
    803                     WORD32 itrans_out;
    804                     itrans_out =
    805                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    806                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    807                     itrans_out =
    808                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    809                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    810                 }
    811                 pi2_tmp++;
    812                 pu1_pred += pred_strd;
    813                 pu1_dst += dst_strd;
    814             }
    815         }
    816         else /* All rows of output of 1st stage are non-zero */
    817         {
    818             for(j = 0; j < trans_size; j++)
    819             {
    820                 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    821                 for(k = 0; k < 8; k++)
    822                 {
    823                     o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
    824                                     + g_ai2_ihevc_trans_16[3][k]
    825                                                     * pi2_tmp[3 * trans_size]
    826                                     + g_ai2_ihevc_trans_16[5][k]
    827                                                     * pi2_tmp[5 * trans_size]
    828                                     + g_ai2_ihevc_trans_16[7][k]
    829                                                     * pi2_tmp[7 * trans_size]
    830                                     + g_ai2_ihevc_trans_16[9][k]
    831                                                     * pi2_tmp[9 * trans_size]
    832                                     + g_ai2_ihevc_trans_16[11][k]
    833                                                     * pi2_tmp[11 * trans_size]
    834                                     + g_ai2_ihevc_trans_16[13][k]
    835                                                     * pi2_tmp[13 * trans_size]
    836                                     + g_ai2_ihevc_trans_16[15][k]
    837                                                     * pi2_tmp[15 * trans_size];
    838                 }
    839                 for(k = 0; k < 4; k++)
    840                 {
    841                     eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
    842                                     + g_ai2_ihevc_trans_16[6][k]
    843                                                     * pi2_tmp[6 * trans_size]
    844                                     + g_ai2_ihevc_trans_16[10][k]
    845                                                     * pi2_tmp[10 * trans_size]
    846                                     + g_ai2_ihevc_trans_16[14][k]
    847                                                     * pi2_tmp[14 * trans_size];
    848                 }
    849                 eeo[0] =
    850                                 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
    851                                                 + g_ai2_ihevc_trans_16[12][0]
    852                                                                 * pi2_tmp[12
    853                                                                                 * trans_size];
    854                 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
    855                                 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
    856                 eeo[1] =
    857                                 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
    858                                                 + g_ai2_ihevc_trans_16[12][1]
    859                                                                 * pi2_tmp[12
    860                                                                                 * trans_size];
    861                 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
    862                                 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
    863 
    864                 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
    865                 for(k = 0; k < 2; k++)
    866                 {
    867                     ee[k] = eee[k] + eeo[k];
    868                     ee[k + 2] = eee[1 - k] - eeo[1 - k];
    869                 }
    870                 for(k = 0; k < 4; k++)
    871                 {
    872                     e[k] = ee[k] + eo[k];
    873                     e[k + 4] = ee[3 - k] - eo[3 - k];
    874                 }
    875                 for(k = 0; k < 8; k++)
    876                 {
    877                     WORD32 itrans_out;
    878                     itrans_out =
    879                                     CLIP_S16(((e[k] + o[k] + add) >> shift));
    880                     pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
    881                     itrans_out =
    882                                     CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
    883                     pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
    884                 }
    885                 pi2_tmp++;
    886                 pu1_pred += pred_strd;
    887                 pu1_dst += dst_strd;
    888             }
    889         }
    890         /************************************************************************************************/
    891         /************************************END - IT_RECON_16x16****************************************/
    892         /************************************************************************************************/
    893     }
    894 }
    895 
    896