Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevc_itrans_recon_x86_intr.c
     22  *
     23  * @brief
     24  *  Contains function definitions for inverse  quantization, inverse
     25  * transform and reconstruction
     26  *
     27  * @author
     28  *  100470
     29  *  100592 (edited by)
     30  *
     31  * @par List of Functions:
     32  *  - ihevc_itrans_recon_4x4_ttype1_sse42()
     33  *  - ihevc_itrans_recon_4x4_sse42()
     34  *  - ihevc_itrans_recon_8x8_sse42()
     35  *
     36  * @remarks
     37  *  None
     38  *
     39  *******************************************************************************
     40  */
     41 #include <stdio.h>
     42 #include <string.h>
     43 #include "ihevc_typedefs.h"
     44 #include "ihevc_macros.h"
     45 #include "ihevc_platform_macros.h"
     46 #include "ihevc_defs.h"
     47 #include "ihevc_trans_tables.h"
     48 #include "ihevc_iquant_itrans_recon.h"
     49 #include "ihevc_func_selector.h"
     50 #include "ihevc_trans_macros.h"
     51 
     52 #include <immintrin.h>
     53 #include <emmintrin.h>
     54 #include <smmintrin.h>
     55 #include <tmmintrin.h>
     56 
     57 /**
     58  *******************************************************************************
     59  *
     60  * @brief
     61  *  This function performs inverse quantization, inverse  transform
     62  * type1(DST) and reconstruction for 4x4  input block
     63  *
     64  * @par Description:
     65  *  Performs inverse quantization , inverse transform type 1  and adds
     66  * prediction data and clips output to 8 bit
     67  *
     68  * @param[in] pi2_src
     69  *  Input 4x4 coefficients
     70  *
     71  * @param[in] pi2_tmp
     72  *  Temporary 4x4 buffer for storing inverse
     73  *  transform 1st stage output
     74  *
     75  * @param[in] pu1_pred
     76  *  Prediction 4x4 block
     77  *
     78  * @param[in] pi2_dequant_coeff
     79  *  Dequant Coeffs
     80  *
     81  * @param[out] pu1_dst
     82  *  Output 4x4 block
     83  *
     84  * @param[in] qp_div
     85  *  Quantization parameter / 6
     86  *
     87  * @param[in] qp_rem
     88  *  Quantization parameter % 6
     89  *
     90  * @param[in] src_strd
     91  *  Input stride
     92  *
     93  * @param[in] pred_strd
     94  *  Prediction stride
     95  *
     96  * @param[in] dst_strd
     97  *  Output Stride
     98  *
     99  * @param[in] zero_cols
    100  *  Zero columns in pi2_src
    101  *
    102  * @returns  Void
    103  *
    104  * @remarks
    105  *  None
    106  *
    107  *******************************************************************************
    108  */
    109 
    110 
    111 void ihevc_itrans_recon_4x4_ttype1_sse42(WORD16 *pi2_src,
    112                                          WORD16 *pi2_tmp,
    113                                          UWORD8 *pu1_pred,
    114                                          UWORD8 *pu1_dst,
    115                                          WORD32 src_strd,
    116                                          WORD32 pred_strd,
    117                                          WORD32 dst_strd,
    118                                          WORD32 zero_cols,
    119                                          WORD32 zero_rows)
    120 {
    121     __m128i m_temp_reg_0;
    122     __m128i m_temp_reg_1;
    123     __m128i m_temp_reg_2;
    124     __m128i m_temp_reg_3;
    125     __m128i m_temp_reg_4;
    126     __m128i m_temp_reg_10;
    127     __m128i m_temp_reg_11;
    128     __m128i m_temp_reg_12;
    129     __m128i m_temp_reg_13;
    130     __m128i m_temp_reg_14;
    131     __m128i m_temp_reg_20;
    132     __m128i m_temp_reg_21;
    133     __m128i m_temp_reg_22;
    134     __m128i m_temp_reg_23;
    135     __m128i m_temp_reg_24;
    136     __m128i m_temp_reg_25;
    137     __m128i m_temp_reg_30;
    138     __m128i m_temp_reg_31;
    139     __m128i m_temp_reg_32;
    140     __m128i m_temp_reg_33;
    141     __m128i m_temp_reg_34;
    142     __m128i m_temp_reg_35;
    143     __m128i m_temp_reg_36;
    144     __m128i m_coeff1, m_coeff2, m_coeff3;
    145     __m128i m_rdng_factor;
    146     __m128i m_count;
    147 
    148     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    149     UNUSED(zero_rows);
    150     UNUSED(zero_cols);
    151     UNUSED(pi2_tmp);
    152 
    153     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[2][0]); //74
    154 
    155     m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
    156     pi2_src += src_strd;
    157     m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
    158     pi2_src += src_strd;
    159     m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
    160     pi2_src += src_strd;
    161     m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
    162 
    163     m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
    164     m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
    165 
    166     m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
    167     m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
    168 
    169     /* c[4] in m_temp_reg_14 */
    170     /* c[4] = src[0] - src[2] + src[3] */
    171     {
    172         m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
    173     }
    174 
    175     /* c[3] in m_temp_reg_13 */
    176     {
    177         m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
    178     }
    179 
    180     /* c[0] in m_temp_reg_10 */
    181     {
    182         m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
    183     }
    184 
    185     /* c[1] in m_temp_reg_11 */
    186     {
    187         m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
    188     }
    189 
    190     /* c[2] in m_temp_reg_12 */
    191     {
    192         m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
    193     }
    194 
    195     /* c[4] in m_temp_reg_14 */
    196     /* c[4] = src[0] - src[2] + src[3] */
    197     {
    198         m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
    199     }
    200 
    201     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[1][0]); //29
    202     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[0][0]); //55
    203 
    204     /* Stage 1 outputs stored in m_temp_reg_20-23 */
    205     {
    206         m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1); //29*c0
    207         m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2); //55*c1
    208 
    209         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    210 
    211         m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1); //29*c1
    212         m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2); //55*c2
    213 
    214         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
    215 
    216         m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2); //55*c0
    217         m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1); //29*c2
    218         m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3); //74*c4
    219 
    220         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
    221         m_count = _mm_cvtsi32_si128(i4_shift);
    222 
    223         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
    224         m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
    225         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
    226 
    227         m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
    228         m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
    229 
    230         m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
    231         m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
    232         m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
    233 
    234         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
    235 
    236         m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
    237         m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
    238         m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
    239         m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
    240 
    241         m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    242         m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    243         m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    244         m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    245 
    246         m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
    247         m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
    248 
    249         m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
    250         m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
    251 
    252     }
    253 
    254     /* Stage 2 */
    255     {
    256         i4_shift = IT_SHIFT_STAGE_2;
    257 
    258         m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    259         m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
    260         m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    261         m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
    262         m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
    263         m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
    264 
    265         /* c[4] stored in m_temp_reg_4 */
    266         {
    267             m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
    268         }
    269 
    270         /* c[3] stored in m_temp_reg_3 */
    271         {
    272             m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
    273         }
    274 
    275         /* c[0] stored in m_temp_reg_0 */
    276         {
    277             m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
    278         }
    279 
    280         /* c[1] stored in m_temp_reg_1 */
    281         {
    282             m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
    283         }
    284 
    285         /* c[2] stored in m_temp_reg_2 */
    286         {
    287             m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
    288         }
    289 
    290         /* c[4] stored in m_temp_reg_4 */
    291         {
    292             m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
    293         }
    294 
    295         /* Stage 2 output generation */
    296         {
    297             m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1); //29*c0
    298             m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2); //55*c1
    299 
    300             m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    301 
    302             m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //29*c1
    303             m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2); //55*c2
    304 
    305             m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
    306 
    307             m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2); //55*c0
    308             m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1); //29*c2
    309             m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3); //74*c4
    310 
    311             m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
    312             m_count = _mm_cvtsi32_si128(i4_shift);
    313 
    314             m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
    315             m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
    316             m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
    317 
    318             m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
    319             m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
    320 
    321             m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
    322             m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
    323             m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
    324 
    325             m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
    326 
    327             m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
    328             m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
    329             m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
    330             m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
    331 
    332             m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    333             m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    334             m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    335             m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    336 
    337             m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
    338             m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
    339 
    340             m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
    341             m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
    342         }
    343 
    344         /* Recon and store */
    345         {
    346             WORD32 *pi4_dst = (WORD32 *)pu1_dst;
    347 
    348             m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
    349             pu1_pred += pred_strd;
    350             m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
    351             pu1_pred += pred_strd;
    352             m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
    353             pu1_pred += pred_strd;
    354             m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
    355 
    356             m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
    357             m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
    358             m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
    359             m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
    360             m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
    361             m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
    362 
    363             m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
    364             m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
    365 
    366             m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
    367 
    368             *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
    369             m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
    370             m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
    371             m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
    372             pu1_dst += dst_strd;
    373             pi4_dst = (WORD32 *)(pu1_dst);
    374 
    375             *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
    376             pu1_dst += dst_strd;
    377             pi4_dst = (WORD32 *)(pu1_dst);
    378 
    379             *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
    380             pu1_dst += dst_strd;
    381             pi4_dst = (WORD32 *)(pu1_dst);
    382 
    383             *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
    384         }
    385     }
    386 }
    387 
    388 /**
    389  *******************************************************************************
    390  *
    391  * @brief
    392  *  This function performs inverse quantization, inverse  transform
    393  * (DCT) and reconstruction for 4x4  input block
    394  *
    395  * @par Description:
    396  *  Performs inverse quantization , inverse transform and adds
    397  * prediction data and clips output to 8 bit
    398  *
    399  * @param[in] pi2_src
    400  *  Input 4x4 coefficients
    401  *
    402  * @param[in] pi2_tmp
    403  *  Temporary 4x4 buffer for storing inverse
    404  *  transform 1st stage output
    405  *
    406  * @param[in] pu1_pred
    407  *  Prediction 4x4 block
    408  *
    409  * @param[in] pi2_dequant_coeff
    410  *  Dequant Coeffs
    411  *
    412  * @param[out] pu1_dst
    413  *  Output 4x4 block
    414  *
    415  * @param[in] qp_div
    416  *  Quantization parameter / 6
    417  *
    418  * @param[in] qp_rem
    419  *  Quantization parameter % 6
    420  *
    421  * @param[in] src_strd
    422  *  Input stride
    423  *
    424  * @param[in] pred_strd
    425  *  Prediction stride
    426  *
    427  * @param[in] dst_strd
    428  *  Output Stride
    429  *
    430  * @param[in] zero_cols
    431  *  Zero columns in pi2_src
    432  *
    433  * @returns  Void
    434  *
    435  * @remarks
    436  *  None
    437  *
    438  *******************************************************************************
    439  */
    440 
    441 void ihevc_itrans_recon_4x4_sse42(WORD16 *pi2_src,
    442                                   WORD16 *pi2_tmp,
    443                                   UWORD8 *pu1_pred,
    444                                   UWORD8 *pu1_dst,
    445                                   WORD32 src_strd,
    446                                   WORD32 pred_strd,
    447                                   WORD32 dst_strd,
    448                                   WORD32 zero_cols,
    449                                   WORD32 zero_rows)
    450 {
    451 
    452 
    453     __m128i m_temp_reg_0;
    454     __m128i m_temp_reg_1;
    455     __m128i m_temp_reg_2;
    456     __m128i m_temp_reg_3;
    457     __m128i m_temp_reg_10;
    458     __m128i m_temp_reg_11;
    459     __m128i m_temp_reg_12;
    460     __m128i m_temp_reg_13;
    461     __m128i m_temp_reg_14;
    462     __m128i m_temp_reg_15;
    463     __m128i m_temp_reg_20;
    464     __m128i m_temp_reg_21;
    465     __m128i m_temp_reg_22;
    466     __m128i m_temp_reg_23;
    467     __m128i m_temp_reg_24;
    468     __m128i m_temp_reg_25;
    469     __m128i m_temp_reg_30;
    470     __m128i m_temp_reg_31;
    471     __m128i m_temp_reg_33;
    472     __m128i m_temp_reg_34;
    473     __m128i m_coeff1, m_coeff3;
    474     __m128i m_rdng_factor;
    475     __m128i m_count;
    476 
    477 
    478     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    479     UNUSED(zero_rows);
    480     UNUSED(zero_cols);
    481     UNUSED(pi2_tmp);
    482 
    483 
    484     m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
    485     pi2_src += src_strd;
    486     m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
    487     pi2_src += src_strd;
    488     m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
    489     pi2_src += src_strd;
    490     m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
    491 
    492     m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
    493     m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
    494 
    495     m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
    496     m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
    497 
    498 
    499     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[0][0]); //36
    500     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[2][0]); //83
    501 
    502     /* e */
    503     {
    504         m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
    505         m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
    506     }
    507 
    508     /* o */
    509     {
    510         m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //src[1]*36
    511         m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3); //src[3]*83
    512         m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); //src[1]*83
    513         m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1); //src[3]*36
    514     }
    515 
    516     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    517 
    518     /* e1 stored in m_temp_reg_31 */
    519     {
    520         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
    521     }
    522 
    523     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
    524 
    525     /* e0 stored in m_temp_reg_30 */
    526     {
    527         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
    528     }
    529 
    530     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
    531     m_count = _mm_cvtsi32_si128(i4_shift);
    532 
    533     /* o1 stored in m_temp_reg_33 */
    534     {
    535         m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
    536     }
    537 
    538     /* e1 + add */
    539     {
    540         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    541     }
    542 
    543     /* e0 + add */
    544     {
    545         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    546     }
    547 
    548     /* o0 stored in m_temp_reg_34 */
    549     {
    550         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
    551     }
    552 
    553     /* Stage 1 outputs */
    554     {
    555         m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
    556         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
    557 
    558         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
    559         m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
    560 
    561 
    562         m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
    563         m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
    564         m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
    565         m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
    566 
    567         m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    568         m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    569         m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    570         m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    571 
    572         m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
    573         m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
    574 
    575         m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
    576         m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
    577     }
    578 
    579     /* Stage 2 */
    580     {
    581         i4_shift = IT_SHIFT_STAGE_2;
    582 
    583         m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    584         m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    585 
    586         m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
    587         m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
    588 
    589         m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
    590         m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
    591 
    592         /* e */
    593         {
    594             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
    595         }
    596 
    597         /* o */
    598         {
    599             m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1); //src[1]*36
    600             m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); //src[1]*83
    601             m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3); //src[3]*83
    602             m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1); //src[3]*36
    603         }
    604 
    605         /* e */
    606         {
    607             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
    608         }
    609 
    610         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    611 
    612         /* e1 stored in m_temp_reg_31 */
    613         {
    614             m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
    615         }
    616 
    617         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
    618 
    619         /* e0 stored in m_temp_reg_30 */
    620         {
    621             m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
    622         }
    623 
    624         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
    625         m_count = _mm_cvtsi32_si128(i4_shift);
    626 
    627         /* o1 stored in m_temp_reg_33 */
    628         {
    629             m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
    630         }
    631 
    632         /* e1 + add */
    633         {
    634             m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    635         }
    636 
    637         /* e0 + add */
    638         {
    639             m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    640         }
    641 
    642         /* o0 stored in m_temp_reg_34 */
    643         {
    644             m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
    645         }
    646 
    647         /* Stage 2 outputs */
    648         {
    649             m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
    650             m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
    651             m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
    652             m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
    653 
    654             m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
    655             m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
    656             m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
    657             m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
    658 
    659             m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    660             m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    661             m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    662             m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    663 
    664             m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
    665             m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
    666 
    667             m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
    668             m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
    669         }
    670 
    671         /* Recon and store */
    672         {
    673             UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
    674 
    675             m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
    676             pu1_pred += pred_strd;
    677             m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
    678             pu1_pred += pred_strd;
    679             m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
    680             pu1_pred += pred_strd;
    681             m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
    682 
    683             m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
    684             m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
    685             m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
    686             m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
    687             m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
    688             m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
    689 
    690             m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
    691             m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
    692 
    693             m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
    694 
    695             *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
    696             m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
    697             m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
    698             m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
    699             pu1_dst += dst_strd;
    700             pu4_dst = (UWORD32 *)(pu1_dst);
    701 
    702             *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
    703             pu1_dst += dst_strd;
    704             pu4_dst = (UWORD32 *)(pu1_dst);
    705 
    706             *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
    707             pu1_dst += dst_strd;
    708             pu4_dst = (UWORD32 *)(pu1_dst);
    709 
    710             *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
    711         }
    712     }
    713 }
    714 
    715 
    716 
    717 /**
    718  *******************************************************************************
    719  *
    720  * @brief
    721  *  This function performs inverse quantization, inverse  transform and
    722  * reconstruction for 8c8 input block
    723  *
    724  * @par Description:
    725  *  Performs inverse quantization , inverse transform  and adds the
    726  * prediction data and clips output to 8 bit
    727  *
    728  * @param[in] pi2_src
    729  *  Input 8x8 coefficients
    730  *
    731  * @param[in] pi2_tmp
    732  *  Temporary 8x8 buffer for storing inverse
    733  *  transform 1st stage output
    734  *
    735  * @param[in] pu1_pred
    736  *  Prediction 8x8 block
    737  *
    738  * @param[in] pi2_dequant_coeff
    739  *  Dequant Coeffs
    740  *
    741  * @param[out] pu1_dst
    742  *  Output 8x8 block
    743  *
    744  * @param[in] src_strd
    745  *  Input stride
    746  *
    747  * @param[in] qp_div
    748  *  Quantization parameter / 6
    749  *
    750  * @param[in] qp_rem
    751  *  Quantization parameter % 6
    752  *
    753  * @param[in] pred_strd
    754  *  Prediction stride
    755  *
    756  * @param[in] dst_strd
    757  *  Output Stride
    758  *
    759  * @param[in] zero_cols
    760  *  Zero columns in pi2_src
    761  *
    762  * @returns  Void
    763  *
    764  * @remarks
    765  *  None
    766  *
    767  *******************************************************************************
    768  */
    769 
    770 
    771 void ihevc_itrans_recon_8x8_sse42(WORD16 *pi2_src,
    772                                   WORD16 *pi2_tmp,
    773                                   UWORD8 *pu1_pred,
    774                                   UWORD8 *pu1_dst,
    775                                   WORD32 src_strd,
    776                                   WORD32 pred_strd,
    777                                   WORD32 dst_strd,
    778                                   WORD32 zero_cols,
    779                                   WORD32 zero_rows)
    780 {
    781     __m128i m_temp_reg_0;
    782     __m128i m_temp_reg_1;
    783     __m128i m_temp_reg_2;
    784     __m128i m_temp_reg_3;
    785     __m128i m_temp_reg_5;
    786     __m128i m_temp_reg_6;
    787     __m128i m_temp_reg_7;
    788     __m128i m_temp_reg_4;
    789     __m128i m_temp_reg_10;
    790     __m128i m_temp_reg_11;
    791     __m128i m_temp_reg_12;
    792     __m128i m_temp_reg_13;
    793     __m128i m_temp_reg_14;
    794     __m128i m_temp_reg_15;
    795     __m128i m_temp_reg_16;
    796     __m128i m_temp_reg_17;
    797     __m128i m_temp_reg_20;
    798     __m128i m_temp_reg_21;
    799     __m128i m_temp_reg_22;
    800     __m128i m_temp_reg_23;
    801     __m128i m_temp_reg_24;
    802     __m128i m_temp_reg_25;
    803     __m128i m_temp_reg_26;
    804     __m128i m_temp_reg_27;
    805     __m128i m_temp_reg_30;
    806     __m128i m_temp_reg_31;
    807     __m128i m_temp_reg_32;
    808     __m128i m_temp_reg_33;
    809     __m128i m_temp_reg_34;
    810     __m128i m_temp_reg_35;
    811     __m128i m_temp_reg_36;
    812     __m128i m_temp_reg_37;
    813     __m128i m_temp_reg_40;
    814     __m128i m_temp_reg_41;
    815     __m128i m_temp_reg_42;
    816     __m128i m_temp_reg_43;
    817     __m128i m_temp_reg_44;
    818     __m128i m_temp_reg_45;
    819     __m128i m_temp_reg_46;
    820     __m128i m_temp_reg_47;
    821     __m128i m_temp_reg_50;
    822     __m128i m_temp_reg_51;
    823     __m128i m_temp_reg_52;
    824     __m128i m_temp_reg_53;
    825     __m128i m_temp_reg_54;
    826     __m128i m_temp_reg_55;
    827     __m128i m_temp_reg_56;
    828     __m128i m_temp_reg_57;
    829     __m128i m_temp_reg_60;
    830     __m128i m_temp_reg_61;
    831     __m128i m_temp_reg_62;
    832     __m128i m_temp_reg_63;
    833     __m128i m_temp_reg_64;
    834     __m128i m_temp_reg_65;
    835     __m128i m_temp_reg_66;
    836     __m128i m_temp_reg_67;
    837     __m128i m_temp_reg_70;
    838     __m128i m_temp_reg_71;
    839     __m128i m_temp_reg_72;
    840     __m128i m_temp_reg_73;
    841     __m128i m_temp_reg_74;
    842     __m128i m_temp_reg_75;
    843     __m128i m_temp_reg_76;
    844     __m128i m_temp_reg_77;
    845     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
    846 
    847     WORD32 check_row_stage_1;   /* Lokesh */
    848     WORD32 check_row_stage_2;   /* Lokesh */
    849 
    850     __m128i m_rdng_factor;
    851     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    852     UNUSED(pi2_tmp);
    853     check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
    854     check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
    855 
    856     m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
    857     pi2_src += src_strd;
    858     m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
    859     pi2_src += src_strd;
    860     m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
    861     pi2_src += src_strd;
    862     m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
    863     pi2_src += src_strd;
    864 
    865     m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
    866     pi2_src += src_strd;
    867     m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
    868     pi2_src += src_strd;
    869     m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
    870     pi2_src += src_strd;
    871     m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
    872 
    873     if(!check_row_stage_2)
    874     {
    875         if(!check_row_stage_1)
    876         {
    877             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
    878             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
    879             {
    880                 //Interleaving 0,4 row in 0 , 1 Rishab
    881                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
    882                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
    883                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
    884 
    885                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
    886 
    887                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    888                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    889 
    890             }
    891 
    892 
    893             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
    894             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
    895             /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
    896             {
    897 
    898                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
    899                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
    900 
    901                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
    902                 //Interleaving 2,6 row in 4, 5 Rishab
    903                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
    904 
    905                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
    906                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
    907 
    908 
    909                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
    910 
    911                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
    912                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
    913 
    914                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
    915                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
    916 
    917 
    918 
    919                 /* e */
    920 
    921                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
    922                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
    923                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
    924                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
    925                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
    926                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
    927 
    928                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
    929                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
    930 
    931             }
    932 
    933             /* o */
    934             {
    935 
    936                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
    937                 {
    938 
    939                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    940                     //o0:1B*89+3B*75,5B*50+7B*18
    941                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
    942 
    943                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    944                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
    945 
    946 
    947 
    948                     /* Column 0 of destination computed here */
    949                     /* It is stored in m_temp_reg_50 */
    950                     /* Column 7 of destination computed here */
    951                     /* It is stored in m_temp_reg_57 */
    952                     /* Upper 8 bytes of both registers are zero due to zero_cols*/
    953 
    954 
    955 
    956                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    957                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    958 
    959                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    960                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    961 
    962                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    963                     m_temp_reg_63 = _mm_setzero_si128();
    964                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    965 
    966                     //o1:1B*75-3B*18,5B*89+7B*50
    967                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
    968 
    969                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    970                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    971 
    972                     /* Loading coeff for computing o2  in the next block */
    973 
    974                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
    975                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
    976 
    977                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
    978 
    979 
    980 
    981                     /* Column 1 of destination computed here */
    982                     /* It is stored in m_temp_reg_51 */
    983                     /* Column 6 of destination computed here */
    984                     /* It is stored in m_temp_reg_56 */
    985 
    986                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
    987                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
    988 
    989                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    990                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    991 
    992                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    993                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    994 
    995                     //o2:1B*50-3B*89,5B*18+7B*75
    996                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
    997 
    998                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    999                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1000 
   1001 
   1002                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1003 
   1004                     /* Loading coeff for computing o3  in the next block */
   1005 
   1006                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   1007                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
   1008 
   1009 
   1010 
   1011                     /* Column 2 of destination computed here */
   1012                     /* It is stored in m_temp_reg_52 */
   1013                     /* Column 5 of destination computed here */
   1014                     /* It is stored in m_temp_reg_55 */
   1015 
   1016                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1017                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1018 
   1019                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1020                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1021 
   1022                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1023                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1024 
   1025                     //o3:1B*18-3B*50,5B*75-7B*89
   1026                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1027 
   1028                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1029                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1030 
   1031 
   1032 
   1033                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1034 
   1035 
   1036 
   1037                     /* Column 3 of destination computed here */
   1038                     /* It is stored in m_temp_reg_53 */
   1039                     /* Column 4 of destination computed here */
   1040                     /* It is stored in m_temp_reg_54 */
   1041 
   1042                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1043                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1044 
   1045                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1046                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1047 
   1048                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1049                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1050 
   1051 
   1052                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1053                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1054                 }
   1055             }
   1056 
   1057             /* Transpose of the destination 8x8 matrix done here */
   1058             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1059             /* respectively */
   1060             {
   1061                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1062                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1063                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1064                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1065 
   1066                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1067                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1068 
   1069                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1070                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1071 
   1072                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1073                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1074                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1075                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1076 
   1077                 m_temp_reg_54 = _mm_setzero_si128();
   1078                 m_temp_reg_55 = _mm_setzero_si128();
   1079                 m_temp_reg_56 = _mm_setzero_si128();
   1080                 m_temp_reg_57 = _mm_setzero_si128();
   1081             }
   1082         }
   1083         else
   1084         {
   1085             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1086             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1087             {
   1088                 //Interleaving 0,4 row in 0 , 1 Rishab
   1089                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
   1090                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
   1091                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
   1092 
   1093                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
   1094 
   1095                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1096                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1097 
   1098             }
   1099 
   1100 
   1101             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1102             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1103             /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
   1104             {
   1105 
   1106                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
   1107                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
   1108 
   1109                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
   1110                 //Interleaving 2,6 row in 4, 5 Rishab
   1111                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1112 
   1113                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
   1114                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1115 
   1116 
   1117                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
   1118 
   1119                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   1120                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
   1121 
   1122                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   1123                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
   1124 
   1125 
   1126 
   1127                 /* e */
   1128 
   1129                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1130                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1131                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1132                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1133                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1134                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1135 
   1136                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1137                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1138 
   1139             }
   1140 
   1141             /* o */
   1142             {
   1143 
   1144                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1145                 {
   1146 
   1147                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1148                     m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   1149                     //o0:1B*89+3B*75,5B*50+7B*18
   1150                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1151                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   1152 
   1153                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1154                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1155 
   1156                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1157 
   1158 
   1159 
   1160                     /* Column 0 of destination computed here */
   1161                     /* It is stored in m_temp_reg_50 */
   1162                     /* Column 7 of destination computed here */
   1163                     /* It is stored in m_temp_reg_57 */
   1164                     /* Upper 8 bytes of both registers are zero due to zero_cols*/
   1165 
   1166 
   1167 
   1168                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1169                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1170 
   1171                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1172                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1173 
   1174                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1175                     m_temp_reg_63 = _mm_setzero_si128();
   1176                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1177 
   1178                     //o1:1B*75-3B*18,5B*89+7B*50
   1179                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1180                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   1181 
   1182                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1183                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1184 
   1185                     /* Loading coeff for computing o2  in the next block */
   1186 
   1187                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   1188                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
   1189 
   1190                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   1191                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   1192 
   1193 
   1194 
   1195                     /* Column 1 of destination computed here */
   1196                     /* It is stored in m_temp_reg_51 */
   1197                     /* Column 6 of destination computed here */
   1198                     /* It is stored in m_temp_reg_56 */
   1199 
   1200                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1201                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1202 
   1203                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1204                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1205 
   1206                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1207                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1208 
   1209                     //o2:1B*50-3B*89,5B*18+7B*75
   1210                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1211                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   1212 
   1213                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1214                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1215 
   1216 
   1217                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1218 
   1219                     /* Loading coeff for computing o3  in the next block */
   1220 
   1221                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   1222                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
   1223 
   1224                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1225 
   1226 
   1227                     /* Column 2 of destination computed here */
   1228                     /* It is stored in m_temp_reg_52 */
   1229                     /* Column 5 of destination computed here */
   1230                     /* It is stored in m_temp_reg_55 */
   1231 
   1232                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1233                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1234 
   1235                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1236                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1237 
   1238                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1239                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1240 
   1241                     //o3:1B*18-3B*50,5B*75-7B*89
   1242                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1243                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   1244 
   1245                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1246                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1247 
   1248 
   1249 
   1250                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1251 
   1252                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
   1253 
   1254 
   1255                     /* Column 3 of destination computed here */
   1256                     /* It is stored in m_temp_reg_53 */
   1257                     /* Column 4 of destination computed here */
   1258                     /* It is stored in m_temp_reg_54 */
   1259 
   1260                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1261                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1262 
   1263                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1264                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1265 
   1266                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1267                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1268 
   1269 
   1270                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1271                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1272                 }
   1273             }
   1274 
   1275             /* Transpose of the destination 8x8 matrix done here */
   1276             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1277             /* respectively */
   1278             {
   1279                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1280                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1281                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1282                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1283 
   1284                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1285                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1286                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1287                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1288 
   1289                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1290                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1291                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1292                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1293 
   1294                 m_temp_reg_54 = _mm_setzero_si128();
   1295                 m_temp_reg_55 = _mm_setzero_si128();
   1296                 m_temp_reg_56 = _mm_setzero_si128();
   1297                 m_temp_reg_57 = _mm_setzero_si128();
   1298             }
   1299         }
   1300 
   1301         /* Stage 2 */
   1302         i4_shift = IT_SHIFT_STAGE_2;
   1303         {
   1304             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1305             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1306             {
   1307                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
   1308                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
   1309 
   1310                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
   1311                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
   1312 
   1313                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1314                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1315                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1316                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1317 
   1318 
   1319                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
   1320                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
   1321             }
   1322 
   1323 
   1324             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1325             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1326             {
   1327 
   1328                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
   1329                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
   1330 
   1331 
   1332                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1333                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1334                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1335                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1336 
   1337                 /* Loading coeff for computing o0 in the next block */
   1338                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   1339 
   1340 
   1341                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
   1342                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
   1343 
   1344 
   1345 
   1346                 /* e */
   1347 
   1348                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1349                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1350                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1351                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1352                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1353                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1354 
   1355                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1356                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1357 
   1358                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   1359                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   1360 
   1361                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   1362                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   1363 
   1364             }
   1365 
   1366             /* o */
   1367             {
   1368 
   1369                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1370                 {
   1371                     //o0:1B*89+3B*75,1T*89+3T*75
   1372                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1373                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1374 
   1375                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1376                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1377                     /* Loading coeff for computing o1 in the next block */
   1378                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   1379 
   1380 
   1381 
   1382                     /* Column 0 of destination computed here */
   1383                     /* It is stored in m_temp_reg_50 */
   1384                     /* Column 7 of destination computed here */
   1385                     /* It is stored in m_temp_reg_57 */
   1386 
   1387                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1388                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1389 
   1390                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   1391                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   1392 
   1393                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1394                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1395                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1396                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1397 
   1398                     //o1:1B*75-3B*18,1T*75-3T*18
   1399                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   1400                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   1401 
   1402                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1403                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1404                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1405                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1406 
   1407                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1408                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1409 
   1410 
   1411                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   1412 
   1413 
   1414                     /* Loading coeff for computing o2  in the next block */
   1415                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   1416 
   1417 
   1418 
   1419                     /* Column 1 of destination computed here */
   1420                     /* It is stored in m_temp_reg_51 */
   1421                     /* Column 6 of destination computed here */
   1422                     /* It is stored in m_temp_reg_56 */
   1423 
   1424                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1425                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1426 
   1427                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   1428                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   1429 
   1430                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1431                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1432                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1433                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1434 
   1435                     //o2:1B*50-3B*89,5T*18+7T*75.
   1436                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1437                     m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1438 
   1439                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1440                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1441                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1442                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1443 
   1444                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1445                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1446 
   1447 
   1448                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1449 
   1450                     /* Loading coeff for computing o3  in the next block */
   1451 
   1452                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   1453 
   1454 
   1455                     /* Column 2 of destination computed here */
   1456                     /* It is stored in m_temp_reg_52 */
   1457                     /* Column 5 of destination computed here */
   1458                     /* It is stored in m_temp_reg_55 */
   1459 
   1460                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1461                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1462 
   1463                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   1464                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   1465 
   1466                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1467                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1468                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1469                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1470 
   1471                     //o3:1B*18-3B*50,1T*18-3T*50
   1472                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   1473                     m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   1474 
   1475                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1476                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1477                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1478                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1479 
   1480 
   1481                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1482                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1483 
   1484 
   1485 
   1486                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1487 
   1488 
   1489                     /* Column 3 of destination computed here */
   1490                     /* It is stored in m_temp_reg_53 */
   1491                     /* Column 4 of destination computed here */
   1492                     /* It is stored in m_temp_reg_54 */
   1493 
   1494                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1495                     m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1496 
   1497                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   1498                     m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   1499 
   1500                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
   1501                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
   1502                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
   1503                     m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
   1504 
   1505                     m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
   1506                     m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
   1507                     m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
   1508                     m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
   1509 
   1510                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
   1511                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
   1512                 }
   1513             }
   1514 
   1515             /* Transpose of the destination 8x8 matrix done here */
   1516             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1517             /* respectively */
   1518             {
   1519                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1520                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1521                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   1522                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   1523                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1524                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1525                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   1526                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   1527 
   1528                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1529                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1530                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   1531                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   1532                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1533                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1534                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   1535                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   1536                 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1537                 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1538                 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1539                 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1540 
   1541                 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   1542                 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   1543                 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   1544                 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   1545             }
   1546 
   1547             /* Recon and store */
   1548             {
   1549                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1550                 pu1_pred += pred_strd;
   1551                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1552                 pu1_pred += pred_strd;
   1553                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1554                 pu1_pred += pred_strd;
   1555                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1556                 pu1_pred += pred_strd;
   1557                 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1558                 pu1_pred += pred_strd;
   1559                 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1560                 pu1_pred += pred_strd;
   1561                 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1562                 pu1_pred += pred_strd;
   1563                 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1564 
   1565                 m_temp_reg_50 = _mm_setzero_si128();
   1566                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
   1567                 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
   1568                 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
   1569                 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
   1570                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
   1571                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
   1572                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
   1573                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
   1574 
   1575                 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
   1576                 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
   1577                 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
   1578                 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
   1579                 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
   1580                 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
   1581                 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
   1582                 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
   1583 
   1584                 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
   1585                 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
   1586                 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
   1587                 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
   1588                 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
   1589                 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
   1590                 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
   1591                 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
   1592 
   1593                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
   1594                 pu1_dst += dst_strd;
   1595                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
   1596                 pu1_dst += dst_strd;
   1597                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
   1598                 pu1_dst += dst_strd;
   1599                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
   1600                 pu1_dst += dst_strd;
   1601                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
   1602                 pu1_dst += dst_strd;
   1603                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
   1604                 pu1_dst += dst_strd;
   1605                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
   1606                 pu1_dst += dst_strd;
   1607                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
   1608                 pu1_dst += dst_strd;
   1609             }
   1610         }
   1611     }
   1612     else
   1613 
   1614     {
   1615 
   1616         /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1617         /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1618         if(!check_row_stage_1)
   1619         {
   1620             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1621             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1622             {
   1623                 //Interleaving 0,4 row in 0 , 1 Rishab
   1624                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
   1625                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
   1626                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
   1627 
   1628                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
   1629                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
   1630 
   1631                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1632                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1633 
   1634 
   1635                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1636                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1637             }
   1638 
   1639 
   1640             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1641             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1642             {
   1643 
   1644                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
   1645                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
   1646 
   1647                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
   1648                 //Interleaving 2,6 row in 4, 5 Rishab
   1649                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1650                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
   1651 
   1652                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
   1653                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1654 
   1655                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
   1656                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   1657 
   1658 
   1659 
   1660                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
   1661 
   1662                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   1663                 //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
   1664 
   1665                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   1666                 //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
   1667 
   1668             }
   1669 
   1670             /* e */
   1671             {
   1672                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1673                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1674                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1675                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1676                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1677                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1678 
   1679                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1680                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1681 
   1682                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   1683                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   1684 
   1685                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   1686                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   1687 
   1688             }
   1689 
   1690             /* o */
   1691             {
   1692 
   1693                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1694                 {
   1695 
   1696                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1697                     m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   1698                     //o0:1B*89+3B*75,1T*89+3T*75
   1699                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1700                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1701 
   1702                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1703                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1704 
   1705                 }
   1706 
   1707                 /* Column 0 of destination computed here */
   1708                 /* It is stored in m_temp_reg_50 */
   1709                 /* Column 7 of destination computed here */
   1710                 /* It is stored in m_temp_reg_57 */
   1711                 {
   1712 
   1713 
   1714                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1715                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1716 
   1717                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   1718                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   1719 
   1720                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1721                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1722                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1723                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1724 
   1725                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1726                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1727                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1728                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1729 
   1730                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   1731                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1732                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   1733 
   1734                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1735                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1736 
   1737                     /* Loading coeff for computing o2  in the next block */
   1738 
   1739                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   1740 
   1741                 }
   1742 
   1743                 /* Column 1 of destination computed here */
   1744                 /* It is stored in m_temp_reg_51 */
   1745                 /* Column 6 of destination computed here */
   1746                 /* It is stored in m_temp_reg_56 */
   1747                 {
   1748                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1749                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1750 
   1751                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   1752                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   1753 
   1754                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1755                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1756                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1757                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1758 
   1759                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1760                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1761                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1762                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1763 
   1764                     //o2:1B*50-3B*89,1T*50-3T*89
   1765                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1766                     m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1767 
   1768                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1769                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1770 
   1771 
   1772                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1773 
   1774 
   1775                     /* Loading coeff for computing o3  in the next block */
   1776 
   1777                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   1778 
   1779                 }
   1780 
   1781                 /* Column 2 of destination computed here */
   1782                 /* It is stored in m_temp_reg_52 */
   1783                 /* Column 5 of destination computed here */
   1784                 /* It is stored in m_temp_reg_55 */
   1785                 {
   1786                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1787                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1788 
   1789                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   1790                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   1791 
   1792                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1793                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1794                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1795                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1796 
   1797                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1798                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1799                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1800                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1801 
   1802                     //o3:1B*18-3B*50,1T*18-3T*50
   1803                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1804                     m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   1805 
   1806                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1807                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1808 
   1809 
   1810 
   1811                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1812 
   1813 
   1814                 }
   1815 
   1816                 /* Column 3 of destination computed here */
   1817                 /* It is stored in m_temp_reg_53 */
   1818                 /* Column 4 of destination computed here */
   1819                 /* It is stored in m_temp_reg_54 */
   1820                 {
   1821                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1822                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1823 
   1824                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   1825                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   1826 
   1827                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1828                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1829                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1830                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1831 
   1832                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1833                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1834                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1835                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1836 
   1837                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1838                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1839                 }
   1840             }
   1841 
   1842             /* Transpose of the destination 8x8 matrix done here */
   1843             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1844             /* respectively */
   1845             {
   1846 
   1847 
   1848                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1849                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1850                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   1851                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   1852                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1853                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1854                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   1855                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   1856 
   1857                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1858                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1859                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   1860                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   1861                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1862                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1863                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   1864                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   1865 
   1866                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1867                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1868                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1869                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1870 
   1871                 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   1872                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   1873                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   1874                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   1875             }
   1876         }
   1877         else
   1878         {
   1879 
   1880             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1881             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1882             {
   1883                 //Interleaving 0,4 row in 0 , 1 Rishab
   1884                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
   1885                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
   1886                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
   1887 
   1888                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
   1889                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
   1890 
   1891                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1892                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1893 
   1894 
   1895                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1896                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1897             }
   1898 
   1899 
   1900             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1901             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1902             {
   1903 
   1904                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
   1905                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
   1906 
   1907                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
   1908                 //Interleaving 2,6 row in 4, 5 Rishab
   1909                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1910                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
   1911 
   1912                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
   1913                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1914 
   1915                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
   1916                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   1917 
   1918 
   1919 
   1920                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
   1921 
   1922                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   1923                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
   1924 
   1925                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   1926                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
   1927 
   1928             }
   1929 
   1930             /* e */
   1931             {
   1932                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1933                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1934                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1935                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1936                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1937                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1938 
   1939                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1940                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1941 
   1942                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   1943                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   1944 
   1945                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   1946                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   1947 
   1948             }
   1949 
   1950             /* o */
   1951             {
   1952 
   1953                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1954                 {
   1955 
   1956                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1957                     m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   1958                     m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   1959                     m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
   1960                     //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
   1961                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1962                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1963                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   1964                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
   1965 
   1966 
   1967                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1968                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1969 
   1970                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1971                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   1972                 }
   1973 
   1974                 /* Column 0 of destination computed here */
   1975                 /* It is stored in m_temp_reg_50 */
   1976                 /* Column 7 of destination computed here */
   1977                 /* It is stored in m_temp_reg_57 */
   1978                 {
   1979 
   1980 
   1981                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1982                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1983 
   1984                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   1985                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   1986 
   1987                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1988                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1989                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1990                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1991 
   1992                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1993                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1994                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1995                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1996 
   1997                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   1998                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1999                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   2000                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   2001                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
   2002 
   2003                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2004                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2005 
   2006                     /* Loading coeff for computing o2  in the next block */
   2007 
   2008                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   2009                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
   2010 
   2011                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   2012                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   2013                     m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
   2014                 }
   2015 
   2016                 /* Column 1 of destination computed here */
   2017                 /* It is stored in m_temp_reg_51 */
   2018                 /* Column 6 of destination computed here */
   2019                 /* It is stored in m_temp_reg_56 */
   2020                 {
   2021                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   2022                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   2023 
   2024                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   2025                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   2026 
   2027                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2028                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2029                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2030                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2031 
   2032                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2033                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2034                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2035                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2036 
   2037                     //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
   2038                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   2039                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   2040                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   2041                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
   2042 
   2043                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2044                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2045 
   2046 
   2047                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   2048 
   2049 
   2050                     /* Loading coeff for computing o3  in the next block */
   2051 
   2052                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   2053                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
   2054 
   2055                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   2056                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   2057                 }
   2058 
   2059                 /* Column 2 of destination computed here */
   2060                 /* It is stored in m_temp_reg_52 */
   2061                 /* Column 5 of destination computed here */
   2062                 /* It is stored in m_temp_reg_55 */
   2063                 {
   2064                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   2065                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   2066 
   2067                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   2068                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   2069 
   2070                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2071                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2072                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2073                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2074 
   2075                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2076                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2077                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2078                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2079 
   2080                     //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
   2081                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   2082                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   2083                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   2084                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
   2085 
   2086                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2087                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2088 
   2089 
   2090 
   2091                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   2092 
   2093 
   2094                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
   2095                     m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
   2096                 }
   2097 
   2098                 /* Column 3 of destination computed here */
   2099                 /* It is stored in m_temp_reg_53 */
   2100                 /* Column 4 of destination computed here */
   2101                 /* It is stored in m_temp_reg_54 */
   2102                 {
   2103                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   2104                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   2105 
   2106                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   2107                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   2108 
   2109                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2110                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2111                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2112                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2113 
   2114                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2115                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2116                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2117                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2118 
   2119                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2120                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2121                 }
   2122             }
   2123 
   2124             /* Transpose of the destination 8x8 matrix done here */
   2125             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   2126             /* respectively */
   2127             {
   2128 
   2129 
   2130                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   2131                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   2132                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   2133                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   2134                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   2135                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   2136                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   2137                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   2138 
   2139                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   2140                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   2141                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   2142                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   2143                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   2144                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   2145                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   2146                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   2147 
   2148                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   2149                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   2150                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   2151                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   2152 
   2153                 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   2154                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   2155                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   2156                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   2157             }
   2158         }
   2159         /* Stage 2 */
   2160 
   2161         i4_shift = IT_SHIFT_STAGE_2;
   2162 
   2163         {
   2164 
   2165             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   2166             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   2167             {
   2168                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
   2169                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
   2170 
   2171                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
   2172                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
   2173 
   2174                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2175                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   2176                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2177                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   2178 
   2179 
   2180                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
   2181                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
   2182             }
   2183 
   2184 
   2185             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   2186             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   2187             {
   2188                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
   2189                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
   2190 
   2191 
   2192                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2193                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   2194                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2195                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   2196 
   2197                 /* Loading coeff for computing o0 in the next block */
   2198                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   2199                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
   2200 
   2201 
   2202                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
   2203                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
   2204             }
   2205 
   2206             /* e */
   2207             {
   2208                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   2209                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   2210                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   2211                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   2212                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   2213                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   2214 
   2215                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   2216                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   2217 
   2218                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   2219                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   2220 
   2221                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   2222                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   2223 
   2224             }
   2225 
   2226             /* o */
   2227             {
   2228                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
   2229                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
   2230 
   2231                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   2232                 {
   2233                     //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
   2234                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2235                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2236                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   2237                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   2238 
   2239                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2240                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   2241                     /* Loading coeff for computing o1 in the next block */
   2242                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   2243                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
   2244 
   2245                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   2246                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   2247                 }
   2248 
   2249                 /* Column 0 of destination computed here */
   2250                 /* It is stored in m_temp_reg_50 */
   2251                 /* Column 7 of destination computed here */
   2252                 /* It is stored in m_temp_reg_57 */
   2253                 {
   2254                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   2255                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   2256 
   2257                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   2258                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   2259 
   2260                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   2261                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   2262                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   2263                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   2264 
   2265                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   2266                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   2267                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   2268                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   2269 
   2270                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   2271                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   2272                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
   2273                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   2274                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
   2275 
   2276                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   2277                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   2278 
   2279 
   2280                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   2281 
   2282 
   2283                     /* Loading coeff for computing o2  in the next block */
   2284                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   2285                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
   2286 
   2287                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   2288                     m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
   2289                 }
   2290 
   2291                 /* Column 1 of destination computed here */
   2292                 /* It is stored in m_temp_reg_51 */
   2293                 /* Column 6 of destination computed here */
   2294                 /* It is stored in m_temp_reg_56 */
   2295                 {
   2296                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   2297                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   2298 
   2299                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   2300                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   2301 
   2302                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   2303                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   2304                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   2305                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   2306 
   2307                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   2308                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   2309                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   2310                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   2311 
   2312                     //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
   2313                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2314                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   2315                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2316                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   2317 
   2318                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   2319                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   2320 
   2321 
   2322                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   2323 
   2324                     /* Loading coeff for computing o3  in the next block */
   2325 
   2326                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   2327                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
   2328 
   2329                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   2330                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   2331                 }
   2332 
   2333                 /* Column 2 of destination computed here */
   2334                 /* It is stored in m_temp_reg_52 */
   2335                 /* Column 5 of destination computed here */
   2336                 /* It is stored in m_temp_reg_55 */
   2337                 {
   2338                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   2339                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   2340 
   2341                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   2342                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   2343 
   2344                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   2345                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   2346                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   2347                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   2348 
   2349                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   2350                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   2351                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   2352                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   2353 
   2354                     //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
   2355                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   2356                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
   2357                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   2358                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
   2359 
   2360                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   2361                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   2362 
   2363 
   2364 
   2365                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   2366 
   2367 
   2368                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
   2369                     m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
   2370                 }
   2371 
   2372                 /* Column 3 of destination computed here */
   2373                 /* It is stored in m_temp_reg_53 */
   2374                 /* Column 4 of destination computed here */
   2375                 /* It is stored in m_temp_reg_54 */
   2376                 {
   2377                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   2378                     m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   2379 
   2380                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   2381                     m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   2382 
   2383                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
   2384                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
   2385                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
   2386                     m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
   2387 
   2388                     m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
   2389                     m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
   2390                     m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
   2391                     m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
   2392 
   2393                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
   2394                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
   2395                 }
   2396             }
   2397 
   2398             /* Transpose of the destination 8x8 matrix done here */
   2399             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   2400             /* respectively */
   2401             {
   2402                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   2403                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   2404                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   2405                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   2406                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   2407                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   2408                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   2409                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   2410 
   2411                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   2412                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   2413                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   2414                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   2415                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   2416                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   2417                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   2418                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   2419                 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   2420                 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   2421                 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   2422                 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   2423 
   2424                 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   2425                 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   2426                 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   2427                 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   2428             }
   2429 
   2430             /* Recon and store */
   2431             {
   2432                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2433                 pu1_pred += pred_strd;
   2434                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2435                 pu1_pred += pred_strd;
   2436                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2437                 pu1_pred += pred_strd;
   2438                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2439                 pu1_pred += pred_strd;
   2440                 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2441                 pu1_pred += pred_strd;
   2442                 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2443                 pu1_pred += pred_strd;
   2444                 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2445                 pu1_pred += pred_strd;
   2446                 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2447 
   2448 
   2449                 m_temp_reg_50 = _mm_setzero_si128();
   2450                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
   2451                 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
   2452                 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
   2453                 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
   2454                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
   2455                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
   2456                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
   2457                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
   2458 
   2459                 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
   2460                 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
   2461                 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
   2462                 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
   2463                 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
   2464                 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
   2465                 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
   2466                 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
   2467 
   2468                 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
   2469                 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
   2470                 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
   2471                 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
   2472                 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
   2473                 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
   2474                 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
   2475                 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
   2476 
   2477                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
   2478                 pu1_dst += dst_strd;
   2479                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
   2480                 pu1_dst += dst_strd;
   2481                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
   2482                 pu1_dst += dst_strd;
   2483                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
   2484                 pu1_dst += dst_strd;
   2485                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
   2486                 pu1_dst += dst_strd;
   2487                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
   2488                 pu1_dst += dst_strd;
   2489                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
   2490                 pu1_dst += dst_strd;
   2491                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
   2492                 pu1_dst += dst_strd;
   2493 
   2494             }
   2495 
   2496 
   2497         }
   2498 
   2499 
   2500     }
   2501 }
   2502