Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevc_itrans_recon_atom_intr.c
     22  *
     23  * @brief
     24  *  Contains function definitions for inverse  quantization, inverse
     25  * transform and reconstruction
     26  *
     27  * @author
     28  *  100470
     29  *  100592 (edited by)
     30  *
     31  * @par List of Functions:
     32  *  - ihevc_itrans_recon_4x4_ttype1_ssse3()
     33  *  - ihevc_itrans_recon_4x4_ssse3()
     34  *  - ihevc_itrans_recon_8x8_ssse3()
     35  *
     36  * @remarks
     37  *  None
     38  *
     39  *******************************************************************************
     40  */
     41 #include <stdio.h>
     42 #include <string.h>
     43 #include "ihevc_typedefs.h"
     44 #include "ihevc_platform_macros.h"
     45 #include "ihevc_macros.h"
     46 #include "ihevc_defs.h"
     47 #include "ihevc_func_selector.h"
     48 #include "ihevc_trans_tables.h"
     49 #include "ihevc_iquant_itrans_recon.h"
     50 #include "ihevc_trans_macros.h"
     51 
     52 
     53 #include <immintrin.h>
     54 #include <emmintrin.h>
     55 
     56 #include <tmmintrin.h>
     57 
     58 
     59 /**
     60  *******************************************************************************
     61  *
     62  * @brief
     63  *  This function performs inverse quantization, inverse  transform
     64  * type1(DST) and reconstruction for 4x4  input block
     65  *
     66  * @par Description:
     67  *  Performs inverse quantization , inverse transform type 1  and adds
     68  * prediction data and clips output to 8 bit
     69  *
     70  * @param[in] pi2_src
     71  *  Input 4x4 coefficients
     72  *
     73  * @param[in] pi2_tmp
     74  *  Temporary 4x4 buffer for storing inverse
     75  *  transform 1st stage output
     76  *
     77  * @param[in] pu1_pred
     78  *  Prediction 4x4 block
     79  *
     80  * @param[in] pi2_dequant_coeff
     81  *  Dequant Coeffs
     82  *
     83  * @param[out] pu1_dst
     84  *  Output 4x4 block
     85  *
     86  * @param[in] qp_div
     87  *  Quantization parameter / 6
     88  *
     89  * @param[in] qp_rem
     90  *  Quantization parameter % 6
     91  *
     92  * @param[in] src_strd
     93  *  Input stride
     94  *
     95  * @param[in] pred_strd
     96  *  Prediction stride
     97  *
     98  * @param[in] dst_strd
     99  *  Output Stride
    100  *
    101  * @param[in] zero_cols
    102  *  Zero columns in pi2_src
    103  *
    104  * @returns  Void
    105  *
    106  * @remarks
    107  *  None
    108  *
    109  *******************************************************************************
    110  */
    111 
    112 void ihevc_itrans_recon_4x4_ttype1_ssse3(WORD16 *pi2_src,
    113                                          WORD16 *pi2_tmp,
    114                                          UWORD8 *pu1_pred,
    115                                          UWORD8 *pu1_dst,
    116                                          WORD32 src_strd,
    117                                          WORD32 pred_strd,
    118                                          WORD32 dst_strd,
    119                                          WORD32 zero_cols,
    120                                          WORD32 zero_rows)
    121 {
    122     __m128i m_temp_reg_0;
    123     __m128i m_temp_reg_1;
    124     __m128i m_temp_reg_2;
    125     __m128i m_temp_reg_3;
    126     __m128i m_temp_reg_4;
    127     __m128i m_temp_reg_10;
    128     __m128i m_temp_reg_11;
    129     __m128i m_temp_reg_12;
    130     __m128i m_temp_reg_13;
    131     __m128i m_temp_reg_14;
    132     __m128i m_temp_reg_20;
    133     __m128i m_temp_reg_21;
    134     __m128i m_temp_reg_22;
    135     __m128i m_temp_reg_23;
    136     __m128i m_temp_reg_24;
    137     __m128i m_temp_reg_25;
    138     __m128i m_temp_reg_30;
    139     __m128i m_temp_reg_31;
    140     __m128i m_temp_reg_32;
    141     __m128i m_temp_reg_33;
    142     __m128i m_temp_reg_34;
    143     __m128i m_temp_reg_35;
    144     __m128i m_temp_reg_36;
    145     __m128i m_rdng_factor;
    146     __m128i m_count;
    147 
    148     __m128i m_ge_zero16b_flag_row0;
    149     __m128i m_ge_zero16b_flag_row1;
    150     __m128i m_ge_zero16b_flag_row2;
    151     __m128i m_ge_zero16b_flag_row3;
    152 
    153     __m128i m_zero = _mm_setzero_si128();
    154 
    155     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    156     UNUSED(zero_cols);
    157     UNUSED(zero_rows);
    158     UNUSED(pi2_tmp);
    159     m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
    160     pi2_src += src_strd;
    161     m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
    162     pi2_src += src_strd;
    163     m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
    164     pi2_src += src_strd;
    165     m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
    166 
    167     m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
    168     m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
    169     m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
    170     m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
    171 
    172     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
    173     m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
    174     m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
    175     m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
    176 
    177     /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
    178     m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
    179 
    180     m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
    181     m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
    182 
    183     /* c[4] in m_temp_reg_14 */
    184     /* c[4] = src[0] - src[2] + src[3] */
    185     {
    186         m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
    187     }
    188 
    189     /* c[3] in m_temp_reg_13 */
    190     {
    191         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
    192         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 3);
    193         m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
    194         m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
    195         m_temp_reg_13 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
    196         //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
    197     }
    198 
    199     /* c[0] in m_temp_reg_10 */
    200     {
    201         m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
    202     }
    203 
    204     /* c[1] in m_temp_reg_11 */
    205     {
    206         m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
    207     }
    208 
    209     /* c[2] in m_temp_reg_12 */
    210     {
    211         m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
    212     }
    213 
    214     /* c[4] in m_temp_reg_14 */
    215     /* c[4] = src[0] - src[2] + src[3] */
    216     {
    217         m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
    218     }
    219 
    220     /* Stage 1 outputs stored in m_temp_reg_20-23 */
    221     {
    222         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 5);
    223         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 1);
    224         m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
    225         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
    226         //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1);//29*c0
    227 
    228         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 6);
    229         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 3);
    230         m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
    231         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
    232         //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2);//55*c1
    233 
    234         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    235 
    236         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 5);
    237         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 1);
    238         m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
    239         m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
    240         //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1);//29*c1
    241 
    242         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 6);
    243         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 3);
    244         m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
    245         m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
    246         //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2);//55*c2
    247 
    248         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
    249 
    250         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 6);
    251         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 3);
    252         m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
    253         m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
    254         //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2);//55*c0
    255 
    256         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 5);
    257         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 1);
    258         m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
    259         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
    260         //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1);//29*c2
    261 
    262         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_14, 6);
    263         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_14, 3);
    264         m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_14, 1);
    265         m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
    266         m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
    267         //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3);//74*c4
    268 
    269         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
    270         m_count = _mm_cvtsi32_si128(i4_shift);
    271 
    272         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
    273         m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
    274         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
    275 
    276         m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
    277         m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
    278 
    279         m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
    280         m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
    281         m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
    282 
    283         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
    284 
    285         m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
    286         m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
    287         m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
    288         m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
    289 
    290         m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    291         m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    292         m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    293         m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    294 
    295         m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
    296         m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
    297 
    298         m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
    299         m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
    300 
    301     }
    302 
    303     /* Stage 2 */
    304     {
    305         i4_shift = IT_SHIFT_STAGE_2;
    306 
    307         /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    308         m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
    309         m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    310         m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
    311         m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
    312         m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
    313 
    314         m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
    315         m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
    316 
    317         m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
    318         m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
    319         m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
    320         m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
    321 
    322 
    323         /* c[4] stored in m_temp_reg_4 */
    324         {
    325             m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
    326         }
    327 
    328         /* c[3] stored in m_temp_reg_3 */
    329         {
    330             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_22, 6);
    331             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_22, 3);
    332             m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_22, 1);
    333             m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
    334             m_temp_reg_3 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
    335             //m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
    336         }
    337 
    338         /* c[0] stored in m_temp_reg_0 */
    339         {
    340             m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
    341         }
    342 
    343         /* c[1] stored in m_temp_reg_1 */
    344         {
    345             m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
    346         }
    347 
    348         /* c[2] stored in m_temp_reg_2 */
    349         {
    350             m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
    351         }
    352 
    353         /* c[4] stored in m_temp_reg_4 */
    354         {
    355             m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
    356         }
    357 
    358         /* Stage 2 output generation */
    359         {
    360             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 5);
    361             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 1);
    362             m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
    363             m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
    364             //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);//29*c0
    365 
    366             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 6);
    367             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 3);
    368             m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
    369             m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
    370             //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2);//55*c1
    371 
    372             m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    373 
    374             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 5);
    375             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 1);
    376             m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
    377             m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
    378             //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//29*c1
    379 
    380             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 6);
    381             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 3);
    382             m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
    383             m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
    384             //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);//55*c2
    385 
    386             m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
    387 
    388             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
    389             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 3);
    390             m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
    391             m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
    392             //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);//55*c0
    393 
    394             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 5);
    395             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 1);
    396             m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
    397             m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
    398             //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);//29*c2
    399 
    400             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_4, 6);
    401             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_4, 3);
    402             m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_4, 1);
    403             m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
    404             m_temp_reg_36 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
    405             //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3);//74*c4
    406 
    407             m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
    408             m_count = _mm_cvtsi32_si128(i4_shift);
    409 
    410             m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
    411             m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
    412             m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
    413 
    414             m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
    415             m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
    416 
    417             m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
    418             m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
    419             m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
    420 
    421             m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
    422 
    423             m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
    424             m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
    425             m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
    426             m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
    427 
    428             m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    429             m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    430             m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    431             m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    432 
    433             m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
    434             m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
    435 
    436             m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
    437             m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
    438         }
    439 
    440         /* Recon and store */
    441         {
    442             WORD32 *pi4_dst = (WORD32 *)pu1_dst;
    443 
    444             m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
    445             pu1_pred += pred_strd;
    446             m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
    447             pu1_pred += pred_strd;
    448             m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
    449             pu1_pred += pred_strd;
    450             m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
    451 
    452             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
    453             m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
    454             m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
    455             m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
    456 
    457             /*m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
    458             m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
    459             m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
    460             m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);*/
    461 
    462             m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
    463             m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
    464 
    465             m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
    466             m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
    467 
    468             m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
    469 
    470             *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
    471             m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
    472             m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
    473             m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
    474             pu1_dst += dst_strd;
    475             pi4_dst = (WORD32 *)(pu1_dst);
    476 
    477             *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
    478             pu1_dst += dst_strd;
    479             pi4_dst = (WORD32 *)(pu1_dst);
    480 
    481             *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
    482             pu1_dst += dst_strd;
    483             pi4_dst = (WORD32 *)(pu1_dst);
    484 
    485             *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
    486         }
    487     }
    488 }
    489 
    490 
    491 /**
    492  *******************************************************************************
    493  *
    494  * @brief
    495  *  This function performs inverse quantization, inverse  transform
    496  * (DCT) and reconstruction for 4x4  input block
    497  *
    498  * @par Description:
    499  *  Performs inverse quantization , inverse transform and adds
    500  * prediction data and clips output to 8 bit
    501  *
    502  * @param[in] pi2_src
    503  *  Input 4x4 coefficients
    504  *
    505  * @param[in] pi2_tmp
    506  *  Temporary 4x4 buffer for storing inverse
    507  *  transform 1st stage output
    508  *
    509  * @param[in] pu1_pred
    510  *  Prediction 4x4 block
    511  *
    512  * @param[in] pi2_dequant_coeff
    513  *  Dequant Coeffs
    514  *
    515  * @param[out] pu1_dst
    516  *  Output 4x4 block
    517  *
    518  * @param[in] qp_div
    519  *  Quantization parameter / 6
    520  *
    521  * @param[in] qp_rem
    522  *  Quantization parameter % 6
    523  *
    524  * @param[in] src_strd
    525  *  Input stride
    526  *
    527  * @param[in] pred_strd
    528  *  Prediction stride
    529  *
    530  * @param[in] dst_strd
    531  *  Output Stride
    532  *
    533  * @param[in] zero_cols
    534  *  Zero columns in pi2_src
    535  *
    536  * @returns  Void
    537  *
    538  * @remarks
    539  *  None
    540  *
    541  *******************************************************************************
    542  */
    543 
    544 void ihevc_itrans_recon_4x4_ssse3(WORD16 *pi2_src,
    545                                   WORD16 *pi2_tmp,
    546                                   UWORD8 *pu1_pred,
    547                                   UWORD8 *pu1_dst,
    548                                   WORD32 src_strd,
    549                                   WORD32 pred_strd,
    550                                   WORD32 dst_strd,
    551                                   WORD32 zero_cols,
    552                                   WORD32 zero_rows)
    553 {
    554     __m128i m_temp_reg_0;
    555     __m128i m_temp_reg_1;
    556     __m128i m_temp_reg_2;
    557     __m128i m_temp_reg_3;
    558     __m128i m_temp_reg_4;
    559     __m128i m_temp_reg_10;
    560     __m128i m_temp_reg_11;
    561     __m128i m_temp_reg_12;
    562     __m128i m_temp_reg_13;
    563     __m128i m_temp_reg_14;
    564     __m128i m_temp_reg_15;
    565     __m128i m_temp_reg_20;
    566     __m128i m_temp_reg_21;
    567     __m128i m_temp_reg_22;
    568     __m128i m_temp_reg_23;
    569     __m128i m_temp_reg_24;
    570     __m128i m_temp_reg_25;
    571     __m128i m_temp_reg_30;
    572     __m128i m_temp_reg_31;
    573     __m128i m_temp_reg_33;
    574     __m128i m_temp_reg_34;
    575     __m128i m_rdng_factor;
    576     __m128i m_count;
    577 
    578     __m128i m_ge_zero16b_flag_row0;
    579     __m128i m_ge_zero16b_flag_row1;
    580     __m128i m_ge_zero16b_flag_row2;
    581     __m128i m_ge_zero16b_flag_row3;
    582 
    583     __m128i m_zero = _mm_setzero_si128();
    584 
    585     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    586     UNUSED(zero_rows);
    587     UNUSED(zero_cols);
    588     UNUSED(pi2_tmp);
    589 
    590     m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
    591     pi2_src += src_strd;
    592     m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
    593     pi2_src += src_strd;
    594     m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
    595     pi2_src += src_strd;
    596     m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
    597 
    598     m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
    599     m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
    600     m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
    601     m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
    602 
    603     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
    604     m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
    605     m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
    606     m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
    607 
    608     /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
    609     m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
    610 
    611     m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
    612     m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
    613 
    614     /* e */
    615     {
    616         m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
    617         m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
    618     }
    619 
    620     /* o */
    621     {
    622         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 5);
    623         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 2);
    624         m_temp_reg_12 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
    625         //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
    626 
    627         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 6);
    628         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 4);
    629         m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_3, 1);
    630         m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_3);
    631         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
    632         m_temp_reg_13 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
    633         //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
    634 
    635         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
    636         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 4);
    637         m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
    638         m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_1);
    639         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
    640         m_temp_reg_14 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
    641         //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
    642 
    643         m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 5);
    644         m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 2);
    645         m_temp_reg_15 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
    646         //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
    647     }
    648 
    649     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    650 
    651     /* e1 stored in m_temp_reg_31 */
    652     {
    653         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
    654     }
    655 
    656     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
    657 
    658     /* e0 stored in m_temp_reg_30 */
    659     {
    660         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
    661     }
    662 
    663     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
    664     m_count = _mm_cvtsi32_si128(i4_shift);
    665 
    666     /* o1 stored in m_temp_reg_33 */
    667     {
    668         m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
    669     }
    670 
    671     /* e1 + add */
    672     {
    673         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    674     }
    675 
    676     /* e0 + add */
    677     {
    678         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    679     }
    680 
    681     /* o0 stored in m_temp_reg_34 */
    682     {
    683         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
    684     }
    685 
    686     /* Stage 1 outputs */
    687     {
    688         m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
    689         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
    690 
    691         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
    692         m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
    693 
    694 
    695         m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
    696         m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
    697         m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
    698         m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
    699 
    700         m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    701         m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    702         m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    703         m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    704 
    705         m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
    706         m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
    707 
    708         m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
    709         m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
    710     }
    711 
    712     /* Stage 2 */
    713     {
    714         i4_shift = IT_SHIFT_STAGE_2;
    715 
    716         /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    717         m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);*/
    718 
    719         m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
    720         m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
    721 
    722         m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
    723         m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
    724         m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
    725         m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
    726 
    727         /*m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
    728         m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
    729 
    730         m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
    731         m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
    732 
    733         /* e */
    734         {
    735             m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
    736         }
    737 
    738         /* o */
    739         /*{
    740             m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1);//src[1]*36
    741             m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);//src[1]*83
    742             m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3);//src[3]*83
    743             m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1);//src[3]*36
    744         }*/
    745         {
    746             m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 5);
    747             m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 2);
    748             m_temp_reg_12 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
    749             //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
    750 
    751             m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 6);
    752             m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 4);
    753             m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_23, 1);
    754             m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_23);
    755             m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
    756             m_temp_reg_13 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
    757             //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
    758 
    759             m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 6);
    760             m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 4);
    761             m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_22, 1);
    762             m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_22);
    763             m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
    764             m_temp_reg_14 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
    765             //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
    766 
    767             m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 5);
    768             m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 2);
    769             m_temp_reg_15 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
    770             //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
    771         }
    772 
    773         /* e */
    774         {
    775             m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
    776         }
    777 
    778         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    779 
    780         /* e1 stored in m_temp_reg_31 */
    781         {
    782             m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
    783         }
    784 
    785         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
    786 
    787         /* e0 stored in m_temp_reg_30 */
    788         {
    789             m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
    790         }
    791 
    792         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
    793         m_count = _mm_cvtsi32_si128(i4_shift);
    794 
    795         /* o1 stored in m_temp_reg_33 */
    796         {
    797             m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
    798         }
    799 
    800         /* e1 + add */
    801         {
    802             m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    803         }
    804 
    805         /* e0 + add */
    806         {
    807             m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    808         }
    809 
    810         /* o0 stored in m_temp_reg_34 */
    811         {
    812             m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
    813         }
    814 
    815         /* Stage 2 outputs */
    816         {
    817             m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
    818             m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
    819             m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
    820             m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
    821 
    822             m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
    823             m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
    824             m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
    825             m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
    826 
    827             m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    828             m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    829             m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
    830             m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
    831 
    832             m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
    833             m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
    834 
    835             m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
    836             m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
    837         }
    838 
    839         /* Recon and store */
    840         {
    841             UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
    842 
    843             m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
    844             pu1_pred += pred_strd;
    845             m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
    846             pu1_pred += pred_strd;
    847             m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
    848             pu1_pred += pred_strd;
    849             m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
    850 
    851             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
    852             //m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
    853 
    854             //m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
    855             //m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
    856 
    857             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
    858             m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
    859             m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
    860             m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
    861 
    862             m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
    863             m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
    864 
    865             m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
    866             m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
    867 
    868             m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
    869 
    870             *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
    871             m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
    872             m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
    873             m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
    874             pu1_dst += dst_strd;
    875             pu4_dst = (UWORD32 *)(pu1_dst);
    876 
    877             *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
    878             pu1_dst += dst_strd;
    879             pu4_dst = (UWORD32 *)(pu1_dst);
    880 
    881             *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
    882             pu1_dst += dst_strd;
    883             pu4_dst = (UWORD32 *)(pu1_dst);
    884 
    885             *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
    886         }
    887     }
    888 }
    889 
    890 
    891 
    892 /**
    893  *******************************************************************************
    894  *
    895  * @brief
    896  *  This function performs inverse quantization, inverse  transform and
    897  * reconstruction for 8c8 input block
    898  *
    899  * @par Description:
    900  *  Performs inverse quantization , inverse transform  and adds the
    901  * prediction data and clips output to 8 bit
    902  *
    903  * @param[in] pi2_src
    904  *  Input 8x8 coefficients
    905  *
    906  * @param[in] pi2_tmp
    907  *  Temporary 8x8 buffer for storing inverse
    908  *  transform 1st stage output
    909  *
    910  * @param[in] pu1_pred
    911  *  Prediction 8x8 block
    912  *
    913  * @param[in] pi2_dequant_coeff
    914  *  Dequant Coeffs
    915  *
    916  * @param[out] pu1_dst
    917  *  Output 8x8 block
    918  *
    919  * @param[in] src_strd
    920  *  Input stride
    921  *
    922  * @param[in] qp_div
    923  *  Quantization parameter / 6
    924  *
    925  * @param[in] qp_rem
    926  *  Quantization parameter % 6
    927  *
    928  * @param[in] pred_strd
    929  *  Prediction stride
    930  *
    931  * @param[in] dst_strd
    932  *  Output Stride
    933  *
    934  * @param[in] zero_cols
    935  *  Zero columns in pi2_src
    936  *
    937  * @returns  Void
    938  *
    939  * @remarks
    940  *  None
    941  *
    942  *******************************************************************************
    943  */
    944 
    945 
    946 void ihevc_itrans_recon_8x8_ssse3(WORD16 *pi2_src,
    947                                   WORD16 *pi2_tmp,
    948                                   UWORD8 *pu1_pred,
    949                                   UWORD8 *pu1_dst,
    950                                   WORD32 src_strd,
    951                                   WORD32 pred_strd,
    952                                   WORD32 dst_strd,
    953                                   WORD32 zero_cols,
    954                                   WORD32 zero_rows)
    955 {
    956     __m128i m_temp_reg_0;
    957     __m128i m_temp_reg_1;
    958     __m128i m_temp_reg_2;
    959     __m128i m_temp_reg_3;
    960     __m128i m_temp_reg_5;
    961     __m128i m_temp_reg_6;
    962     __m128i m_temp_reg_7;
    963     __m128i m_temp_reg_4;
    964     __m128i m_temp_reg_10;
    965     __m128i m_temp_reg_11;
    966     __m128i m_temp_reg_12;
    967     __m128i m_temp_reg_13;
    968     __m128i m_temp_reg_14;
    969     __m128i m_temp_reg_15;
    970     __m128i m_temp_reg_16;
    971     __m128i m_temp_reg_17;
    972     __m128i m_temp_reg_20;
    973     __m128i m_temp_reg_21;
    974     __m128i m_temp_reg_22;
    975     __m128i m_temp_reg_23;
    976     __m128i m_temp_reg_24;
    977     __m128i m_temp_reg_25;
    978     __m128i m_temp_reg_26;
    979     __m128i m_temp_reg_27;
    980     __m128i m_temp_reg_30;
    981     __m128i m_temp_reg_31;
    982     __m128i m_temp_reg_32;
    983     __m128i m_temp_reg_33;
    984     __m128i m_temp_reg_34;
    985     __m128i m_temp_reg_35;
    986     __m128i m_temp_reg_36;
    987     __m128i m_temp_reg_37;
    988     __m128i m_temp_reg_40;
    989     __m128i m_temp_reg_41;
    990     __m128i m_temp_reg_42;
    991     __m128i m_temp_reg_43;
    992     __m128i m_temp_reg_44;
    993     __m128i m_temp_reg_45;
    994     __m128i m_temp_reg_46;
    995     __m128i m_temp_reg_47;
    996     __m128i m_temp_reg_50;
    997     __m128i m_temp_reg_51;
    998     __m128i m_temp_reg_52;
    999     __m128i m_temp_reg_53;
   1000     __m128i m_temp_reg_54;
   1001     __m128i m_temp_reg_55;
   1002     __m128i m_temp_reg_56;
   1003     __m128i m_temp_reg_57;
   1004     __m128i m_temp_reg_60;
   1005     __m128i m_temp_reg_61;
   1006     __m128i m_temp_reg_62;
   1007     __m128i m_temp_reg_63;
   1008     __m128i m_temp_reg_64;
   1009     __m128i m_temp_reg_65;
   1010     __m128i m_temp_reg_66;
   1011     __m128i m_temp_reg_67;
   1012     __m128i m_temp_reg_70;
   1013     __m128i m_temp_reg_71;
   1014     __m128i m_temp_reg_72;
   1015     __m128i m_temp_reg_73;
   1016     __m128i m_temp_reg_74;
   1017     __m128i m_temp_reg_75;
   1018     __m128i m_temp_reg_76;
   1019     __m128i m_temp_reg_77;
   1020     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
   1021 
   1022     WORD32 check_row_stage_1;   /* Lokesh */
   1023     WORD32 check_row_stage_2;   /* Lokesh */
   1024 
   1025     __m128i m_rdng_factor;
   1026     //__m128i m_count;
   1027     WORD32 i4_shift = IT_SHIFT_STAGE_1;
   1028     UNUSED(zero_rows);
   1029     UNUSED(zero_cols);
   1030     UNUSED(pi2_tmp);
   1031 
   1032     check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
   1033     check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
   1034 
   1035     m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src);
   1036     pi2_src += src_strd;
   1037     m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src);
   1038     pi2_src += src_strd;
   1039     m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src);
   1040     pi2_src += src_strd;
   1041     m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src);
   1042     pi2_src += src_strd;
   1043 
   1044     m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src);
   1045     pi2_src += src_strd;
   1046     m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src);
   1047     pi2_src += src_strd;
   1048     m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src);
   1049     pi2_src += src_strd;
   1050     m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src);
   1051 
   1052     if(!check_row_stage_2)
   1053     {
   1054         if(!check_row_stage_1)
   1055         {
   1056             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1057             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1058             {
   1059                 //Interleaving 0,4 row in 0 , 1 Rishab
   1060                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
   1061                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
   1062                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
   1063 
   1064                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
   1065 
   1066                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1067                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1068 
   1069             }
   1070 
   1071 
   1072             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1073             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1074             /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
   1075             {
   1076 
   1077                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
   1078                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
   1079 
   1080                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
   1081                 //Interleaving 2,6 row in 4, 5 Rishab
   1082                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1083 
   1084                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
   1085                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1086 
   1087 
   1088                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
   1089 
   1090                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   1091                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
   1092 
   1093                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   1094                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
   1095 
   1096 
   1097 
   1098                 /* e */
   1099 
   1100                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1101                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1102                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1103                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1104                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1105                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1106 
   1107                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1108                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1109 
   1110             }
   1111 
   1112             /* o */
   1113             {
   1114 
   1115                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1116                 {
   1117 
   1118                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1119                     //o0:1B*89+3B*75,5B*50+7B*18
   1120                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1121 
   1122                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1123                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1124 
   1125 
   1126 
   1127                     /* Column 0 of destination computed here */
   1128                     /* It is stored in m_temp_reg_50 */
   1129                     /* Column 7 of destination computed here */
   1130                     /* It is stored in m_temp_reg_57 */
   1131                     /* Upper 8 bytes of both registers are zero due to zero_cols*/
   1132 
   1133 
   1134 
   1135                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1136                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1137 
   1138                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1139                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1140 
   1141                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1142                     m_temp_reg_63 = _mm_setzero_si128();
   1143                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1144 
   1145                     //o1:1B*75-3B*18,5B*89+7B*50
   1146                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1147 
   1148                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1149                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1150 
   1151                     /* Loading coeff for computing o2  in the next block */
   1152 
   1153                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   1154                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
   1155 
   1156                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   1157 
   1158 
   1159 
   1160                     /* Column 1 of destination computed here */
   1161                     /* It is stored in m_temp_reg_51 */
   1162                     /* Column 6 of destination computed here */
   1163                     /* It is stored in m_temp_reg_56 */
   1164 
   1165                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1166                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1167 
   1168                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1169                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1170 
   1171                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1172                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1173 
   1174                     //o2:1B*50-3B*89,5B*18+7B*75
   1175                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1176 
   1177                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1178                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1179 
   1180 
   1181                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1182 
   1183                     /* Loading coeff for computing o3  in the next block */
   1184 
   1185                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   1186                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
   1187 
   1188 
   1189 
   1190                     /* Column 2 of destination computed here */
   1191                     /* It is stored in m_temp_reg_52 */
   1192                     /* Column 5 of destination computed here */
   1193                     /* It is stored in m_temp_reg_55 */
   1194 
   1195                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1196                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1197 
   1198                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1199                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1200 
   1201                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1202                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1203 
   1204                     //o3:1B*18-3B*50,5B*75-7B*89
   1205                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1206 
   1207                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1208                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1209 
   1210 
   1211 
   1212                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1213 
   1214 
   1215 
   1216                     /* Column 3 of destination computed here */
   1217                     /* It is stored in m_temp_reg_53 */
   1218                     /* Column 4 of destination computed here */
   1219                     /* It is stored in m_temp_reg_54 */
   1220 
   1221                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1222                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1223 
   1224                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1225                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1226 
   1227                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1228                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1229 
   1230 
   1231                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1232                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1233                 }
   1234             }
   1235 
   1236             /* Transpose of the destination 8x8 matrix done here */
   1237             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1238             /* respectively */
   1239             {
   1240                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1241                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1242                 //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   1243                 //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   1244                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1245                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1246                 //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   1247                 //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   1248 
   1249                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1250                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1251                 //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   1252                 //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   1253                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1254                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1255                 //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   1256                 //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   1257 
   1258                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1259                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1260                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1261                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1262 
   1263                 /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   1264                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   1265                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   1266                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   1267                 */
   1268                 m_temp_reg_54 = _mm_setzero_si128();
   1269                 m_temp_reg_55 = _mm_setzero_si128();
   1270                 m_temp_reg_56 = _mm_setzero_si128();
   1271                 m_temp_reg_57 = _mm_setzero_si128();
   1272             }
   1273         }
   1274         else
   1275         {
   1276             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1277             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1278             {
   1279                 //Interleaving 0,4 row in 0 , 1 Rishab
   1280                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
   1281                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
   1282                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
   1283 
   1284                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
   1285 
   1286                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1287                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1288 
   1289             }
   1290 
   1291 
   1292             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1293             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1294             /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
   1295             {
   1296 
   1297                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
   1298                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
   1299 
   1300                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
   1301                 //Interleaving 2,6 row in 4, 5 Rishab
   1302                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1303 
   1304                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
   1305                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1306 
   1307 
   1308                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
   1309 
   1310                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   1311                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
   1312 
   1313                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   1314                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
   1315 
   1316 
   1317 
   1318                 /* e */
   1319 
   1320                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1321                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1322                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1323                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1324                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1325                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1326 
   1327                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1328                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1329 
   1330             }
   1331 
   1332             /* o */
   1333             {
   1334 
   1335                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1336                 {
   1337 
   1338                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1339                     m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   1340                     //o0:1B*89+3B*75,5B*50+7B*18
   1341                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1342                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   1343 
   1344                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1345                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1346 
   1347                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1348 
   1349 
   1350 
   1351                     /* Column 0 of destination computed here */
   1352                     /* It is stored in m_temp_reg_50 */
   1353                     /* Column 7 of destination computed here */
   1354                     /* It is stored in m_temp_reg_57 */
   1355                     /* Upper 8 bytes of both registers are zero due to zero_cols*/
   1356 
   1357 
   1358 
   1359                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1360                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1361 
   1362                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1363                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1364 
   1365                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1366                     m_temp_reg_63 = _mm_setzero_si128();
   1367                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1368 
   1369                     //o1:1B*75-3B*18,5B*89+7B*50
   1370                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1371                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   1372 
   1373                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1374                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1375 
   1376                     /* Loading coeff for computing o2  in the next block */
   1377 
   1378                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   1379                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
   1380 
   1381                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   1382                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   1383 
   1384 
   1385 
   1386                     /* Column 1 of destination computed here */
   1387                     /* It is stored in m_temp_reg_51 */
   1388                     /* Column 6 of destination computed here */
   1389                     /* It is stored in m_temp_reg_56 */
   1390 
   1391                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1392                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1393 
   1394                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1395                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1396 
   1397                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1398                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1399 
   1400                     //o2:1B*50-3B*89,5B*18+7B*75
   1401                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1402                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   1403 
   1404                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1405                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1406 
   1407 
   1408                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1409 
   1410                     /* Loading coeff for computing o3  in the next block */
   1411 
   1412                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   1413                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
   1414 
   1415                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1416 
   1417 
   1418                     /* Column 2 of destination computed here */
   1419                     /* It is stored in m_temp_reg_52 */
   1420                     /* Column 5 of destination computed here */
   1421                     /* It is stored in m_temp_reg_55 */
   1422 
   1423                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1424                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1425 
   1426                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1427                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1428 
   1429                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1430                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1431 
   1432                     //o3:1B*18-3B*50,5B*75-7B*89
   1433                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1434                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   1435 
   1436                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1437                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1438 
   1439 
   1440 
   1441                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1442 
   1443                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
   1444 
   1445 
   1446                     /* Column 3 of destination computed here */
   1447                     /* It is stored in m_temp_reg_53 */
   1448                     /* Column 4 of destination computed here */
   1449                     /* It is stored in m_temp_reg_54 */
   1450 
   1451                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1452                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1453 
   1454                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1455                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1456 
   1457                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1458                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1459 
   1460 
   1461                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1462                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
   1463                 }
   1464             }
   1465 
   1466             /* Transpose of the destination 8x8 matrix done here */
   1467             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1468             /* respectively */
   1469             {
   1470                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1471                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1472                 //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   1473                 //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   1474                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1475                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1476                 //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   1477                 //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   1478 
   1479                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1480                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1481                 //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   1482                 //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   1483                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1484                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1485                 //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   1486                 //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   1487 
   1488                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1489                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1490                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1491                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1492 
   1493                 /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   1494                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   1495                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   1496                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   1497                 */
   1498                 m_temp_reg_54 = _mm_setzero_si128();
   1499                 m_temp_reg_55 = _mm_setzero_si128();
   1500                 m_temp_reg_56 = _mm_setzero_si128();
   1501                 m_temp_reg_57 = _mm_setzero_si128();
   1502             }
   1503         }
   1504 
   1505         /* Stage 2 */
   1506         i4_shift = IT_SHIFT_STAGE_2;
   1507         {
   1508             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1509             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1510             {
   1511                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
   1512                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
   1513 
   1514                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
   1515                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
   1516 
   1517                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1518                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1519                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1520                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1521 
   1522 
   1523                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
   1524                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
   1525             }
   1526 
   1527 
   1528             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1529             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1530             {
   1531 
   1532                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
   1533                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
   1534 
   1535 
   1536                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1537                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1538                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1539                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1540 
   1541                 /* Loading coeff for computing o0 in the next block */
   1542                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   1543 
   1544 
   1545                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
   1546                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
   1547 
   1548 
   1549 
   1550                 /* e */
   1551 
   1552                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1553                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1554                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1555                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1556                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1557                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1558 
   1559                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1560                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1561 
   1562                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   1563                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   1564 
   1565                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   1566                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   1567 
   1568             }
   1569 
   1570             /* o */
   1571             {
   1572 
   1573                 //m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55,m_temp_reg_57);
   1574                 //m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55,m_temp_reg_57);
   1575 
   1576                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1577                 {
   1578                     //o0:1B*89+3B*75,1T*89+3T*75
   1579                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1580                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1581 
   1582                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1583                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1584                     /* Loading coeff for computing o1 in the next block */
   1585                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   1586 
   1587 
   1588 
   1589                     /* Column 0 of destination computed here */
   1590                     /* It is stored in m_temp_reg_50 */
   1591                     /* Column 7 of destination computed here */
   1592                     /* It is stored in m_temp_reg_57 */
   1593 
   1594                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1595                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1596 
   1597                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   1598                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   1599 
   1600                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1601                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1602                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1603                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1604 
   1605                     //o1:1B*75-3B*18,1T*75-3T*18
   1606                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   1607                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   1608 
   1609                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1610                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1611                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1612                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1613 
   1614                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1615                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1616 
   1617 
   1618                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   1619 
   1620 
   1621                     /* Loading coeff for computing o2  in the next block */
   1622                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   1623 
   1624 
   1625 
   1626                     /* Column 1 of destination computed here */
   1627                     /* It is stored in m_temp_reg_51 */
   1628                     /* Column 6 of destination computed here */
   1629                     /* It is stored in m_temp_reg_56 */
   1630 
   1631                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1632                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1633 
   1634                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   1635                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   1636 
   1637                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1638                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1639                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1640                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1641 
   1642                     //o2:1B*50-3B*89,5T*18+7T*75.
   1643                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1644                     m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1645 
   1646                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1647                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1648                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1649                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1650 
   1651                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1652                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1653 
   1654 
   1655                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1656 
   1657                     /* Loading coeff for computing o3  in the next block */
   1658 
   1659                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   1660 
   1661 
   1662                     /* Column 2 of destination computed here */
   1663                     /* It is stored in m_temp_reg_52 */
   1664                     /* Column 5 of destination computed here */
   1665                     /* It is stored in m_temp_reg_55 */
   1666 
   1667                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1668                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1669 
   1670                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   1671                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   1672 
   1673                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1674                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1675                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1676                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1677 
   1678                     //o3:1B*18-3B*50,1T*18-3T*50
   1679                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   1680                     m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   1681 
   1682                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1683                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1684                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1685                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1686 
   1687 
   1688                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1689                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1690 
   1691 
   1692 
   1693                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1694 
   1695 
   1696                     /* Column 3 of destination computed here */
   1697                     /* It is stored in m_temp_reg_53 */
   1698                     /* Column 4 of destination computed here */
   1699                     /* It is stored in m_temp_reg_54 */
   1700 
   1701                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1702                     m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1703 
   1704                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   1705                     m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   1706 
   1707                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
   1708                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
   1709                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
   1710                     m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
   1711 
   1712                     m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
   1713                     m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
   1714                     m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
   1715                     m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
   1716 
   1717                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
   1718                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
   1719                 }
   1720             }
   1721 
   1722             /* Transpose of the destination 8x8 matrix done here */
   1723             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1724             /* respectively */
   1725             {
   1726                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1727                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1728                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   1729                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   1730                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1731                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1732                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   1733                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   1734 
   1735                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1736                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1737                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   1738                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   1739                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1740                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1741                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   1742                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   1743                 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1744                 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1745                 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1746                 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1747 
   1748                 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   1749                 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   1750                 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   1751                 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   1752             }
   1753 
   1754             /* Recon and store */
   1755             {
   1756                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1757                 pu1_pred += pred_strd;
   1758                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1759                 pu1_pred += pred_strd;
   1760                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1761                 pu1_pred += pred_strd;
   1762                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1763                 pu1_pred += pred_strd;
   1764                 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1765                 pu1_pred += pred_strd;
   1766                 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1767                 pu1_pred += pred_strd;
   1768                 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1769                 pu1_pred += pred_strd;
   1770                 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1771 
   1772                 m_temp_reg_50 = _mm_setzero_si128();
   1773                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
   1774                 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
   1775                 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
   1776                 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
   1777                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
   1778                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
   1779                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
   1780                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
   1781 
   1782                 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
   1783                 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
   1784                 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
   1785                 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
   1786                 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
   1787                 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
   1788                 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
   1789                 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
   1790 
   1791                 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
   1792                 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
   1793                 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
   1794                 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
   1795                 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
   1796                 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
   1797                 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
   1798                 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
   1799 
   1800                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
   1801                 pu1_dst += dst_strd;
   1802                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
   1803                 pu1_dst += dst_strd;
   1804                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
   1805                 pu1_dst += dst_strd;
   1806                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
   1807                 pu1_dst += dst_strd;
   1808                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
   1809                 pu1_dst += dst_strd;
   1810                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
   1811                 pu1_dst += dst_strd;
   1812                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
   1813                 pu1_dst += dst_strd;
   1814                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
   1815                 pu1_dst += dst_strd;
   1816             }
   1817         }
   1818     }
   1819     else
   1820 
   1821     {
   1822 
   1823         /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1824         /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1825         if(!check_row_stage_1)
   1826         {
   1827             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1828             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1829             {
   1830                 //Interleaving 0,4 row in 0 , 1 Rishab
   1831                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
   1832                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
   1833                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
   1834 
   1835                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
   1836                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
   1837 
   1838                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1839                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1840 
   1841 
   1842                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1843                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1844             }
   1845 
   1846 
   1847             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1848             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1849             {
   1850 
   1851                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
   1852                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
   1853 
   1854                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
   1855                 //Interleaving 2,6 row in 4, 5 Rishab
   1856                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1857                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
   1858 
   1859                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
   1860                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1861 
   1862                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
   1863                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   1864 
   1865 
   1866 
   1867                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
   1868 
   1869                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   1870                 //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
   1871 
   1872                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   1873                 //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
   1874 
   1875             }
   1876 
   1877             /* e */
   1878             {
   1879                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1880                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1881                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1882                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1883                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1884                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1885 
   1886                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1887                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1888 
   1889                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   1890                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   1891 
   1892                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   1893                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   1894 
   1895             }
   1896 
   1897             /* o */
   1898             {
   1899 
   1900                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1901                 {
   1902 
   1903                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1904                     m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   1905                     //  m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75,m_temp_reg_77);
   1906                     //   m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75,m_temp_reg_77);
   1907                     //o0:1B*89+3B*75,1T*89+3T*75
   1908                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1909                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1910                     //m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   1911                     //m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
   1912 
   1913 
   1914                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1915                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1916 
   1917                     //m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1918                     //m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   1919                 }
   1920 
   1921                 /* Column 0 of destination computed here */
   1922                 /* It is stored in m_temp_reg_50 */
   1923                 /* Column 7 of destination computed here */
   1924                 /* It is stored in m_temp_reg_57 */
   1925                 {
   1926 
   1927 
   1928                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1929                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1930 
   1931                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   1932                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   1933 
   1934                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1935                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1936                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1937                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1938 
   1939                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1940                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1941                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1942                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1943 
   1944                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   1945                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1946                     //m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   1947                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   1948                     //m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
   1949 
   1950                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1951                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1952 
   1953                     /* Loading coeff for computing o2  in the next block */
   1954 
   1955                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   1956 
   1957                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   1958                     //m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   1959                     //m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
   1960                 }
   1961 
   1962                 /* Column 1 of destination computed here */
   1963                 /* It is stored in m_temp_reg_51 */
   1964                 /* Column 6 of destination computed here */
   1965                 /* It is stored in m_temp_reg_56 */
   1966                 {
   1967                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1968                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1969 
   1970                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   1971                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   1972 
   1973                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1974                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1975                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1976                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1977 
   1978                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1979                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1980                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1981                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1982 
   1983                     //o2:1B*50-3B*89,1T*50-3T*89
   1984                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1985                     m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1986 
   1987                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1988                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1989 
   1990 
   1991                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1992 
   1993 
   1994                     /* Loading coeff for computing o3  in the next block */
   1995 
   1996                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   1997 
   1998                     //m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1999                     //m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   2000                 }
   2001 
   2002                 /* Column 2 of destination computed here */
   2003                 /* It is stored in m_temp_reg_52 */
   2004                 /* Column 5 of destination computed here */
   2005                 /* It is stored in m_temp_reg_55 */
   2006                 {
   2007                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   2008                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   2009 
   2010                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   2011                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   2012 
   2013                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2014                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2015                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2016                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2017 
   2018                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2019                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2020                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2021                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2022 
   2023                     //o3:1B*18-3B*50,1T*18-3T*50
   2024                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   2025                     m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   2026 
   2027                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2028                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2029 
   2030 
   2031 
   2032                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   2033 
   2034 
   2035                 }
   2036 
   2037                 /* Column 3 of destination computed here */
   2038                 /* It is stored in m_temp_reg_53 */
   2039                 /* Column 4 of destination computed here */
   2040                 /* It is stored in m_temp_reg_54 */
   2041                 {
   2042                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   2043                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   2044 
   2045                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   2046                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   2047 
   2048                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2049                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2050                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2051                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2052 
   2053                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2054                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2055                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2056                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2057 
   2058                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2059                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2060                 }
   2061             }
   2062 
   2063             /* Transpose of the destination 8x8 matrix done here */
   2064             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   2065             /* respectively */
   2066             {
   2067 
   2068 
   2069                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   2070                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   2071                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   2072                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   2073                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   2074                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   2075                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   2076                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   2077 
   2078                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   2079                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   2080                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   2081                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   2082                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   2083                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   2084                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   2085                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   2086 
   2087                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   2088                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   2089                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   2090                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   2091 
   2092                 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   2093                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   2094                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   2095                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   2096             }
   2097         }
   2098         else
   2099         {
   2100 
   2101             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   2102             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   2103             {
   2104                 //Interleaving 0,4 row in 0 , 1 Rishab
   2105                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
   2106                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
   2107                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
   2108 
   2109                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
   2110                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
   2111 
   2112                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2113                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   2114 
   2115 
   2116                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2117                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   2118             }
   2119 
   2120 
   2121             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   2122             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   2123             {
   2124 
   2125                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
   2126                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
   2127 
   2128                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
   2129                 //Interleaving 2,6 row in 4, 5 Rishab
   2130                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   2131                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
   2132 
   2133                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
   2134                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   2135 
   2136                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
   2137                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   2138 
   2139 
   2140 
   2141                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
   2142 
   2143                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   2144                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
   2145 
   2146                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   2147                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
   2148 
   2149             }
   2150 
   2151             /* e */
   2152             {
   2153                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   2154                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   2155                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   2156                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   2157                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   2158                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   2159 
   2160                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   2161                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   2162 
   2163                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   2164                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   2165 
   2166                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   2167                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   2168 
   2169             }
   2170 
   2171             /* o */
   2172             {
   2173 
   2174                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   2175                 {
   2176 
   2177                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   2178                     m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   2179                     m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   2180                     m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
   2181                     //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
   2182                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   2183                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   2184                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   2185                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
   2186 
   2187 
   2188                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2189                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   2190 
   2191                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   2192                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   2193                 }
   2194 
   2195                 /* Column 0 of destination computed here */
   2196                 /* It is stored in m_temp_reg_50 */
   2197                 /* Column 7 of destination computed here */
   2198                 /* It is stored in m_temp_reg_57 */
   2199                 {
   2200 
   2201 
   2202                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   2203                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   2204 
   2205                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   2206                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   2207 
   2208                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2209                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2210                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2211                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2212 
   2213                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2214                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2215                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2216                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2217 
   2218                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   2219                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   2220                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   2221                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   2222                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
   2223 
   2224                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2225                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2226 
   2227                     /* Loading coeff for computing o2  in the next block */
   2228 
   2229                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   2230                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
   2231 
   2232                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   2233                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   2234                     m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
   2235                 }
   2236 
   2237                 /* Column 1 of destination computed here */
   2238                 /* It is stored in m_temp_reg_51 */
   2239                 /* Column 6 of destination computed here */
   2240                 /* It is stored in m_temp_reg_56 */
   2241                 {
   2242                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   2243                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   2244 
   2245                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   2246                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   2247 
   2248                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2249                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2250                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2251                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2252 
   2253                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2254                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2255                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2256                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2257 
   2258                     //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
   2259                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   2260                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   2261                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   2262                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
   2263 
   2264                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2265                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2266 
   2267 
   2268                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   2269 
   2270 
   2271                     /* Loading coeff for computing o3  in the next block */
   2272 
   2273                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   2274                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
   2275 
   2276                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   2277                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   2278                 }
   2279 
   2280                 /* Column 2 of destination computed here */
   2281                 /* It is stored in m_temp_reg_52 */
   2282                 /* Column 5 of destination computed here */
   2283                 /* It is stored in m_temp_reg_55 */
   2284                 {
   2285                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   2286                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   2287 
   2288                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   2289                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   2290 
   2291                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2292                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2293                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2294                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2295 
   2296                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2297                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2298                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2299                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2300 
   2301                     //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
   2302                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   2303                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   2304                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   2305                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
   2306 
   2307                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2308                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2309 
   2310 
   2311 
   2312                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   2313 
   2314 
   2315                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
   2316                     m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
   2317                 }
   2318 
   2319                 /* Column 3 of destination computed here */
   2320                 /* It is stored in m_temp_reg_53 */
   2321                 /* Column 4 of destination computed here */
   2322                 /* It is stored in m_temp_reg_54 */
   2323                 {
   2324                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   2325                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   2326 
   2327                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   2328                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   2329 
   2330                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   2331                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   2332                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   2333                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   2334 
   2335                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   2336                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   2337                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   2338                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   2339 
   2340                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   2341                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   2342                 }
   2343             }
   2344 
   2345             /* Transpose of the destination 8x8 matrix done here */
   2346             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   2347             /* respectively */
   2348             {
   2349 
   2350 
   2351                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   2352                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   2353                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   2354                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   2355                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   2356                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   2357                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   2358                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   2359 
   2360                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   2361                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   2362                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   2363                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   2364                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   2365                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   2366                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   2367                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   2368 
   2369                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   2370                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   2371                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   2372                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   2373 
   2374                 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   2375                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   2376                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   2377                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   2378             }
   2379         }
   2380         /* Stage 2 */
   2381 
   2382         i4_shift = IT_SHIFT_STAGE_2;
   2383 
   2384         {
   2385 
   2386             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   2387             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   2388             {
   2389                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
   2390                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
   2391 
   2392                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
   2393                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
   2394 
   2395                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2396                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   2397                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2398                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   2399 
   2400 
   2401                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
   2402                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
   2403             }
   2404 
   2405 
   2406             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   2407             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   2408             {
   2409                 //m_temp_reg_66 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);
   2410                 //m_temp_reg_64 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);
   2411                 //m_temp_reg_62 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);
   2412                 //m_temp_reg_60 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);
   2413                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
   2414                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
   2415 
   2416 
   2417                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2418                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   2419                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2420                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   2421 
   2422                 //m_temp_reg_16 = _mm_sub_epi32(m_temp_reg_64, m_temp_reg_66);
   2423                 //m_temp_reg_14 = _mm_add_epi32(m_temp_reg_60, m_temp_reg_62);
   2424 
   2425                 /* Loading coeff for computing o0 in the next block */
   2426                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
   2427                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
   2428 
   2429 
   2430                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
   2431                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
   2432                 /*m_temp_reg_3 = _mm_srli_si128(m_temp_reg_53, 8);
   2433                 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
   2434                 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
   2435                 */
   2436 
   2437             }
   2438 
   2439             /* e */
   2440             {
   2441                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   2442                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   2443                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   2444                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   2445                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   2446                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   2447 
   2448                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   2449                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   2450 
   2451                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   2452                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   2453 
   2454                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   2455                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   2456 
   2457             }
   2458 
   2459             /* o */
   2460             {
   2461                 /*  m_temp_reg_4 = _mm_cvtepi16_epi32(m_temp_reg_55);
   2462                     m_temp_reg_5 = _mm_srli_si128(m_temp_reg_55, 8);
   2463                     m_temp_reg_6 = _mm_cvtepi16_epi32(m_temp_reg_57);
   2464                     m_temp_reg_7 = _mm_srli_si128(m_temp_reg_57, 8);
   2465                     m_temp_reg_5 = _mm_cvtepi16_epi32(m_temp_reg_5);
   2466                     m_temp_reg_7 = _mm_cvtepi16_epi32(m_temp_reg_7);
   2467                     */
   2468                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
   2469                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
   2470 
   2471                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   2472                 {
   2473                     //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
   2474                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2475                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2476                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   2477                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   2478 
   2479                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2480                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   2481                     /* Loading coeff for computing o1 in the next block */
   2482                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
   2483                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
   2484 
   2485                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   2486                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   2487                 }
   2488 
   2489                 /* Column 0 of destination computed here */
   2490                 /* It is stored in m_temp_reg_50 */
   2491                 /* Column 7 of destination computed here */
   2492                 /* It is stored in m_temp_reg_57 */
   2493                 {
   2494                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   2495                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   2496 
   2497                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   2498                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   2499 
   2500                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   2501                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   2502                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   2503                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   2504 
   2505                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   2506                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   2507                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   2508                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   2509 
   2510                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   2511                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   2512                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
   2513                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   2514                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
   2515 
   2516                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   2517                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   2518 
   2519 
   2520                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   2521 
   2522 
   2523                     /* Loading coeff for computing o2  in the next block */
   2524                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
   2525                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
   2526 
   2527                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   2528                     m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
   2529                 }
   2530 
   2531                 /* Column 1 of destination computed here */
   2532                 /* It is stored in m_temp_reg_51 */
   2533                 /* Column 6 of destination computed here */
   2534                 /* It is stored in m_temp_reg_56 */
   2535                 {
   2536                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   2537                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   2538 
   2539                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   2540                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   2541 
   2542                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   2543                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   2544                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   2545                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   2546 
   2547                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   2548                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   2549                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   2550                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   2551 
   2552                     //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
   2553                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2554                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   2555                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2556                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   2557 
   2558                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   2559                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   2560 
   2561 
   2562                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   2563 
   2564                     /* Loading coeff for computing o3  in the next block */
   2565 
   2566                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
   2567                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
   2568 
   2569                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   2570                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   2571                 }
   2572 
   2573                 /* Column 2 of destination computed here */
   2574                 /* It is stored in m_temp_reg_52 */
   2575                 /* Column 5 of destination computed here */
   2576                 /* It is stored in m_temp_reg_55 */
   2577                 {
   2578                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   2579                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   2580 
   2581                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   2582                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   2583 
   2584                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   2585                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   2586                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   2587                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   2588 
   2589                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   2590                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   2591                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   2592                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   2593 
   2594                     //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
   2595                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   2596                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
   2597                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   2598                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
   2599 
   2600                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   2601                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   2602 
   2603 
   2604 
   2605                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   2606 
   2607 
   2608                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
   2609                     m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
   2610                 }
   2611 
   2612                 /* Column 3 of destination computed here */
   2613                 /* It is stored in m_temp_reg_53 */
   2614                 /* Column 4 of destination computed here */
   2615                 /* It is stored in m_temp_reg_54 */
   2616                 {
   2617                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   2618                     m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   2619 
   2620                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   2621                     m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   2622 
   2623                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
   2624                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
   2625                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
   2626                     m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
   2627 
   2628                     m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
   2629                     m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
   2630                     m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
   2631                     m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
   2632 
   2633                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
   2634                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
   2635                 }
   2636             }
   2637 
   2638             /* Transpose of the destination 8x8 matrix done here */
   2639             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   2640             /* respectively */
   2641             {
   2642                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   2643                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   2644                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   2645                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   2646                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   2647                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   2648                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   2649                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   2650 
   2651                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   2652                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   2653                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   2654                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   2655                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   2656                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   2657                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   2658                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   2659                 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   2660                 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   2661                 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   2662                 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   2663 
   2664                 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   2665                 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   2666                 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   2667                 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   2668             }
   2669 
   2670             /* Recon and store */
   2671             {
   2672                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2673                 pu1_pred += pred_strd;
   2674                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2675                 pu1_pred += pred_strd;
   2676                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2677                 pu1_pred += pred_strd;
   2678                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2679                 pu1_pred += pred_strd;
   2680                 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2681                 pu1_pred += pred_strd;
   2682                 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2683                 pu1_pred += pred_strd;
   2684                 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2685                 pu1_pred += pred_strd;
   2686                 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
   2687 
   2688 
   2689                 m_temp_reg_50 = _mm_setzero_si128();
   2690                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
   2691                 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
   2692                 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
   2693                 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
   2694                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
   2695                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
   2696                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
   2697                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
   2698 
   2699                 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
   2700                 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
   2701                 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
   2702                 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
   2703                 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
   2704                 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
   2705                 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
   2706                 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
   2707 
   2708                 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
   2709                 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
   2710                 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
   2711                 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
   2712                 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
   2713                 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
   2714                 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
   2715                 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
   2716 
   2717                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
   2718                 pu1_dst += dst_strd;
   2719                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
   2720                 pu1_dst += dst_strd;
   2721                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
   2722                 pu1_dst += dst_strd;
   2723                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
   2724                 pu1_dst += dst_strd;
   2725                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
   2726                 pu1_dst += dst_strd;
   2727                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
   2728                 pu1_dst += dst_strd;
   2729                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
   2730                 pu1_dst += dst_strd;
   2731                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
   2732                 pu1_dst += dst_strd;
   2733 
   2734             }
   2735 
   2736 
   2737         }
   2738 
   2739 
   2740     }
   2741 }
   2742 
   2743