Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22  *******************************************************************************
     23  * @file
     24  *  impeg2_itrans_recon_x86_intr.c
     25  *
     26  * @brief
     27  *  Contains function definitions for inverse  quantization, inverse
     28  * transform and reconstruction
     29  *
     30  * @author
     31  *  100470
     32  *  100592 (edited by)
     33  *
     34  * @par List of Functions:
     35  *  - impeg2_itrans_recon_8x8_sse42()
     36  *
     37  * @remarks
     38  *  None
     39  *
     40  *******************************************************************************
     41  */
     42 #include <stdio.h>
     43 #include <string.h>
     44 #include "iv_datatypedef.h"
     45 #include "impeg2_macros.h"
     46 #include "impeg2_defs.h"
     47 #include "impeg2_globals.h"
     48 
     49 #include <immintrin.h>
     50 #include <emmintrin.h>
     51 #include <smmintrin.h>
     52 #include <tmmintrin.h>
     53 
     54 
     55 /**
     56  *******************************************************************************
     57  *
     58  * @brief
     59  *  This function performs inverse quantization, inverse  transform and
     60  * reconstruction for 8c8 input block
     61  *
     62  * @par Description:
     63  *  Performs inverse quantization , inverse transform  and adds the
     64  * prediction data and clips output to 8 bit
     65  *
     66  * @param[in] pi2_src
     67  *  Input 8x8 coefficients
     68  *
     69  * @param[in] pi2_tmp
     70  *  Temporary 8x8 buffer for storing inverse
     71  *  transform 1st stage output
     72  *
     73  * @param[in] pu1_pred
     74  *  Prediction 8x8 block
     75  *
     76  * @param[in] pi2_dequant_coeff
     77  *  Dequant Coeffs
     78  *
     79  * @param[out] pu1_dst
     80  *  Output 8x8 block
     81  *
     82  * @param[in] src_strd
     83  *  Input stride
     84  *
     85  * @param[in] qp_div
     86  *  Quantization parameter / 6
     87  *
     88  * @param[in] qp_rem
     89  *  Quantization parameter % 6
     90  *
     91  * @param[in] pred_strd
     92  *  Prediction stride
     93  *
     94  * @param[in] dst_strd
     95  *  Output Stride
     96  *
     97  * @param[in] zero_cols
     98  *  Zero columns in pi2_src
     99  *
    100  * @returns  Void
    101  *
    102  * @remarks
    103  *  None
    104  *
    105  *******************************************************************************
    106  */
    107 
    108 
    109 void impeg2_idct_recon_sse42(WORD16 *pi2_src,
    110                                   WORD16 *pi2_tmp,
    111                                   UWORD8 *pu1_pred,
    112                                   UWORD8 *pu1_dst,
    113                                   WORD32 src_strd,
    114                                   WORD32 pred_strd,
    115                                   WORD32 dst_strd,
    116                                   WORD32 zero_cols,
    117                                   WORD32 zero_rows)
    118 {
    119     __m128i m_temp_reg_0;
    120     __m128i m_temp_reg_1;
    121     __m128i m_temp_reg_2;
    122     __m128i m_temp_reg_3;
    123     __m128i m_temp_reg_5;
    124     __m128i m_temp_reg_6;
    125     __m128i m_temp_reg_7;
    126     __m128i m_temp_reg_4;
    127     __m128i m_temp_reg_10;
    128     __m128i m_temp_reg_11;
    129     __m128i m_temp_reg_12;
    130     __m128i m_temp_reg_13;
    131     __m128i m_temp_reg_14;
    132     __m128i m_temp_reg_15;
    133     __m128i m_temp_reg_16;
    134     __m128i m_temp_reg_17;
    135     __m128i m_temp_reg_20;
    136     __m128i m_temp_reg_21;
    137     __m128i m_temp_reg_22;
    138     __m128i m_temp_reg_23;
    139     __m128i m_temp_reg_24;
    140     __m128i m_temp_reg_25;
    141     __m128i m_temp_reg_26;
    142     __m128i m_temp_reg_27;
    143     __m128i m_temp_reg_30;
    144     __m128i m_temp_reg_31;
    145     __m128i m_temp_reg_32;
    146     __m128i m_temp_reg_33;
    147     __m128i m_temp_reg_34;
    148     __m128i m_temp_reg_35;
    149     __m128i m_temp_reg_36;
    150     __m128i m_temp_reg_37;
    151     __m128i m_temp_reg_40;
    152     __m128i m_temp_reg_41;
    153     __m128i m_temp_reg_42;
    154     __m128i m_temp_reg_43;
    155     __m128i m_temp_reg_44;
    156     __m128i m_temp_reg_45;
    157     __m128i m_temp_reg_46;
    158     __m128i m_temp_reg_47;
    159     __m128i m_temp_reg_50;
    160     __m128i m_temp_reg_51;
    161     __m128i m_temp_reg_52;
    162     __m128i m_temp_reg_53;
    163     __m128i m_temp_reg_54;
    164     __m128i m_temp_reg_55;
    165     __m128i m_temp_reg_56;
    166     __m128i m_temp_reg_57;
    167     __m128i m_temp_reg_60;
    168     __m128i m_temp_reg_61;
    169     __m128i m_temp_reg_62;
    170     __m128i m_temp_reg_63;
    171     __m128i m_temp_reg_64;
    172     __m128i m_temp_reg_65;
    173     __m128i m_temp_reg_66;
    174     __m128i m_temp_reg_67;
    175     __m128i m_temp_reg_70;
    176     __m128i m_temp_reg_71;
    177     __m128i m_temp_reg_72;
    178     __m128i m_temp_reg_73;
    179     __m128i m_temp_reg_74;
    180     __m128i m_temp_reg_75;
    181     __m128i m_temp_reg_76;
    182     __m128i m_temp_reg_77;
    183     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
    184 
    185     WORD32 check_row_stage_1;   /* Lokesh */
    186     WORD32 check_row_stage_2;   /* Lokesh */
    187 
    188     __m128i m_rdng_factor;
    189     WORD32 i4_shift = IDCT_STG1_SHIFT;
    190     UNUSED(pi2_tmp);
    191     check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
    192     check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
    193 
    194     m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
    195     pi2_src += src_strd;
    196     m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
    197     pi2_src += src_strd;
    198     m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
    199     pi2_src += src_strd;
    200     m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
    201     pi2_src += src_strd;
    202 
    203     m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
    204     pi2_src += src_strd;
    205     m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
    206     pi2_src += src_strd;
    207     m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
    208     pi2_src += src_strd;
    209     m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
    210 
    211     if(!check_row_stage_2)
    212     {
    213         if(!check_row_stage_1)
    214         {
    215             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
    216             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
    217             {
    218                 //Interleaving 0,4 row in 0 , 1 Rishab
    219                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
    220                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
    221                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
    222 
    223                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
    224 
    225                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    226                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    227 
    228             }
    229 
    230 
    231             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
    232             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
    233             /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
    234             {
    235 
    236                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
    237                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
    238 
    239                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
    240                 //Interleaving 2,6 row in 4, 5 Rishab
    241                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
    242 
    243                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
    244                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
    245 
    246 
    247                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
    248 
    249                 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
    250                 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
    251 
    252                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
    253                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
    254 
    255 
    256 
    257                 /* e */
    258 
    259                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
    260                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
    261                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
    262                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
    263                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
    264                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
    265 
    266                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
    267                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
    268 
    269             }
    270 
    271             /* o */
    272             {
    273 
    274                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
    275                 {
    276 
    277                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    278                     //o0:1B*89+3B*75,5B*50+7B*18
    279                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
    280 
    281                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    282                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
    283 
    284 
    285 
    286                     /* Column 0 of destination computed here */
    287                     /* It is stored in m_temp_reg_50 */
    288                     /* Column 7 of destination computed here */
    289                     /* It is stored in m_temp_reg_57 */
    290                     /* Upper 8 bytes of both registers are zero due to zero_cols*/
    291 
    292 
    293 
    294                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    295                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    296 
    297                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    298                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    299 
    300                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    301                     m_temp_reg_63 = _mm_setzero_si128();
    302                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    303 
    304                     //o1:1B*75-3B*18,5B*89+7B*50
    305                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
    306 
    307                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    308                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    309 
    310                     /* Loading coeff for computing o2  in the next block */
    311 
    312                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
    313                     m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
    314 
    315                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
    316 
    317 
    318 
    319                     /* Column 1 of destination computed here */
    320                     /* It is stored in m_temp_reg_51 */
    321                     /* Column 6 of destination computed here */
    322                     /* It is stored in m_temp_reg_56 */
    323 
    324                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
    325                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
    326 
    327                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    328                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    329 
    330                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    331                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    332 
    333                     //o2:1B*50-3B*89,5B*18+7B*75
    334                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
    335 
    336                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    337                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    338 
    339 
    340                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
    341 
    342                     /* Loading coeff for computing o3  in the next block */
    343 
    344                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
    345                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
    346 
    347 
    348 
    349                     /* Column 2 of destination computed here */
    350                     /* It is stored in m_temp_reg_52 */
    351                     /* Column 5 of destination computed here */
    352                     /* It is stored in m_temp_reg_55 */
    353 
    354                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
    355                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
    356 
    357                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    358                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    359 
    360                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    361                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    362 
    363                     //o3:1B*18-3B*50,5B*75-7B*89
    364                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
    365 
    366                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    367                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    368 
    369 
    370 
    371                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
    372 
    373 
    374 
    375                     /* Column 3 of destination computed here */
    376                     /* It is stored in m_temp_reg_53 */
    377                     /* Column 4 of destination computed here */
    378                     /* It is stored in m_temp_reg_54 */
    379 
    380                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
    381                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
    382 
    383                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    384                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    385 
    386                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    387                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    388 
    389 
    390                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    391                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    392                 }
    393             }
    394 
    395             /* Transpose of the destination 8x8 matrix done here */
    396             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
    397             /* respectively */
    398             {
    399                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
    400                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
    401                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
    402                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
    403 
    404                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
    405                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
    406 
    407                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
    408                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
    409 
    410                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
    411                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
    412                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
    413                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
    414 
    415                 m_temp_reg_54 = _mm_setzero_si128();
    416                 m_temp_reg_55 = _mm_setzero_si128();
    417                 m_temp_reg_56 = _mm_setzero_si128();
    418                 m_temp_reg_57 = _mm_setzero_si128();
    419             }
    420         }
    421         else
    422         {
    423             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
    424             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
    425             {
    426                 //Interleaving 0,4 row in 0 , 1 Rishab
    427                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
    428                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
    429                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
    430 
    431                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
    432 
    433                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    434                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    435 
    436             }
    437 
    438 
    439             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
    440             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
    441             /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
    442             {
    443 
    444                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
    445                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
    446 
    447                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
    448                 //Interleaving 2,6 row in 4, 5 Rishab
    449                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
    450 
    451                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
    452                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
    453 
    454 
    455                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
    456 
    457                 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
    458                 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
    459 
    460                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
    461                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
    462 
    463 
    464 
    465                 /* e */
    466 
    467                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
    468                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
    469                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
    470                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
    471                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
    472                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
    473 
    474                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
    475                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
    476 
    477             }
    478 
    479             /* o */
    480             {
    481 
    482                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
    483                 {
    484 
    485                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    486                     m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
    487                     //o0:1B*89+3B*75,5B*50+7B*18
    488                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
    489                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
    490 
    491                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    492                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
    493 
    494                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
    495 
    496 
    497 
    498                     /* Column 0 of destination computed here */
    499                     /* It is stored in m_temp_reg_50 */
    500                     /* Column 7 of destination computed here */
    501                     /* It is stored in m_temp_reg_57 */
    502                     /* Upper 8 bytes of both registers are zero due to zero_cols*/
    503 
    504 
    505 
    506                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    507                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    508 
    509                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    510                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    511 
    512                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    513                     m_temp_reg_63 = _mm_setzero_si128();
    514                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    515 
    516                     //o1:1B*75-3B*18,5B*89+7B*50
    517                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
    518                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
    519 
    520                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    521                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    522 
    523                     /* Loading coeff for computing o2  in the next block */
    524 
    525                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
    526                     m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
    527 
    528                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
    529                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
    530 
    531 
    532 
    533                     /* Column 1 of destination computed here */
    534                     /* It is stored in m_temp_reg_51 */
    535                     /* Column 6 of destination computed here */
    536                     /* It is stored in m_temp_reg_56 */
    537 
    538                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
    539                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
    540 
    541                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    542                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    543 
    544                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    545                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    546 
    547                     //o2:1B*50-3B*89,5B*18+7B*75
    548                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
    549                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
    550 
    551                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    552                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    553 
    554 
    555                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
    556 
    557                     /* Loading coeff for computing o3  in the next block */
    558 
    559                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
    560                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
    561 
    562                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
    563 
    564 
    565                     /* Column 2 of destination computed here */
    566                     /* It is stored in m_temp_reg_52 */
    567                     /* Column 5 of destination computed here */
    568                     /* It is stored in m_temp_reg_55 */
    569 
    570                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
    571                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
    572 
    573                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    574                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    575 
    576                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    577                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    578 
    579                     //o3:1B*18-3B*50,5B*75-7B*89
    580                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
    581                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
    582 
    583                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    584                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    585 
    586 
    587 
    588                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
    589 
    590                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
    591 
    592 
    593                     /* Column 3 of destination computed here */
    594                     /* It is stored in m_temp_reg_53 */
    595                     /* Column 4 of destination computed here */
    596                     /* It is stored in m_temp_reg_54 */
    597 
    598                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
    599                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
    600 
    601                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
    602                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
    603 
    604                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
    605                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
    606 
    607 
    608                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
    609                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
    610                 }
    611             }
    612 
    613             /* Transpose of the destination 8x8 matrix done here */
    614             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
    615             /* respectively */
    616             {
    617                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
    618                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
    619                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
    620                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
    621 
    622                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
    623                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
    624                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
    625                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
    626 
    627                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
    628                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
    629                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
    630                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
    631 
    632                 m_temp_reg_54 = _mm_setzero_si128();
    633                 m_temp_reg_55 = _mm_setzero_si128();
    634                 m_temp_reg_56 = _mm_setzero_si128();
    635                 m_temp_reg_57 = _mm_setzero_si128();
    636             }
    637         }
    638 
    639         /* Stage 2 */
    640         i4_shift = IDCT_STG2_SHIFT;
    641         {
    642             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
    643             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
    644             {
    645                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
    646                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
    647 
    648                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
    649                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
    650 
    651                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    652                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    653                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    654                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
    655 
    656 
    657                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
    658                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
    659             }
    660 
    661 
    662             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
    663             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
    664             {
    665 
    666                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
    667                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
    668 
    669 
    670                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    671                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    672                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    673                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
    674 
    675                 /* Loading coeff for computing o0 in the next block */
    676                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
    677 
    678 
    679                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
    680                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
    681 
    682 
    683 
    684                 /* e */
    685 
    686                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
    687                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
    688                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
    689                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
    690                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
    691                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
    692 
    693                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
    694                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
    695 
    696                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
    697                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
    698 
    699                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
    700                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
    701 
    702             }
    703 
    704             /* o */
    705             {
    706 
    707                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
    708                 {
    709                     //o0:1B*89+3B*75,1T*89+3T*75
    710                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    711                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    712 
    713                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    714                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
    715                     /* Loading coeff for computing o1 in the next block */
    716                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
    717 
    718 
    719 
    720                     /* Column 0 of destination computed here */
    721                     /* It is stored in m_temp_reg_50 */
    722                     /* Column 7 of destination computed here */
    723                     /* It is stored in m_temp_reg_57 */
    724 
    725                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    726                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    727 
    728                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
    729                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
    730 
    731                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
    732                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
    733                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
    734                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
    735 
    736                     //o1:1B*75-3B*18,1T*75-3T*18
    737                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
    738                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    739 
    740                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
    741                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
    742                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
    743                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
    744 
    745                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
    746                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
    747 
    748 
    749                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
    750 
    751 
    752                     /* Loading coeff for computing o2  in the next block */
    753                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
    754 
    755 
    756 
    757                     /* Column 1 of destination computed here */
    758                     /* It is stored in m_temp_reg_51 */
    759                     /* Column 6 of destination computed here */
    760                     /* It is stored in m_temp_reg_56 */
    761 
    762                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
    763                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
    764 
    765                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
    766                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
    767 
    768                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
    769                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
    770                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
    771                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
    772 
    773                     //o2:1B*50-3B*89,5T*18+7T*75.
    774                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    775                     m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    776 
    777                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
    778                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
    779                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
    780                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
    781 
    782                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
    783                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
    784 
    785 
    786                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
    787 
    788                     /* Loading coeff for computing o3  in the next block */
    789 
    790                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
    791 
    792 
    793                     /* Column 2 of destination computed here */
    794                     /* It is stored in m_temp_reg_52 */
    795                     /* Column 5 of destination computed here */
    796                     /* It is stored in m_temp_reg_55 */
    797 
    798                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
    799                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
    800 
    801                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
    802                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
    803 
    804                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
    805                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
    806                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
    807                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
    808 
    809                     //o3:1B*18-3B*50,1T*18-3T*50
    810                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
    811                     m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    812 
    813                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
    814                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
    815                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
    816                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
    817 
    818 
    819                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
    820                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
    821 
    822 
    823 
    824                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
    825 
    826 
    827                     /* Column 3 of destination computed here */
    828                     /* It is stored in m_temp_reg_53 */
    829                     /* Column 4 of destination computed here */
    830                     /* It is stored in m_temp_reg_54 */
    831 
    832                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
    833                     m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
    834 
    835                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
    836                     m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
    837 
    838                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
    839                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
    840                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
    841                     m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
    842 
    843                     m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
    844                     m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
    845                     m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
    846                     m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
    847 
    848                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
    849                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
    850                 }
    851             }
    852 
    853             /* Transpose of the destination 8x8 matrix done here */
    854             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
    855             /* respectively */
    856             {
    857                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
    858                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
    859                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
    860                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
    861                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
    862                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
    863                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
    864                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
    865 
    866                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
    867                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
    868                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
    869                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
    870                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
    871                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
    872                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
    873                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
    874                 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
    875                 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
    876                 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
    877                 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
    878 
    879                 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
    880                 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
    881                 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
    882                 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
    883             }
    884 
    885             /* Recon and store */
    886             {
    887                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
    888                 pu1_pred += pred_strd;
    889                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
    890                 pu1_pred += pred_strd;
    891                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
    892                 pu1_pred += pred_strd;
    893                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
    894                 pu1_pred += pred_strd;
    895                 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
    896                 pu1_pred += pred_strd;
    897                 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
    898                 pu1_pred += pred_strd;
    899                 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
    900                 pu1_pred += pred_strd;
    901                 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
    902 
    903                 m_temp_reg_50 = _mm_setzero_si128();
    904                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
    905                 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
    906                 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
    907                 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
    908                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
    909                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
    910                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
    911                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
    912 
    913                 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
    914                 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
    915                 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
    916                 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
    917                 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
    918                 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
    919                 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
    920                 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
    921 
    922                 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
    923                 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
    924                 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
    925                 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
    926                 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
    927                 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
    928                 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
    929                 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
    930 
    931                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
    932                 pu1_dst += dst_strd;
    933                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
    934                 pu1_dst += dst_strd;
    935                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
    936                 pu1_dst += dst_strd;
    937                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
    938                 pu1_dst += dst_strd;
    939                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
    940                 pu1_dst += dst_strd;
    941                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
    942                 pu1_dst += dst_strd;
    943                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
    944                 pu1_dst += dst_strd;
    945                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
    946                 pu1_dst += dst_strd;
    947             }
    948         }
    949     }
    950     else
    951 
    952     {
    953 
    954         /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
    955         /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
    956         if(!check_row_stage_1)
    957         {
    958             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
    959             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
    960             {
    961                 //Interleaving 0,4 row in 0 , 1 Rishab
    962                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
    963                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
    964                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
    965 
    966                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
    967                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
    968 
    969                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    970                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    971 
    972 
    973                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    974                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
    975             }
    976 
    977 
    978             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
    979             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
    980             {
    981 
    982                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
    983                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
    984 
    985                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
    986                 //Interleaving 2,6 row in 4, 5 Rishab
    987                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
    988                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
    989 
    990                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
    991                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
    992 
    993                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
    994                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
    995 
    996 
    997 
    998                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
    999 
   1000                 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
   1001                 //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]);
   1002 
   1003                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
   1004                 //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]);
   1005 
   1006             }
   1007 
   1008             /* e */
   1009             {
   1010                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1011                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1012                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1013                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1014                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1015                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1016 
   1017                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1018                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1019 
   1020                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   1021                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   1022 
   1023                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   1024                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   1025 
   1026             }
   1027 
   1028             /* o */
   1029             {
   1030 
   1031                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1032                 {
   1033 
   1034                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1035                     m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   1036                     //o0:1B*89+3B*75,1T*89+3T*75
   1037                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1038                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1039 
   1040                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1041                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1042 
   1043                 }
   1044 
   1045                 /* Column 0 of destination computed here */
   1046                 /* It is stored in m_temp_reg_50 */
   1047                 /* Column 7 of destination computed here */
   1048                 /* It is stored in m_temp_reg_57 */
   1049                 {
   1050 
   1051 
   1052                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1053                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1054 
   1055                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   1056                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   1057 
   1058                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1059                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1060                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1061                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1062 
   1063                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1064                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1065                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1066                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1067 
   1068                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   1069                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1070                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   1071 
   1072                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1073                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1074 
   1075                     /* Loading coeff for computing o2  in the next block */
   1076 
   1077                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
   1078 
   1079                 }
   1080 
   1081                 /* Column 1 of destination computed here */
   1082                 /* It is stored in m_temp_reg_51 */
   1083                 /* Column 6 of destination computed here */
   1084                 /* It is stored in m_temp_reg_56 */
   1085                 {
   1086                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1087                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1088 
   1089                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   1090                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   1091 
   1092                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1093                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1094                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1095                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1096 
   1097                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1098                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1099                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1100                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1101 
   1102                     //o2:1B*50-3B*89,1T*50-3T*89
   1103                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1104                     m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1105 
   1106                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1107                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1108 
   1109 
   1110                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1111 
   1112 
   1113                     /* Loading coeff for computing o3  in the next block */
   1114 
   1115                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
   1116 
   1117                 }
   1118 
   1119                 /* Column 2 of destination computed here */
   1120                 /* It is stored in m_temp_reg_52 */
   1121                 /* Column 5 of destination computed here */
   1122                 /* It is stored in m_temp_reg_55 */
   1123                 {
   1124                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1125                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1126 
   1127                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   1128                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   1129 
   1130                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1131                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1132                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1133                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1134 
   1135                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1136                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1137                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1138                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1139 
   1140                     //o3:1B*18-3B*50,1T*18-3T*50
   1141                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1142                     m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   1143 
   1144                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1145                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1146 
   1147 
   1148 
   1149                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1150 
   1151 
   1152                 }
   1153 
   1154                 /* Column 3 of destination computed here */
   1155                 /* It is stored in m_temp_reg_53 */
   1156                 /* Column 4 of destination computed here */
   1157                 /* It is stored in m_temp_reg_54 */
   1158                 {
   1159                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1160                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1161 
   1162                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   1163                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   1164 
   1165                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1166                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1167                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1168                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1169 
   1170                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1171                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1172                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1173                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1174 
   1175                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1176                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1177                 }
   1178             }
   1179 
   1180             /* Transpose of the destination 8x8 matrix done here */
   1181             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1182             /* respectively */
   1183             {
   1184 
   1185 
   1186                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1187                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1188                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   1189                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   1190                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1191                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1192                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   1193                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   1194 
   1195                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1196                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1197                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   1198                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   1199                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1200                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1201                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   1202                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   1203 
   1204                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1205                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1206                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1207                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1208 
   1209                 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   1210                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   1211                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   1212                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   1213             }
   1214         }
   1215         else
   1216         {
   1217 
   1218             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1219             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1220             {
   1221                 //Interleaving 0,4 row in 0 , 1 Rishab
   1222                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
   1223                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
   1224                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
   1225 
   1226                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
   1227                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
   1228 
   1229                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1230                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1231 
   1232 
   1233                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1234                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1235             }
   1236 
   1237 
   1238             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1239             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1240             {
   1241 
   1242                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
   1243                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
   1244 
   1245                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
   1246                 //Interleaving 2,6 row in 4, 5 Rishab
   1247                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1248                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
   1249 
   1250                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
   1251                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1252 
   1253                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
   1254                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   1255 
   1256 
   1257 
   1258                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
   1259 
   1260                 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
   1261                 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
   1262 
   1263                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
   1264                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
   1265 
   1266             }
   1267 
   1268             /* e */
   1269             {
   1270                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1271                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1272                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1273                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1274                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1275                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1276 
   1277                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1278                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1279 
   1280                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   1281                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   1282 
   1283                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   1284                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   1285 
   1286             }
   1287 
   1288             /* o */
   1289             {
   1290 
   1291                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1292                 {
   1293 
   1294                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1295                     m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   1296                     m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   1297                     m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
   1298                     //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
   1299                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1300                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1301                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   1302                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
   1303 
   1304 
   1305                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1306                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1307 
   1308                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1309                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   1310                 }
   1311 
   1312                 /* Column 0 of destination computed here */
   1313                 /* It is stored in m_temp_reg_50 */
   1314                 /* Column 7 of destination computed here */
   1315                 /* It is stored in m_temp_reg_57 */
   1316                 {
   1317 
   1318 
   1319                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1320                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1321 
   1322                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   1323                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   1324 
   1325                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1326                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1327                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1328                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1329 
   1330                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1331                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1332                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1333                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1334 
   1335                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   1336                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1337                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   1338                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   1339                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
   1340 
   1341                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1342                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1343 
   1344                     /* Loading coeff for computing o2  in the next block */
   1345 
   1346                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
   1347                     m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
   1348 
   1349                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   1350                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   1351                     m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
   1352                 }
   1353 
   1354                 /* Column 1 of destination computed here */
   1355                 /* It is stored in m_temp_reg_51 */
   1356                 /* Column 6 of destination computed here */
   1357                 /* It is stored in m_temp_reg_56 */
   1358                 {
   1359                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1360                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1361 
   1362                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   1363                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   1364 
   1365                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1366                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1367                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1368                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1369 
   1370                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1371                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1372                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1373                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1374 
   1375                     //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
   1376                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
   1377                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
   1378                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
   1379                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
   1380 
   1381                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1382                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1383 
   1384 
   1385                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1386 
   1387 
   1388                     /* Loading coeff for computing o3  in the next block */
   1389 
   1390                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
   1391                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
   1392 
   1393                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1394                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   1395                 }
   1396 
   1397                 /* Column 2 of destination computed here */
   1398                 /* It is stored in m_temp_reg_52 */
   1399                 /* Column 5 of destination computed here */
   1400                 /* It is stored in m_temp_reg_55 */
   1401                 {
   1402                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1403                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1404 
   1405                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   1406                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   1407 
   1408                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1409                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1410                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1411                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1412 
   1413                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1414                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1415                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1416                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1417 
   1418                     //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
   1419                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
   1420                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
   1421                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
   1422                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
   1423 
   1424                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1425                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1426 
   1427 
   1428 
   1429                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1430 
   1431 
   1432                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
   1433                     m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
   1434                 }
   1435 
   1436                 /* Column 3 of destination computed here */
   1437                 /* It is stored in m_temp_reg_53 */
   1438                 /* Column 4 of destination computed here */
   1439                 /* It is stored in m_temp_reg_54 */
   1440                 {
   1441                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1442                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1443 
   1444                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   1445                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   1446 
   1447                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
   1448                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
   1449                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
   1450                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
   1451 
   1452                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
   1453                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
   1454                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
   1455                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
   1456 
   1457                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
   1458                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
   1459                 }
   1460             }
   1461 
   1462             /* Transpose of the destination 8x8 matrix done here */
   1463             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1464             /* respectively */
   1465             {
   1466 
   1467 
   1468                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1469                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1470                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   1471                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   1472                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1473                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1474                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   1475                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   1476 
   1477                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1478                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1479                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   1480                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   1481                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1482                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1483                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   1484                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   1485 
   1486                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1487                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1488                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1489                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1490 
   1491                 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   1492                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   1493                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   1494                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   1495             }
   1496         }
   1497         /* Stage 2 */
   1498 
   1499         i4_shift = IDCT_STG2_SHIFT;
   1500 
   1501         {
   1502 
   1503             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
   1504             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
   1505             {
   1506                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
   1507                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
   1508 
   1509                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
   1510                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
   1511 
   1512                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1513                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1514                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1515                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1516 
   1517 
   1518                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
   1519                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
   1520             }
   1521 
   1522 
   1523             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
   1524             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
   1525             {
   1526                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
   1527                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
   1528 
   1529 
   1530                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1531                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1532                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1533                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1534 
   1535                 /* Loading coeff for computing o0 in the next block */
   1536                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
   1537                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]);
   1538 
   1539 
   1540                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
   1541                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
   1542             }
   1543 
   1544             /* e */
   1545             {
   1546                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
   1547                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
   1548                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
   1549                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
   1550                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
   1551                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
   1552 
   1553                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
   1554                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
   1555 
   1556                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
   1557                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
   1558 
   1559                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
   1560                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
   1561 
   1562             }
   1563 
   1564             /* o */
   1565             {
   1566                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
   1567                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
   1568 
   1569                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
   1570                 {
   1571                     //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
   1572                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1573                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1574                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1575                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   1576 
   1577                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1578                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
   1579                     /* Loading coeff for computing o1 in the next block */
   1580                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
   1581                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]);
   1582 
   1583                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1584                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   1585                 }
   1586 
   1587                 /* Column 0 of destination computed here */
   1588                 /* It is stored in m_temp_reg_50 */
   1589                 /* Column 7 of destination computed here */
   1590                 /* It is stored in m_temp_reg_57 */
   1591                 {
   1592                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1593                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1594 
   1595                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   1596                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   1597 
   1598                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1599                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1600                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1601                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1602 
   1603                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1604                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1605                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1606                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1607 
   1608                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
   1609                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   1610                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
   1611                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   1612                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
   1613 
   1614                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1615                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1616 
   1617 
   1618                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
   1619 
   1620 
   1621                     /* Loading coeff for computing o2  in the next block */
   1622                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
   1623                     m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]);
   1624 
   1625                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
   1626                     m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
   1627                 }
   1628 
   1629                 /* Column 1 of destination computed here */
   1630                 /* It is stored in m_temp_reg_51 */
   1631                 /* Column 6 of destination computed here */
   1632                 /* It is stored in m_temp_reg_56 */
   1633                 {
   1634                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
   1635                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
   1636 
   1637                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
   1638                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
   1639 
   1640                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1641                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1642                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1643                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1644 
   1645                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1646                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1647                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1648                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1649 
   1650                     //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
   1651                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1652                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
   1653                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1654                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
   1655 
   1656                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1657                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1658 
   1659 
   1660                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
   1661 
   1662                     /* Loading coeff for computing o3  in the next block */
   1663 
   1664                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
   1665                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]);
   1666 
   1667                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
   1668                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
   1669                 }
   1670 
   1671                 /* Column 2 of destination computed here */
   1672                 /* It is stored in m_temp_reg_52 */
   1673                 /* Column 5 of destination computed here */
   1674                 /* It is stored in m_temp_reg_55 */
   1675                 {
   1676                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
   1677                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
   1678 
   1679                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
   1680                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
   1681 
   1682                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
   1683                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
   1684                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
   1685                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
   1686 
   1687                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
   1688                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
   1689                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
   1690                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
   1691 
   1692                     //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
   1693                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   1694                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
   1695                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   1696                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
   1697 
   1698                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
   1699                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
   1700 
   1701 
   1702 
   1703                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
   1704 
   1705 
   1706                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
   1707                     m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
   1708                 }
   1709 
   1710                 /* Column 3 of destination computed here */
   1711                 /* It is stored in m_temp_reg_53 */
   1712                 /* Column 4 of destination computed here */
   1713                 /* It is stored in m_temp_reg_54 */
   1714                 {
   1715                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
   1716                     m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
   1717 
   1718                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
   1719                     m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
   1720 
   1721                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
   1722                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
   1723                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
   1724                     m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
   1725 
   1726                     m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
   1727                     m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
   1728                     m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
   1729                     m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
   1730 
   1731                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
   1732                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
   1733                 }
   1734             }
   1735 
   1736             /* Transpose of the destination 8x8 matrix done here */
   1737             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
   1738             /* respectively */
   1739             {
   1740                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
   1741                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
   1742                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
   1743                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
   1744                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
   1745                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
   1746                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
   1747                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
   1748 
   1749                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
   1750                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
   1751                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
   1752                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
   1753                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
   1754                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
   1755                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
   1756                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
   1757                 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
   1758                 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
   1759                 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
   1760                 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
   1761 
   1762                 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
   1763                 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
   1764                 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
   1765                 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
   1766             }
   1767 
   1768             /* Recon and store */
   1769             {
   1770                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1771                 pu1_pred += pred_strd;
   1772                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1773                 pu1_pred += pred_strd;
   1774                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1775                 pu1_pred += pred_strd;
   1776                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1777                 pu1_pred += pred_strd;
   1778                 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1779                 pu1_pred += pred_strd;
   1780                 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1781                 pu1_pred += pred_strd;
   1782                 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1783                 pu1_pred += pred_strd;
   1784                 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
   1785 
   1786 
   1787                 m_temp_reg_50 = _mm_setzero_si128();
   1788                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
   1789                 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
   1790                 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
   1791                 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
   1792                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
   1793                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
   1794                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
   1795                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
   1796 
   1797                 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
   1798                 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
   1799                 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
   1800                 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
   1801                 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
   1802                 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
   1803                 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
   1804                 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
   1805 
   1806                 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
   1807                 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
   1808                 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
   1809                 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
   1810                 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
   1811                 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
   1812                 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
   1813                 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
   1814 
   1815                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
   1816                 pu1_dst += dst_strd;
   1817                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
   1818                 pu1_dst += dst_strd;
   1819                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
   1820                 pu1_dst += dst_strd;
   1821                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
   1822                 pu1_dst += dst_strd;
   1823                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
   1824                 pu1_dst += dst_strd;
   1825                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
   1826                 pu1_dst += dst_strd;
   1827                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
   1828                 pu1_dst += dst_strd;
   1829                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
   1830                 pu1_dst += dst_strd;
   1831 
   1832             }
   1833 
   1834 
   1835         }
   1836 
   1837 
   1838     }
   1839 }
   1840 
   1841 void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src,
   1842                             WORD16 *pi2_tmp,
   1843                             UWORD8 *pu1_pred,
   1844                             UWORD8 *pu1_dst,
   1845                             WORD32 src_strd,
   1846                             WORD32 pred_strd,
   1847                             WORD32 dst_strd,
   1848                             WORD32 zero_cols,
   1849                             WORD32 zero_rows)
   1850 {
   1851     WORD32 val;
   1852     __m128i value_4x32b, mismatch_stg2_additive;
   1853     __m128i pred_r, pred_half0, pred_half1;
   1854     __m128i temp0, temp1;
   1855     __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND);
   1856 
   1857     UNUSED(pi2_tmp);
   1858     UNUSED(src_strd);
   1859     UNUSED(zero_cols);
   1860     UNUSED(zero_rows);
   1861 
   1862     val = pi2_src[0] * gai2_impeg2_idct_q15[0];
   1863     val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
   1864     val *= gai2_impeg2_idct_q11[0];
   1865     value_4x32b = _mm_set1_epi32(val);
   1866 
   1867     // Row 0 processing
   1868     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive);
   1869     pred_r = _mm_loadl_epi64((__m128i *) pu1_pred);
   1870     pred_r =  _mm_cvtepu8_epi16(pred_r);
   1871     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1872     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
   1873     pred_half0 = _mm_cvtepu16_epi32(pred_r);
   1874     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1875 
   1876     pred_r = _mm_srli_si128(pred_r, 8);
   1877 
   1878     temp0 = _mm_add_epi32(temp0, value_4x32b);
   1879     temp1 = _mm_add_epi32(temp1, value_4x32b);
   1880     temp0 = _mm_add_epi32(temp0, round_stg2);
   1881     temp1 = _mm_add_epi32(temp1, round_stg2);
   1882     pred_half1 = _mm_cvtepu16_epi32(pred_r);
   1883     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
   1884     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
   1885     temp0 = _mm_add_epi32(temp0, pred_half0);
   1886     temp1 = _mm_add_epi32(temp1, pred_half1);
   1887 
   1888     temp0 = _mm_packus_epi32(temp0, temp1);
   1889     temp0 = _mm_packus_epi16(temp0, temp1);
   1890 
   1891     _mm_storel_epi64((__m128i *)pu1_dst, temp0);
   1892 
   1893     // Row 1 processing
   1894     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8));
   1895     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
   1896     pred_r =  _mm_cvtepu8_epi16(pred_r);
   1897     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1898     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
   1899     pred_half0 = _mm_cvtepu16_epi32(pred_r);
   1900     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1901 
   1902     pred_r = _mm_srli_si128(pred_r, 8);
   1903 
   1904     temp0 = _mm_add_epi32(temp0, value_4x32b);
   1905     temp1 = _mm_add_epi32(temp1, value_4x32b);
   1906     temp0 = _mm_add_epi32(temp0, round_stg2);
   1907     temp1 = _mm_add_epi32(temp1, round_stg2);
   1908     pred_half1 = _mm_cvtepu16_epi32(pred_r);
   1909     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
   1910     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
   1911     temp0 = _mm_add_epi32(temp0, pred_half0);
   1912     temp1 = _mm_add_epi32(temp1, pred_half1);
   1913 
   1914     temp0 = _mm_packus_epi32(temp0, temp1);
   1915     temp0 = _mm_packus_epi16(temp0, temp1);
   1916 
   1917     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0);
   1918 
   1919     // Row 2 processing
   1920     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16));
   1921     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd));
   1922     pred_r =  _mm_cvtepu8_epi16(pred_r);
   1923     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1924     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
   1925     pred_half0 = _mm_cvtepu16_epi32(pred_r);
   1926     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1927 
   1928     pred_r = _mm_srli_si128(pred_r, 8);
   1929 
   1930     temp0 = _mm_add_epi32(temp0, value_4x32b);
   1931     temp1 = _mm_add_epi32(temp1, value_4x32b);
   1932     temp0 = _mm_add_epi32(temp0, round_stg2);
   1933     temp1 = _mm_add_epi32(temp1, round_stg2);
   1934     pred_half1 = _mm_cvtepu16_epi32(pred_r);
   1935     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
   1936     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
   1937     temp0 = _mm_add_epi32(temp0, pred_half0);
   1938     temp1 = _mm_add_epi32(temp1, pred_half1);
   1939 
   1940     temp0 = _mm_packus_epi32(temp0, temp1);
   1941     temp0 = _mm_packus_epi16(temp0, temp1);
   1942 
   1943     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0);
   1944 
   1945     // Row 3 processing
   1946     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24));
   1947     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd));
   1948     pred_r =  _mm_cvtepu8_epi16(pred_r);
   1949     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1950     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
   1951     pred_half0 = _mm_cvtepu16_epi32(pred_r);
   1952     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1953 
   1954     pred_r = _mm_srli_si128(pred_r, 8);
   1955 
   1956     temp0 = _mm_add_epi32(temp0, value_4x32b);
   1957     temp1 = _mm_add_epi32(temp1, value_4x32b);
   1958     temp0 = _mm_add_epi32(temp0, round_stg2);
   1959     temp1 = _mm_add_epi32(temp1, round_stg2);
   1960     pred_half1 = _mm_cvtepu16_epi32(pred_r);
   1961     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
   1962     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
   1963     temp0 = _mm_add_epi32(temp0, pred_half0);
   1964     temp1 = _mm_add_epi32(temp1, pred_half1);
   1965 
   1966     temp0 = _mm_packus_epi32(temp0, temp1);
   1967     temp0 = _mm_packus_epi16(temp0, temp1);
   1968 
   1969     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0);
   1970 
   1971     // Row 4 processing
   1972     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32));
   1973     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd));
   1974     pred_r =  _mm_cvtepu8_epi16(pred_r);
   1975     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1976     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
   1977     pred_half0 = _mm_cvtepu16_epi32(pred_r);
   1978     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   1979 
   1980     pred_r = _mm_srli_si128(pred_r, 8);
   1981 
   1982     temp0 = _mm_add_epi32(temp0, value_4x32b);
   1983     temp1 = _mm_add_epi32(temp1, value_4x32b);
   1984     temp0 = _mm_add_epi32(temp0, round_stg2);
   1985     temp1 = _mm_add_epi32(temp1, round_stg2);
   1986     pred_half1 = _mm_cvtepu16_epi32(pred_r);
   1987     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
   1988     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
   1989     temp0 = _mm_add_epi32(temp0, pred_half0);
   1990     temp1 = _mm_add_epi32(temp1, pred_half1);
   1991 
   1992     temp0 = _mm_packus_epi32(temp0, temp1);
   1993     temp0 = _mm_packus_epi16(temp0, temp1);
   1994 
   1995     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0);
   1996 
   1997     // Row 5 processing
   1998     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40));
   1999     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd));
   2000     pred_r =  _mm_cvtepu8_epi16(pred_r);
   2001     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   2002     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
   2003     pred_half0 = _mm_cvtepu16_epi32(pred_r);
   2004     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   2005 
   2006     pred_r = _mm_srli_si128(pred_r, 8);
   2007 
   2008     temp0 = _mm_add_epi32(temp0, value_4x32b);
   2009     temp1 = _mm_add_epi32(temp1, value_4x32b);
   2010     temp0 = _mm_add_epi32(temp0, round_stg2);
   2011     temp1 = _mm_add_epi32(temp1, round_stg2);
   2012     pred_half1 = _mm_cvtepu16_epi32(pred_r);
   2013     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
   2014     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
   2015     temp0 = _mm_add_epi32(temp0, pred_half0);
   2016     temp1 = _mm_add_epi32(temp1, pred_half1);
   2017 
   2018     temp0 = _mm_packus_epi32(temp0, temp1);
   2019     temp0 = _mm_packus_epi16(temp0, temp1);
   2020 
   2021     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0);
   2022 
   2023     // Row 6 processing
   2024     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48));
   2025     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd));
   2026     pred_r =  _mm_cvtepu8_epi16(pred_r);
   2027     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   2028     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
   2029     pred_half0 = _mm_cvtepu16_epi32(pred_r);
   2030     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   2031 
   2032     pred_r = _mm_srli_si128(pred_r, 8);
   2033 
   2034     temp0 = _mm_add_epi32(temp0, value_4x32b);
   2035     temp1 = _mm_add_epi32(temp1, value_4x32b);
   2036     temp0 = _mm_add_epi32(temp0, round_stg2);
   2037     temp1 = _mm_add_epi32(temp1, round_stg2);
   2038     pred_half1 = _mm_cvtepu16_epi32(pred_r);
   2039     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
   2040     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
   2041     temp0 = _mm_add_epi32(temp0, pred_half0);
   2042     temp1 = _mm_add_epi32(temp1, pred_half1);
   2043 
   2044     temp0 = _mm_packus_epi32(temp0, temp1);
   2045     temp0 = _mm_packus_epi16(temp0, temp1);
   2046 
   2047     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0);
   2048 
   2049     // Row 7 processing
   2050     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56));
   2051     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd));
   2052     pred_r =  _mm_cvtepu8_epi16(pred_r);
   2053     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   2054     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
   2055     pred_half0 = _mm_cvtepu16_epi32(pred_r);
   2056     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
   2057 
   2058     pred_r = _mm_srli_si128(pred_r, 8);
   2059 
   2060     temp0 = _mm_add_epi32(temp0, value_4x32b);
   2061     temp1 = _mm_add_epi32(temp1, value_4x32b);
   2062     temp0 = _mm_add_epi32(temp0, round_stg2);
   2063     temp1 = _mm_add_epi32(temp1, round_stg2);
   2064     pred_half1 = _mm_cvtepu16_epi32(pred_r);
   2065     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
   2066     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
   2067     temp0 = _mm_add_epi32(temp0, pred_half0);
   2068     temp1 = _mm_add_epi32(temp1, pred_half1);
   2069 
   2070     temp0 = _mm_packus_epi32(temp0, temp1);
   2071     temp0 = _mm_packus_epi16(temp0, temp1);
   2072 
   2073     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0);
   2074 }
   2075 
   2076 void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src,
   2077                             WORD16 *pi2_tmp,
   2078                             UWORD8 *pu1_pred,
   2079                             UWORD8 *pu1_dst,
   2080                             WORD32 src_strd,
   2081                             WORD32 pred_strd,
   2082                             WORD32 dst_strd,
   2083                             WORD32 zero_cols,
   2084                             WORD32 zero_rows)
   2085 {
   2086     WORD32 val;
   2087     __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3;
   2088 
   2089     UNUSED(pi2_tmp);
   2090     UNUSED(src_strd);
   2091     UNUSED(zero_cols);
   2092     UNUSED(zero_rows);
   2093 
   2094     val = pi2_src[0] * gai2_impeg2_idct_q15[0];
   2095     val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
   2096     val = val * gai2_impeg2_idct_q11[0];
   2097     val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
   2098 
   2099     value_4x32b = _mm_set1_epi32(val);
   2100 
   2101     //Row 0-1 processing
   2102     pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
   2103     pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
   2104     pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
   2105     pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
   2106 
   2107     temp0 = _mm_cvtepu16_epi32(pred_r0);
   2108     pred_r0 = _mm_srli_si128(pred_r0, 8);
   2109     temp2 = _mm_cvtepu16_epi32(pred_r1);
   2110     pred_r1 = _mm_srli_si128(pred_r1, 8);
   2111     temp1 = _mm_cvtepu16_epi32(pred_r0);
   2112     temp3 = _mm_cvtepu16_epi32(pred_r1);
   2113 
   2114     temp0 = _mm_add_epi32(temp0, value_4x32b);
   2115     temp2 = _mm_add_epi32(temp2, value_4x32b);
   2116     temp1 = _mm_add_epi32(temp1, value_4x32b);
   2117     temp3 = _mm_add_epi32(temp3, value_4x32b);
   2118     temp0 = _mm_packus_epi32(temp0, temp1);
   2119     temp2 = _mm_packus_epi32(temp2, temp3);
   2120     temp0 = _mm_packus_epi16(temp0, temp1);
   2121     temp2 = _mm_packus_epi16(temp2, temp3);
   2122     _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
   2123     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
   2124 
   2125     //Row 2-3 processing
   2126     pu1_pred += 2 * pred_strd;
   2127     pu1_dst += 2 * dst_strd;
   2128 
   2129     pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
   2130     pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
   2131     pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
   2132     pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
   2133 
   2134     temp0 = _mm_cvtepu16_epi32(pred_r0);
   2135     pred_r0 = _mm_srli_si128(pred_r0, 8);
   2136     temp2 = _mm_cvtepu16_epi32(pred_r1);
   2137     pred_r1 = _mm_srli_si128(pred_r1, 8);
   2138     temp1 = _mm_cvtepu16_epi32(pred_r0);
   2139     temp3 = _mm_cvtepu16_epi32(pred_r1);
   2140 
   2141     temp0 = _mm_add_epi32(temp0, value_4x32b);
   2142     temp2 = _mm_add_epi32(temp2, value_4x32b);
   2143     temp1 = _mm_add_epi32(temp1, value_4x32b);
   2144     temp3 = _mm_add_epi32(temp3, value_4x32b);
   2145     temp0 = _mm_packus_epi32(temp0, temp1);
   2146     temp2 = _mm_packus_epi32(temp2, temp3);
   2147     temp0 = _mm_packus_epi16(temp0, temp1);
   2148     temp2 = _mm_packus_epi16(temp2, temp3);
   2149     _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
   2150     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
   2151 
   2152     //Row 4-5 processing
   2153     pu1_pred += 2 * pred_strd;
   2154     pu1_dst += 2 * dst_strd;
   2155 
   2156     pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
   2157     pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
   2158     pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
   2159     pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
   2160 
   2161     temp0 = _mm_cvtepu16_epi32(pred_r0);
   2162     pred_r0 = _mm_srli_si128(pred_r0, 8);
   2163     temp2 = _mm_cvtepu16_epi32(pred_r1);
   2164     pred_r1 = _mm_srli_si128(pred_r1, 8);
   2165     temp1 = _mm_cvtepu16_epi32(pred_r0);
   2166     temp3 = _mm_cvtepu16_epi32(pred_r1);
   2167 
   2168     temp0 = _mm_add_epi32(temp0, value_4x32b);
   2169     temp2 = _mm_add_epi32(temp2, value_4x32b);
   2170     temp1 = _mm_add_epi32(temp1, value_4x32b);
   2171     temp3 = _mm_add_epi32(temp3, value_4x32b);
   2172     temp0 = _mm_packus_epi32(temp0, temp1);
   2173     temp2 = _mm_packus_epi32(temp2, temp3);
   2174     temp0 = _mm_packus_epi16(temp0, temp1);
   2175     temp2 = _mm_packus_epi16(temp2, temp3);
   2176     _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
   2177     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
   2178 
   2179     //Row 6-7 processing
   2180     pu1_pred += 2 * pred_strd;
   2181     pu1_dst += 2 * dst_strd;
   2182 
   2183     pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
   2184     pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
   2185     pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
   2186     pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
   2187 
   2188     temp0 = _mm_cvtepu16_epi32(pred_r0);
   2189     pred_r0 = _mm_srli_si128(pred_r0, 8);
   2190     temp2 = _mm_cvtepu16_epi32(pred_r1);
   2191     pred_r1 = _mm_srli_si128(pred_r1, 8);
   2192     temp1 = _mm_cvtepu16_epi32(pred_r0);
   2193     temp3 = _mm_cvtepu16_epi32(pred_r1);
   2194 
   2195     temp0 = _mm_add_epi32(temp0, value_4x32b);
   2196     temp2 = _mm_add_epi32(temp2, value_4x32b);
   2197     temp1 = _mm_add_epi32(temp1, value_4x32b);
   2198     temp3 = _mm_add_epi32(temp3, value_4x32b);
   2199     temp0 = _mm_packus_epi32(temp0, temp1);
   2200     temp2 = _mm_packus_epi32(temp2, temp3);
   2201     temp0 = _mm_packus_epi16(temp0, temp1);
   2202     temp2 = _mm_packus_epi16(temp2, temp3);
   2203     _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
   2204     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
   2205 }
   2206