Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevc_itrans_recon_32x32_atom_intr.c
     22  *
     23  * @brief
     24  *  Contains function definitions for inverse  quantization, inverse
     25  * transform and reconstruction
     26  *
     27  * @author
     28  *  100470
     29  *
     30  * @par List of Functions:
     31  *  - ihevc_iquant_itrans_recon_32x32_ssse3()
     32  *
     33  * @remarks
     34  *  None
     35  *
     36  *******************************************************************************
     37  */
     38 #include <stdio.h>
     39 #include <string.h>
     40 #include "ihevc_typedefs.h"
     41 #include "ihevc_platform_macros.h"
     42 #include "ihevc_macros.h"
     43 #include "ihevc_defs.h"
     44 #include "ihevc_trans_tables.h"
     45 #include "ihevc_iquant_itrans_recon.h"
     46 #include "ihevc_func_selector.h"
     47 #include "ihevc_trans_macros.h"
     48 
     49 
     50 
     51 
     52 #include <immintrin.h>
     53 #include <emmintrin.h>
     54 
     55 #include <tmmintrin.h>
     56 
     57 
     58 
     59 /**
     60  *******************************************************************************
     61  *
     62  * @brief
     63  *  This function performs inverse quantization, inverse  transform and
     64  * reconstruction for 16x16 input block
     65  *
     66  * @par Description:
     67  *  Performs inverse quantization , inverse transform  and adds the
     68  * prediction data and clips output to 8 bit
     69  *
     70  * @param[in] pi2_src
     71  *  Input 16x16 coefficients
     72  *
     73  * @param[in] pi2_tmp
     74  *  Temporary 16x16 buffer for storing inverse
     75  *  transform 1st stage output
     76  *
     77  * @param[in] pu1_pred
     78  *  Prediction 16x16 block
     79  *
     80  * @param[in] pi2_dequant_coeff
     81  *  Dequant Coeffs
     82  *
     83  * @param[out] pu1_dst
     84  *  Output 16x16 block
     85  *
     86  * @param[in] qp_div
     87  *  Quantization parameter / 6
     88  *
     89  * @param[in] qp_rem
     90  *  Quantization parameter % 6
     91  *
     92  * @param[in] src_strd
     93  *  Input stride
     94  *
     95  * @param[in] pred_strd
     96  *  Prediction stride
     97  *
     98  * @param[in] dst_strd
     99  *  Output Stride
    100  *
    101  * @param[in] zero_cols
    102  *  Zero columns in pi2_src
    103  *
    104  * @returns  Void
    105  *
    106  * @remarks
    107  *  None
    108  *
    109  *******************************************************************************
    110  */
    111 /**/
    112 
    113 void ihevc_itrans_recon_32x32_ssse3(WORD16 *pi2_src,
    114                                     WORD16 *pi2_tmp,
    115                                     UWORD8 *pu1_pred,
    116                                     UWORD8 *pu1_dst,
    117                                     WORD32 src_strd,
    118                                     WORD32 pred_strd,
    119                                     WORD32 dst_strd,
    120                                     WORD32 zero_cols,
    121                                     WORD32 zero_rows)
    122 {
    123     /* Inverse Transform */
    124 
    125     WORD32 j;
    126 
    127 
    128     WORD16 *pi2_tmp_orig;
    129 
    130 
    131     /*MEM_ALIGN16  WORD32 temp_array[1024];
    132     MEM_ALIGN16  WORD16 temp1_array[1024];*/
    133     WORD16 *o_temp_ptr;
    134     WORD16 *temp_ptr;
    135 
    136     __m128i m_temp_reg_0;
    137     __m128i m_temp_reg_1;
    138     __m128i m_temp_reg_2;
    139     __m128i m_temp_reg_3;
    140     __m128i m_temp_reg_4;
    141     __m128i m_temp_reg_5;
    142     __m128i m_temp_reg_6;
    143     __m128i m_temp_reg_7;
    144     __m128i m_temp_reg_10;
    145     __m128i m_temp_reg_11;
    146     __m128i m_temp_reg_12;
    147     __m128i m_temp_reg_13;
    148     __m128i m_temp_reg_14;
    149     __m128i m_temp_reg_15;
    150     __m128i m_temp_reg_16;
    151     __m128i m_temp_reg_17;
    152     __m128i m_temp_reg_18;
    153     __m128i m_temp_reg_19;
    154     __m128i m_temp_reg_20;
    155     __m128i m_temp_reg_21;
    156     __m128i m_temp_reg_22;
    157     __m128i m_temp_reg_23;
    158     __m128i m_temp_reg_30;
    159     __m128i m_temp_reg_31;
    160     __m128i m_temp_reg_32;
    161     __m128i m_temp_reg_33;
    162     __m128i m_temp_reg_34;
    163     __m128i m_temp_reg_35;
    164     __m128i m_temp_reg_36;
    165     __m128i m_temp_reg_37;
    166     __m128i m_temp_reg_40;
    167     __m128i m_temp_reg_41;
    168     __m128i m_temp_reg_42;
    169     __m128i m_temp_reg_43;
    170     __m128i m_temp_reg_44;
    171     __m128i m_temp_reg_45;
    172     __m128i m_temp_reg_46;
    173     __m128i m_temp_reg_47;
    174 
    175     __m128i m_temp_reg_70;
    176     __m128i m_temp_reg_71;
    177     __m128i m_temp_reg_72;
    178     __m128i m_temp_reg_73;
    179     __m128i m_temp_reg_74;
    180     __m128i m_temp_reg_75;
    181     __m128i m_temp_reg_76;
    182     __m128i m_temp_reg_77;
    183 
    184     __m128i m_temp_reg_80;
    185     __m128i m_temp_reg_81;
    186     __m128i m_temp_reg_82;
    187     __m128i m_temp_reg_83;
    188     __m128i m_temp_reg_84;
    189     __m128i m_temp_reg_85;
    190     __m128i m_temp_reg_86;
    191     __m128i m_temp_reg_87;
    192 
    193     __m128i m_temp_reg_90;
    194     __m128i m_temp_reg_91;
    195     __m128i m_temp_reg_92;
    196     __m128i m_temp_reg_93;
    197     __m128i m_temp_reg_94;
    198     __m128i m_temp_reg_95;
    199     __m128i m_temp_reg_96;
    200     __m128i m_temp_reg_97;
    201 
    202     __m128i m_rdng_factor;
    203     __m128i m_count;
    204     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
    205     __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
    206 
    207     __m128i temp1, temp2, temp3, temp4;
    208     __m128i temp5, temp6, temp7, temp8;
    209 
    210     __m128i all_zero_reg;
    211     WORD32 i;
    212 
    213     /*Lokesh*/
    214     WORD32  zero_last24_cols_stg1;
    215     WORD32  zero_last24_rows_stg1;
    216     WORD32  zero_last28_rows_stg1;
    217 
    218     WORD32  zero_last28_rows_stg2;
    219     WORD32  zero_last24_rows_stg2;
    220 
    221     WORD32  trans_size_stg1;
    222 
    223     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    224     WORD32 trans_size = TRANS_SIZE_32;
    225 
    226 
    227     /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
    228     zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
    229     zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
    230     zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
    231 
    232     zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
    233     zero_last24_rows_stg2 = zero_last24_cols_stg1;
    234 
    235     if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
    236     {
    237         trans_size_stg1 = 8;
    238 
    239     }
    240     else
    241     {
    242         trans_size_stg1 = 32;
    243     }
    244 
    245     all_zero_reg = _mm_setzero_si128();
    246 
    247     o_temp_ptr  = pi2_tmp;
    248     temp_ptr = (pi2_tmp + 1024);
    249 
    250     pi2_tmp += 2048;
    251     pi2_tmp_orig = pi2_tmp;
    252 
    253     for(i = 0; i < trans_size_stg1; i += 8)
    254     {
    255 
    256 
    257         {
    258             WORD16 *pi2_tmp_src = pi2_src;
    259 
    260             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
    261             pi2_tmp_src += (src_strd << 1);
    262             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
    263             pi2_tmp_src += (src_strd << 1);
    264             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
    265             pi2_tmp_src += (src_strd << 1);
    266             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
    267             pi2_tmp_src += (src_strd << 1);
    268             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
    269             pi2_tmp_src += (src_strd << 1);
    270             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
    271             pi2_tmp_src += (src_strd << 1);
    272             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
    273             pi2_tmp_src += (src_strd << 1);
    274             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
    275             pi2_tmp_src += (src_strd << 1);
    276 
    277             m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
    278             pi2_tmp_src += (src_strd << 1);
    279             m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
    280             pi2_tmp_src += (src_strd << 1);
    281             m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
    282             pi2_tmp_src += (src_strd << 1);
    283             m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
    284             pi2_tmp_src += (src_strd << 1);
    285             m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
    286             pi2_tmp_src += (src_strd << 1);
    287             m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
    288             pi2_tmp_src += (src_strd << 1);
    289             m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
    290             pi2_tmp_src += (src_strd << 1);
    291             m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
    292         }
    293 
    294         if(zero_last28_rows_stg1)
    295         {
    296             /* eeo */
    297             /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
    298             /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
    299             {
    300                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
    301 
    302                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
    303 
    304                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    305 
    306 /* eeeo[0]= m_temp_reg_20  */
    307 /* eeeo[1]= m_temp_reg_21  */
    308 /* eeee[0]= m_temp_reg_22  */
    309 /* eeee[1]= m_temp_reg_23  */
    310 
    311                 /* eee[0] = eeee[0] + eeeo[0]; */
    312                 m_temp_reg_40 = m_temp_reg_14;
    313 
    314                 /* eee[3] = eeee[0] - eeeo[0]; */
    315                 m_temp_reg_43 = m_temp_reg_14;
    316 
    317                 /* eee[2] = eeee[1] - eeeo[1]; */
    318                 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
    319 
    320                 /* eee[1] = eeee[1] + eeeo[1];*/
    321                 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
    322 
    323                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
    324 
    325                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
    326 
    327                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    328 
    329 /* eeeo[0]= m_temp_reg_20  */
    330 /* eeeo[1]= m_temp_reg_21  */
    331 /* eeee[0]= m_temp_reg_22  */
    332 /* eeee[1]= m_temp_reg_23  */
    333 
    334                 /* eee[0] = eeee[0] + eeeo[0]; */
    335                 m_temp_reg_44 = m_temp_reg_14;
    336 
    337                 /* eee[3] = eeee[0] - eeeo[0]; */
    338                 m_temp_reg_47 = m_temp_reg_14;
    339 
    340                 /* eee[2] = eeee[1] - eeeo[1]; */
    341                 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
    342 
    343                 /* eee[1] = eeee[1] + eeeo[1];*/
    344                 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
    345 
    346 
    347             }
    348             /* eo */
    349             {
    350                 WORD16 *pi2_scratch = o_temp_ptr;
    351 
    352                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
    353                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
    354                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
    355                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
    356                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
    357                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
    358                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
    359                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
    360 
    361                 //m_temp_reg_10 = _mm_cvtepi16_epi32(m_temp_reg_71);
    362                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
    363 
    364                 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
    365 
    366                 /* eo0[0-3] */
    367                 {
    368                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    369 
    370                     //m_temp_reg_14 = _mm_cvtepi16_epi32(m_temp_reg_71);
    371                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
    372 
    373                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    374                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    375 
    376                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    377                     pi2_scratch += 8;
    378                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    379                     pi2_scratch += 8;
    380 
    381                 }
    382 
    383                 /* eo0[4-7] */
    384                 {
    385                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    386 
    387                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    388                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    389 
    390                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    391                     pi2_scratch += 8;
    392                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    393                     pi2_scratch += 8;
    394 
    395                 }
    396                 /* eo1[0-3] */
    397                 {
    398                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
    399 
    400                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
    401                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
    402 
    403                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    404                     pi2_scratch += 8;
    405                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    406                     pi2_scratch += 8;
    407 
    408                 }
    409 
    410                 /* eo1[4-7] */
    411                 {
    412                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
    413 
    414                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
    415                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
    416 
    417                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    418                     pi2_scratch += 8;
    419                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    420                     pi2_scratch += 8;
    421 
    422                 }
    423 
    424                 /* eo2[0-3] */
    425                 {
    426                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    427 
    428                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    429                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    430 
    431                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    432                     pi2_scratch += 8;
    433                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    434                     pi2_scratch += 8;
    435 
    436                 }
    437 
    438                 /* eo2[4-7] */
    439                 {
    440                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
    441 
    442                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    443                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    444 
    445                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    446                     pi2_scratch += 8;
    447                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    448                     pi2_scratch += 8;
    449 
    450                 }
    451 
    452                 /**************************************************************************/
    453 
    454 
    455                 /* eo3[0-3] */
    456                 {
    457                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
    458 
    459                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
    460                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
    461 
    462                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    463                     pi2_scratch += 8;
    464                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    465                     pi2_scratch += 8;
    466 
    467                 }
    468 
    469                 /* eo3[4-7] */
    470                 {
    471                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
    472 
    473                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
    474                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
    475 
    476                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    477                     pi2_scratch += 8;
    478                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    479                     pi2_scratch += 8;
    480 
    481                 }
    482 
    483 
    484                 /* eo4[0-3] */
    485                 {
    486                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
    487 
    488                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
    489                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
    490 
    491                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    492                     pi2_scratch += 8;
    493                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    494                     pi2_scratch += 8;
    495 
    496                 }
    497                 /* eo4[4-7] */
    498                 {
    499                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
    500 
    501                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
    502                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
    503 
    504                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    505                     pi2_scratch += 8;
    506                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    507                     pi2_scratch += 8;
    508 
    509                 }
    510 
    511                 /***********************************************************************/
    512 
    513                 /* eo5[0-3] */
    514                 {
    515                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
    516 
    517                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    518                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    519 
    520                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    521                     pi2_scratch += 8;
    522                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    523                     pi2_scratch += 8;
    524 
    525                 }
    526 
    527 
    528                 /* eo5[4-7] */
    529                 {
    530                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
    531 
    532                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    533                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    534 
    535                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    536                     pi2_scratch += 8;
    537                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    538                     pi2_scratch += 8;
    539 
    540                 }
    541 
    542                 /* eo6[0-3] */
    543                 {
    544                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
    545 
    546                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
    547                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
    548 
    549                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    550                     pi2_scratch += 8;
    551                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    552                     pi2_scratch += 8;
    553 
    554                 }
    555 
    556 
    557                 /* eo6[4-7] */
    558                 {
    559                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
    560 
    561                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
    562                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
    563 
    564                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    565                     pi2_scratch += 8;
    566                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    567                     pi2_scratch += 8;
    568 
    569                 }
    570 
    571 
    572                 /* eo7[0-3] */
    573                 {
    574                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
    575 
    576                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    577                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    578 
    579                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    580                     pi2_scratch += 8;
    581                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    582                     pi2_scratch += 8;
    583 
    584                 }
    585 
    586 
    587                 /* eo7[4-7] */
    588                 {
    589                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
    590 
    591                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    592                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    593 
    594                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    595                     pi2_scratch += 8;
    596                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    597                     pi2_scratch += 8;
    598 
    599                 }
    600 
    601             }
    602         }
    603         else if(zero_last24_rows_stg1)
    604         {
    605             {
    606                 /* eeo */
    607                 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
    608                 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
    609 
    610                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
    611                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
    612 
    613                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
    614 
    615                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
    616 
    617                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    618 
    619                 /* eeeo[0]= m_temp_reg_20  */
    620                 /* eeeo[1]= m_temp_reg_21  */
    621                 /* eeee[0]= m_temp_reg_22  */
    622                 /* eeee[1]= m_temp_reg_23  */
    623 
    624                 /* eee[0] = eeee[0] + eeeo[0]; */
    625                 m_temp_reg_40 = m_temp_reg_14;
    626 
    627                 /* eee[3] = eeee[0] - eeeo[0]; */
    628                 m_temp_reg_43 = m_temp_reg_14;
    629 
    630                 /* eee[2] = eeee[1] - eeeo[1]; */
    631                 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
    632 
    633                 /* eee[1] = eeee[1] + eeeo[1];*/
    634                 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
    635 
    636                 /* for row 4 to 7 */
    637 
    638                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
    639 
    640                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
    641 
    642                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    643 
    644                 /* eeeo[0]= m_temp_reg_20  */
    645                 /* eeeo[1]= m_temp_reg_21  */
    646                 /* eeee[0]= m_temp_reg_22  */
    647                 /* eeee[1]= m_temp_reg_23  */
    648 
    649                 /* eee[0] = eeee[0] + eeeo[0]; */
    650                 m_temp_reg_44 = m_temp_reg_14;
    651 
    652                 /* eee[3] = eeee[0] - eeeo[0]; */
    653                 m_temp_reg_47 = m_temp_reg_14;
    654 
    655                 /* eee[2] = eeee[1] - eeeo[1]; */
    656                 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
    657 
    658                 /* eee[1] = eeee[1] + eeeo[1];*/
    659                 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
    660 
    661 
    662                 /* eeo[] */
    663                 /* for(k = 0; k < 4; k++) */
    664 
    665                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
    666                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
    667                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
    668                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
    669 
    670                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
    671 
    672                 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
    673 
    674                 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
    675 
    676                 m_temp_reg_33 = _mm_setzero_si128();
    677 
    678                 /* eeo */
    679                 {
    680                     /* eeo0[0-3] */
    681                     {
    682                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    683 
    684                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    685                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    686 
    687                         m_temp_reg_90 = m_temp_reg_34;
    688                         m_temp_reg_97 = m_temp_reg_35;
    689                     }
    690                     /* eeo0[4-7] */
    691                     {
    692                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    693 
    694                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    695                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    696 
    697                         m_temp_reg_91 = m_temp_reg_34;
    698                         m_temp_reg_96 = m_temp_reg_35;
    699 
    700                     }
    701 
    702                     /* eeo1[0-3] */
    703                     {
    704                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
    705 
    706                         /* e[1][0-3] stored in pi2_tmp[2][0-7] */
    707                         /* e[6][0-3] stored in pi2_tmp[2][8-15] */
    708                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
    709                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
    710 
    711                         m_temp_reg_92 = m_temp_reg_34;
    712                         m_temp_reg_95 = m_temp_reg_35;
    713 
    714                     }
    715 
    716                     /* eo1[4-7] */
    717                     {
    718                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
    719 
    720                         /* e[1][4-7] stored in pi2_tmp[3][0-7] */
    721                         /* e[6][4-7] stored in pi2_tmp[3][8-15] */
    722                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
    723                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
    724 
    725                         m_temp_reg_93 = m_temp_reg_34;
    726                         m_temp_reg_94 = m_temp_reg_35;
    727 
    728 
    729                     }
    730 
    731                     /* eo2[0-3] */
    732                     {
    733                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
    734 
    735                         /* e[2][0-3] stored in pi2_tmp[4][0-7] */
    736                         /* e[5][0-3] stored in pi2_tmp[4][8-15] */
    737                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    738                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    739 
    740                         temp1 = m_temp_reg_34;
    741                         temp7 = m_temp_reg_35;
    742 
    743                     }
    744 
    745                     /* eo2[4-7] */
    746                     {
    747                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
    748 
    749                         /* e[2][4-7] stored in pi2_tmp[5][0-7] */
    750                         /* e[5][4-7] stored in pi2_tmp[5][8-15] */
    751                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    752                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    753 
    754                         temp2 = m_temp_reg_34;
    755                         temp6 = m_temp_reg_35;
    756 
    757                     }
    758 
    759                     /* eo3[0-3] */
    760                     {
    761                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    762 
    763                         /* e[3][0-3] stored in pi2_tmp[6][0-7] */
    764                         /* e[4][0-3] stored in pi2_tmp[6][8-15] */
    765                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
    766                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
    767 
    768                         temp3 = m_temp_reg_34;
    769                         temp5 = m_temp_reg_35;
    770 
    771                     }
    772 
    773 
    774                     /* eo3[4-7] */
    775                     {
    776                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
    777 
    778                         /* e[3][4-7] stored in pi2_tmp[7][0-7] */
    779                         /* e[4][4-7] stored in pi2_tmp[7][8-15] */
    780                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
    781                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
    782 
    783                         temp4 = m_temp_reg_34;
    784                         temp8 = m_temp_reg_35;
    785 
    786 
    787                     }
    788                     /* All values of ee[] array in pi2_temp */
    789 
    790                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
    791                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
    792                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
    793                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
    794 
    795                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    796 
    797                     m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
    798                     m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
    799 
    800                 }
    801             }
    802             /* eo */
    803             {
    804                 WORD16 *pi2_scratch = o_temp_ptr;
    805 
    806                 /* eo0[0-3] */
    807                 {
    808                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    809 
    810                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
    811                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
    812 
    813                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    814                     pi2_scratch += 8;
    815                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    816                     pi2_scratch += 8;
    817 
    818                 }
    819 
    820 
    821                 /* eo0[4-7] */
    822                 {
    823                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    824 
    825                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    826 
    827                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
    828                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
    829 
    830                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    831                     pi2_scratch += 8;
    832                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    833                     pi2_scratch += 8;
    834 
    835                 }
    836 
    837                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
    838 
    839                 /* eo1[0-3] */
    840                 {
    841                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    842 
    843                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
    844                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
    845 
    846                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    847                     pi2_scratch += 8;
    848                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    849                     pi2_scratch += 8;
    850 
    851                 }
    852 
    853 
    854                 /* eo1[4-7] */
    855                 {
    856                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    857 
    858                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
    859                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
    860 
    861                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    862                     pi2_scratch += 8;
    863                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    864                     pi2_scratch += 8;
    865 
    866                 }
    867 
    868                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
    869 
    870                 /* eo2[0-3] */
    871                 {
    872 
    873                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    874 
    875                     m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
    876                     m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
    877 
    878                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    879                     pi2_scratch += 8;
    880                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    881                     pi2_scratch += 8;
    882 
    883                 }
    884 
    885                 /* eo2[4-7] */
    886                 {
    887 
    888                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    889 
    890                     m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
    891                     m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
    892 
    893                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    894                     pi2_scratch += 8;
    895                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    896                     pi2_scratch += 8;
    897 
    898                 }
    899 
    900                 /**************************************************************************/
    901 
    902 
    903 
    904                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
    905 
    906                 /* eo3[0-3] */
    907                 {
    908 
    909                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    910 
    911                     m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
    912                     m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
    913 
    914                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    915                     pi2_scratch += 8;
    916                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    917                     pi2_scratch += 8;
    918 
    919                 }
    920 
    921 
    922                 /* eo3[4-7] */
    923                 {
    924 
    925                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    926 
    927                     m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
    928                     m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
    929 
    930                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    931                     pi2_scratch += 8;
    932                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    933                     pi2_scratch += 8;
    934 
    935                 }
    936 
    937                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
    938 
    939                 /* eo4[0-3] */
    940                 {
    941                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    942 
    943                     m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
    944                     m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
    945 
    946                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    947                     pi2_scratch += 8;
    948                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    949                     pi2_scratch += 8;
    950 
    951                 }
    952                 /* eo4[4-7] */
    953                 {
    954                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    955 
    956                     m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
    957                     m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
    958 
    959                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    960                     pi2_scratch += 8;
    961                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    962                     pi2_scratch += 8;
    963 
    964                 }
    965 
    966                 /***********************************************************************/
    967 
    968                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
    969 
    970                 /* eo5[0-3] */
    971                 {
    972 
    973                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    974 
    975                     m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
    976                     m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
    977 
    978                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    979                     pi2_scratch += 8;
    980                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    981                     pi2_scratch += 8;
    982 
    983                 }
    984 
    985 
    986                 /* eo5[4-7] */
    987                 {
    988                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    989 
    990                     m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
    991                     m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
    992 
    993                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    994                     pi2_scratch += 8;
    995                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    996                     pi2_scratch += 8;
    997 
    998                 }
    999 
   1000                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
   1001 
   1002                 /* eo6[0-3] */
   1003                 {
   1004                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1005 
   1006                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
   1007                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
   1008 
   1009                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1010                     pi2_scratch += 8;
   1011                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1012                     pi2_scratch += 8;
   1013 
   1014                 }
   1015 
   1016 
   1017                 /* eo6[4-7] */
   1018                 {
   1019 
   1020                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1021 
   1022                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
   1023                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
   1024 
   1025                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1026                     pi2_scratch += 8;
   1027                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1028                     pi2_scratch += 8;
   1029 
   1030                 }
   1031 
   1032                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
   1033 
   1034                 /* eo7[0-3] */
   1035                 {
   1036 
   1037                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1038 
   1039                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
   1040                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
   1041 
   1042                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1043                     pi2_scratch += 8;
   1044                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1045                     pi2_scratch += 8;
   1046 
   1047                 }
   1048 
   1049 
   1050                 /* eo7[4-7] */
   1051                 {
   1052                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1053 
   1054                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
   1055                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
   1056 
   1057                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1058                     pi2_scratch += 8;
   1059                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1060                     pi2_scratch += 8;
   1061 
   1062                 }
   1063 
   1064             }
   1065 
   1066         }
   1067         else
   1068         {
   1069 
   1070             {
   1071                 /* eeo */
   1072                 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
   1073                 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
   1074 
   1075                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
   1076                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
   1077 
   1078                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
   1079                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
   1080 
   1081                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
   1082 
   1083                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
   1084 
   1085                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
   1086                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
   1087 
   1088                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
   1089                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
   1090 
   1091 
   1092                 /* eeeo[0]= m_temp_reg_20  */
   1093                 /* eeeo[1]= m_temp_reg_21  */
   1094                 /* eeee[0]= m_temp_reg_22  */
   1095                 /* eeee[1]= m_temp_reg_23  */
   1096 
   1097                 /* eee[0] = eeee[0] + eeeo[0]; */
   1098                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
   1099 
   1100                 /* eee[3] = eeee[0] - eeeo[0]; */
   1101                 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
   1102 
   1103                 /* eee[2] = eeee[1] - eeeo[1]; */
   1104                 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
   1105 
   1106                 /* eee[1] = eeee[1] + eeeo[1];*/
   1107                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
   1108 
   1109                 /* for row 4 to 7 */
   1110 
   1111                 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
   1112                 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
   1113 
   1114                 /* Interleaving row 8 and row 24*/
   1115                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
   1116 
   1117                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
   1118                 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
   1119 
   1120                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
   1121 
   1122                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
   1123                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
   1124 
   1125                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
   1126                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
   1127 
   1128 
   1129                 /* eeeo[0]= m_temp_reg_20  */
   1130                 /* eeeo[1]= m_temp_reg_21  */
   1131                 /* eeee[0]= m_temp_reg_22  */
   1132                 /* eeee[1]= m_temp_reg_23  */
   1133 
   1134                 /* eee[0] = eeee[0] + eeeo[0]; */
   1135                 m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
   1136 
   1137                 /* eee[3] = eeee[0] - eeeo[0]; */
   1138                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
   1139 
   1140                 /* eee[2] = eeee[1] - eeeo[1]; */
   1141                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
   1142 
   1143                 /* eee[1] = eeee[1] + eeeo[1];*/
   1144                 m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
   1145 
   1146 
   1147                 // eeo[]
   1148                 /* for(k = 0; k < 4; k++) */
   1149 
   1150 
   1151                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
   1152                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
   1153 
   1154                 /* eeo */
   1155                 {
   1156                     /* eeo0[0-3] */
   1157                     {
   1158                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1159                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
   1160 
   1161                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1162                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1163 
   1164                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1165 
   1166                         m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1167                         m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1168 
   1169                     }
   1170 
   1171                     m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
   1172                     m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
   1173                     m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
   1174                     m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
   1175 
   1176                     /* eeo0[4-7] */
   1177                     {
   1178                         m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1179                         m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
   1180 
   1181                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1182                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1183 
   1184                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1185 
   1186                         m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
   1187                         m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
   1188 
   1189                     }
   1190 
   1191 
   1192                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
   1193                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
   1194 
   1195                     /* eeo1[0-3] */
   1196                     {
   1197                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1198                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   1199 
   1200                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
   1201                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
   1202 
   1203                         m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
   1204                         m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
   1205 
   1206                     }
   1207 
   1208                     /* eeo1[4-7] */
   1209                     {
   1210 
   1211                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1212                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
   1213 
   1214                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
   1215                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
   1216 
   1217                         m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
   1218                         m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
   1219 
   1220 
   1221                     }
   1222 
   1223                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
   1224                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
   1225 
   1226                     /* eeo2[0-3] */
   1227                     {
   1228 
   1229                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1230                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   1231 
   1232                         /* e[2][0-3] stored in pi2_tmp[4][0-7] */
   1233                         /* e[5][0-3] stored in pi2_tmp[4][8-15] */
   1234 
   1235                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
   1236                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
   1237 
   1238                         temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
   1239                         temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
   1240 
   1241                     }
   1242 
   1243                     /* eeo2[4-7] */
   1244                     {
   1245 
   1246                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1247                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
   1248 
   1249                         /* e[2][4-7] stored in pi2_tmp[5][0-7] */
   1250                         /* e[5][4-7] stored in pi2_tmp[5][8-15] */
   1251 
   1252                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
   1253                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
   1254 
   1255                         temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
   1256                         temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
   1257 
   1258                     }
   1259 
   1260                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
   1261                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
   1262 
   1263                     /* eeo3[0-3] */
   1264                     {
   1265 
   1266                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1267                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   1268 
   1269                         /* e[3][0-3] stored in pi2_tmp[6][0-7] */
   1270                         /* e[4][0-3] stored in pi2_tmp[6][8-15] */
   1271 
   1272                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
   1273                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
   1274 
   1275                         temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
   1276                         temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
   1277 
   1278 
   1279                     }
   1280 
   1281                     /* eeo3[4-7] */
   1282                     {
   1283 
   1284                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1285                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
   1286 
   1287                         /* e[3][4-7] stored in pi2_tmp[7][0-7] */
   1288                         /* e[4][4-7] stored in pi2_tmp[7][8-15] */
   1289 
   1290                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
   1291                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
   1292                         temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
   1293                         temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
   1294 
   1295                     }
   1296 
   1297 
   1298                     /* All values of ee[] array in pi2_temp */
   1299 
   1300                     /* for(k = 0; k < 8; k++) */
   1301                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
   1302                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
   1303                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
   1304                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
   1305                 }
   1306             }
   1307             /* eo */
   1308             {
   1309                 WORD16 *pi2_scratch = o_temp_ptr;
   1310 
   1311                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1312                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   1313                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
   1314                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
   1315 
   1316                 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
   1317                 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
   1318                 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
   1319                 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
   1320 
   1321                 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
   1322                 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
   1323                 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
   1324                 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
   1325 
   1326                 /* eo0[0-3] */
   1327                 {
   1328                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1329                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1330 
   1331                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1332 
   1333                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1334                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1335 
   1336                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1337 
   1338                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1339 
   1340                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
   1341                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
   1342 
   1343                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1344                     pi2_scratch += 8;
   1345                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1346                     pi2_scratch += 8;
   1347 
   1348                 }
   1349                 /* eo0[4-7] */
   1350                 {
   1351                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1352                     m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   1353                     m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
   1354                     m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
   1355 
   1356                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1357                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1358 
   1359                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1360 
   1361                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1362                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1363 
   1364                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1365 
   1366                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1367 
   1368                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
   1369                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
   1370 
   1371                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1372                     pi2_scratch += 8;
   1373                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1374                     pi2_scratch += 8;
   1375 
   1376                 }
   1377 
   1378                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
   1379                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
   1380                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
   1381                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
   1382 
   1383                 /* eo1[0-3] */
   1384                 {
   1385 
   1386                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1387                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1388 
   1389                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1390 
   1391                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1392                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1393 
   1394                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1395 
   1396                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
   1397 
   1398                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
   1399                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
   1400 
   1401                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1402                     pi2_scratch += 8;
   1403                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1404                     pi2_scratch += 8;
   1405 
   1406                 }
   1407 
   1408                 /* eo1[4-7] */
   1409                 {
   1410                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1411                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1412 
   1413                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1414 
   1415                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1416                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1417 
   1418                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1419 
   1420                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
   1421 
   1422                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
   1423                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
   1424 
   1425                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1426                     pi2_scratch += 8;
   1427                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1428                     pi2_scratch += 8;
   1429 
   1430                 }
   1431 
   1432                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
   1433                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
   1434                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
   1435                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
   1436 
   1437                 /* eo2[0-3] */
   1438                 {
   1439                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1440                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1441 
   1442                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
   1443 
   1444                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1445                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1446 
   1447                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1448 
   1449                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1450 
   1451                     m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
   1452                     m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
   1453 
   1454                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1455                     pi2_scratch += 8;
   1456                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1457                     pi2_scratch += 8;
   1458 
   1459                 }
   1460 
   1461 
   1462                 /* eo2[4-7] */
   1463                 {
   1464 
   1465                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1466                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1467 
   1468                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
   1469 
   1470                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1471                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1472 
   1473                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1474 
   1475                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1476 
   1477                     m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
   1478                     m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
   1479 
   1480                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1481                     pi2_scratch += 8;
   1482                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1483                     pi2_scratch += 8;
   1484 
   1485                 }
   1486                 /**************************************************************************/
   1487 
   1488                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
   1489                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
   1490                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
   1491                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
   1492 
   1493                 /* eo3[0-3] */
   1494                 {
   1495                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1496                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1497 
   1498                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1499 
   1500                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1501                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1502 
   1503                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
   1504 
   1505                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1506 
   1507                     m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
   1508                     m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
   1509 
   1510                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1511                     pi2_scratch += 8;
   1512                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1513                     pi2_scratch += 8;
   1514 
   1515                 }
   1516 
   1517 
   1518                 /* eo3[4-7] */
   1519                 {
   1520                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1521                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1522 
   1523                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1524 
   1525                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1526                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1527 
   1528                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
   1529 
   1530                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1531 
   1532                     m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
   1533                     m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
   1534 
   1535                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1536                     pi2_scratch += 8;
   1537                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1538                     pi2_scratch += 8;
   1539 
   1540                 }
   1541 
   1542                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
   1543                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
   1544                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
   1545                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
   1546 
   1547                 /* eo4[0-3] */
   1548                 {
   1549 
   1550                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1551                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1552 
   1553                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1554 
   1555                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1556                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1557 
   1558                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
   1559 
   1560                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1561 
   1562                     m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
   1563                     m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
   1564 
   1565                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1566                     pi2_scratch += 8;
   1567                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1568                     pi2_scratch += 8;
   1569 
   1570                 }
   1571 
   1572 
   1573                 /* eo4[4-7] */
   1574                 {
   1575                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1576                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1577 
   1578                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1579 
   1580                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1581                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1582 
   1583                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
   1584 
   1585                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1586 
   1587                     m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
   1588                     m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
   1589 
   1590                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1591                     pi2_scratch += 8;
   1592                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1593                     pi2_scratch += 8;
   1594 
   1595                 }
   1596 
   1597                 /***********************************************************************/
   1598 
   1599                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
   1600                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
   1601                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
   1602                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
   1603 
   1604                 /* eo5[0-3] */
   1605                 {
   1606                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1607                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1608 
   1609                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1610 
   1611                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1612                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1613 
   1614                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1615 
   1616                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1617 
   1618                     m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
   1619                     m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
   1620 
   1621                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1622                     pi2_scratch += 8;
   1623                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1624                     pi2_scratch += 8;
   1625 
   1626                 }
   1627 
   1628 
   1629                 /* eo5[4-7] */
   1630                 {
   1631                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1632                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1633 
   1634                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1635 
   1636                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1637                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1638 
   1639                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1640 
   1641                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1642 
   1643                     m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
   1644                     m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
   1645 
   1646                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1647                     pi2_scratch += 8;
   1648                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1649                     pi2_scratch += 8;
   1650 
   1651                 }
   1652 
   1653                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
   1654                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
   1655                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
   1656                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
   1657 
   1658                 /* eo6[0-3] */
   1659                 {
   1660 
   1661                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1662                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1663 
   1664                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1665 
   1666                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1667                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1668 
   1669                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1670 
   1671                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1672 
   1673                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
   1674                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
   1675 
   1676                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1677                     pi2_scratch += 8;
   1678                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1679                     pi2_scratch += 8;
   1680 
   1681                 }
   1682 
   1683 
   1684                 /* eo6[4-7] */
   1685                 {
   1686                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1687                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1688 
   1689                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1690 
   1691                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1692                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1693 
   1694                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1695 
   1696                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1697 
   1698                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
   1699                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
   1700 
   1701                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1702                     pi2_scratch += 8;
   1703                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1704                     pi2_scratch += 8;
   1705 
   1706                 }
   1707 
   1708                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
   1709                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
   1710                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
   1711                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
   1712 
   1713                 /* eo7[0-3] */
   1714                 {
   1715 
   1716                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1717                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1718 
   1719                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1720 
   1721                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1722                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1723 
   1724                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1725 
   1726                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1727 
   1728                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
   1729                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
   1730 
   1731                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1732                     pi2_scratch += 8;
   1733                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1734                     pi2_scratch += 8;
   1735 
   1736                 }
   1737 
   1738 
   1739                 /* eo7[4-7] */
   1740                 {
   1741 
   1742                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1743                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1744 
   1745                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1746 
   1747                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1748                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1749 
   1750                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1751 
   1752                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1753 
   1754                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
   1755                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
   1756 
   1757                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1758                     pi2_scratch += 8;
   1759                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1760                     pi2_scratch += 8;
   1761 
   1762                 }
   1763 
   1764             }
   1765 
   1766         }
   1767         /*  All e[] are done */
   1768         /****************************/
   1769 
   1770 
   1771         {
   1772 
   1773             WORD16 *pi2_tmp_src = pi2_src + src_strd;
   1774 
   1775             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1776             pi2_tmp_src += (src_strd << 1);
   1777             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1778             pi2_tmp_src += (src_strd << 1);
   1779             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1780             pi2_tmp_src += (src_strd << 1);
   1781             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1782             pi2_tmp_src += (src_strd << 1);
   1783             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1784             pi2_tmp_src += (src_strd << 1);
   1785             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1786             pi2_tmp_src += (src_strd << 1);
   1787             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1788             pi2_tmp_src += (src_strd << 1);
   1789             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1790             pi2_tmp_src += (src_strd << 1);
   1791 
   1792             m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1793             pi2_tmp_src += (src_strd << 1);
   1794             m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1795             pi2_tmp_src += (src_strd << 1);
   1796             m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1797             pi2_tmp_src += (src_strd << 1);
   1798             m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1799             pi2_tmp_src += (src_strd << 1);
   1800             m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1801             pi2_tmp_src += (src_strd << 1);
   1802             m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1803             pi2_tmp_src += (src_strd << 1);
   1804             m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1805             pi2_tmp_src += (src_strd << 1);
   1806             m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
   1807         }
   1808 
   1809         if(zero_last28_rows_stg1)
   1810         {
   1811             /* o & stage 1 out */
   1812             {
   1813                 WORD32 j;
   1814                 WORD16 *pi2_src_scratch = o_temp_ptr;
   1815                 WORD16 *pi2_dst_scratch = temp_ptr;
   1816                 WORD32 out_stride = (trans_size << 1);
   1817                 WORD32 in_stride = trans_size;
   1818 
   1819                 for(j = 0; j < 2; j++)
   1820                 {
   1821                     if(j)
   1822                     {
   1823                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
   1824                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
   1825                     }
   1826 
   1827                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
   1828 
   1829                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   1830 
   1831                     /* o0[0-3] */
   1832                     {
   1833                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1834 
   1835                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1836                         pi2_src_scratch += in_stride;
   1837 
   1838                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1839                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1840 
   1841                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1842                         m_count = _mm_cvtsi32_si128(i4_shift);
   1843                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1844                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1845 
   1846                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1847                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1848                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1849                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1850 
   1851                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1852 
   1853                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1854                         pi2_dst_scratch += out_stride;
   1855 
   1856                     }
   1857 
   1858                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   1859 
   1860                     /* o1[0-3] */
   1861                     {
   1862 
   1863                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1864 
   1865                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1866                         pi2_src_scratch += in_stride;
   1867 
   1868                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1869                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1870 
   1871                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1872                         m_count = _mm_cvtsi32_si128(i4_shift);
   1873                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1874                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1875 
   1876                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1877                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1878                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1879                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1880 
   1881                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1882 
   1883                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1884                         pi2_dst_scratch += out_stride;
   1885 
   1886                     }
   1887 
   1888                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   1889 
   1890                     /* o2[0-3] */
   1891                     {
   1892 
   1893                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1894 
   1895                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1896                         pi2_src_scratch += in_stride;
   1897 
   1898                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1899                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1900 
   1901                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1902                         m_count = _mm_cvtsi32_si128(i4_shift);
   1903                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1904                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1905 
   1906                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1907                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1908                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1909                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1910 
   1911                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1912 
   1913                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1914                         pi2_dst_scratch += out_stride;
   1915 
   1916                     }
   1917 
   1918                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   1919 
   1920                     /* o3[0-3] */
   1921                     {
   1922                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1923 
   1924                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1925                         pi2_src_scratch += in_stride;
   1926 
   1927                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1928                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1929 
   1930                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1931                         m_count = _mm_cvtsi32_si128(i4_shift);
   1932                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1933                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1934 
   1935                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1936                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1937                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1938                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1939 
   1940                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1941 
   1942                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1943                         pi2_dst_scratch += out_stride;
   1944 
   1945                     }
   1946 
   1947                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   1948 
   1949                     /* o4[0-3] */
   1950                     {
   1951                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1952 
   1953                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1954                         pi2_src_scratch += in_stride;
   1955 
   1956                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1957                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1958 
   1959                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1960                         m_count = _mm_cvtsi32_si128(i4_shift);
   1961                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1962                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1963 
   1964                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1965                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1966                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1967                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1968 
   1969                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1970 
   1971                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1972                         pi2_dst_scratch += out_stride;
   1973 
   1974                     }
   1975 
   1976                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   1977 
   1978                     /* o5[0-3] */
   1979                     {
   1980 
   1981                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1982 
   1983                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1984                         pi2_src_scratch += in_stride;
   1985 
   1986                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1987                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1988 
   1989                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1990                         m_count = _mm_cvtsi32_si128(i4_shift);
   1991                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1992                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1993 
   1994                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1995                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1996                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1997                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1998 
   1999                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2000 
   2001                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2002                         pi2_dst_scratch += out_stride;
   2003 
   2004                     }
   2005 
   2006                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   2007 
   2008                     /* o6[0-3] */
   2009                     {
   2010                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2011 
   2012                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2013                         pi2_src_scratch += in_stride;
   2014 
   2015                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2016                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2017 
   2018                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2019                         m_count = _mm_cvtsi32_si128(i4_shift);
   2020                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2021                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2022 
   2023                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2024                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2025                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2026                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2027 
   2028                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2029 
   2030                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2031                         pi2_dst_scratch += out_stride;
   2032 
   2033                     }
   2034 
   2035                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   2036 
   2037                     /* o7[0-3] */
   2038                     {
   2039 
   2040                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2041 
   2042                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2043                         pi2_src_scratch += 8;
   2044 
   2045                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2046                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2047 
   2048                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2049                         m_count = _mm_cvtsi32_si128(i4_shift);
   2050                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2051                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2052 
   2053                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2054                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2055                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2056                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2057 
   2058                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2059 
   2060                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2061                         pi2_dst_scratch += 8;
   2062 
   2063                     }
   2064 
   2065                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   2066 
   2067                     /* o8[0-3] */
   2068                     {
   2069                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2070 
   2071                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2072                         pi2_src_scratch -= in_stride;
   2073 
   2074                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2075                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2076 
   2077                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2078                         m_count = _mm_cvtsi32_si128(i4_shift);
   2079                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2080                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2081 
   2082                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2083                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2084                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2085                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2086 
   2087                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2088 
   2089                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2090                         pi2_dst_scratch -= out_stride;
   2091                     }
   2092 
   2093                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   2094 
   2095                     /* o9[0-3] */
   2096                     {
   2097                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2098 
   2099                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2100                         pi2_src_scratch -= in_stride;
   2101 
   2102                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2103                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2104 
   2105                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2106                         m_count = _mm_cvtsi32_si128(i4_shift);
   2107                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2108                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2109 
   2110                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2111                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2112                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2113                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2114 
   2115                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2116 
   2117                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2118                         pi2_dst_scratch -= out_stride;
   2119                     }
   2120 
   2121                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   2122 
   2123                     /* o10[0-3] */
   2124                     {
   2125                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2126 
   2127                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2128                         pi2_src_scratch -= in_stride;
   2129 
   2130                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2131                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2132 
   2133                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2134                         m_count = _mm_cvtsi32_si128(i4_shift);
   2135                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2136                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2137 
   2138                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2139                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2140                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2141                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2142 
   2143                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2144 
   2145                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2146                         pi2_dst_scratch -= out_stride;
   2147                     }
   2148 
   2149                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   2150 
   2151                     /* o11[0-3] */
   2152                     {
   2153                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2154 
   2155                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2156                         pi2_src_scratch -= in_stride;
   2157 
   2158                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2159                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2160 
   2161                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2162                         m_count = _mm_cvtsi32_si128(i4_shift);
   2163                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2164                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2165 
   2166                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2167                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2168                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2169                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2170 
   2171                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2172 
   2173                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2174                         pi2_dst_scratch -= out_stride;
   2175 
   2176                     }
   2177 
   2178                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   2179 
   2180                     /* o12[0-3] */
   2181                     {
   2182                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2183 
   2184                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2185                         pi2_src_scratch -= in_stride;
   2186 
   2187                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2188                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2189 
   2190                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2191                         m_count = _mm_cvtsi32_si128(i4_shift);
   2192                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2193                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2194 
   2195                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2196                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2197                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2198                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2199 
   2200                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2201 
   2202                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2203                         pi2_dst_scratch -= out_stride;
   2204 
   2205                     }
   2206 
   2207                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   2208 
   2209                     /* o13[0-3] */
   2210                     {
   2211                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2212 
   2213                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2214                         pi2_src_scratch -= in_stride;
   2215 
   2216                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2217                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2218 
   2219                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2220                         m_count = _mm_cvtsi32_si128(i4_shift);
   2221                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2222                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2223 
   2224                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2225                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2226                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2227                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2228 
   2229                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2230 
   2231                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2232                         pi2_dst_scratch -= out_stride;
   2233                     }
   2234 
   2235                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   2236 
   2237                     /* o14[0-3] */
   2238                     {
   2239                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2240 
   2241                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2242                         pi2_src_scratch -= in_stride;
   2243 
   2244                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2245                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2246 
   2247                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2248                         m_count = _mm_cvtsi32_si128(i4_shift);
   2249                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2250                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2251 
   2252                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2253                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2254                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2255                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2256 
   2257                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2258 
   2259                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2260                         pi2_dst_scratch -= out_stride;
   2261 
   2262                     }
   2263 
   2264                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   2265 
   2266                     /* o15[0-3] */
   2267                     {
   2268                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2269 
   2270                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2271                         pi2_src_scratch += 8;
   2272 
   2273                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2274                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2275 
   2276                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2277                         m_count = _mm_cvtsi32_si128(i4_shift);
   2278                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2279                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2280 
   2281                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2282                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2283                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2284                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2285 
   2286                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2287 
   2288                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2289                         pi2_dst_scratch += 8;
   2290                     }
   2291 
   2292                 }
   2293             }
   2294         }
   2295         else if(zero_last24_rows_stg1)
   2296         {
   2297             /* o & stage 1 out */
   2298             {
   2299                 WORD32 j;
   2300                 WORD16 *pi2_src_scratch = o_temp_ptr;
   2301                 WORD16 *pi2_dst_scratch = temp_ptr;
   2302                 WORD32 out_stride = (trans_size << 1);
   2303                 WORD32 in_stride = trans_size;
   2304 
   2305                 for(j = 0; j < 2; j++)
   2306                 {
   2307                     if(j)
   2308                     {
   2309                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
   2310                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
   2311                         m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
   2312                         m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
   2313                     }
   2314 
   2315                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
   2316                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
   2317 
   2318                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   2319                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
   2320 
   2321                     /* o0[0-3] */
   2322                     {
   2323 
   2324                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2325                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2326 
   2327                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2328 
   2329                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2330                         pi2_src_scratch += in_stride;
   2331 
   2332                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2333                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2334 
   2335                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2336                         m_count = _mm_cvtsi32_si128(i4_shift);
   2337                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2338                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2339 
   2340                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2341                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2342                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2343                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2344 
   2345                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2346 
   2347                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2348                         pi2_dst_scratch += out_stride;
   2349 
   2350                     }
   2351 
   2352                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   2353                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
   2354 
   2355                     /* o1[0-3] */
   2356                     {
   2357                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2358                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2359 
   2360                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2361 
   2362                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2363                         pi2_src_scratch += in_stride;
   2364 
   2365                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2366                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2367 
   2368                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2369                         m_count = _mm_cvtsi32_si128(i4_shift);
   2370                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2371                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2372 
   2373                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2374                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2375                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2376                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2377 
   2378                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2379 
   2380                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2381                         pi2_dst_scratch += out_stride;
   2382 
   2383                     }
   2384 
   2385                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   2386                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
   2387 
   2388                     /* o2[0-3] */
   2389                     {
   2390                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2391                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2392 
   2393                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   2394 
   2395                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2396                         pi2_src_scratch += in_stride;
   2397 
   2398                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2399                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2400 
   2401                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2402                         m_count = _mm_cvtsi32_si128(i4_shift);
   2403                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2404                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2405 
   2406                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2407                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2408                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2409                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2410 
   2411                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2412 
   2413                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2414                         pi2_dst_scratch += out_stride;
   2415 
   2416                     }
   2417 
   2418                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   2419                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
   2420 
   2421                     /* o3[0-3] */
   2422                     {
   2423                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2424                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2425 
   2426                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   2427 
   2428                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2429                         pi2_src_scratch += in_stride;
   2430 
   2431                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2432                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2433 
   2434                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2435                         m_count = _mm_cvtsi32_si128(i4_shift);
   2436                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2437                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2438 
   2439                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2440                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2441                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2442                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2443 
   2444                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2445 
   2446                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2447                         pi2_dst_scratch += out_stride;
   2448 
   2449                     }
   2450 
   2451                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   2452                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
   2453 
   2454                     /* o4[0-3] */
   2455                     {
   2456                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2457                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2458 
   2459                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2460 
   2461                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2462                         pi2_src_scratch += in_stride;
   2463 
   2464                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2465                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2466 
   2467                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2468                         m_count = _mm_cvtsi32_si128(i4_shift);
   2469                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2470                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2471 
   2472                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2473                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2474                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2475                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2476 
   2477                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2478 
   2479                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2480                         pi2_dst_scratch += out_stride;
   2481 
   2482                     }
   2483 
   2484                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   2485                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
   2486 
   2487                     /* o5[0-3] */
   2488                     {
   2489                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2490                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2491 
   2492                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2493 
   2494                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2495                         pi2_src_scratch += in_stride;
   2496 
   2497                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2498                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2499 
   2500                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2501                         m_count = _mm_cvtsi32_si128(i4_shift);
   2502                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2503                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2504 
   2505                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2506                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2507                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2508                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2509 
   2510                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2511 
   2512                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2513                         pi2_dst_scratch += out_stride;
   2514 
   2515                     }
   2516 
   2517                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   2518                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
   2519 
   2520                     /* o6[0-3] */
   2521                     {
   2522                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2523                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2524 
   2525                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2526 
   2527                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2528                         pi2_src_scratch += in_stride;
   2529 
   2530                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2531                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2532 
   2533                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2534                         m_count = _mm_cvtsi32_si128(i4_shift);
   2535                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2536                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2537 
   2538                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2539                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2540                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2541                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2542 
   2543                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2544 
   2545                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2546                         pi2_dst_scratch += out_stride;
   2547 
   2548                     }
   2549 
   2550                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   2551                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
   2552 
   2553                     /* o7[0-3] */
   2554                     {
   2555                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2556                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2557 
   2558                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2559 
   2560                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2561                         pi2_src_scratch += 8;
   2562 
   2563                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2564                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2565 
   2566                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2567                         m_count = _mm_cvtsi32_si128(i4_shift);
   2568                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2569                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2570 
   2571                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2572                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2573                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2574                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2575 
   2576                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2577 
   2578                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2579                         pi2_dst_scratch += 8;
   2580 
   2581                     }
   2582 
   2583                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   2584                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
   2585 
   2586                     /* o8[0-3] */
   2587                     {
   2588                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2589                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2590 
   2591                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2592 
   2593                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2594                         pi2_src_scratch -= in_stride;
   2595 
   2596                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2597                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2598 
   2599                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2600                         m_count = _mm_cvtsi32_si128(i4_shift);
   2601                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2602                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2603 
   2604                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2605                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2606                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2607                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2608 
   2609                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2610 
   2611                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2612                         pi2_dst_scratch -= out_stride;
   2613                     }
   2614 
   2615                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   2616                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
   2617 
   2618                     /* o9[0-3] */
   2619                     {
   2620                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2621                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2622 
   2623                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2624 
   2625                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2626                         pi2_src_scratch -= in_stride;
   2627 
   2628                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2629                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2630 
   2631                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2632                         m_count = _mm_cvtsi32_si128(i4_shift);
   2633                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2634                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2635 
   2636                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2637                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2638                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2639                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2640 
   2641                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2642 
   2643                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2644                         pi2_dst_scratch -= out_stride;
   2645                     }
   2646 
   2647                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   2648                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
   2649 
   2650                     /* o10[0-3] */
   2651                     {
   2652                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2653                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2654 
   2655                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2656 
   2657                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2658                         pi2_src_scratch -= in_stride;
   2659 
   2660                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2661                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2662 
   2663                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2664                         m_count = _mm_cvtsi32_si128(i4_shift);
   2665                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2666                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2667 
   2668                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2669                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2670                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2671                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2672 
   2673                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2674 
   2675                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2676                         pi2_dst_scratch -= out_stride;
   2677                     }
   2678 
   2679                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   2680                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
   2681 
   2682                     /* o11[0-3] */
   2683                     {
   2684 
   2685                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2686                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2687 
   2688                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2689 
   2690                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2691                         pi2_src_scratch -= in_stride;
   2692 
   2693                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2694                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2695 
   2696                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2697                         m_count = _mm_cvtsi32_si128(i4_shift);
   2698                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2699                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2700 
   2701                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2702                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2703                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2704                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2705 
   2706                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2707 
   2708                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2709                         pi2_dst_scratch -= out_stride;
   2710 
   2711                     }
   2712 
   2713                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   2714                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
   2715 
   2716                     /* o12[0-3] */
   2717                     {
   2718                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2719                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2720 
   2721                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2722 
   2723                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2724                         pi2_src_scratch -= in_stride;
   2725 
   2726                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2727                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2728 
   2729                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2730                         m_count = _mm_cvtsi32_si128(i4_shift);
   2731                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2732                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2733 
   2734                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2735                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2736                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2737                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2738 
   2739                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2740 
   2741                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2742                         pi2_dst_scratch -= out_stride;
   2743 
   2744                     }
   2745 
   2746                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   2747                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
   2748 
   2749                     /* o13[0-3] */
   2750                     {
   2751                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2752                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2753 
   2754                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2755 
   2756                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2757                         pi2_src_scratch -= in_stride;
   2758 
   2759                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2760                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2761 
   2762                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2763                         m_count = _mm_cvtsi32_si128(i4_shift);
   2764                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2765                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2766 
   2767                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2768                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2769                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2770                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2771 
   2772                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2773 
   2774                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2775                         pi2_dst_scratch -= out_stride;
   2776                     }
   2777 
   2778                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   2779                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
   2780 
   2781                     /* o14[0-3] */
   2782                     {
   2783                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2784                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2785 
   2786                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2787 
   2788                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2789                         pi2_src_scratch -= in_stride;
   2790 
   2791                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2792                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2793 
   2794                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2795                         m_count = _mm_cvtsi32_si128(i4_shift);
   2796                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2797                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2798 
   2799                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2800                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2801                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2802                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2803 
   2804                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2805 
   2806                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2807                         pi2_dst_scratch -= out_stride;
   2808 
   2809                     }
   2810 
   2811                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   2812                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
   2813 
   2814                     /* o15[0-3] */
   2815                     {
   2816                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2817                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2818 
   2819                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2820 
   2821                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2822                         pi2_src_scratch += 8;
   2823 
   2824                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2825                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2826 
   2827                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2828                         m_count = _mm_cvtsi32_si128(i4_shift);
   2829                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2830                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2831 
   2832                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2833                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2834                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2835                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2836 
   2837                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2838 
   2839                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2840                         pi2_dst_scratch += 8;
   2841                     }
   2842 
   2843                 }
   2844             }
   2845         }
   2846         else
   2847         {
   2848             /* o & stage 1 out */
   2849             {
   2850                 WORD32 j;
   2851                 WORD16 *pi2_src_scratch = o_temp_ptr;
   2852                 WORD16 *pi2_dst_scratch = temp_ptr;
   2853                 WORD32 out_stride = (trans_size << 1);
   2854                 WORD32 in_stride = trans_size;
   2855 
   2856 
   2857                 for(j = 0; j < 2; j++)
   2858                 {
   2859                     if(j)
   2860                     {
   2861                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
   2862                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
   2863                         m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
   2864                         m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
   2865                         m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
   2866                         m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
   2867                         m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
   2868                         m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
   2869 
   2870                         m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
   2871                         m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
   2872                         m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
   2873                         m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
   2874                         m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
   2875                         m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
   2876                         m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
   2877                         m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
   2878                     }
   2879 
   2880                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   2881                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
   2882                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
   2883                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
   2884                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
   2885                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
   2886                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
   2887                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
   2888 
   2889                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
   2890                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
   2891                     m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
   2892                     m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
   2893                     temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
   2894                     temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
   2895                     temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
   2896                     temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
   2897 
   2898 
   2899                     /* o0[0-3] */
   2900                     {
   2901                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2902                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2903                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   2904                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2905 
   2906                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2907                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   2908 
   2909                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   2910 
   2911                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   2912                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   2913                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   2914                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   2915 
   2916                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   2917                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   2918 
   2919                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   2920 
   2921                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   2922 
   2923                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2924                         pi2_src_scratch += in_stride;
   2925 
   2926                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2927                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2928 
   2929                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2930                         m_count = _mm_cvtsi32_si128(i4_shift);
   2931                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2932                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2933 
   2934                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2935                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2936                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2937                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2938 
   2939                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2940 
   2941                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2942                         pi2_dst_scratch += out_stride;
   2943 
   2944                     }
   2945 
   2946                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   2947                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
   2948                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
   2949                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
   2950                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
   2951                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
   2952                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
   2953                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
   2954 
   2955 
   2956                     /* o1[0-3] */
   2957                     {
   2958                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2959                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2960                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   2961                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2962 
   2963                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2964                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   2965 
   2966                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
   2967 
   2968                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   2969                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   2970                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   2971                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   2972 
   2973                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   2974                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   2975 
   2976                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   2977 
   2978                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   2979 
   2980                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2981                         pi2_src_scratch += in_stride;
   2982 
   2983                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2984                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2985 
   2986                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2987                         m_count = _mm_cvtsi32_si128(i4_shift);
   2988                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2989                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2990 
   2991                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2992                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2993                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2994                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2995 
   2996                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2997 
   2998                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2999                         pi2_dst_scratch += out_stride;
   3000 
   3001                     }
   3002 
   3003                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   3004                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
   3005                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
   3006                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
   3007                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
   3008                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
   3009                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
   3010                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
   3011 
   3012                     /* o2[0-3] */
   3013                     {
   3014                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3015                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3016                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3017                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3018 
   3019                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   3020                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3021 
   3022                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3023 
   3024                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3025                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3026                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3027                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3028 
   3029                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
   3030                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3031 
   3032                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
   3033 
   3034                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3035 
   3036                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3037                         pi2_src_scratch += in_stride;
   3038 
   3039                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3040                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3041 
   3042                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3043                         m_count = _mm_cvtsi32_si128(i4_shift);
   3044                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3045                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3046 
   3047                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3048                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3049                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3050                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3051 
   3052                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3053 
   3054                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3055                         pi2_dst_scratch += out_stride;
   3056 
   3057                     }
   3058 
   3059 
   3060                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   3061                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
   3062                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
   3063                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
   3064                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
   3065                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
   3066                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
   3067                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
   3068 
   3069                     /* o3[0-3] */
   3070                     {
   3071                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3072                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3073                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3074                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3075 
   3076                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   3077                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3078 
   3079                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3080 
   3081                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3082                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3083                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3084                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3085 
   3086                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
   3087                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3088 
   3089                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3090 
   3091                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3092 
   3093                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3094                         pi2_src_scratch += in_stride;
   3095 
   3096                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3097                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3098 
   3099                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3100                         m_count = _mm_cvtsi32_si128(i4_shift);
   3101                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3102                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3103 
   3104                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3105                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3106                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3107                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3108 
   3109                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3110 
   3111                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3112                         pi2_dst_scratch += out_stride;
   3113 
   3114                     }
   3115 
   3116                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   3117                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
   3118                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
   3119                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
   3120                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
   3121                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
   3122                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
   3123                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
   3124 
   3125                     /* o4[0-3] */
   3126                     {
   3127                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3128                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3129                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3130                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3131 
   3132                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3133                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3134 
   3135                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3136 
   3137                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3138                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3139                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3140                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3141 
   3142                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3143                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3144 
   3145                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3146 
   3147                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3148 
   3149                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3150                         pi2_src_scratch += in_stride;
   3151 
   3152                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3153                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3154 
   3155                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3156                         m_count = _mm_cvtsi32_si128(i4_shift);
   3157                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3158                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3159 
   3160                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3161                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3162                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3163                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3164 
   3165                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3166 
   3167                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3168                         pi2_dst_scratch += out_stride;
   3169 
   3170                     }
   3171 
   3172 
   3173                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   3174                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
   3175                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
   3176                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
   3177                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
   3178                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
   3179                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
   3180                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
   3181 
   3182                     /* o5[0-3] */
   3183                     {
   3184                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3185                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3186                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3187                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3188 
   3189                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3190                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3191 
   3192                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3193 
   3194                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3195                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3196                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3197                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3198 
   3199                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3200                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3201 
   3202                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3203 
   3204                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3205 
   3206                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3207                         pi2_src_scratch += in_stride;
   3208 
   3209                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3210                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3211 
   3212                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3213                         m_count = _mm_cvtsi32_si128(i4_shift);
   3214                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3215                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3216 
   3217                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3218                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3219                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3220                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3221 
   3222                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3223 
   3224                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3225                         pi2_dst_scratch += out_stride;
   3226 
   3227                     }
   3228 
   3229                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   3230                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
   3231                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
   3232                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
   3233                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
   3234                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
   3235                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
   3236                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
   3237 
   3238 
   3239                     /* o6[0-3] */
   3240                     {
   3241                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3242                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3243                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3244                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3245 
   3246                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3247                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3248 
   3249                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3250 
   3251                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3252                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3253                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3254                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3255 
   3256                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3257                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3258 
   3259                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3260 
   3261                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3262 
   3263                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3264                         pi2_src_scratch += in_stride;
   3265 
   3266                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3267                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3268 
   3269                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3270                         m_count = _mm_cvtsi32_si128(i4_shift);
   3271                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3272                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3273 
   3274                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3275                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3276                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3277                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3278 
   3279                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3280 
   3281                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3282                         pi2_dst_scratch += out_stride;
   3283 
   3284                     }
   3285 
   3286                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   3287                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
   3288                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
   3289                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
   3290                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
   3291                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
   3292                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
   3293                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
   3294 
   3295                     /* o7[0-3] */
   3296                     {
   3297                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3298                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3299                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3300                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3301 
   3302                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3303                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3304 
   3305                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3306 
   3307                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3308                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3309                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3310                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3311 
   3312                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3313                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3314 
   3315                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3316 
   3317                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3318 
   3319                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3320                         pi2_src_scratch += 8;
   3321 
   3322                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3323                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3324 
   3325                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3326                         m_count = _mm_cvtsi32_si128(i4_shift);
   3327                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3328                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3329 
   3330                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3331                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3332                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3333                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3334 
   3335                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3336 
   3337                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3338                         pi2_dst_scratch += 8;
   3339 
   3340                     }
   3341 
   3342                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   3343                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
   3344                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
   3345                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
   3346                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
   3347                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
   3348                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
   3349                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
   3350 
   3351 
   3352                     /* o8[0-3] */
   3353                     {
   3354 
   3355                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3356                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3357                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3358                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3359 
   3360                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3361                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3362 
   3363                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3364 
   3365                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3366                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3367                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3368                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3369 
   3370                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3371                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3372 
   3373                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3374 
   3375                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3376 
   3377                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3378                         pi2_src_scratch -= in_stride;
   3379 
   3380                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3381                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3382 
   3383                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3384                         m_count = _mm_cvtsi32_si128(i4_shift);
   3385                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3386                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3387 
   3388                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3389                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3390                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3391                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3392 
   3393                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3394 
   3395                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3396                         pi2_dst_scratch -= out_stride;
   3397                     }
   3398 
   3399                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   3400                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
   3401                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
   3402                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
   3403                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
   3404                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
   3405                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
   3406                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
   3407 
   3408 
   3409                     /* o9[0-3] */
   3410                     {
   3411                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3412                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3413                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3414                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3415 
   3416                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3417                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3418 
   3419                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3420 
   3421                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3422                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3423                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3424                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3425 
   3426                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3427                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3428 
   3429                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3430 
   3431                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3432 
   3433                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3434                         pi2_src_scratch -= in_stride;
   3435 
   3436                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3437                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3438 
   3439                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3440                         m_count = _mm_cvtsi32_si128(i4_shift);
   3441                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3442                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3443 
   3444                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3445                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3446                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3447                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3448 
   3449                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3450 
   3451                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3452                         pi2_dst_scratch -= out_stride;
   3453                     }
   3454 
   3455                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   3456                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
   3457                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
   3458                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
   3459                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
   3460                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
   3461                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
   3462                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
   3463 
   3464                     /* o10[0-3] */
   3465                     {
   3466                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3467                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3468                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3469                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3470 
   3471                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3472                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3473 
   3474                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3475 
   3476                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3477                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3478                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3479                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3480 
   3481                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3482                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3483 
   3484                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3485 
   3486                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3487 
   3488                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3489                         pi2_src_scratch -= in_stride;
   3490 
   3491                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3492                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3493 
   3494                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3495                         m_count = _mm_cvtsi32_si128(i4_shift);
   3496                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3497                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3498 
   3499                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3500                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3501                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3502                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3503 
   3504                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3505 
   3506                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3507                         pi2_dst_scratch -= out_stride;
   3508                     }
   3509 
   3510                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   3511                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
   3512                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
   3513                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
   3514                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
   3515                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
   3516                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
   3517                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
   3518 
   3519                     /* o11[0-3] */
   3520                     {
   3521                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3522                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3523                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3524                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3525 
   3526                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3527                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3528 
   3529                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3530 
   3531                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3532                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3533                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3534                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3535 
   3536                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3537                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3538 
   3539                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3540 
   3541                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3542 
   3543                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3544                         pi2_src_scratch -= in_stride;
   3545 
   3546                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3547                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3548 
   3549                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3550                         m_count = _mm_cvtsi32_si128(i4_shift);
   3551                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3552                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3553 
   3554                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3555                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3556                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3557                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3558 
   3559                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3560 
   3561                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3562                         pi2_dst_scratch -= out_stride;
   3563 
   3564                     }
   3565 
   3566                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   3567                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
   3568                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
   3569                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
   3570                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
   3571                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
   3572                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
   3573                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
   3574 
   3575 
   3576                     /* o12[0-3] */
   3577                     {
   3578                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3579                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3580                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3581                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3582 
   3583                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3584                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3585 
   3586                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3587 
   3588                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3589                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3590                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3591                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3592 
   3593                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3594                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3595 
   3596                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3597 
   3598                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3599 
   3600                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3601                         pi2_src_scratch -= in_stride;
   3602 
   3603                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3604                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3605 
   3606                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3607                         m_count = _mm_cvtsi32_si128(i4_shift);
   3608                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3609                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3610 
   3611                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3612                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3613                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3614                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3615 
   3616                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3617 
   3618                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3619                         pi2_dst_scratch -= out_stride;
   3620 
   3621                     }
   3622 
   3623                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   3624                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
   3625                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
   3626                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
   3627                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
   3628                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
   3629                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
   3630                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
   3631 
   3632 
   3633                     /* o13[0-3] */
   3634                     {
   3635                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3636                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3637                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3638                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3639 
   3640                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3641                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3642 
   3643                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3644 
   3645                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3646                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3647                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3648                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3649 
   3650                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3651                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3652 
   3653                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3654 
   3655                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3656 
   3657                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3658                         pi2_src_scratch -= in_stride;
   3659 
   3660                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3661                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3662 
   3663                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3664                         m_count = _mm_cvtsi32_si128(i4_shift);
   3665                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3666                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3667 
   3668                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3669                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3670                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3671                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3672 
   3673                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3674 
   3675                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3676                         pi2_dst_scratch -= out_stride;
   3677                     }
   3678 
   3679                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   3680                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
   3681                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
   3682                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
   3683                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
   3684                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
   3685                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
   3686                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
   3687 
   3688 
   3689                     /* o14[0-3] */
   3690                     {
   3691                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3692                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3693                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3694                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3695 
   3696                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3697                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3698 
   3699                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3700 
   3701                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3702                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3703                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3704                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3705 
   3706                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3707                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3708 
   3709                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3710 
   3711                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3712 
   3713                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3714                         pi2_src_scratch -= in_stride;
   3715 
   3716                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3717                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3718 
   3719                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3720                         m_count = _mm_cvtsi32_si128(i4_shift);
   3721                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3722                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3723 
   3724                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3725                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3726                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3727                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3728 
   3729                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3730 
   3731                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3732                         pi2_dst_scratch -= out_stride;
   3733 
   3734                     }
   3735 
   3736                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   3737                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
   3738                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
   3739                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
   3740                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
   3741                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
   3742                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
   3743                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
   3744 
   3745                     /* o15[0-3] */
   3746                     {
   3747                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3748                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3749                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3750                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3751 
   3752                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3753                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3754 
   3755                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3756 
   3757                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3758                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3759                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3760                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3761 
   3762                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3763                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3764 
   3765                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3766 
   3767                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3768 
   3769                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3770                         pi2_src_scratch += 8;
   3771 
   3772                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3773                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3774 
   3775                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3776                         m_count = _mm_cvtsi32_si128(i4_shift);
   3777                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3778                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3779 
   3780                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3781                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3782                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3783                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3784 
   3785                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3786 
   3787                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3788                         pi2_dst_scratch += 8;
   3789                     }
   3790 
   3791                 }
   3792             }
   3793         }
   3794         /* Transpose */
   3795         {
   3796             WORD16 *pi2_src_scratch = temp_ptr;
   3797             WORD16 *pi2_dst_scratch = pi2_tmp;
   3798             WORD32 in_stride = (trans_size << 1);
   3799 
   3800             for(j = 0; j < 2; j++)
   3801             {
   3802                 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3803                 pi2_src_scratch += in_stride;
   3804                 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3805                 pi2_src_scratch += in_stride;
   3806                 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3807                 pi2_src_scratch += in_stride;
   3808                 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3809                 pi2_src_scratch += in_stride;
   3810                 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3811                 pi2_src_scratch += in_stride;
   3812                 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3813                 pi2_src_scratch += in_stride;
   3814                 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3815                 pi2_src_scratch += in_stride;
   3816                 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3817                 pi2_src_scratch += 8;
   3818 
   3819                 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3820                 pi2_src_scratch -= in_stride;
   3821                 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3822                 pi2_src_scratch -= in_stride;
   3823                 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3824                 pi2_src_scratch -= in_stride;
   3825                 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3826                 pi2_src_scratch -= in_stride;
   3827                 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3828                 pi2_src_scratch -= in_stride;
   3829                 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3830                 pi2_src_scratch -= in_stride;
   3831                 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3832                 pi2_src_scratch -= in_stride;
   3833                 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3834                 pi2_src_scratch += 8;
   3835 
   3836 
   3837                 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
   3838                 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
   3839 
   3840                 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
   3841                 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
   3842 
   3843                 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
   3844                 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
   3845 
   3846                 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
   3847                 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
   3848 
   3849                 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
   3850                 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
   3851 
   3852                 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
   3853                 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
   3854 
   3855                 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
   3856                 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
   3857 
   3858                 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
   3859                 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
   3860 
   3861                 /****************/
   3862 
   3863                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
   3864                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
   3865 
   3866                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
   3867                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
   3868 
   3869                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
   3870                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
   3871 
   3872                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
   3873                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
   3874 
   3875                 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
   3876                 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
   3877 
   3878                 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
   3879                 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
   3880 
   3881                 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
   3882                 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
   3883 
   3884                 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
   3885                 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
   3886 
   3887                 /******************/
   3888 
   3889                 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
   3890                 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
   3891 
   3892                 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
   3893                 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
   3894 
   3895                 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
   3896                 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
   3897 
   3898                 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
   3899                 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
   3900 
   3901                 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
   3902                 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
   3903 
   3904                 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
   3905                 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
   3906 
   3907                 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
   3908                 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
   3909 
   3910                 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
   3911                 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
   3912 
   3913                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
   3914                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
   3915                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
   3916                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
   3917 
   3918                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
   3919                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
   3920                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
   3921                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
   3922 
   3923                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
   3924                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
   3925                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
   3926                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
   3927 
   3928                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
   3929                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
   3930                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
   3931                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
   3932 
   3933                 pi2_dst_scratch += 4 * trans_size;
   3934             }
   3935         }
   3936         pi2_src += 8;
   3937 //      pi2_dequant_coeff +=8;
   3938         pi2_tmp += 8 * trans_size;
   3939         zero_cols = zero_cols >> 1;
   3940     }
   3941 
   3942     if(trans_size_stg1 != TRANS_SIZE_32)
   3943     {
   3944         m_temp_reg_10 = _mm_setzero_si128();
   3945 
   3946         for(i = trans_size_stg1; i < 32; i += 8)
   3947         {
   3948             WORD16 *pi2_dst_scratch = pi2_tmp;
   3949 
   3950             _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
   3951             _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
   3952             _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
   3953             _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
   3954 
   3955             _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
   3956             _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
   3957             _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
   3958             _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
   3959 
   3960             _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
   3961             _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
   3962             _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
   3963             _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
   3964 
   3965             _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
   3966             _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
   3967             _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
   3968             _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
   3969 
   3970             _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
   3971             _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
   3972             _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
   3973             _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
   3974 
   3975             _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
   3976             _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
   3977             _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
   3978             _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
   3979 
   3980             _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
   3981             _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
   3982             _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
   3983             _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
   3984 
   3985             _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
   3986             _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
   3987             _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
   3988             _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
   3989 
   3990             pi2_tmp += 8 * trans_size;
   3991         }
   3992     }
   3993 
   3994     pi2_tmp = pi2_tmp_orig;
   3995 
   3996     /* Inverse Transform 2nd stage */
   3997 
   3998     for(j = 0; j < trans_size; j += 4)
   3999     {
   4000         i4_shift = IT_SHIFT_STAGE_2;
   4001 
   4002         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
   4003         if(zero_last28_rows_stg2)
   4004         {
   4005             {
   4006 
   4007                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
   4008                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
   4009                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
   4010                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
   4011                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
   4012                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
   4013                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
   4014                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
   4015 
   4016                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
   4017 
   4018                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
   4019 
   4020                 /* eo0[0-3] */
   4021                 {
   4022                     m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4023 
   4024                 }
   4025                 /* eo1[0-3] */
   4026                 {
   4027                     m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
   4028 
   4029                 }
   4030                 /* eo2[0-3] */
   4031                 {
   4032                     m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   4033                 }
   4034 
   4035                 /* eo3[0-3] */
   4036                 {
   4037                     m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
   4038                 }
   4039                 /* eo4[0-3] */
   4040                 {
   4041                     m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   4042                 }
   4043 
   4044                 /* eo5[0-3] */
   4045                 {
   4046                     m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
   4047                 }
   4048 
   4049                 /* eo6[0-3] */
   4050                 {
   4051                     m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
   4052                 }
   4053                 /* eo7[0-3] */
   4054                 {
   4055                     m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
   4056                 }
   4057             }
   4058 
   4059             m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
   4060 
   4061             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
   4062 
   4063             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
   4064 
   4065             m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   4066 
   4067             m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   4068 
   4069             /* e[]*/
   4070 
   4071             temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[0] */
   4072             temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[15] */
   4073 
   4074             temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[1] */
   4075             temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[14] */
   4076 
   4077             temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[2] */
   4078             temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[13] */
   4079 
   4080             temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[3] */
   4081             temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[12] */
   4082 
   4083             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[4] */
   4084             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[11] */
   4085 
   4086             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[5] */
   4087             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[10] */
   4088 
   4089             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[6] */
   4090             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[9] */
   4091 
   4092             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[7] */
   4093             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[8] */
   4094 
   4095             /*o[k]*/
   4096             {
   4097 
   4098                 WORD16 *pi2_dst_scratch = temp_ptr;
   4099                 WORD32 out_stride = 8;
   4100 
   4101                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   4102 
   4103                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
   4104                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
   4105 
   4106                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
   4107 
   4108 
   4109                 /* o0[0-3] */
   4110                 {
   4111                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4112 
   4113                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
   4114                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
   4115 
   4116                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4117                     m_count = _mm_cvtsi32_si128(i4_shift);
   4118                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4119                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4120 
   4121                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4122                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4123                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4124                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4125 
   4126                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4127 
   4128                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4129                     pi2_dst_scratch += out_stride;
   4130 
   4131                 }
   4132 
   4133                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   4134 
   4135                 /* o1[0-3] */
   4136                 {
   4137                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4138 
   4139                     m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
   4140                     m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
   4141 
   4142                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4143                     m_count = _mm_cvtsi32_si128(i4_shift);
   4144                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4145                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4146 
   4147                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4148                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4149                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4150                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4151 
   4152                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4153 
   4154                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4155                     pi2_dst_scratch += out_stride;
   4156 
   4157                 }
   4158 
   4159                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   4160 
   4161                 /* o2[0-3] */
   4162                 {
   4163                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4164 
   4165                     m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
   4166                     m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
   4167 
   4168                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4169                     m_count = _mm_cvtsi32_si128(i4_shift);
   4170                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4171                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4172 
   4173                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4174                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4175                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4176                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4177 
   4178                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4179 
   4180                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4181                     pi2_dst_scratch += out_stride;
   4182 
   4183                 }
   4184 
   4185                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   4186 
   4187                 /* o3[0-3] */
   4188                 {
   4189                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4190 
   4191                     m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
   4192                     m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
   4193 
   4194                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4195                     m_count = _mm_cvtsi32_si128(i4_shift);
   4196                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4197                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4198 
   4199                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4200                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4201                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4202                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4203 
   4204                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4205 
   4206                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4207                     pi2_dst_scratch += out_stride;
   4208 
   4209                 }
   4210 
   4211                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   4212 
   4213                 /* o4[0-3] */
   4214                 {
   4215                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4216 
   4217                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
   4218                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
   4219 
   4220                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4221                     m_count = _mm_cvtsi32_si128(i4_shift);
   4222                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4223                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4224 
   4225                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4226                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4227                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4228                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4229 
   4230                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4231 
   4232                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4233                     pi2_dst_scratch += out_stride;
   4234 
   4235                 }
   4236 
   4237                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   4238 
   4239                 /* o5[0-3] */
   4240                 {
   4241                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4242 
   4243                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
   4244                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
   4245 
   4246                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4247                     m_count = _mm_cvtsi32_si128(i4_shift);
   4248                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4249                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4250 
   4251                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4252                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4253                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4254                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4255 
   4256                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4257 
   4258                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4259                     pi2_dst_scratch += out_stride;
   4260 
   4261                 }
   4262 
   4263                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   4264 
   4265                 /* o6[0-3] */
   4266                 {
   4267                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4268 
   4269                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
   4270                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
   4271 
   4272                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4273                     m_count = _mm_cvtsi32_si128(i4_shift);
   4274                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4275                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4276 
   4277                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4278                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4279                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4280                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4281 
   4282                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4283 
   4284                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4285                     pi2_dst_scratch += out_stride;
   4286 
   4287                 }
   4288 
   4289                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   4290 
   4291                 /* o7[0-3] */
   4292                 {
   4293                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4294 
   4295                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
   4296                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
   4297 
   4298                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4299                     m_count = _mm_cvtsi32_si128(i4_shift);
   4300                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4301                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4302 
   4303                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4304                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4305                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4306                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4307 
   4308                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4309 
   4310                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4311                     pi2_dst_scratch += 8;
   4312 
   4313                 }
   4314 
   4315                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   4316 
   4317                 /* o8[0-3] */
   4318                 {
   4319                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4320 
   4321                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
   4322                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
   4323 
   4324                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4325                     m_count = _mm_cvtsi32_si128(i4_shift);
   4326                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4327                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4328 
   4329                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4330                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4331                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4332                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4333 
   4334                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4335 
   4336                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4337                     pi2_dst_scratch += out_stride;
   4338                 }
   4339 
   4340                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   4341 
   4342                 /* o9[0-3] */
   4343                 {
   4344                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4345 
   4346                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
   4347                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
   4348 
   4349                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4350                     m_count = _mm_cvtsi32_si128(i4_shift);
   4351                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4352                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4353 
   4354                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4355                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4356                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4357                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4358 
   4359                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4360 
   4361                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4362                     pi2_dst_scratch += out_stride;
   4363 
   4364                 }
   4365 
   4366                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   4367 
   4368                 /* o10[0-3] */
   4369                 {
   4370                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4371 
   4372                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
   4373                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
   4374 
   4375                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4376                     m_count = _mm_cvtsi32_si128(i4_shift);
   4377                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4378                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4379 
   4380                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4381                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4382                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4383                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4384 
   4385                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4386 
   4387                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4388                     pi2_dst_scratch += out_stride;
   4389                 }
   4390 
   4391                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   4392 
   4393                 /* o11[0-3] */
   4394                 {
   4395                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4396 
   4397                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
   4398                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
   4399 
   4400                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4401                     m_count = _mm_cvtsi32_si128(i4_shift);
   4402                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4403                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4404 
   4405                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4406                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4407                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4408                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4409 
   4410                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4411 
   4412                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4413                     pi2_dst_scratch += out_stride;
   4414 
   4415                 }
   4416 
   4417                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   4418 
   4419                 /* o12[0-3] */
   4420                 {
   4421                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4422 
   4423                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
   4424                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
   4425 
   4426                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4427                     m_count = _mm_cvtsi32_si128(i4_shift);
   4428                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4429                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4430 
   4431                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4432                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4433                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4434                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4435 
   4436                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4437 
   4438                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4439                     pi2_dst_scratch += out_stride;
   4440 
   4441                 }
   4442 
   4443                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   4444 
   4445                 /* o13[0-3] */
   4446                 {
   4447                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4448 
   4449                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
   4450                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
   4451 
   4452                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4453                     m_count = _mm_cvtsi32_si128(i4_shift);
   4454                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4455                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4456 
   4457                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4458                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4459                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4460                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4461 
   4462                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4463 
   4464                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4465                     pi2_dst_scratch += out_stride;
   4466                 }
   4467 
   4468                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   4469 
   4470                 /* o14[0-3] */
   4471                 {
   4472                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4473 
   4474                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
   4475                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
   4476 
   4477                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4478                     m_count = _mm_cvtsi32_si128(i4_shift);
   4479                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4480                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4481 
   4482                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4483                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4484                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4485                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4486 
   4487                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4488 
   4489                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4490                     pi2_dst_scratch += out_stride;
   4491 
   4492                 }
   4493 
   4494                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   4495 
   4496                 /* o15[0-3] */
   4497                 {
   4498                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4499 
   4500                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
   4501                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
   4502 
   4503                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4504                     m_count = _mm_cvtsi32_si128(i4_shift);
   4505                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4506                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4507 
   4508                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4509                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4510                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4511                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4512 
   4513                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4514 
   4515                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4516                     pi2_dst_scratch += 8;
   4517                 }
   4518 
   4519             }
   4520 
   4521         }
   4522         else if(zero_last24_rows_stg2)
   4523         {
   4524             /* eo */
   4525             {
   4526                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
   4527 
   4528                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
   4529                 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
   4530 
   4531                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
   4532 
   4533 
   4534                 /* eo0[0-3] */
   4535                 {
   4536                     m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4537 
   4538                 }
   4539 
   4540                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
   4541 
   4542                 /* eo1[0-3] */
   4543                 {
   4544                     m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4545 
   4546                 }
   4547                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
   4548 
   4549                 /* eo2[0-3] */
   4550                 {
   4551                     m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4552 
   4553                 }
   4554 
   4555                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
   4556 
   4557                 /* eo3[0-3] */
   4558                 {
   4559 
   4560                     m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4561 
   4562                 }
   4563 
   4564                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
   4565 
   4566                 /* eo4[0-3] */
   4567                 {
   4568                     m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4569 
   4570                 }
   4571 
   4572                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
   4573 
   4574                 /* eo5[0-3] */
   4575                 {
   4576                     m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4577                 }
   4578 
   4579                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
   4580                 /* eo6[0-3] */
   4581                 {
   4582                     m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4583                 }
   4584 
   4585                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
   4586                 /* eo7[0-3] */
   4587                 {
   4588                     m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4589 
   4590                 }
   4591 
   4592             }
   4593 
   4594             /* eeo */
   4595             {
   4596 
   4597                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
   4598                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
   4599                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
   4600                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
   4601 
   4602                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
   4603 
   4604                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
   4605 
   4606                 /* eeo0[0-3] */
   4607                 {
   4608                     temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4609 
   4610                 }
   4611 
   4612                 /* eeo1[0-3] */
   4613                 {
   4614                     temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
   4615 
   4616                 }
   4617 
   4618                 /* eo2[0-3] */
   4619                 {
   4620                     temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
   4621 
   4622                 }
   4623 
   4624 
   4625                 /* eo3[0-3] */
   4626                 {
   4627                     temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   4628 
   4629                 }
   4630 
   4631             }
   4632 
   4633             m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
   4634             m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
   4635             m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
   4636 
   4637             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
   4638 
   4639             //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
   4640             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
   4641 
   4642             m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   4643             m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   4644 
   4645             m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1);  /* ee[0] */
   4646             m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1);  /* ee[7] */
   4647 
   4648             m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2);  /* ee[1] */
   4649             m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2);  /* ee[6] */
   4650 
   4651             m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3);  /* ee[2] */
   4652             m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3);  /* ee[5] */
   4653 
   4654             m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4);  /* ee[3] */
   4655             m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4);  /* ee[4] */
   4656 
   4657             /* e[]*/
   4658 
   4659             temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
   4660             temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
   4661 
   4662             temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
   4663             temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
   4664 
   4665             temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
   4666             temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
   4667 
   4668             temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
   4669             temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
   4670 
   4671             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
   4672             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
   4673 
   4674             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
   4675             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
   4676 
   4677             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
   4678             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
   4679 
   4680             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
   4681             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
   4682 
   4683             /*o[k] */
   4684             {
   4685 
   4686                 WORD16 *pi2_dst_scratch = temp_ptr;
   4687                 WORD32 out_stride = 8;
   4688 
   4689                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   4690                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
   4691 
   4692                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
   4693                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
   4694                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
   4695                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
   4696 
   4697                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
   4698                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
   4699 
   4700                 /* o0[0-3] */
   4701                 {
   4702                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4703                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4704 
   4705                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   4706 
   4707                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
   4708                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
   4709 
   4710                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4711                     m_count = _mm_cvtsi32_si128(i4_shift);
   4712                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4713                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4714 
   4715                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4716                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4717                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4718                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4719 
   4720                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4721 
   4722                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4723                     pi2_dst_scratch += out_stride;
   4724 
   4725                 }
   4726 
   4727 
   4728                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   4729                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
   4730 
   4731                 /* o1[0-3] */
   4732                 {
   4733                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4734                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4735 
   4736                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   4737 
   4738                     m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
   4739                     m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
   4740 
   4741                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4742                     m_count = _mm_cvtsi32_si128(i4_shift);
   4743                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4744                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4745 
   4746                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4747                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4748                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4749                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4750 
   4751                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4752 
   4753                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4754                     pi2_dst_scratch += out_stride;
   4755 
   4756                 }
   4757 
   4758                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   4759                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
   4760 
   4761                 /* o2[0-3] */
   4762                 {
   4763                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4764                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4765 
   4766                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   4767 
   4768                     m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
   4769                     m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
   4770 
   4771                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4772                     m_count = _mm_cvtsi32_si128(i4_shift);
   4773                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4774                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4775 
   4776                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4777                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4778                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4779                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4780 
   4781                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4782 
   4783                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4784                     pi2_dst_scratch += out_stride;
   4785 
   4786                 }
   4787 
   4788                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   4789                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
   4790 
   4791                 /* o3[0-3] */
   4792                 {
   4793                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4794                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4795 
   4796                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   4797 
   4798                     m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
   4799                     m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
   4800 
   4801                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4802                     m_count = _mm_cvtsi32_si128(i4_shift);
   4803                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4804                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4805 
   4806                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4807                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4808                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4809                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4810 
   4811                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4812 
   4813                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4814                     pi2_dst_scratch += out_stride;
   4815 
   4816                 }
   4817 
   4818                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   4819                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
   4820 
   4821                 /* o4[0-3] */
   4822                 {
   4823                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4824                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4825 
   4826                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4827 
   4828                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
   4829                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
   4830 
   4831                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4832                     m_count = _mm_cvtsi32_si128(i4_shift);
   4833                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4834                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4835 
   4836                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4837                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4838                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4839                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4840 
   4841                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4842 
   4843                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4844                     pi2_dst_scratch += out_stride;
   4845 
   4846                 }
   4847 
   4848                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   4849                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
   4850 
   4851                 /* o5[0-3] */
   4852                 {
   4853                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4854                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4855 
   4856                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4857 
   4858                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
   4859                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
   4860 
   4861                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4862                     m_count = _mm_cvtsi32_si128(i4_shift);
   4863                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4864                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4865 
   4866                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4867                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4868                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4869                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4870 
   4871                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4872 
   4873                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4874                     pi2_dst_scratch += out_stride;
   4875 
   4876                 }
   4877 
   4878                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   4879                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
   4880 
   4881                 /* o6[0-3] */
   4882                 {
   4883                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4884                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4885 
   4886                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4887 
   4888                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
   4889                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
   4890 
   4891                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4892                     m_count = _mm_cvtsi32_si128(i4_shift);
   4893                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4894                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4895 
   4896                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4897                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4898                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4899                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4900 
   4901                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4902 
   4903                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4904                     pi2_dst_scratch += out_stride;
   4905 
   4906                 }
   4907 
   4908                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   4909                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
   4910 
   4911                 /* o7[0-3] */
   4912                 {
   4913                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4914                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4915 
   4916                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4917 
   4918                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
   4919                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
   4920 
   4921                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4922                     m_count = _mm_cvtsi32_si128(i4_shift);
   4923                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4924                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4925 
   4926                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4927                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4928                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4929                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4930 
   4931                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4932 
   4933                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4934                     pi2_dst_scratch += 8;
   4935 
   4936                 }
   4937 
   4938                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   4939                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
   4940 
   4941                 /* o8[0-3] */
   4942                 {
   4943                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4944                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4945 
   4946                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4947 
   4948                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
   4949                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
   4950 
   4951                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4952                     m_count = _mm_cvtsi32_si128(i4_shift);
   4953                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4954                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4955 
   4956                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4957                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4958                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4959                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4960 
   4961                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4962 
   4963                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4964                     pi2_dst_scratch += out_stride;
   4965                 }
   4966 
   4967                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   4968                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
   4969 
   4970                 /* o9[0-3] */
   4971                 {
   4972                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4973                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4974 
   4975                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4976 
   4977                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
   4978                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
   4979 
   4980                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4981                     m_count = _mm_cvtsi32_si128(i4_shift);
   4982                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4983                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4984 
   4985                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4986                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4987                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4988                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4989 
   4990                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4991 
   4992                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4993                     pi2_dst_scratch += out_stride;
   4994                 }
   4995 
   4996                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   4997                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
   4998 
   4999                 /* o10[0-3] */
   5000                 {
   5001                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5002                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5003 
   5004                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5005 
   5006                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
   5007                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
   5008 
   5009                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5010                     m_count = _mm_cvtsi32_si128(i4_shift);
   5011                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5012                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5013 
   5014                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5015                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5016                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5017                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5018 
   5019                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5020 
   5021                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5022                     pi2_dst_scratch += out_stride;
   5023                 }
   5024 
   5025                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   5026                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
   5027 
   5028                 /* o11[0-3] */
   5029                 {
   5030                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5031                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5032 
   5033                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5034 
   5035                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
   5036                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
   5037 
   5038                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5039                     m_count = _mm_cvtsi32_si128(i4_shift);
   5040                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5041                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5042 
   5043                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5044                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5045                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5046                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5047 
   5048                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5049 
   5050                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5051                     pi2_dst_scratch += out_stride;
   5052 
   5053                 }
   5054 
   5055                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   5056                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
   5057 
   5058                 /* o12[0-3] */
   5059                 {
   5060                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5061                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5062 
   5063                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5064 
   5065                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
   5066                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
   5067 
   5068                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5069                     m_count = _mm_cvtsi32_si128(i4_shift);
   5070                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5071                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5072 
   5073                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5074                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5075                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5076                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5077 
   5078                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5079 
   5080                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5081                     pi2_dst_scratch += out_stride;
   5082 
   5083                 }
   5084 
   5085                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   5086                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
   5087 
   5088                 /* o13[0-3] */
   5089                 {
   5090                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5091                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5092 
   5093                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5094 
   5095                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
   5096                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
   5097 
   5098                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5099                     m_count = _mm_cvtsi32_si128(i4_shift);
   5100                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5101                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5102 
   5103                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5104                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5105                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5106                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5107 
   5108                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5109 
   5110                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5111                     pi2_dst_scratch += out_stride;
   5112                 }
   5113 
   5114                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   5115                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
   5116 
   5117                 /* o14[0-3] */
   5118                 {
   5119                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5120                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5121 
   5122                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5123 
   5124                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
   5125                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
   5126 
   5127                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5128                     m_count = _mm_cvtsi32_si128(i4_shift);
   5129                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5130                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5131 
   5132                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5133                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5134                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5135                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5136 
   5137                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5138 
   5139                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5140                     pi2_dst_scratch += out_stride;
   5141                 }
   5142 
   5143                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   5144                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
   5145 
   5146                 /* o15[0-3] */
   5147                 {
   5148                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5149                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5150 
   5151                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5152 
   5153                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
   5154                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
   5155 
   5156                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5157                     m_count = _mm_cvtsi32_si128(i4_shift);
   5158                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5159                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5160 
   5161                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5162                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5163                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5164                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5165 
   5166                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5167 
   5168                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5169                     pi2_dst_scratch += 8;
   5170                 }
   5171 
   5172             }
   5173         }
   5174         else
   5175         {
   5176             /* eo */
   5177             {
   5178 
   5179                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
   5180                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
   5181                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
   5182                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
   5183 
   5184 
   5185                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
   5186                 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
   5187                 m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
   5188                 m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
   5189                 m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
   5190                 m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
   5191                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
   5192                 m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
   5193 
   5194                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
   5195                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
   5196                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
   5197                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
   5198 
   5199                 /* eo0[0-3] */
   5200                 {
   5201                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5202                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5203 
   5204                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5205 
   5206                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5207                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5208 
   5209                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5210 
   5211                     m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5212 
   5213                 }
   5214 
   5215                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
   5216                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
   5217                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
   5218                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
   5219 
   5220                 /* eo1[0-3] */
   5221                 {
   5222                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5223                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5224 
   5225                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5226 
   5227                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5228                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5229 
   5230                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5231 
   5232                     m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
   5233 
   5234                 }
   5235 
   5236                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
   5237                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
   5238                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
   5239                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
   5240 
   5241                 /* eo2[0-3] */
   5242                 {
   5243                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5244                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5245 
   5246                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
   5247 
   5248                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5249                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5250 
   5251                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5252 
   5253                     m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5254 
   5255                 }
   5256 
   5257                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
   5258                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
   5259                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
   5260                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
   5261 
   5262                 /* eo3[0-3] */
   5263                 {
   5264                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5265                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5266 
   5267                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5268 
   5269                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5270                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5271 
   5272                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
   5273 
   5274                     m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5275 
   5276                 }
   5277 
   5278                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
   5279                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
   5280                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
   5281                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
   5282 
   5283 
   5284                 /* eo4[0-3] */
   5285                 {
   5286                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5287                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5288 
   5289                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5290 
   5291                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5292                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5293 
   5294                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
   5295 
   5296                     m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5297 
   5298                 }
   5299 
   5300                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
   5301                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
   5302                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
   5303                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
   5304 
   5305                 /* eo5[0-3] */
   5306                 {
   5307                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5308                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5309 
   5310                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5311 
   5312                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5313                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5314 
   5315                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5316 
   5317                     m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5318                 }
   5319 
   5320                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
   5321                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
   5322                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
   5323                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
   5324 
   5325                 /* eo6[0-3] */
   5326                 {
   5327                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5328                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5329 
   5330                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5331 
   5332                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5333                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5334 
   5335                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5336 
   5337                     m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5338 
   5339                 }
   5340 
   5341                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
   5342                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
   5343                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
   5344                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
   5345 
   5346                 /* eo7[0-3] */
   5347                 {
   5348                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5349                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5350 
   5351                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5352 
   5353                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5354                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5355 
   5356                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5357 
   5358                     m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5359 
   5360 
   5361                 }
   5362 
   5363             }
   5364 
   5365             /* eeo */
   5366             {
   5367                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
   5368                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
   5369 
   5370                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
   5371                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
   5372                 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
   5373                 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
   5374 
   5375                 /* eeo0[0-3] */
   5376                 {
   5377 
   5378                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   5379                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
   5380 
   5381                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5382                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5383 
   5384                     temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5385 
   5386                 }
   5387 
   5388                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
   5389                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
   5390 
   5391                 /* eeo1[0-3] */
   5392                 {
   5393                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   5394                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   5395 
   5396                     temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
   5397 
   5398                 }
   5399 
   5400                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
   5401                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
   5402 
   5403                 /* eo2[0-3] */
   5404                 {
   5405                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   5406                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   5407 
   5408                     temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5409 
   5410                 }
   5411 
   5412                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
   5413                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
   5414 
   5415                 /* eo3[0-3] */
   5416                 {
   5417                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   5418                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   5419 
   5420                     temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5421 
   5422                 }
   5423 
   5424 
   5425             }
   5426 
   5427             m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
   5428             m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
   5429 
   5430             m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
   5431             m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
   5432 
   5433             m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
   5434             m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
   5435 
   5436             m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
   5437 
   5438             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
   5439             m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
   5440 
   5441             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
   5442 
   5443             m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
   5444             m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
   5445 
   5446             m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
   5447             m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
   5448 
   5449 /* eeeo[0]= m_temp_reg_20  */
   5450 /* eeeo[1]= m_temp_reg_21  */
   5451 /* eeee[0]= m_temp_reg_22  */
   5452 /* eeee[1]= m_temp_reg_23  */
   5453 
   5454             /* eee[0] = eeee[0] + eeeo[0]; */
   5455             m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
   5456 
   5457             /* eee[3] = eeee[0] - eeeo[0]; */
   5458             m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
   5459 
   5460             /* eee[2] = eeee[1] - eeeo[1]; */
   5461             m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
   5462 
   5463             /* eee[1] = eeee[1] + eeeo[1];*/
   5464             m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
   5465 
   5466             m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1);  /* ee[0] */
   5467             m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1);  /* ee[7] */
   5468 
   5469             m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2);  /* ee[1] */
   5470             m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2);  /* ee[6] */
   5471 
   5472             m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3);  /* ee[2] */
   5473             m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3);  /* ee[5] */
   5474 
   5475             m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4);  /* ee[3] */
   5476             m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4);  /* ee[4] */
   5477 
   5478 /* e[]*/
   5479 
   5480             temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
   5481             temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
   5482 
   5483             temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
   5484             temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
   5485 
   5486             temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
   5487             temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
   5488 
   5489             temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
   5490             temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
   5491 
   5492             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
   5493             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
   5494 
   5495             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
   5496             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
   5497 
   5498             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
   5499             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
   5500 
   5501             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
   5502             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
   5503 
   5504 /*o[k] */
   5505             {
   5506 
   5507                 WORD16 *pi2_dst_scratch = temp_ptr;
   5508                 WORD32 out_stride = 8;
   5509 
   5510                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   5511                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
   5512                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
   5513                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
   5514                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
   5515                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
   5516                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
   5517                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
   5518 
   5519 
   5520                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
   5521                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
   5522                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
   5523                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
   5524                 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
   5525                 m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
   5526                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
   5527                 m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
   5528 
   5529                 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
   5530                 m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
   5531                 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
   5532                 m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
   5533                 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
   5534                 m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
   5535                 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
   5536                 m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
   5537 
   5538                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
   5539                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
   5540                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
   5541                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
   5542                 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
   5543                 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
   5544                 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
   5545                 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
   5546 
   5547                 /* o0[0-3] */
   5548                 {
   5549                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5550                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5551                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5552                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5553 
   5554                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5555                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5556 
   5557                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5558 
   5559                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5560                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5561                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5562                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5563 
   5564                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5565                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5566 
   5567                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5568 
   5569                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5570 
   5571                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
   5572                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
   5573 
   5574                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5575                     m_count = _mm_cvtsi32_si128(i4_shift);
   5576                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5577                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5578 
   5579                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5580                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5581                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5582                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5583 
   5584                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5585 
   5586                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5587                     pi2_dst_scratch += out_stride;
   5588 
   5589                 }
   5590 
   5591                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   5592                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
   5593                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
   5594                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
   5595                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
   5596                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
   5597                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
   5598                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
   5599 
   5600                 /* o1[0-3] */
   5601                 {
   5602                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5603                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5604                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5605                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5606 
   5607                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5608                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5609 
   5610                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
   5611 
   5612                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5613                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5614                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5615                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5616 
   5617                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5618                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5619 
   5620                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5621 
   5622                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5623 
   5624                     m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
   5625                     m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
   5626 
   5627                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5628                     m_count = _mm_cvtsi32_si128(i4_shift);
   5629                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5630                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5631 
   5632                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5633                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5634                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5635                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5636 
   5637                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5638 
   5639                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5640                     pi2_dst_scratch += out_stride;
   5641 
   5642                 }
   5643 
   5644                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   5645                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
   5646                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
   5647                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
   5648                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
   5649                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
   5650                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
   5651                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
   5652 
   5653                 /* o2[0-3] */
   5654                 {
   5655                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5656                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5657                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5658                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5659 
   5660                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   5661                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5662 
   5663                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5664 
   5665                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5666                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5667                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5668                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5669 
   5670                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
   5671                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5672 
   5673                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
   5674 
   5675                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5676 
   5677                     m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
   5678                     m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
   5679 
   5680                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5681                     m_count = _mm_cvtsi32_si128(i4_shift);
   5682                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5683                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5684 
   5685                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5686                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5687                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5688                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5689 
   5690                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5691 
   5692                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5693                     pi2_dst_scratch += out_stride;
   5694 
   5695                 }
   5696 
   5697                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   5698                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
   5699                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
   5700                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
   5701                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
   5702                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
   5703                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
   5704                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
   5705 
   5706                 /* o3[0-3] */
   5707                 {
   5708                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5709                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5710                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5711                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5712 
   5713                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   5714                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5715 
   5716                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5717 
   5718                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5719                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5720                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5721                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5722 
   5723                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
   5724                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5725 
   5726                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5727 
   5728                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5729 
   5730                     m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
   5731                     m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
   5732 
   5733                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5734                     m_count = _mm_cvtsi32_si128(i4_shift);
   5735                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5736                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5737 
   5738                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5739                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5740                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5741                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5742 
   5743                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5744 
   5745                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5746                     pi2_dst_scratch += out_stride;
   5747 
   5748                 }
   5749 
   5750                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   5751                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
   5752                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
   5753                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
   5754                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
   5755                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
   5756                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
   5757                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
   5758 
   5759                 /* o4[0-3] */
   5760                 {
   5761                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5762                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5763                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5764                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5765 
   5766                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5767                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5768 
   5769                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5770 
   5771                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5772                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5773                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5774                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5775 
   5776                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5777                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5778 
   5779                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5780 
   5781                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5782 
   5783                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
   5784                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
   5785                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5786                     m_count = _mm_cvtsi32_si128(i4_shift);
   5787                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5788                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5789 
   5790                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5791                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5792                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5793                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5794 
   5795                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5796 
   5797                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5798                     pi2_dst_scratch += out_stride;
   5799 
   5800                 }
   5801 
   5802                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   5803                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
   5804                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
   5805                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
   5806                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
   5807                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
   5808                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
   5809                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
   5810 
   5811                 /* o5[0-3] */
   5812                 {
   5813                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5814                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5815                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5816                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5817 
   5818                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5819                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5820 
   5821                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5822 
   5823                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5824                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5825                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5826                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5827 
   5828                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5829                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5830 
   5831                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5832 
   5833                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5834 
   5835                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
   5836                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
   5837 
   5838                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5839                     m_count = _mm_cvtsi32_si128(i4_shift);
   5840                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5841                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5842 
   5843                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5844                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5845                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5846                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5847 
   5848                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5849 
   5850                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5851                     pi2_dst_scratch += out_stride;
   5852 
   5853                 }
   5854 
   5855                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   5856                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
   5857                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
   5858                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
   5859                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
   5860                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
   5861                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
   5862                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
   5863 
   5864                 /* o6[0-3] */
   5865                 {
   5866                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5867                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5868                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5869                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5870 
   5871                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5872                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5873 
   5874                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5875 
   5876                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5877                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5878                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5879                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5880 
   5881                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5882                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5883 
   5884                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5885 
   5886                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5887 
   5888                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
   5889                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
   5890 
   5891                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5892                     m_count = _mm_cvtsi32_si128(i4_shift);
   5893                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5894                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5895 
   5896                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5897                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5898                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5899                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5900 
   5901                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5902 
   5903                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5904                     pi2_dst_scratch += out_stride;
   5905 
   5906                 }
   5907 
   5908                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   5909                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
   5910                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
   5911                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
   5912                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
   5913                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
   5914                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
   5915                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
   5916 
   5917                 /* o7[0-3] */
   5918                 {
   5919                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5920                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5921                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5922                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5923 
   5924                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5925                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5926 
   5927                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5928 
   5929                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5930                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5931                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5932                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5933 
   5934                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5935                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5936 
   5937                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5938 
   5939                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5940 
   5941                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
   5942                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
   5943 
   5944                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5945                     m_count = _mm_cvtsi32_si128(i4_shift);
   5946                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5947                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5948 
   5949                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5950                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5951                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5952                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5953 
   5954                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5955 
   5956                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5957                     pi2_dst_scratch += 8;
   5958 
   5959                 }
   5960 
   5961                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   5962                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
   5963                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
   5964                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
   5965                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
   5966                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
   5967                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
   5968                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
   5969 
   5970                 /* o8[0-3] */
   5971                 {
   5972                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5973                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5974                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5975                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5976 
   5977                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5978                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5979 
   5980                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5981 
   5982                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5983                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5984                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5985                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5986 
   5987                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5988                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5989 
   5990                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5991 
   5992                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5993 
   5994                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
   5995                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
   5996 
   5997                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5998                     m_count = _mm_cvtsi32_si128(i4_shift);
   5999                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6000                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6001 
   6002                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6003                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6004                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6005                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6006 
   6007                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6008 
   6009                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6010                     pi2_dst_scratch += out_stride;
   6011                 }
   6012 
   6013                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   6014                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
   6015                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
   6016                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
   6017                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
   6018                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
   6019                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
   6020                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
   6021 
   6022                 /* o9[0-3] */
   6023                 {
   6024                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6025                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6026                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6027                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6028 
   6029                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6030                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6031 
   6032                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6033 
   6034                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6035                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6036                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6037                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6038 
   6039                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6040                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6041 
   6042                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6043 
   6044                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6045 
   6046                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
   6047                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
   6048 
   6049                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6050                     m_count = _mm_cvtsi32_si128(i4_shift);
   6051                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6052                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6053 
   6054                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6055                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6056                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6057                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6058 
   6059                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6060 
   6061                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6062                     pi2_dst_scratch += out_stride;
   6063                 }
   6064 
   6065                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   6066                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
   6067                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
   6068                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
   6069                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
   6070                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
   6071                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
   6072                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
   6073 
   6074                 /* o10[0-3] */
   6075                 {
   6076                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6077                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6078                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6079                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6080 
   6081                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6082                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6083 
   6084                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6085 
   6086                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6087                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6088                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6089                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6090 
   6091                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6092                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6093 
   6094                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6095 
   6096                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6097 
   6098                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
   6099                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
   6100 
   6101                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6102                     m_count = _mm_cvtsi32_si128(i4_shift);
   6103                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6104                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6105 
   6106                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6107                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6108                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6109                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6110 
   6111                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6112 
   6113                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6114                     pi2_dst_scratch += out_stride;
   6115                 }
   6116 
   6117 
   6118                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   6119                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
   6120                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
   6121                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
   6122                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
   6123                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
   6124                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
   6125                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
   6126 
   6127                 /* o11[0-3] */
   6128                 {
   6129                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6130                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6131                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6132                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6133 
   6134                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6135                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6136 
   6137                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6138 
   6139                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6140                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6141                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6142                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6143 
   6144                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6145                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6146 
   6147                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6148 
   6149                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6150 
   6151                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
   6152                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
   6153 
   6154                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6155                     m_count = _mm_cvtsi32_si128(i4_shift);
   6156                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6157                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6158 
   6159                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6160                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6161                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6162                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6163 
   6164                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6165 
   6166                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6167                     pi2_dst_scratch += out_stride;
   6168 
   6169                 }
   6170 
   6171                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   6172                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
   6173                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
   6174                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
   6175                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
   6176                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
   6177                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
   6178                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
   6179 
   6180                 /* o12[0-3] */
   6181                 {
   6182                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6183                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6184                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6185                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6186 
   6187                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6188                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6189 
   6190                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6191 
   6192                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6193                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6194                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6195                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6196 
   6197                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6198                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6199 
   6200                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6201 
   6202                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6203 
   6204                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
   6205                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
   6206 
   6207                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6208                     m_count = _mm_cvtsi32_si128(i4_shift);
   6209                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6210                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6211 
   6212                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6213                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6214                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6215                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6216 
   6217                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6218 
   6219                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6220                     pi2_dst_scratch += out_stride;
   6221 
   6222                 }
   6223 
   6224                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   6225                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
   6226                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
   6227                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
   6228                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
   6229                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
   6230                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
   6231                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
   6232 
   6233                 /* o13[0-3] */
   6234                 {
   6235                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6236                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6237                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6238                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6239 
   6240                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6241                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6242 
   6243                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6244 
   6245                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6246                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6247                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6248                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6249 
   6250                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6251                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6252 
   6253                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6254 
   6255                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6256 
   6257                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
   6258                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
   6259 
   6260                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6261                     m_count = _mm_cvtsi32_si128(i4_shift);
   6262                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6263                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6264 
   6265                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6266                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6267                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6268                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6269 
   6270                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6271 
   6272                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6273                     pi2_dst_scratch += out_stride;
   6274                 }
   6275 
   6276                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   6277                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
   6278                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
   6279                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
   6280                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
   6281                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
   6282                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
   6283                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
   6284 
   6285                 /* o14[0-3] */
   6286                 {
   6287                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6288                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6289                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6290                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6291 
   6292                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6293                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6294 
   6295                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6296 
   6297                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6298                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6299                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6300                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6301 
   6302                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6303                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6304 
   6305                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6306 
   6307                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6308 
   6309                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
   6310                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
   6311 
   6312                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6313                     m_count = _mm_cvtsi32_si128(i4_shift);
   6314                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6315                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6316 
   6317                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6318                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6319                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6320                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6321 
   6322                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6323 
   6324                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6325                     pi2_dst_scratch += out_stride;
   6326 
   6327                 }
   6328 
   6329                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   6330                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
   6331                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
   6332                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
   6333                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
   6334                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
   6335                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
   6336                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
   6337 
   6338                 /* o15[0-3] */
   6339                 {
   6340                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6341                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6342                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6343                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6344 
   6345                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6346                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6347 
   6348                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6349 
   6350                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6351                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6352                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6353                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6354 
   6355                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6356                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6357 
   6358                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6359 
   6360                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6361 
   6362                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
   6363                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
   6364 
   6365                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6366                     m_count = _mm_cvtsi32_si128(i4_shift);
   6367                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6368                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6369 
   6370                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6371                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6372                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6373                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6374 
   6375                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6376 
   6377                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6378                     pi2_dst_scratch += 8;
   6379                 }
   6380 
   6381             }
   6382 
   6383         }
   6384 
   6385         /* Transpose */
   6386         {
   6387 
   6388             WORD16 *pi2_src_scratch = temp_ptr;
   6389             WORD32 out_stride = dst_strd;
   6390             WORD32 in_stride = 8;
   6391 
   6392             m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6393             pi2_src_scratch += in_stride;
   6394             m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6395             pi2_src_scratch += in_stride;
   6396             m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6397             pi2_src_scratch += in_stride;
   6398             m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6399             pi2_src_scratch += in_stride;
   6400             m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6401             pi2_src_scratch += in_stride;
   6402             m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6403             pi2_src_scratch += in_stride;
   6404             m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6405             pi2_src_scratch += in_stride;
   6406             m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6407             pi2_src_scratch += 8;
   6408 
   6409             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6410             pi2_src_scratch += in_stride;
   6411             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6412             pi2_src_scratch += in_stride;
   6413             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6414             pi2_src_scratch += in_stride;
   6415             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6416             pi2_src_scratch += in_stride;
   6417             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6418             pi2_src_scratch += in_stride;
   6419             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6420             pi2_src_scratch += in_stride;
   6421             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6422             pi2_src_scratch += in_stride;
   6423             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6424             pi2_src_scratch += 8;
   6425 
   6426 
   6427             m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
   6428             m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
   6429 
   6430             m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
   6431             m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
   6432 
   6433             m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
   6434             m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
   6435 
   6436             m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
   6437             m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
   6438 
   6439             m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
   6440             m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
   6441 
   6442             m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
   6443             m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
   6444 
   6445             m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
   6446             m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
   6447 
   6448             m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
   6449             m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
   6450 
   6451 
   6452             m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
   6453             m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
   6454 
   6455             m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
   6456             m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
   6457 
   6458             m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
   6459             m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
   6460 
   6461             m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
   6462             m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
   6463 
   6464             m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
   6465             m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
   6466 
   6467             m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
   6468             m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
   6469 
   6470             m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
   6471             m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
   6472 
   6473             m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
   6474             m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
   6475 
   6476 
   6477             m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);       // row0 = 0-7
   6478             m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);       // row1 = 0-7
   6479 
   6480             m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);     // row0=24-31
   6481             m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);     // row1=24-31
   6482 
   6483             m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);       // row0=8-15
   6484             m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);       // row1=8-15
   6485 
   6486             m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);     // row0=16-23
   6487             m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);     // row1=16-23
   6488 
   6489             m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);      // row2 =0-7
   6490             m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);      // row3 =0-7
   6491 
   6492             m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);    // row2=24-31
   6493             m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);    // row3=24-31
   6494 
   6495             m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);      // row2=8-15
   6496             m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);      // row3=8-15
   6497 
   6498             m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);    // row2=16-23
   6499             m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);    // row3=16-23
   6500 
   6501             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
   6502 
   6503             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6504 
   6505             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
   6506             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6507 
   6508             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6509 
   6510             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
   6511             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6512 
   6513             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   6514 
   6515             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
   6516 
   6517             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6518 
   6519             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
   6520             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6521 
   6522             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6523 
   6524             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
   6525             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6526 
   6527             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
   6528             pu1_dst += out_stride;
   6529             pu1_pred += pred_strd;
   6530 
   6531 
   6532             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
   6533 
   6534             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6535 
   6536             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
   6537             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6538 
   6539             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6540 
   6541             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
   6542             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6543 
   6544             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   6545 
   6546             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
   6547 
   6548             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6549 
   6550             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
   6551             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6552 
   6553             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6554 
   6555             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
   6556             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6557 
   6558             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
   6559             pu1_dst += out_stride;
   6560             pu1_pred += pred_strd;
   6561 
   6562             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
   6563 
   6564             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6565 
   6566             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
   6567             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6568 
   6569             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6570 
   6571             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
   6572             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6573 
   6574             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   6575 
   6576             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
   6577 
   6578             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6579 
   6580             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
   6581             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6582 
   6583             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6584 
   6585             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
   6586             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6587 
   6588             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
   6589             pu1_dst += out_stride;
   6590             pu1_pred += pred_strd;
   6591 
   6592 
   6593             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
   6594 
   6595             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6596 
   6597             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
   6598             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6599 
   6600             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6601 
   6602             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
   6603             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6604 
   6605             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   6606 
   6607             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
   6608 
   6609             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6610 
   6611             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
   6612             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6613 
   6614             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6615 
   6616             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
   6617             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6618 
   6619             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
   6620             pu1_dst += out_stride;
   6621             pu1_pred += pred_strd;
   6622 
   6623         }
   6624         pi2_tmp += 4;
   6625     }
   6626 }
   6627 
   6628 
   6629