Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevc_32x32_itrans_recon_x86_intr.c
     22  *
     23  * @brief
     24  *  Contains function definitions for inverse  quantization, inverse
     25  * transform and reconstruction
     26  *
     27  * @author
     28  *  100470
     29  *
     30  * @par List of Functions:
     31  *  - ihevc_itrans_recon_32x32_sse42()
     32  *
     33  * @remarks
     34  *  None
     35  *
     36  *******************************************************************************
     37  */
     38 #include <stdio.h>
     39 #include <string.h>
     40 #include "ihevc_typedefs.h"
     41 #include "ihevc_platform_macros.h"
     42 #include "ihevc_macros.h"
     43 #include "ihevc_defs.h"
     44 #include "ihevc_trans_tables.h"
     45 #include "ihevc_iquant_itrans_recon.h"
     46 #include "ihevc_func_selector.h"
     47 #include "ihevc_trans_macros.h"
     48 
     49 #include <emmintrin.h>
     50 #include <smmintrin.h>
     51 #include <tmmintrin.h>
     52 
     53 /**
     54  *******************************************************************************
     55  *
     56  * @brief
     57  *  This function performs inverse quantization, inverse  transform and
     58  * reconstruction for 16x16 input block
     59  *
     60  * @par Description:
     61  *  Performs inverse quantization , inverse transform  and adds the
     62  * prediction data and clips output to 8 bit
     63  *
     64  * @param[in] pi2_src
     65  *  Input 16x16 coefficients
     66  *
     67  * @param[in] pi2_tmp
     68  *  Temporary 16x16 buffer for storing inverse
     69  *  transform 1st stage output
     70  *
     71  * @param[in] pu1_pred
     72  *  Prediction 16x16 block
     73  *
     74  * @param[in] pi2_dequant_coeff
     75  *  Dequant Coeffs
     76  *
     77  * @param[out] pu1_dst
     78  *  Output 16x16 block
     79  *
     80  * @param[in] qp_div
     81  *  Quantization parameter / 6
     82  *
     83  * @param[in] qp_rem
     84  *  Quantization parameter % 6
     85  *
     86  * @param[in] src_strd
     87  *  Input stride
     88  *
     89  * @param[in] pred_strd
     90  *  Prediction stride
     91  *
     92  * @param[in] dst_strd
     93  *  Output Stride
     94  *
     95  * @param[in] zero_cols
     96  *  Zero columns in pi2_src
     97  *
     98  * @returns  Void
     99  *
    100  * @remarks
    101  *  None
    102  *
    103  *******************************************************************************
    104  */
    105 /**/
    106 
    107 void ihevc_itrans_recon_32x32_sse42(WORD16 *pi2_src,
    108                                     WORD16 *pi2_tmp,
    109                                     UWORD8 *pu1_pred,
    110                                     UWORD8 *pu1_dst,
    111                                     WORD32 src_strd,
    112                                     WORD32 pred_strd,
    113                                     WORD32 dst_strd,
    114                                     WORD32 zero_cols,
    115                                     WORD32 zero_rows)
    116 {
    117     /* Inverse Transform */
    118 
    119     WORD32 j;
    120 
    121 
    122     WORD16 *pi2_tmp_orig;
    123 
    124 
    125     WORD16 *o_temp_ptr;
    126     WORD16 *temp_ptr;
    127 
    128     __m128i m_temp_reg_0;
    129     __m128i m_temp_reg_1;
    130     __m128i m_temp_reg_2;
    131     __m128i m_temp_reg_3;
    132     __m128i m_temp_reg_4;
    133     __m128i m_temp_reg_5;
    134     __m128i m_temp_reg_6;
    135     __m128i m_temp_reg_7;
    136     __m128i m_temp_reg_10;
    137     __m128i m_temp_reg_11;
    138     __m128i m_temp_reg_12;
    139     __m128i m_temp_reg_13;
    140     __m128i m_temp_reg_14;
    141     __m128i m_temp_reg_15;
    142     __m128i m_temp_reg_16;
    143     __m128i m_temp_reg_17;
    144     __m128i m_temp_reg_18;
    145     __m128i m_temp_reg_19;
    146     __m128i m_temp_reg_20;
    147     __m128i m_temp_reg_21;
    148     __m128i m_temp_reg_22;
    149     __m128i m_temp_reg_23;
    150     __m128i m_temp_reg_30;
    151     __m128i m_temp_reg_31;
    152     __m128i m_temp_reg_32;
    153     __m128i m_temp_reg_33;
    154     __m128i m_temp_reg_34;
    155     __m128i m_temp_reg_35;
    156     __m128i m_temp_reg_36;
    157     __m128i m_temp_reg_37;
    158     __m128i m_temp_reg_40;
    159     __m128i m_temp_reg_41;
    160     __m128i m_temp_reg_42;
    161     __m128i m_temp_reg_43;
    162     __m128i m_temp_reg_44;
    163     __m128i m_temp_reg_45;
    164     __m128i m_temp_reg_46;
    165     __m128i m_temp_reg_47;
    166 
    167     __m128i m_temp_reg_70;
    168     __m128i m_temp_reg_71;
    169     __m128i m_temp_reg_72;
    170     __m128i m_temp_reg_73;
    171     __m128i m_temp_reg_74;
    172     __m128i m_temp_reg_75;
    173     __m128i m_temp_reg_76;
    174     __m128i m_temp_reg_77;
    175 
    176     __m128i m_temp_reg_80;
    177     __m128i m_temp_reg_81;
    178     __m128i m_temp_reg_82;
    179     __m128i m_temp_reg_83;
    180     __m128i m_temp_reg_84;
    181     __m128i m_temp_reg_85;
    182     __m128i m_temp_reg_86;
    183     __m128i m_temp_reg_87;
    184 
    185     __m128i m_temp_reg_90;
    186     __m128i m_temp_reg_91;
    187     __m128i m_temp_reg_92;
    188     __m128i m_temp_reg_93;
    189     __m128i m_temp_reg_94;
    190     __m128i m_temp_reg_95;
    191     __m128i m_temp_reg_96;
    192     __m128i m_temp_reg_97;
    193 
    194     __m128i m_rdng_factor;
    195     __m128i m_count;
    196     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
    197     __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
    198 
    199     __m128i temp1, temp2, temp3, temp4;
    200     __m128i temp5, temp6, temp7, temp8;
    201 
    202     __m128i all_zero_reg;
    203     WORD32 i;
    204 
    205     /*Lokesh*/
    206     WORD32  zero_last24_cols_stg1;
    207     WORD32  zero_last24_rows_stg1;
    208     WORD32  zero_last28_rows_stg1;
    209 
    210     WORD32  zero_last28_rows_stg2;
    211     WORD32  zero_last24_rows_stg2;
    212 
    213     WORD32  trans_size_stg1;
    214 
    215     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    216     WORD32 trans_size = TRANS_SIZE_32;
    217 
    218 
    219     /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
    220     zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
    221     zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
    222     zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
    223 
    224     zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
    225     zero_last24_rows_stg2 = zero_last24_cols_stg1;
    226 
    227     if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
    228     {
    229         trans_size_stg1 = 8;
    230 
    231     }
    232     else
    233     {
    234         trans_size_stg1 = 32;
    235     }
    236 
    237     all_zero_reg = _mm_setzero_si128();
    238 
    239     o_temp_ptr  = pi2_tmp;
    240     temp_ptr = (pi2_tmp + 1024);
    241 
    242     pi2_tmp += 2048;
    243     pi2_tmp_orig = pi2_tmp;
    244 
    245     for(i = 0; i < trans_size_stg1; i += 8)
    246     {
    247 
    248         {
    249             WORD16 *pi2_tmp_src = pi2_src;
    250 
    251             m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    252             pi2_tmp_src += (src_strd << 1);
    253             m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    254             pi2_tmp_src += (src_strd << 1);
    255             m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    256             pi2_tmp_src += (src_strd << 1);
    257             m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    258             pi2_tmp_src += (src_strd << 1);
    259             m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    260             pi2_tmp_src += (src_strd << 1);
    261             m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    262             pi2_tmp_src += (src_strd << 1);
    263             m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    264             pi2_tmp_src += (src_strd << 1);
    265             m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    266             pi2_tmp_src += (src_strd << 1);
    267 
    268             m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    269             pi2_tmp_src += (src_strd << 1);
    270             m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    271             pi2_tmp_src += (src_strd << 1);
    272             m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    273             pi2_tmp_src += (src_strd << 1);
    274             m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    275             pi2_tmp_src += (src_strd << 1);
    276             m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    277             pi2_tmp_src += (src_strd << 1);
    278             m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    279             pi2_tmp_src += (src_strd << 1);
    280             m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    281             pi2_tmp_src += (src_strd << 1);
    282             m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    283         }
    284 
    285         if(zero_last28_rows_stg1)
    286         {
    287             /* eeo */
    288             /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
    289             /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
    290             {
    291                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
    292 
    293                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
    294 
    295                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    296 
    297 /* eeeo[0]= m_temp_reg_20  */
    298 /* eeeo[1]= m_temp_reg_21  */
    299 /* eeee[0]= m_temp_reg_22  */
    300 /* eeee[1]= m_temp_reg_23  */
    301 
    302                 /* eee[0] = eeee[0] + eeeo[0]; */
    303                 m_temp_reg_40 = m_temp_reg_14;
    304 
    305                 /* eee[3] = eeee[0] - eeeo[0]; */
    306                 m_temp_reg_43 = m_temp_reg_14;
    307 
    308                 /* eee[2] = eeee[1] - eeeo[1]; */
    309                 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
    310 
    311                 /* eee[1] = eeee[1] + eeeo[1];*/
    312                 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
    313 
    314                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
    315 
    316                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
    317 
    318                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    319 
    320 /* eeeo[0]= m_temp_reg_20  */
    321 /* eeeo[1]= m_temp_reg_21  */
    322 /* eeee[0]= m_temp_reg_22  */
    323 /* eeee[1]= m_temp_reg_23  */
    324 
    325                 /* eee[0] = eeee[0] + eeeo[0]; */
    326                 m_temp_reg_44 = m_temp_reg_14;
    327 
    328                 /* eee[3] = eeee[0] - eeeo[0]; */
    329                 m_temp_reg_47 = m_temp_reg_14;
    330 
    331                 /* eee[2] = eeee[1] - eeeo[1]; */
    332                 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
    333 
    334                 /* eee[1] = eeee[1] + eeeo[1];*/
    335                 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
    336 
    337 
    338             }
    339             /* eo */
    340             {
    341                 WORD16 *pi2_scratch = o_temp_ptr;
    342 
    343                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
    344                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
    345                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
    346                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
    347                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
    348                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
    349                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
    350                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
    351 
    352                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
    353 
    354                 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
    355 
    356                 /* eo0[0-3] */
    357                 {
    358                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    359 
    360                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
    361 
    362                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    363                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    364 
    365                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    366                     pi2_scratch += 8;
    367                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    368                     pi2_scratch += 8;
    369 
    370                 }
    371 
    372                 /* eo0[4-7] */
    373                 {
    374                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    375 
    376                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    377                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    378 
    379                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    380                     pi2_scratch += 8;
    381                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    382                     pi2_scratch += 8;
    383 
    384                 }
    385                 /* eo1[0-3] */
    386                 {
    387                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
    388 
    389                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
    390                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
    391 
    392                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    393                     pi2_scratch += 8;
    394                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    395                     pi2_scratch += 8;
    396 
    397                 }
    398 
    399                 /* eo1[4-7] */
    400                 {
    401                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
    402 
    403                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
    404                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
    405 
    406                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    407                     pi2_scratch += 8;
    408                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    409                     pi2_scratch += 8;
    410 
    411                 }
    412 
    413                 /* eo2[0-3] */
    414                 {
    415                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    416 
    417                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    418                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    419 
    420                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    421                     pi2_scratch += 8;
    422                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    423                     pi2_scratch += 8;
    424 
    425                 }
    426 
    427                 /* eo2[4-7] */
    428                 {
    429                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
    430 
    431                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    432                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    433 
    434                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    435                     pi2_scratch += 8;
    436                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    437                     pi2_scratch += 8;
    438 
    439                 }
    440 
    441                 /**************************************************************************/
    442 
    443 
    444                 /* eo3[0-3] */
    445                 {
    446                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
    447 
    448                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
    449                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
    450 
    451                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    452                     pi2_scratch += 8;
    453                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    454                     pi2_scratch += 8;
    455 
    456                 }
    457 
    458                 /* eo3[4-7] */
    459                 {
    460                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
    461 
    462                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
    463                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
    464 
    465                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    466                     pi2_scratch += 8;
    467                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    468                     pi2_scratch += 8;
    469 
    470                 }
    471 
    472 
    473                 /* eo4[0-3] */
    474                 {
    475                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
    476 
    477                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
    478                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
    479 
    480                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    481                     pi2_scratch += 8;
    482                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    483                     pi2_scratch += 8;
    484 
    485                 }
    486                 /* eo4[4-7] */
    487                 {
    488                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
    489 
    490                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
    491                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
    492 
    493                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    494                     pi2_scratch += 8;
    495                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    496                     pi2_scratch += 8;
    497 
    498                 }
    499 
    500                 /***********************************************************************/
    501 
    502                 /* eo5[0-3] */
    503                 {
    504                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
    505 
    506                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    507                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    508 
    509                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    510                     pi2_scratch += 8;
    511                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    512                     pi2_scratch += 8;
    513 
    514                 }
    515 
    516 
    517                 /* eo5[4-7] */
    518                 {
    519                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
    520 
    521                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    522                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    523 
    524                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    525                     pi2_scratch += 8;
    526                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    527                     pi2_scratch += 8;
    528 
    529                 }
    530 
    531                 /* eo6[0-3] */
    532                 {
    533                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
    534 
    535                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
    536                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
    537 
    538                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    539                     pi2_scratch += 8;
    540                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    541                     pi2_scratch += 8;
    542 
    543                 }
    544 
    545 
    546                 /* eo6[4-7] */
    547                 {
    548                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
    549 
    550                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
    551                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
    552 
    553                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    554                     pi2_scratch += 8;
    555                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    556                     pi2_scratch += 8;
    557 
    558                 }
    559 
    560 
    561                 /* eo7[0-3] */
    562                 {
    563                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
    564 
    565                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    566                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    567 
    568                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    569                     pi2_scratch += 8;
    570                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    571                     pi2_scratch += 8;
    572 
    573                 }
    574 
    575 
    576                 /* eo7[4-7] */
    577                 {
    578                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
    579 
    580                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    581                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    582 
    583                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    584                     pi2_scratch += 8;
    585                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    586                     pi2_scratch += 8;
    587 
    588                 }
    589 
    590             }
    591         }
    592         else if(zero_last24_rows_stg1)
    593         {
    594             {
    595                 /* eeo */
    596                 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
    597                 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
    598 
    599                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
    600                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
    601 
    602                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
    603 
    604                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
    605 
    606                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    607 
    608                 /* eeeo[0]= m_temp_reg_20  */
    609                 /* eeeo[1]= m_temp_reg_21  */
    610                 /* eeee[0]= m_temp_reg_22  */
    611                 /* eeee[1]= m_temp_reg_23  */
    612 
    613                 /* eee[0] = eeee[0] + eeeo[0]; */
    614                 m_temp_reg_40 = m_temp_reg_14;
    615 
    616                 /* eee[3] = eeee[0] - eeeo[0]; */
    617                 m_temp_reg_43 = m_temp_reg_14;
    618 
    619                 /* eee[2] = eeee[1] - eeeo[1]; */
    620                 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
    621 
    622                 /* eee[1] = eeee[1] + eeeo[1];*/
    623                 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
    624 
    625                 /* for row 4 to 7 */
    626 
    627                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
    628 
    629                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
    630 
    631                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    632 
    633                 /* eeeo[0]= m_temp_reg_20  */
    634                 /* eeeo[1]= m_temp_reg_21  */
    635                 /* eeee[0]= m_temp_reg_22  */
    636                 /* eeee[1]= m_temp_reg_23  */
    637 
    638                 /* eee[0] = eeee[0] + eeeo[0]; */
    639                 m_temp_reg_44 = m_temp_reg_14;
    640 
    641                 /* eee[3] = eeee[0] - eeeo[0]; */
    642                 m_temp_reg_47 = m_temp_reg_14;
    643 
    644                 /* eee[2] = eeee[1] - eeeo[1]; */
    645                 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
    646 
    647                 /* eee[1] = eeee[1] + eeeo[1];*/
    648                 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
    649 
    650 
    651                 // eeo[]
    652                 /* for(k = 0; k < 4; k++) */
    653 
    654                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
    655                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
    656                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
    657                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
    658 
    659                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
    660 
    661                 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
    662 
    663                 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
    664 
    665                 m_temp_reg_33 = _mm_setzero_si128();
    666 
    667                 /* eeo */
    668                 {
    669                     /* eeo0[0-3] */
    670                     {
    671                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    672 
    673                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    674                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    675 
    676                         m_temp_reg_90 = m_temp_reg_34;
    677                         m_temp_reg_97 = m_temp_reg_35;
    678                     }
    679                     /* eeo0[4-7] */
    680                     {
    681                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    682 
    683                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    684                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    685 
    686                         m_temp_reg_91 = m_temp_reg_34;
    687                         m_temp_reg_96 = m_temp_reg_35;
    688 
    689                     }
    690 
    691                     /* eeo1[0-3] */
    692                     {
    693                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
    694 
    695                         /* e[1][0-3] stored in pi2_tmp[2][0-7] */
    696                         /* e[6][0-3] stored in pi2_tmp[2][8-15] */
    697                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
    698                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
    699 
    700                         m_temp_reg_92 = m_temp_reg_34;
    701                         m_temp_reg_95 = m_temp_reg_35;
    702 
    703                     }
    704 
    705                     /* eo1[4-7] */
    706                     {
    707                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
    708 
    709                         /* e[1][4-7] stored in pi2_tmp[3][0-7] */
    710                         /* e[6][4-7] stored in pi2_tmp[3][8-15] */
    711                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
    712                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
    713 
    714                         m_temp_reg_93 = m_temp_reg_34;
    715                         m_temp_reg_94 = m_temp_reg_35;
    716 
    717 
    718                     }
    719 
    720                     /* eo2[0-3] */
    721                     {
    722                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
    723 
    724                         /* e[2][0-3] stored in pi2_tmp[4][0-7] */
    725                         /* e[5][0-3] stored in pi2_tmp[4][8-15] */
    726                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    727                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    728 
    729                         temp1 = m_temp_reg_34;
    730                         temp7 = m_temp_reg_35;
    731 
    732                     }
    733 
    734                     /* eo2[4-7] */
    735                     {
    736                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
    737 
    738                         /* e[2][4-7] stored in pi2_tmp[5][0-7] */
    739                         /* e[5][4-7] stored in pi2_tmp[5][8-15] */
    740                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    741                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    742 
    743                         temp2 = m_temp_reg_34;
    744                         temp6 = m_temp_reg_35;
    745 
    746                     }
    747 
    748                     /* eo3[0-3] */
    749                     {
    750                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    751 
    752                         /* e[3][0-3] stored in pi2_tmp[6][0-7] */
    753                         /* e[4][0-3] stored in pi2_tmp[6][8-15] */
    754                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
    755                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
    756 
    757                         temp3 = m_temp_reg_34;
    758                         temp5 = m_temp_reg_35;
    759 
    760                     }
    761 
    762 
    763                     /* eo3[4-7] */
    764                     {
    765                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
    766 
    767                         /* e[3][4-7] stored in pi2_tmp[7][0-7] */
    768                         /* e[4][4-7] stored in pi2_tmp[7][8-15] */
    769                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
    770                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
    771 
    772                         temp4 = m_temp_reg_34;
    773                         temp8 = m_temp_reg_35;
    774 
    775 
    776                     }
    777                     /* All values of ee[] array in pi2_temp */
    778 
    779                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
    780                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
    781                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
    782                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
    783 
    784                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    785 
    786                     m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
    787                     m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
    788 
    789                 }
    790             }
    791             /* eo */
    792             {
    793 
    794                 WORD16 *pi2_scratch = o_temp_ptr;
    795 
    796                 /* eo0[0-3] */
    797                 {
    798                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    799 
    800                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
    801                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
    802 
    803                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    804                     pi2_scratch += 8;
    805                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    806                     pi2_scratch += 8;
    807 
    808                 }
    809 
    810 
    811                 /* eo0[4-7] */
    812                 {
    813                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    814 
    815                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    816 
    817                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
    818                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
    819 
    820                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    821                     pi2_scratch += 8;
    822                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    823                     pi2_scratch += 8;
    824 
    825                 }
    826 
    827                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
    828 
    829                 /* eo1[0-3] */
    830                 {
    831                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    832 
    833                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
    834                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
    835 
    836                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    837                     pi2_scratch += 8;
    838                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    839                     pi2_scratch += 8;
    840 
    841                 }
    842 
    843 
    844                 /* eo1[4-7] */
    845                 {
    846                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    847 
    848                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
    849                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
    850 
    851                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    852                     pi2_scratch += 8;
    853                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    854                     pi2_scratch += 8;
    855 
    856                 }
    857 
    858                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
    859 
    860                 /* eo2[0-3] */
    861                 {
    862 
    863                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    864 
    865                     m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
    866                     m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
    867 
    868                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    869                     pi2_scratch += 8;
    870                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    871                     pi2_scratch += 8;
    872 
    873                 }
    874 
    875                 /* eo2[4-7] */
    876                 {
    877 
    878                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    879 
    880                     m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
    881                     m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
    882 
    883                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    884                     pi2_scratch += 8;
    885                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    886                     pi2_scratch += 8;
    887 
    888                 }
    889 
    890                 /**************************************************************************/
    891 
    892 
    893 
    894                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
    895 
    896                 /* eo3[0-3] */
    897                 {
    898 
    899                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    900 
    901                     m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
    902                     m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
    903 
    904                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    905                     pi2_scratch += 8;
    906                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    907                     pi2_scratch += 8;
    908 
    909                 }
    910 
    911 
    912                 /* eo3[4-7] */
    913                 {
    914 
    915                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    916 
    917                     m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
    918                     m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
    919 
    920                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    921                     pi2_scratch += 8;
    922                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    923                     pi2_scratch += 8;
    924 
    925                 }
    926 
    927                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
    928 
    929                 /* eo4[0-3] */
    930                 {
    931                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    932 
    933                     m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
    934                     m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
    935 
    936                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    937                     pi2_scratch += 8;
    938                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    939                     pi2_scratch += 8;
    940 
    941                 }
    942                 /* eo4[4-7] */
    943                 {
    944                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    945 
    946                     m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
    947                     m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
    948 
    949                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    950                     pi2_scratch += 8;
    951                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    952                     pi2_scratch += 8;
    953 
    954                 }
    955 
    956                 /***********************************************************************/
    957 
    958                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
    959 
    960                 /* eo5[0-3] */
    961                 {
    962 
    963                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    964 
    965                     m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
    966                     m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
    967 
    968                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    969                     pi2_scratch += 8;
    970                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    971                     pi2_scratch += 8;
    972 
    973                 }
    974 
    975 
    976                 /* eo5[4-7] */
    977                 {
    978                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
    979 
    980                     m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
    981                     m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
    982 
    983                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    984                     pi2_scratch += 8;
    985                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    986                     pi2_scratch += 8;
    987 
    988                 }
    989 
    990                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
    991 
    992                 /* eo6[0-3] */
    993                 {
    994                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    995 
    996                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
    997                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
    998 
    999                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1000                     pi2_scratch += 8;
   1001                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1002                     pi2_scratch += 8;
   1003 
   1004                 }
   1005 
   1006 
   1007                 /* eo6[4-7] */
   1008                 {
   1009 
   1010                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1011 
   1012                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
   1013                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
   1014 
   1015                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1016                     pi2_scratch += 8;
   1017                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1018                     pi2_scratch += 8;
   1019 
   1020                 }
   1021 
   1022                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
   1023 
   1024                 /* eo7[0-3] */
   1025                 {
   1026 
   1027                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1028 
   1029                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
   1030                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
   1031 
   1032                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1033                     pi2_scratch += 8;
   1034                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1035                     pi2_scratch += 8;
   1036 
   1037                 }
   1038 
   1039 
   1040                 /* eo7[4-7] */
   1041                 {
   1042                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1043 
   1044                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
   1045                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
   1046 
   1047                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1048                     pi2_scratch += 8;
   1049                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1050                     pi2_scratch += 8;
   1051 
   1052                 }
   1053 
   1054             }
   1055 
   1056         }
   1057         else
   1058         {
   1059 
   1060             {
   1061                 /* eeo */
   1062                 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
   1063                 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
   1064 
   1065                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
   1066                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
   1067 
   1068                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
   1069                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
   1070 
   1071                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
   1072 
   1073                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
   1074 
   1075                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
   1076                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
   1077 
   1078                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
   1079                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
   1080 
   1081 
   1082                 /* eeeo[0]= m_temp_reg_20  */
   1083                 /* eeeo[1]= m_temp_reg_21  */
   1084                 /* eeee[0]= m_temp_reg_22  */
   1085                 /* eeee[1]= m_temp_reg_23  */
   1086 
   1087                 /* eee[0] = eeee[0] + eeeo[0]; */
   1088                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
   1089 
   1090                 /* eee[3] = eeee[0] - eeeo[0]; */
   1091                 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
   1092 
   1093                 /* eee[2] = eeee[1] - eeeo[1]; */
   1094                 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
   1095 
   1096                 /* eee[1] = eeee[1] + eeeo[1];*/
   1097                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
   1098 
   1099                 /* for row 4 to 7 */
   1100 
   1101                 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
   1102                 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
   1103 
   1104                 /* Interleaving row 8 and row 24*/
   1105                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
   1106 
   1107                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
   1108                 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
   1109 
   1110                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
   1111 
   1112                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
   1113                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
   1114 
   1115                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
   1116                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
   1117 
   1118 
   1119                 /* eeeo[0]= m_temp_reg_20  */
   1120                 /* eeeo[1]= m_temp_reg_21  */
   1121                 /* eeee[0]= m_temp_reg_22  */
   1122                 /* eeee[1]= m_temp_reg_23  */
   1123 
   1124                 /* eee[0] = eeee[0] + eeeo[0]; */
   1125                 m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
   1126 
   1127                 /* eee[3] = eeee[0] - eeeo[0]; */
   1128                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
   1129 
   1130                 /* eee[2] = eeee[1] - eeeo[1]; */
   1131                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
   1132 
   1133                 /* eee[1] = eeee[1] + eeeo[1];*/
   1134                 m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
   1135 
   1136 
   1137                 // eeo[]
   1138                 /* for(k = 0; k < 4; k++) */
   1139 
   1140                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
   1141                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
   1142 
   1143                 /* eeo */
   1144                 {
   1145                     /* eeo0[0-3] */
   1146                     {
   1147                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1148                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
   1149 
   1150                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1151                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1152 
   1153                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1154 
   1155                         m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1156                         m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1157 
   1158                     }
   1159 
   1160                     m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
   1161                     m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
   1162                     m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
   1163                     m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
   1164 
   1165                     /* eeo0[4-7] */
   1166                     {
   1167                         m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   1168                         m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
   1169 
   1170                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1171                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1172 
   1173                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1174 
   1175                         m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
   1176                         m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
   1177 
   1178                     }
   1179 
   1180 
   1181                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
   1182                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
   1183 
   1184                     /* eeo1[0-3] */
   1185                     {
   1186                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1187                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   1188 
   1189                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
   1190                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
   1191 
   1192                         m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
   1193                         m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
   1194 
   1195                     }
   1196 
   1197                     /* eeo1[4-7] */
   1198                     {
   1199 
   1200                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1201                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
   1202 
   1203                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
   1204                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
   1205 
   1206                         m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
   1207                         m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
   1208 
   1209 
   1210                     }
   1211 
   1212                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
   1213                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
   1214 
   1215                     /* eeo2[0-3] */
   1216                     {
   1217 
   1218                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1219                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   1220 
   1221                         /* e[2][0-3] stored in pi2_tmp[4][0-7] */
   1222                         /* e[5][0-3] stored in pi2_tmp[4][8-15] */
   1223 
   1224                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
   1225                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
   1226 
   1227                         temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
   1228                         temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
   1229 
   1230                     }
   1231 
   1232                     /* eeo2[4-7] */
   1233                     {
   1234 
   1235                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1236                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
   1237 
   1238                         /* e[2][4-7] stored in pi2_tmp[5][0-7] */
   1239                         /* e[5][4-7] stored in pi2_tmp[5][8-15] */
   1240 
   1241                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
   1242                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
   1243 
   1244                         temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
   1245                         temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
   1246 
   1247                     }
   1248 
   1249                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
   1250                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
   1251 
   1252                     /* eeo3[0-3] */
   1253                     {
   1254 
   1255                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1256                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   1257 
   1258                         /* e[3][0-3] stored in pi2_tmp[6][0-7] */
   1259                         /* e[4][0-3] stored in pi2_tmp[6][8-15] */
   1260 
   1261                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
   1262                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
   1263 
   1264                         temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
   1265                         temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
   1266 
   1267 
   1268                     }
   1269 
   1270                     /* eeo3[4-7] */
   1271                     {
   1272 
   1273                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1274                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
   1275 
   1276                         /* e[3][4-7] stored in pi2_tmp[7][0-7] */
   1277                         /* e[4][4-7] stored in pi2_tmp[7][8-15] */
   1278 
   1279                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
   1280                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
   1281                         temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
   1282                         temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
   1283 
   1284                     }
   1285 
   1286 
   1287                     /* All values of ee[] array in pi2_temp */
   1288 
   1289                     /* for(k = 0; k < 8; k++) */
   1290                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
   1291                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
   1292                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
   1293                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
   1294                 }
   1295             }
   1296             /* eo */
   1297             {
   1298 
   1299                 WORD16 *pi2_scratch = o_temp_ptr;
   1300 
   1301                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1302                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   1303                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
   1304                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
   1305 
   1306                 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
   1307                 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
   1308                 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
   1309                 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
   1310 
   1311                 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
   1312                 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
   1313                 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
   1314                 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
   1315 
   1316                 /* eo0[0-3] */
   1317                 {
   1318                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1319                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1320 
   1321                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1322 
   1323                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1324                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1325 
   1326                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1327 
   1328                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1329 
   1330                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
   1331                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
   1332 
   1333                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1334                     pi2_scratch += 8;
   1335                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1336                     pi2_scratch += 8;
   1337 
   1338                 }
   1339                 /* eo0[4-7] */
   1340                 {
   1341                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1342                     m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   1343                     m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
   1344                     m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
   1345 
   1346                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1347                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1348 
   1349                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1350 
   1351                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1352                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1353 
   1354                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1355 
   1356                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1357 
   1358                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
   1359                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
   1360 
   1361                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1362                     pi2_scratch += 8;
   1363                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1364                     pi2_scratch += 8;
   1365 
   1366                 }
   1367 
   1368                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
   1369                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
   1370                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
   1371                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
   1372 
   1373                 /* eo1[0-3] */
   1374                 {
   1375 
   1376                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1377                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1378 
   1379                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1380 
   1381                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1382                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1383 
   1384                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1385 
   1386                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
   1387 
   1388                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
   1389                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
   1390 
   1391                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1392                     pi2_scratch += 8;
   1393                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1394                     pi2_scratch += 8;
   1395 
   1396                 }
   1397 
   1398                 /* eo1[4-7] */
   1399                 {
   1400                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1401                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1402 
   1403                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1404 
   1405                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1406                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1407 
   1408                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1409 
   1410                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
   1411 
   1412                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
   1413                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
   1414 
   1415                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1416                     pi2_scratch += 8;
   1417                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1418                     pi2_scratch += 8;
   1419 
   1420                 }
   1421 
   1422                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
   1423                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
   1424                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
   1425                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
   1426 
   1427                 /* eo2[0-3] */
   1428                 {
   1429                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1430                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1431 
   1432                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
   1433 
   1434                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1435                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1436 
   1437                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1438 
   1439                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1440 
   1441                     m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
   1442                     m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
   1443 
   1444                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1445                     pi2_scratch += 8;
   1446                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1447                     pi2_scratch += 8;
   1448 
   1449                 }
   1450 
   1451 
   1452                 /* eo2[4-7] */
   1453                 {
   1454 
   1455                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1456                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1457 
   1458                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
   1459 
   1460                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1461                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1462 
   1463                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1464 
   1465                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1466 
   1467                     m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
   1468                     m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
   1469 
   1470                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1471                     pi2_scratch += 8;
   1472                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1473                     pi2_scratch += 8;
   1474 
   1475                 }
   1476                 /**************************************************************************/
   1477 
   1478                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
   1479                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
   1480                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
   1481                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
   1482 
   1483                 /* eo3[0-3] */
   1484                 {
   1485                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1486                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1487 
   1488                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1489 
   1490                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1491                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1492 
   1493                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
   1494 
   1495                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1496 
   1497                     m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
   1498                     m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
   1499 
   1500                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1501                     pi2_scratch += 8;
   1502                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1503                     pi2_scratch += 8;
   1504 
   1505                 }
   1506 
   1507 
   1508                 /* eo3[4-7] */
   1509                 {
   1510                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1511                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1512 
   1513                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1514 
   1515                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1516                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1517 
   1518                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
   1519 
   1520                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1521 
   1522                     m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
   1523                     m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
   1524 
   1525                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1526                     pi2_scratch += 8;
   1527                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1528                     pi2_scratch += 8;
   1529 
   1530                 }
   1531 
   1532                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
   1533                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
   1534                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
   1535                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
   1536 
   1537                 /* eo4[0-3] */
   1538                 {
   1539 
   1540                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1541                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1542 
   1543                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1544 
   1545                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1546                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1547 
   1548                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
   1549 
   1550                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1551 
   1552                     m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
   1553                     m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
   1554 
   1555                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1556                     pi2_scratch += 8;
   1557                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1558                     pi2_scratch += 8;
   1559 
   1560                 }
   1561 
   1562 
   1563                 /* eo4[4-7] */
   1564                 {
   1565                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1566                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1567 
   1568                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1569 
   1570                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1571                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1572 
   1573                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
   1574 
   1575                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1576 
   1577                     m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
   1578                     m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
   1579 
   1580                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1581                     pi2_scratch += 8;
   1582                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1583                     pi2_scratch += 8;
   1584 
   1585                 }
   1586 
   1587                 /***********************************************************************/
   1588 
   1589                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
   1590                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
   1591                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
   1592                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
   1593 
   1594                 /* eo5[0-3] */
   1595                 {
   1596                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1597                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1598 
   1599                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1600 
   1601                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1602                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1603 
   1604                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1605 
   1606                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1607 
   1608                     m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
   1609                     m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
   1610 
   1611                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1612                     pi2_scratch += 8;
   1613                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1614                     pi2_scratch += 8;
   1615 
   1616                 }
   1617 
   1618 
   1619                 /* eo5[4-7] */
   1620                 {
   1621                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1622                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1623 
   1624                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1625 
   1626                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1627                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1628 
   1629                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1630 
   1631                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1632 
   1633                     m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
   1634                     m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
   1635 
   1636                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1637                     pi2_scratch += 8;
   1638                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1639                     pi2_scratch += 8;
   1640 
   1641                 }
   1642 
   1643                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
   1644                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
   1645                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
   1646                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
   1647 
   1648                 /* eo6[0-3] */
   1649                 {
   1650 
   1651                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1652                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1653 
   1654                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1655 
   1656                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1657                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1658 
   1659                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1660 
   1661                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1662 
   1663                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
   1664                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
   1665 
   1666                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1667                     pi2_scratch += 8;
   1668                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1669                     pi2_scratch += 8;
   1670 
   1671                 }
   1672 
   1673 
   1674                 /* eo6[4-7] */
   1675                 {
   1676                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1677                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1678 
   1679                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1680 
   1681                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1682                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1683 
   1684                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1685 
   1686                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1687 
   1688                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
   1689                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
   1690 
   1691                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1692                     pi2_scratch += 8;
   1693                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1694                     pi2_scratch += 8;
   1695 
   1696                 }
   1697 
   1698                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
   1699                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
   1700                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
   1701                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
   1702 
   1703                 /* eo7[0-3] */
   1704                 {
   1705 
   1706                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1707                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1708 
   1709                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1710 
   1711                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1712                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1713 
   1714                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1715 
   1716                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1717 
   1718                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
   1719                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
   1720 
   1721                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1722                     pi2_scratch += 8;
   1723                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1724                     pi2_scratch += 8;
   1725 
   1726                 }
   1727 
   1728 
   1729                 /* eo7[4-7] */
   1730                 {
   1731 
   1732                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1733                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
   1734 
   1735                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   1736 
   1737                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
   1738                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
   1739 
   1740                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   1741 
   1742                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   1743 
   1744                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
   1745                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
   1746 
   1747                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1748                     pi2_scratch += 8;
   1749                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1750                     pi2_scratch += 8;
   1751 
   1752                 }
   1753 
   1754             }
   1755 
   1756         }
   1757         /*  All e[] are done */
   1758         /****************************/
   1759 
   1760         {
   1761 
   1762             WORD16 *pi2_tmp_src = pi2_src + src_strd;
   1763 
   1764             m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1765             pi2_tmp_src += (src_strd << 1);
   1766             m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1767             pi2_tmp_src += (src_strd << 1);
   1768             m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1769             pi2_tmp_src += (src_strd << 1);
   1770             m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1771             pi2_tmp_src += (src_strd << 1);
   1772             m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1773             pi2_tmp_src += (src_strd << 1);
   1774             m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1775             pi2_tmp_src += (src_strd << 1);
   1776             m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1777             pi2_tmp_src += (src_strd << 1);
   1778             m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1779             pi2_tmp_src += (src_strd << 1);
   1780 
   1781             m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1782             pi2_tmp_src += (src_strd << 1);
   1783             m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1784             pi2_tmp_src += (src_strd << 1);
   1785             m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1786             pi2_tmp_src += (src_strd << 1);
   1787             m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1788             pi2_tmp_src += (src_strd << 1);
   1789             m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1790             pi2_tmp_src += (src_strd << 1);
   1791             m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1792             pi2_tmp_src += (src_strd << 1);
   1793             m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1794             pi2_tmp_src += (src_strd << 1);
   1795             m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
   1796         }
   1797 
   1798         if(zero_last28_rows_stg1)
   1799         {
   1800             /* o & stage 1 out */
   1801             {
   1802                 WORD32 j;
   1803                 WORD16 *pi2_src_scratch = o_temp_ptr;
   1804                 WORD16 *pi2_dst_scratch = temp_ptr;
   1805                 WORD32 out_stride = (trans_size << 1);
   1806                 WORD32 in_stride = trans_size;
   1807 
   1808                 for(j = 0; j < 2; j++)
   1809                 {
   1810                     if(j)
   1811                     {
   1812                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
   1813                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
   1814                     }
   1815 
   1816                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
   1817 
   1818                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   1819 
   1820                     /* o0[0-3] */
   1821                     {
   1822                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1823 
   1824                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1825                         pi2_src_scratch += in_stride;
   1826 
   1827                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1828                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1829 
   1830                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1831                         m_count = _mm_cvtsi32_si128(i4_shift);
   1832                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1833                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1834 
   1835                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1836                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1837                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1838                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1839 
   1840                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1841 
   1842                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1843                         pi2_dst_scratch += out_stride;
   1844 
   1845                     }
   1846 
   1847                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   1848 
   1849                     /* o1[0-3] */
   1850                     {
   1851 
   1852                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1853 
   1854                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1855                         pi2_src_scratch += in_stride;
   1856 
   1857                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1858                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1859 
   1860                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1861                         m_count = _mm_cvtsi32_si128(i4_shift);
   1862                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1863                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1864 
   1865                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1866                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1867                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1868                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1869 
   1870                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1871 
   1872                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1873                         pi2_dst_scratch += out_stride;
   1874 
   1875                     }
   1876 
   1877                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   1878 
   1879                     /* o2[0-3] */
   1880                     {
   1881 
   1882                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1883 
   1884                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1885                         pi2_src_scratch += in_stride;
   1886 
   1887                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1888                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1889 
   1890                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1891                         m_count = _mm_cvtsi32_si128(i4_shift);
   1892                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1893                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1894 
   1895                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1896                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1897                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1898                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1899 
   1900                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1901 
   1902                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1903                         pi2_dst_scratch += out_stride;
   1904 
   1905                     }
   1906 
   1907                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   1908 
   1909                     /* o3[0-3] */
   1910                     {
   1911                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1912 
   1913                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1914                         pi2_src_scratch += in_stride;
   1915 
   1916                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1917                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1918 
   1919                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1920                         m_count = _mm_cvtsi32_si128(i4_shift);
   1921                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1922                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1923 
   1924                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1925                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1926                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1927                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1928 
   1929                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1930 
   1931                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1932                         pi2_dst_scratch += out_stride;
   1933 
   1934                     }
   1935 
   1936                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   1937 
   1938                     /* o4[0-3] */
   1939                     {
   1940                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1941 
   1942                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1943                         pi2_src_scratch += in_stride;
   1944 
   1945                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1946                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1947 
   1948                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1949                         m_count = _mm_cvtsi32_si128(i4_shift);
   1950                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1951                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1952 
   1953                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1954                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1955                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1956                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1957 
   1958                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1959 
   1960                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1961                         pi2_dst_scratch += out_stride;
   1962 
   1963                     }
   1964 
   1965                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   1966 
   1967                     /* o5[0-3] */
   1968                     {
   1969 
   1970                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1971 
   1972                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1973                         pi2_src_scratch += in_stride;
   1974 
   1975                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1976                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1977 
   1978                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1979                         m_count = _mm_cvtsi32_si128(i4_shift);
   1980                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1981                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1982 
   1983                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1984                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1985                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1986                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1987 
   1988                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1989 
   1990                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1991                         pi2_dst_scratch += out_stride;
   1992 
   1993                     }
   1994 
   1995                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   1996 
   1997                     /* o6[0-3] */
   1998                     {
   1999                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2000 
   2001                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2002                         pi2_src_scratch += in_stride;
   2003 
   2004                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2005                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2006 
   2007                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2008                         m_count = _mm_cvtsi32_si128(i4_shift);
   2009                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2010                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2011 
   2012                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2013                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2014                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2015                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2016 
   2017                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2018 
   2019                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2020                         pi2_dst_scratch += out_stride;
   2021 
   2022                     }
   2023 
   2024                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   2025 
   2026                     /* o7[0-3] */
   2027                     {
   2028 
   2029                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2030 
   2031                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2032                         pi2_src_scratch += 8;
   2033 
   2034                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2035                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2036 
   2037                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2038                         m_count = _mm_cvtsi32_si128(i4_shift);
   2039                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2040                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2041 
   2042                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2043                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2044                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2045                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2046 
   2047                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2048 
   2049                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2050                         pi2_dst_scratch += 8;
   2051 
   2052                     }
   2053 
   2054                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   2055 
   2056                     /* o8[0-3] */
   2057                     {
   2058                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2059 
   2060                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2061                         pi2_src_scratch -= in_stride;
   2062 
   2063                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2064                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2065 
   2066                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2067                         m_count = _mm_cvtsi32_si128(i4_shift);
   2068                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2069                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2070 
   2071                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2072                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2073                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2074                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2075 
   2076                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2077 
   2078                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2079                         pi2_dst_scratch -= out_stride;
   2080                     }
   2081 
   2082                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   2083 
   2084                     /* o9[0-3] */
   2085                     {
   2086                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2087 
   2088                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2089                         pi2_src_scratch -= in_stride;
   2090 
   2091                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2092                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2093 
   2094                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2095                         m_count = _mm_cvtsi32_si128(i4_shift);
   2096                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2097                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2098 
   2099                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2100                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2101                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2102                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2103 
   2104                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2105 
   2106                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2107                         pi2_dst_scratch -= out_stride;
   2108                     }
   2109 
   2110                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   2111 
   2112                     /* o10[0-3] */
   2113                     {
   2114                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2115 
   2116                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2117                         pi2_src_scratch -= in_stride;
   2118 
   2119                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2120                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2121 
   2122                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2123                         m_count = _mm_cvtsi32_si128(i4_shift);
   2124                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2125                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2126 
   2127                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2128                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2129                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2130                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2131 
   2132                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2133 
   2134                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2135                         pi2_dst_scratch -= out_stride;
   2136                     }
   2137 
   2138                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   2139 
   2140                     /* o11[0-3] */
   2141                     {
   2142                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2143 
   2144                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2145                         pi2_src_scratch -= in_stride;
   2146 
   2147                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2148                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2149 
   2150                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2151                         m_count = _mm_cvtsi32_si128(i4_shift);
   2152                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2153                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2154 
   2155                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2156                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2157                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2158                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2159 
   2160                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2161 
   2162                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2163                         pi2_dst_scratch -= out_stride;
   2164 
   2165                     }
   2166 
   2167                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   2168 
   2169                     /* o12[0-3] */
   2170                     {
   2171                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2172 
   2173                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2174                         pi2_src_scratch -= in_stride;
   2175 
   2176                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2177                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2178 
   2179                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2180                         m_count = _mm_cvtsi32_si128(i4_shift);
   2181                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2182                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2183 
   2184                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2185                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2186                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2187                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2188 
   2189                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2190 
   2191                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2192                         pi2_dst_scratch -= out_stride;
   2193 
   2194                     }
   2195 
   2196                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   2197 
   2198                     /* o13[0-3] */
   2199                     {
   2200                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2201 
   2202                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2203                         pi2_src_scratch -= in_stride;
   2204 
   2205                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2206                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2207 
   2208                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2209                         m_count = _mm_cvtsi32_si128(i4_shift);
   2210                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2211                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2212 
   2213                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2214                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2215                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2216                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2217 
   2218                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2219 
   2220                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2221                         pi2_dst_scratch -= out_stride;
   2222                     }
   2223 
   2224                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   2225 
   2226                     /* o14[0-3] */
   2227                     {
   2228                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2229 
   2230                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2231                         pi2_src_scratch -= in_stride;
   2232 
   2233                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2234                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2235 
   2236                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2237                         m_count = _mm_cvtsi32_si128(i4_shift);
   2238                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2239                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2240 
   2241                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2242                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2243                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2244                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2245 
   2246                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2247 
   2248                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2249                         pi2_dst_scratch -= out_stride;
   2250 
   2251                     }
   2252 
   2253                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   2254 
   2255                     /* o15[0-3] */
   2256                     {
   2257                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2258 
   2259                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2260                         pi2_src_scratch += 8;
   2261 
   2262                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2263                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2264 
   2265                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2266                         m_count = _mm_cvtsi32_si128(i4_shift);
   2267                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2268                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2269 
   2270                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2271                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2272                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2273                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2274 
   2275                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2276 
   2277                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2278                         pi2_dst_scratch += 8;
   2279                     }
   2280 
   2281                 }
   2282             }
   2283         }
   2284         else if(zero_last24_rows_stg1)
   2285         {
   2286             /* o & stage 1 out */
   2287             {
   2288                 WORD32 j;
   2289 
   2290                 WORD16 *pi2_src_scratch = o_temp_ptr;
   2291                 WORD16 *pi2_dst_scratch = temp_ptr;
   2292                 WORD32 out_stride = (trans_size << 1);
   2293 
   2294                 WORD32 in_stride = trans_size;
   2295 
   2296                 for(j = 0; j < 2; j++)
   2297                 {
   2298                     if(j)
   2299                     {
   2300                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
   2301                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
   2302                         m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
   2303                         m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
   2304                     }
   2305 
   2306                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
   2307                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
   2308 
   2309                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   2310                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
   2311 
   2312                     /* o0[0-3] */
   2313                     {
   2314 
   2315                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2316                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2317 
   2318                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2319 
   2320                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2321                         pi2_src_scratch += in_stride;
   2322 
   2323                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2324                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2325 
   2326                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2327                         m_count = _mm_cvtsi32_si128(i4_shift);
   2328                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2329                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2330 
   2331                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2332                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2333                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2334                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2335 
   2336                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2337 
   2338                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2339                         pi2_dst_scratch += out_stride;
   2340 
   2341                     }
   2342 
   2343                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   2344                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
   2345 
   2346                     /* o1[0-3] */
   2347                     {
   2348                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2349                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2350 
   2351                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2352 
   2353                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2354                         pi2_src_scratch += in_stride;
   2355 
   2356                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2357                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2358 
   2359                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2360                         m_count = _mm_cvtsi32_si128(i4_shift);
   2361                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2362                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2363 
   2364                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2365                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2366                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2367                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2368 
   2369                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2370 
   2371                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2372                         pi2_dst_scratch += out_stride;
   2373 
   2374                     }
   2375 
   2376                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   2377                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
   2378 
   2379                     /* o2[0-3] */
   2380                     {
   2381                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2382                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2383 
   2384                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   2385 
   2386                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2387                         pi2_src_scratch += in_stride;
   2388 
   2389                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2390                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2391 
   2392                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2393                         m_count = _mm_cvtsi32_si128(i4_shift);
   2394                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2395                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2396 
   2397                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2398                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2399                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2400                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2401 
   2402                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2403 
   2404                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2405                         pi2_dst_scratch += out_stride;
   2406 
   2407                     }
   2408 
   2409                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   2410                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
   2411 
   2412                     /* o3[0-3] */
   2413                     {
   2414                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2415                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2416 
   2417                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   2418 
   2419                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2420                         pi2_src_scratch += in_stride;
   2421 
   2422                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2423                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2424 
   2425                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2426                         m_count = _mm_cvtsi32_si128(i4_shift);
   2427                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2428                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2429 
   2430                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2431                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2432                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2433                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2434 
   2435                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2436 
   2437                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2438                         pi2_dst_scratch += out_stride;
   2439 
   2440                     }
   2441 
   2442                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   2443                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
   2444 
   2445                     /* o4[0-3] */
   2446                     {
   2447                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2448                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2449 
   2450                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2451 
   2452                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2453                         pi2_src_scratch += in_stride;
   2454 
   2455                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2456                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2457 
   2458                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2459                         m_count = _mm_cvtsi32_si128(i4_shift);
   2460                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2461                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2462 
   2463                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2464                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2465                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2466                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2467 
   2468                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2469 
   2470                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2471                         pi2_dst_scratch += out_stride;
   2472 
   2473                     }
   2474 
   2475                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   2476                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
   2477 
   2478                     /* o5[0-3] */
   2479                     {
   2480                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2481                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2482 
   2483                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2484 
   2485                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2486                         pi2_src_scratch += in_stride;
   2487 
   2488                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2489                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2490 
   2491                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2492                         m_count = _mm_cvtsi32_si128(i4_shift);
   2493                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2494                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2495 
   2496                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2497                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2498                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2499                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2500 
   2501                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2502 
   2503                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2504                         pi2_dst_scratch += out_stride;
   2505 
   2506                     }
   2507 
   2508                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   2509                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
   2510 
   2511                     /* o6[0-3] */
   2512                     {
   2513                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2514                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2515 
   2516                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2517 
   2518                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2519                         pi2_src_scratch += in_stride;
   2520 
   2521                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2522                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2523 
   2524                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2525                         m_count = _mm_cvtsi32_si128(i4_shift);
   2526                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2527                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2528 
   2529                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2530                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2531                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2532                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2533 
   2534                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2535 
   2536                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2537                         pi2_dst_scratch += out_stride;
   2538 
   2539                     }
   2540 
   2541                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   2542                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
   2543 
   2544                     /* o7[0-3] */
   2545                     {
   2546                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2547                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2548 
   2549                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2550 
   2551                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2552                         pi2_src_scratch += 8;
   2553 
   2554                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2555                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2556 
   2557                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2558                         m_count = _mm_cvtsi32_si128(i4_shift);
   2559                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2560                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2561 
   2562                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2563                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2564                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2565                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2566 
   2567                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2568 
   2569                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2570                         pi2_dst_scratch += 8;
   2571 
   2572                     }
   2573 
   2574                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   2575                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
   2576 
   2577                     /* o8[0-3] */
   2578                     {
   2579                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2580                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2581 
   2582                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2583 
   2584                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2585                         pi2_src_scratch -= in_stride;
   2586 
   2587                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2588                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2589 
   2590                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2591                         m_count = _mm_cvtsi32_si128(i4_shift);
   2592                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2593                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2594 
   2595                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2596                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2597                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2598                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2599 
   2600                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2601 
   2602                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2603                         pi2_dst_scratch -= out_stride;
   2604                     }
   2605 
   2606                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   2607                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
   2608 
   2609                     /* o9[0-3] */
   2610                     {
   2611                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2612                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2613 
   2614                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2615 
   2616                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2617                         pi2_src_scratch -= in_stride;
   2618 
   2619                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2620                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2621 
   2622                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2623                         m_count = _mm_cvtsi32_si128(i4_shift);
   2624                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2625                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2626 
   2627                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2628                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2629                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2630                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2631 
   2632                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2633 
   2634                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2635                         pi2_dst_scratch -= out_stride;
   2636                     }
   2637 
   2638                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   2639                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
   2640 
   2641                     /* o10[0-3] */
   2642                     {
   2643                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2644                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2645 
   2646                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2647 
   2648                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2649                         pi2_src_scratch -= in_stride;
   2650 
   2651                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2652                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2653 
   2654                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2655                         m_count = _mm_cvtsi32_si128(i4_shift);
   2656                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2657                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2658 
   2659                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2660                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2661                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2662                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2663 
   2664                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2665 
   2666                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2667                         pi2_dst_scratch -= out_stride;
   2668                     }
   2669 
   2670                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   2671                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
   2672 
   2673                     /* o11[0-3] */
   2674                     {
   2675 
   2676                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2677                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2678 
   2679                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2680 
   2681                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2682                         pi2_src_scratch -= in_stride;
   2683 
   2684                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2685                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2686 
   2687                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2688                         m_count = _mm_cvtsi32_si128(i4_shift);
   2689                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2690                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2691 
   2692                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2693                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2694                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2695                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2696 
   2697                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2698 
   2699                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2700                         pi2_dst_scratch -= out_stride;
   2701 
   2702                     }
   2703 
   2704                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   2705                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
   2706 
   2707                     /* o12[0-3] */
   2708                     {
   2709                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2710                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2711 
   2712                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2713 
   2714                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2715                         pi2_src_scratch -= in_stride;
   2716 
   2717                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2718                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2719 
   2720                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2721                         m_count = _mm_cvtsi32_si128(i4_shift);
   2722                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2723                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2724 
   2725                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2726                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2727                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2728                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2729 
   2730                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2731 
   2732                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2733                         pi2_dst_scratch -= out_stride;
   2734 
   2735                     }
   2736 
   2737                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   2738                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
   2739 
   2740                     /* o13[0-3] */
   2741                     {
   2742                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2743                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2744 
   2745                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2746 
   2747                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2748                         pi2_src_scratch -= in_stride;
   2749 
   2750                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2751                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2752 
   2753                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2754                         m_count = _mm_cvtsi32_si128(i4_shift);
   2755                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2756                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2757 
   2758                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2759                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2760                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2761                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2762 
   2763                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2764 
   2765                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2766                         pi2_dst_scratch -= out_stride;
   2767                     }
   2768 
   2769                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   2770                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
   2771 
   2772                     /* o14[0-3] */
   2773                     {
   2774                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2775                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2776 
   2777                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2778 
   2779                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2780                         pi2_src_scratch -= in_stride;
   2781 
   2782                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2783                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2784 
   2785                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2786                         m_count = _mm_cvtsi32_si128(i4_shift);
   2787                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2788                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2789 
   2790                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2791                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2792                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2793                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2794 
   2795                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2796 
   2797                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2798                         pi2_dst_scratch -= out_stride;
   2799 
   2800                     }
   2801 
   2802                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   2803                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
   2804 
   2805                     /* o15[0-3] */
   2806                     {
   2807                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2808                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2809 
   2810                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2811 
   2812                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2813                         pi2_src_scratch += 8;
   2814 
   2815                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2816                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2817 
   2818                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2819                         m_count = _mm_cvtsi32_si128(i4_shift);
   2820                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2821                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2822 
   2823                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2824                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2825                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2826                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2827 
   2828                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2829 
   2830                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2831                         pi2_dst_scratch += 8;
   2832                     }
   2833 
   2834                 }
   2835             }
   2836         }
   2837         else
   2838         {
   2839             /* o & stage 1 out */
   2840             {
   2841                 WORD32 j;
   2842 
   2843                 WORD16 *pi2_src_scratch = o_temp_ptr;
   2844                 WORD16 *pi2_dst_scratch = temp_ptr;
   2845                 WORD32 out_stride = (trans_size << 1);
   2846 
   2847                 WORD32 in_stride = trans_size;
   2848 
   2849 
   2850                 for(j = 0; j < 2; j++)
   2851                 {
   2852                     if(j)
   2853                     {
   2854                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
   2855                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
   2856                         m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
   2857                         m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
   2858                         m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
   2859                         m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
   2860                         m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
   2861                         m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
   2862 
   2863                         m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
   2864                         m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
   2865                         m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
   2866                         m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
   2867                         m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
   2868                         m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
   2869                         m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
   2870                         m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
   2871                     }
   2872 
   2873                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   2874                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
   2875                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
   2876                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
   2877                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
   2878                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
   2879                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
   2880                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
   2881 
   2882                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
   2883                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
   2884                     m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
   2885                     m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
   2886                     temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
   2887                     temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
   2888                     temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
   2889                     temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
   2890 
   2891 
   2892                     /* o0[0-3] */
   2893                     {
   2894                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2895                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2896                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   2897                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2898 
   2899                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2900                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   2901 
   2902                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   2903 
   2904                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   2905                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   2906                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   2907                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   2908 
   2909                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   2910                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   2911 
   2912                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   2913 
   2914                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   2915 
   2916                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2917                         pi2_src_scratch += in_stride;
   2918 
   2919                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2920                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2921 
   2922                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2923                         m_count = _mm_cvtsi32_si128(i4_shift);
   2924                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2925                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2926 
   2927                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2928                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2929                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2930                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2931 
   2932                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2933 
   2934                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2935                         pi2_dst_scratch += out_stride;
   2936 
   2937                     }
   2938 
   2939                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   2940                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
   2941                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
   2942                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
   2943                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
   2944                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
   2945                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
   2946                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
   2947 
   2948 
   2949                     /* o1[0-3] */
   2950                     {
   2951                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2952                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2953                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   2954                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2955 
   2956                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2957                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   2958 
   2959                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
   2960 
   2961                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   2962                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   2963                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   2964                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   2965 
   2966                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   2967                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   2968 
   2969                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   2970 
   2971                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   2972 
   2973                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2974                         pi2_src_scratch += in_stride;
   2975 
   2976                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2977                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2978 
   2979                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2980                         m_count = _mm_cvtsi32_si128(i4_shift);
   2981                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   2982                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   2983 
   2984                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2985                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2986                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2987                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2988 
   2989                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2990 
   2991                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2992                         pi2_dst_scratch += out_stride;
   2993 
   2994                     }
   2995 
   2996                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   2997                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
   2998                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
   2999                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
   3000                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
   3001                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
   3002                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
   3003                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
   3004 
   3005                     /* o2[0-3] */
   3006                     {
   3007                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3008                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3009                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3010                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3011 
   3012                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   3013                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3014 
   3015                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3016 
   3017                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3018                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3019                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3020                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3021 
   3022                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
   3023                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3024 
   3025                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
   3026 
   3027                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3028 
   3029                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3030                         pi2_src_scratch += in_stride;
   3031 
   3032                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3033                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3034 
   3035                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3036                         m_count = _mm_cvtsi32_si128(i4_shift);
   3037                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3038                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3039 
   3040                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3041                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3042                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3043                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3044 
   3045                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3046 
   3047                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3048                         pi2_dst_scratch += out_stride;
   3049 
   3050                     }
   3051 
   3052 
   3053                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   3054                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
   3055                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
   3056                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
   3057                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
   3058                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
   3059                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
   3060                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
   3061 
   3062                     /* o3[0-3] */
   3063                     {
   3064                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3065                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3066                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3067                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3068 
   3069                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   3070                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3071 
   3072                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3073 
   3074                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3075                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3076                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3077                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3078 
   3079                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
   3080                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3081 
   3082                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3083 
   3084                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3085 
   3086                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3087                         pi2_src_scratch += in_stride;
   3088 
   3089                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3090                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3091 
   3092                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3093                         m_count = _mm_cvtsi32_si128(i4_shift);
   3094                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3095                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3096 
   3097                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3098                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3099                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3100                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3101 
   3102                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3103 
   3104                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3105                         pi2_dst_scratch += out_stride;
   3106 
   3107                     }
   3108 
   3109                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   3110                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
   3111                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
   3112                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
   3113                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
   3114                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
   3115                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
   3116                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
   3117 
   3118                     /* o4[0-3] */
   3119                     {
   3120                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3121                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3122                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3123                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3124 
   3125                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3126                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3127 
   3128                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3129 
   3130                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3131                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3132                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3133                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3134 
   3135                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3136                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3137 
   3138                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3139 
   3140                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3141 
   3142                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3143                         pi2_src_scratch += in_stride;
   3144 
   3145                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3146                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3147 
   3148                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3149                         m_count = _mm_cvtsi32_si128(i4_shift);
   3150                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3151                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3152 
   3153                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3154                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3155                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3156                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3157 
   3158                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3159 
   3160                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3161                         pi2_dst_scratch += out_stride;
   3162 
   3163                     }
   3164 
   3165 
   3166                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   3167                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
   3168                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
   3169                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
   3170                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
   3171                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
   3172                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
   3173                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
   3174 
   3175                     /* o5[0-3] */
   3176                     {
   3177                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3178                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3179                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3180                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3181 
   3182                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3183                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3184 
   3185                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3186 
   3187                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3188                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3189                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3190                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3191 
   3192                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3193                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3194 
   3195                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3196 
   3197                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3198 
   3199                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3200                         pi2_src_scratch += in_stride;
   3201 
   3202                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3203                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3204 
   3205                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3206                         m_count = _mm_cvtsi32_si128(i4_shift);
   3207                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3208                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3209 
   3210                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3211                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3212                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3213                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3214 
   3215                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3216 
   3217                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3218                         pi2_dst_scratch += out_stride;
   3219 
   3220                     }
   3221 
   3222                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   3223                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
   3224                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
   3225                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
   3226                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
   3227                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
   3228                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
   3229                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
   3230 
   3231 
   3232                     /* o6[0-3] */
   3233                     {
   3234                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3235                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3236                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3237                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3238 
   3239                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3240                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3241 
   3242                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3243 
   3244                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3245                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3246                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3247                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3248 
   3249                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3250                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3251 
   3252                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3253 
   3254                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3255 
   3256                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3257                         pi2_src_scratch += in_stride;
   3258 
   3259                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3260                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3261 
   3262                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3263                         m_count = _mm_cvtsi32_si128(i4_shift);
   3264                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3265                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3266 
   3267                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3268                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3269                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3270                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3271 
   3272                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3273 
   3274                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3275                         pi2_dst_scratch += out_stride;
   3276 
   3277                     }
   3278 
   3279                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   3280                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
   3281                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
   3282                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
   3283                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
   3284                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
   3285                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
   3286                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
   3287 
   3288                     /* o7[0-3] */
   3289                     {
   3290                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3291                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3292                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3293                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3294 
   3295                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3296                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3297 
   3298                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3299 
   3300                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3301                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3302                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3303                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3304 
   3305                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3306                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3307 
   3308                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3309 
   3310                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3311 
   3312                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3313                         pi2_src_scratch += 8;
   3314 
   3315                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3316                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3317 
   3318                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3319                         m_count = _mm_cvtsi32_si128(i4_shift);
   3320                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3321                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3322 
   3323                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3324                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3325                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3326                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3327 
   3328                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3329 
   3330                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3331                         pi2_dst_scratch += 8;
   3332 
   3333                     }
   3334 
   3335                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   3336                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
   3337                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
   3338                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
   3339                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
   3340                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
   3341                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
   3342                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
   3343 
   3344 
   3345                     /* o8[0-3] */
   3346                     {
   3347 
   3348                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3349                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3350                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3351                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3352 
   3353                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3354                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3355 
   3356                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3357 
   3358                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3359                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3360                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3361                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3362 
   3363                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3364                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3365 
   3366                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3367 
   3368                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3369 
   3370                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3371                         pi2_src_scratch -= in_stride;
   3372 
   3373                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3374                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3375 
   3376                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3377                         m_count = _mm_cvtsi32_si128(i4_shift);
   3378                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3379                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3380 
   3381                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3382                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3383                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3384                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3385 
   3386                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3387 
   3388                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3389                         pi2_dst_scratch -= out_stride;
   3390                     }
   3391 
   3392                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   3393                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
   3394                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
   3395                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
   3396                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
   3397                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
   3398                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
   3399                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
   3400 
   3401 
   3402                     /* o9[0-3] */
   3403                     {
   3404                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3405                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3406                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3407                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3408 
   3409                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3410                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3411 
   3412                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3413 
   3414                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3415                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3416                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3417                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3418 
   3419                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3420                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3421 
   3422                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3423 
   3424                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3425 
   3426                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3427                         pi2_src_scratch -= in_stride;
   3428 
   3429                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3430                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3431 
   3432                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3433                         m_count = _mm_cvtsi32_si128(i4_shift);
   3434                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3435                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3436 
   3437                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3438                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3439                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3440                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3441 
   3442                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3443 
   3444                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3445                         pi2_dst_scratch -= out_stride;
   3446                     }
   3447 
   3448                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   3449                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
   3450                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
   3451                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
   3452                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
   3453                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
   3454                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
   3455                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
   3456 
   3457                     /* o10[0-3] */
   3458                     {
   3459                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3460                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3461                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3462                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3463 
   3464                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3465                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3466 
   3467                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3468 
   3469                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3470                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3471                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3472                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3473 
   3474                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3475                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3476 
   3477                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3478 
   3479                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3480 
   3481                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3482                         pi2_src_scratch -= in_stride;
   3483 
   3484                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3485                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3486 
   3487                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3488                         m_count = _mm_cvtsi32_si128(i4_shift);
   3489                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3490                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3491 
   3492                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3493                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3494                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3495                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3496 
   3497                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3498 
   3499                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3500                         pi2_dst_scratch -= out_stride;
   3501                     }
   3502 
   3503                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   3504                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
   3505                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
   3506                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
   3507                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
   3508                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
   3509                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
   3510                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
   3511 
   3512                     /* o11[0-3] */
   3513                     {
   3514                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3515                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3516                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3517                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3518 
   3519                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3520                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3521 
   3522                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3523 
   3524                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3525                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3526                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3527                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3528 
   3529                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3530                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3531 
   3532                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3533 
   3534                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3535 
   3536                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3537                         pi2_src_scratch -= in_stride;
   3538 
   3539                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3540                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3541 
   3542                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3543                         m_count = _mm_cvtsi32_si128(i4_shift);
   3544                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3545                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3546 
   3547                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3548                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3549                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3550                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3551 
   3552                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3553 
   3554                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3555                         pi2_dst_scratch -= out_stride;
   3556 
   3557                     }
   3558 
   3559                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   3560                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
   3561                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
   3562                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
   3563                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
   3564                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
   3565                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
   3566                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
   3567 
   3568 
   3569                     /* o12[0-3] */
   3570                     {
   3571                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3572                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3573                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3574                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3575 
   3576                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3577                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3578 
   3579                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3580 
   3581                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3582                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3583                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3584                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3585 
   3586                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3587                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3588 
   3589                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3590 
   3591                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3592 
   3593                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3594                         pi2_src_scratch -= in_stride;
   3595 
   3596                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3597                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3598 
   3599                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3600                         m_count = _mm_cvtsi32_si128(i4_shift);
   3601                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3602                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3603 
   3604                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3605                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3606                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3607                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3608 
   3609                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3610 
   3611                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3612                         pi2_dst_scratch -= out_stride;
   3613 
   3614                     }
   3615 
   3616                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   3617                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
   3618                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
   3619                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
   3620                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
   3621                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
   3622                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
   3623                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
   3624 
   3625 
   3626                     /* o13[0-3] */
   3627                     {
   3628                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3629                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3630                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3631                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3632 
   3633                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3634                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3635 
   3636                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3637 
   3638                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3639                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3640                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3641                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3642 
   3643                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3644                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3645 
   3646                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3647 
   3648                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3649 
   3650                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3651                         pi2_src_scratch -= in_stride;
   3652 
   3653                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3654                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3655 
   3656                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3657                         m_count = _mm_cvtsi32_si128(i4_shift);
   3658                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3659                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3660 
   3661                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3662                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3663                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3664                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3665 
   3666                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3667 
   3668                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3669                         pi2_dst_scratch -= out_stride;
   3670                     }
   3671 
   3672                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   3673                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
   3674                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
   3675                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
   3676                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
   3677                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
   3678                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
   3679                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
   3680 
   3681 
   3682                     /* o14[0-3] */
   3683                     {
   3684                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3685                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3686                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3687                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3688 
   3689                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3690                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3691 
   3692                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3693 
   3694                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3695                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3696                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3697                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3698 
   3699                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3700                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3701 
   3702                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3703 
   3704                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3705 
   3706                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3707                         pi2_src_scratch -= in_stride;
   3708 
   3709                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3710                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3711 
   3712                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3713                         m_count = _mm_cvtsi32_si128(i4_shift);
   3714                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3715                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3716 
   3717                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3718                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3719                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3720                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3721 
   3722                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3723 
   3724                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3725                         pi2_dst_scratch -= out_stride;
   3726 
   3727                     }
   3728 
   3729                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   3730                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
   3731                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
   3732                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
   3733                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
   3734                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
   3735                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
   3736                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
   3737 
   3738                     /* o15[0-3] */
   3739                     {
   3740                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3741                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3742                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3743                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3744 
   3745                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3746                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   3747 
   3748                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3749 
   3750                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
   3751                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
   3752                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
   3753                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
   3754 
   3755                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   3756                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   3757 
   3758                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   3759 
   3760                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   3761 
   3762                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3763                         pi2_src_scratch += 8;
   3764 
   3765                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3766                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3767 
   3768                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   3769                         m_count = _mm_cvtsi32_si128(i4_shift);
   3770                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   3771                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   3772 
   3773                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3774                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3775                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3776                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3777 
   3778                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3779 
   3780                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3781                         pi2_dst_scratch += 8;
   3782                     }
   3783 
   3784                 }
   3785             }
   3786         }
   3787         /* Transpose */
   3788         {
   3789             WORD16 *pi2_src_scratch = temp_ptr;
   3790             WORD16 *pi2_dst_scratch = pi2_tmp;
   3791             WORD32 in_stride = (trans_size << 1);
   3792 
   3793             for(j = 0; j < 2; j++)
   3794             {
   3795                 m_temp_reg_30 =  _mm_load_si128((__m128i *)pi2_src_scratch);
   3796                 pi2_src_scratch += in_stride;
   3797                 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3798                 pi2_src_scratch += in_stride;
   3799                 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3800                 pi2_src_scratch += in_stride;
   3801                 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3802                 pi2_src_scratch += in_stride;
   3803                 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3804                 pi2_src_scratch += in_stride;
   3805                 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3806                 pi2_src_scratch += in_stride;
   3807                 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3808                 pi2_src_scratch += in_stride;
   3809                 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3810                 pi2_src_scratch += 8;
   3811 
   3812                 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3813                 pi2_src_scratch -= in_stride;
   3814                 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3815                 pi2_src_scratch -= in_stride;
   3816                 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3817                 pi2_src_scratch -= in_stride;
   3818                 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3819                 pi2_src_scratch -= in_stride;
   3820                 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3821                 pi2_src_scratch -= in_stride;
   3822                 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3823                 pi2_src_scratch -= in_stride;
   3824                 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3825                 pi2_src_scratch -= in_stride;
   3826                 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3827                 pi2_src_scratch += 8;
   3828 
   3829 
   3830                 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
   3831                 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
   3832 
   3833                 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
   3834                 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
   3835 
   3836                 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
   3837                 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
   3838 
   3839                 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
   3840                 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
   3841 
   3842                 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
   3843                 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
   3844 
   3845                 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
   3846                 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
   3847 
   3848                 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
   3849                 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
   3850 
   3851                 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
   3852                 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
   3853 
   3854                 /****************/
   3855 
   3856                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
   3857                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
   3858 
   3859                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
   3860                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
   3861 
   3862                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
   3863                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
   3864 
   3865                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
   3866                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
   3867 
   3868                 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
   3869                 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
   3870 
   3871                 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
   3872                 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
   3873 
   3874                 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
   3875                 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
   3876 
   3877                 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
   3878                 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
   3879 
   3880                 /******************/
   3881 
   3882                 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
   3883                 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
   3884 
   3885                 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
   3886                 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
   3887 
   3888                 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
   3889                 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
   3890 
   3891                 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
   3892                 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
   3893 
   3894                 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
   3895                 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
   3896 
   3897                 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
   3898                 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
   3899 
   3900                 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
   3901                 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
   3902 
   3903                 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
   3904                 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
   3905 
   3906                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
   3907                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
   3908                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
   3909                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
   3910 
   3911                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
   3912                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
   3913                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
   3914                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
   3915 
   3916                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
   3917                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
   3918                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
   3919                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
   3920 
   3921                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
   3922                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
   3923                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
   3924                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
   3925 
   3926                 pi2_dst_scratch += 4 * trans_size;
   3927             }
   3928         }
   3929         pi2_src += 8;
   3930 //      pi2_dequant_coeff +=8;
   3931         pi2_tmp += 8 * trans_size;
   3932         zero_cols = zero_cols >> 1;
   3933     }
   3934 
   3935     if(trans_size_stg1 != TRANS_SIZE_32)
   3936     {
   3937         m_temp_reg_10 = _mm_setzero_si128();
   3938 
   3939         for(i = trans_size_stg1; i < 32; i += 8)
   3940         {
   3941             WORD16 *pi2_dst_scratch = pi2_tmp;
   3942 
   3943             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
   3944             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
   3945             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
   3946             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
   3947 
   3948             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
   3949             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
   3950             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
   3951             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
   3952 
   3953             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
   3954             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
   3955             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
   3956             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
   3957 
   3958             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
   3959             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
   3960             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
   3961             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
   3962 
   3963             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
   3964             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
   3965             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
   3966             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
   3967 
   3968             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
   3969             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
   3970             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
   3971             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
   3972 
   3973             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
   3974             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
   3975             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
   3976             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
   3977 
   3978             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
   3979             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
   3980             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
   3981             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
   3982 
   3983             pi2_tmp += 8 * trans_size;
   3984         }
   3985     }
   3986 
   3987     pi2_tmp = pi2_tmp_orig;
   3988 
   3989     /* Inverse Transform 2nd stage */
   3990 
   3991 
   3992     for(j = 0; j < trans_size; j += 4)
   3993     {
   3994         i4_shift = IT_SHIFT_STAGE_2;
   3995 
   3996         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
   3997         if(zero_last28_rows_stg2)
   3998         {
   3999             {
   4000 
   4001                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
   4002                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
   4003                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
   4004                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
   4005                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
   4006                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
   4007                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
   4008                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
   4009 
   4010                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
   4011 
   4012                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
   4013 
   4014                 /* eo0[0-3] */
   4015                 {
   4016                     m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4017 
   4018                 }
   4019                 /* eo1[0-3] */
   4020                 {
   4021                     m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
   4022 
   4023                 }
   4024                 /* eo2[0-3] */
   4025                 {
   4026                     m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   4027                 }
   4028 
   4029                 /* eo3[0-3] */
   4030                 {
   4031                     m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
   4032                 }
   4033                 /* eo4[0-3] */
   4034                 {
   4035                     m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   4036                 }
   4037 
   4038                 /* eo5[0-3] */
   4039                 {
   4040                     m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
   4041                 }
   4042 
   4043                 /* eo6[0-3] */
   4044                 {
   4045                     m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
   4046                 }
   4047                 /* eo7[0-3] */
   4048                 {
   4049                     m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
   4050                 }
   4051             }
   4052 
   4053             m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
   4054 
   4055             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
   4056 
   4057             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
   4058 
   4059             m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   4060 
   4061             m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   4062 
   4063             /* e[]*/
   4064 
   4065             temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[0] */
   4066             temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[15] */
   4067 
   4068             temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[1] */
   4069             temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[14] */
   4070 
   4071             temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[2] */
   4072             temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[13] */
   4073 
   4074             temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[3] */
   4075             temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[12] */
   4076 
   4077             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[4] */
   4078             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[11] */
   4079 
   4080             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[5] */
   4081             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[10] */
   4082 
   4083             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[6] */
   4084             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[9] */
   4085 
   4086             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[7] */
   4087             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[8] */
   4088 
   4089             /*o[k]*/
   4090             {
   4091 
   4092                 WORD16 *pi2_dst_scratch = temp_ptr;
   4093                 WORD32 out_stride = 8;
   4094 
   4095                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   4096 
   4097                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
   4098                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
   4099 
   4100                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
   4101 
   4102 
   4103                 /* o0[0-3] */
   4104                 {
   4105                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4106 
   4107                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
   4108                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
   4109 
   4110                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4111                     m_count = _mm_cvtsi32_si128(i4_shift);
   4112                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4113                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4114 
   4115                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4116                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4117                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4118                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4119 
   4120                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4121 
   4122                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4123                     pi2_dst_scratch += out_stride;
   4124 
   4125                 }
   4126 
   4127                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   4128 
   4129                 /* o1[0-3] */
   4130                 {
   4131                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4132 
   4133                     m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
   4134                     m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
   4135 
   4136                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4137                     m_count = _mm_cvtsi32_si128(i4_shift);
   4138                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4139                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4140 
   4141                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4142                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4143                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4144                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4145 
   4146                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4147 
   4148                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4149                     pi2_dst_scratch += out_stride;
   4150 
   4151                 }
   4152 
   4153                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   4154 
   4155                 /* o2[0-3] */
   4156                 {
   4157                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4158 
   4159                     m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
   4160                     m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
   4161 
   4162                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4163                     m_count = _mm_cvtsi32_si128(i4_shift);
   4164                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4165                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4166 
   4167                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4168                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4169                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4170                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4171 
   4172                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4173 
   4174                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4175                     pi2_dst_scratch += out_stride;
   4176 
   4177                 }
   4178 
   4179                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   4180 
   4181                 /* o3[0-3] */
   4182                 {
   4183                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4184 
   4185                     m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
   4186                     m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
   4187 
   4188                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4189                     m_count = _mm_cvtsi32_si128(i4_shift);
   4190                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4191                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4192 
   4193                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4194                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4195                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4196                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4197 
   4198                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4199 
   4200                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4201                     pi2_dst_scratch += out_stride;
   4202 
   4203                 }
   4204 
   4205                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   4206 
   4207                 /* o4[0-3] */
   4208                 {
   4209                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4210 
   4211                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
   4212                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
   4213 
   4214                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4215                     m_count = _mm_cvtsi32_si128(i4_shift);
   4216                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4217                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4218 
   4219                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4220                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4221                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4222                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4223 
   4224                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4225 
   4226                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4227                     pi2_dst_scratch += out_stride;
   4228 
   4229                 }
   4230 
   4231                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   4232 
   4233                 /* o5[0-3] */
   4234                 {
   4235                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4236 
   4237                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
   4238                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
   4239 
   4240                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4241                     m_count = _mm_cvtsi32_si128(i4_shift);
   4242                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4243                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4244 
   4245                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4246                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4247                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4248                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4249 
   4250                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4251 
   4252                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4253                     pi2_dst_scratch += out_stride;
   4254 
   4255                 }
   4256 
   4257                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   4258 
   4259                 /* o6[0-3] */
   4260                 {
   4261                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4262 
   4263                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
   4264                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
   4265 
   4266                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4267                     m_count = _mm_cvtsi32_si128(i4_shift);
   4268                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4269                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4270 
   4271                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4272                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4273                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4274                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4275 
   4276                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4277 
   4278                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4279                     pi2_dst_scratch += out_stride;
   4280 
   4281                 }
   4282 
   4283                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   4284 
   4285                 /* o7[0-3] */
   4286                 {
   4287                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4288 
   4289                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
   4290                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
   4291 
   4292                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4293                     m_count = _mm_cvtsi32_si128(i4_shift);
   4294                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4295                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4296 
   4297                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4298                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4299                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4300                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4301 
   4302                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4303 
   4304                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4305                     pi2_dst_scratch += 8;
   4306 
   4307                 }
   4308 
   4309                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   4310 
   4311                 /* o8[0-3] */
   4312                 {
   4313                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4314 
   4315                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
   4316                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
   4317 
   4318                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4319                     m_count = _mm_cvtsi32_si128(i4_shift);
   4320                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4321                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4322 
   4323                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4324                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4325                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4326                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4327 
   4328                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4329 
   4330                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4331                     pi2_dst_scratch += out_stride;
   4332                 }
   4333 
   4334                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   4335 
   4336                 /* o9[0-3] */
   4337                 {
   4338                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4339 
   4340                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
   4341                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
   4342 
   4343                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4344                     m_count = _mm_cvtsi32_si128(i4_shift);
   4345                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4346                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4347 
   4348                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4349                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4350                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4351                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4352 
   4353                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4354 
   4355                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4356                     pi2_dst_scratch += out_stride;
   4357 
   4358                 }
   4359 
   4360                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   4361 
   4362                 /* o10[0-3] */
   4363                 {
   4364                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4365 
   4366                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
   4367                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
   4368 
   4369                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4370                     m_count = _mm_cvtsi32_si128(i4_shift);
   4371                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4372                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4373 
   4374                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4375                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4376                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4377                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4378 
   4379                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4380 
   4381                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4382                     pi2_dst_scratch += out_stride;
   4383                 }
   4384 
   4385                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   4386 
   4387                 /* o11[0-3] */
   4388                 {
   4389                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4390 
   4391                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
   4392                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
   4393 
   4394                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4395                     m_count = _mm_cvtsi32_si128(i4_shift);
   4396                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4397                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4398 
   4399                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4400                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4401                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4402                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4403 
   4404                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4405 
   4406                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4407                     pi2_dst_scratch += out_stride;
   4408 
   4409                 }
   4410 
   4411                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   4412 
   4413                 /* o12[0-3] */
   4414                 {
   4415                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4416 
   4417                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
   4418                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
   4419 
   4420                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4421                     m_count = _mm_cvtsi32_si128(i4_shift);
   4422                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4423                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4424 
   4425                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4426                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4427                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4428                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4429 
   4430                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4431 
   4432                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4433                     pi2_dst_scratch += out_stride;
   4434 
   4435                 }
   4436 
   4437                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   4438 
   4439                 /* o13[0-3] */
   4440                 {
   4441                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4442 
   4443                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
   4444                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
   4445 
   4446                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4447                     m_count = _mm_cvtsi32_si128(i4_shift);
   4448                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4449                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4450 
   4451                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4452                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4453                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4454                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4455 
   4456                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4457 
   4458                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4459                     pi2_dst_scratch += out_stride;
   4460                 }
   4461 
   4462                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   4463 
   4464                 /* o14[0-3] */
   4465                 {
   4466                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4467 
   4468                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
   4469                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
   4470 
   4471                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4472                     m_count = _mm_cvtsi32_si128(i4_shift);
   4473                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4474                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4475 
   4476                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4477                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4478                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4479                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4480 
   4481                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4482 
   4483                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4484                     pi2_dst_scratch += out_stride;
   4485 
   4486                 }
   4487 
   4488                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   4489 
   4490                 /* o15[0-3] */
   4491                 {
   4492                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4493 
   4494                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
   4495                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
   4496 
   4497                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4498                     m_count = _mm_cvtsi32_si128(i4_shift);
   4499                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4500                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4501 
   4502                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4503                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4504                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4505                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4506 
   4507                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4508 
   4509                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4510                     pi2_dst_scratch += 8;
   4511                 }
   4512 
   4513             }
   4514 
   4515         }
   4516         else if(zero_last24_rows_stg2)
   4517         {
   4518             /* eo */
   4519             {
   4520                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
   4521 
   4522                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
   4523                 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
   4524 
   4525                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
   4526 
   4527 
   4528                 /* eo0[0-3] */
   4529                 {
   4530                     m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4531 
   4532                 }
   4533 
   4534                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
   4535 
   4536                 /* eo1[0-3] */
   4537                 {
   4538                     m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4539 
   4540                 }
   4541                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
   4542 
   4543                 /* eo2[0-3] */
   4544                 {
   4545                     m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4546 
   4547                 }
   4548 
   4549                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
   4550 
   4551                 /* eo3[0-3] */
   4552                 {
   4553 
   4554                     m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4555 
   4556                 }
   4557 
   4558                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
   4559 
   4560                 /* eo4[0-3] */
   4561                 {
   4562                     m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4563 
   4564                 }
   4565 
   4566                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
   4567 
   4568                 /* eo5[0-3] */
   4569                 {
   4570                     m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4571                 }
   4572 
   4573                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
   4574                 /* eo6[0-3] */
   4575                 {
   4576                     m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4577                 }
   4578 
   4579                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
   4580                 /* eo7[0-3] */
   4581                 {
   4582                     m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4583 
   4584                 }
   4585 
   4586             }
   4587 
   4588             /* eeo */
   4589             {
   4590 
   4591                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
   4592                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
   4593                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
   4594                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
   4595 
   4596                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
   4597 
   4598                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
   4599 
   4600                 /* eeo0[0-3] */
   4601                 {
   4602                     temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4603 
   4604                 }
   4605 
   4606                 /* eeo1[0-3] */
   4607                 {
   4608                     temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
   4609 
   4610                 }
   4611 
   4612                 /* eo2[0-3] */
   4613                 {
   4614                     temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
   4615 
   4616                 }
   4617 
   4618 
   4619                 /* eo3[0-3] */
   4620                 {
   4621                     temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   4622 
   4623                 }
   4624 
   4625             }
   4626 
   4627             m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
   4628             m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
   4629             m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
   4630 
   4631             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
   4632 
   4633             //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
   4634             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
   4635 
   4636             m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   4637             m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   4638 
   4639             m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1);  /* ee[0] */
   4640             m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1);  /* ee[7] */
   4641 
   4642             m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2);  /* ee[1] */
   4643             m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2);  /* ee[6] */
   4644 
   4645             m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3);  /* ee[2] */
   4646             m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3);  /* ee[5] */
   4647 
   4648             m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4);  /* ee[3] */
   4649             m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4);  /* ee[4] */
   4650 
   4651             /* e[]*/
   4652 
   4653             temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
   4654             temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
   4655 
   4656             temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
   4657             temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
   4658 
   4659             temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
   4660             temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
   4661 
   4662             temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
   4663             temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
   4664 
   4665             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
   4666             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
   4667 
   4668             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
   4669             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
   4670 
   4671             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
   4672             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
   4673 
   4674             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
   4675             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
   4676 
   4677             /*o[k] */
   4678             {
   4679 
   4680                 WORD16 *pi2_dst_scratch = temp_ptr;
   4681                 WORD32 out_stride = 8;
   4682 
   4683                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   4684                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
   4685 
   4686                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
   4687                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
   4688                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
   4689                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
   4690 
   4691                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
   4692                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
   4693 
   4694                 /* o0[0-3] */
   4695                 {
   4696                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4697                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4698 
   4699                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   4700 
   4701                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
   4702                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
   4703 
   4704                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4705                     m_count = _mm_cvtsi32_si128(i4_shift);
   4706                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4707                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4708 
   4709                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4710                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4711                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4712                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4713 
   4714                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4715 
   4716                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4717                     pi2_dst_scratch += out_stride;
   4718 
   4719                 }
   4720 
   4721 
   4722                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   4723                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
   4724 
   4725                 /* o1[0-3] */
   4726                 {
   4727                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4728                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4729 
   4730                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   4731 
   4732                     m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
   4733                     m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
   4734 
   4735                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4736                     m_count = _mm_cvtsi32_si128(i4_shift);
   4737                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4738                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4739 
   4740                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4741                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4742                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4743                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4744 
   4745                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4746 
   4747                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4748                     pi2_dst_scratch += out_stride;
   4749 
   4750                 }
   4751 
   4752                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   4753                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
   4754 
   4755                 /* o2[0-3] */
   4756                 {
   4757                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4758                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4759 
   4760                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   4761 
   4762                     m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
   4763                     m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
   4764 
   4765                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4766                     m_count = _mm_cvtsi32_si128(i4_shift);
   4767                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4768                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4769 
   4770                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4771                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4772                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4773                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4774 
   4775                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4776 
   4777                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4778                     pi2_dst_scratch += out_stride;
   4779 
   4780                 }
   4781 
   4782                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   4783                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
   4784 
   4785                 /* o3[0-3] */
   4786                 {
   4787                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4788                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4789 
   4790                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   4791 
   4792                     m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
   4793                     m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
   4794 
   4795                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4796                     m_count = _mm_cvtsi32_si128(i4_shift);
   4797                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4798                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4799 
   4800                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4801                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4802                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4803                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4804 
   4805                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4806 
   4807                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4808                     pi2_dst_scratch += out_stride;
   4809 
   4810                 }
   4811 
   4812                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   4813                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
   4814 
   4815                 /* o4[0-3] */
   4816                 {
   4817                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4818                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4819 
   4820                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4821 
   4822                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
   4823                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
   4824 
   4825                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4826                     m_count = _mm_cvtsi32_si128(i4_shift);
   4827                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4828                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4829 
   4830                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4831                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4832                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4833                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4834 
   4835                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4836 
   4837                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4838                     pi2_dst_scratch += out_stride;
   4839 
   4840                 }
   4841 
   4842                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   4843                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
   4844 
   4845                 /* o5[0-3] */
   4846                 {
   4847                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4848                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4849 
   4850                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4851 
   4852                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
   4853                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
   4854 
   4855                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4856                     m_count = _mm_cvtsi32_si128(i4_shift);
   4857                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4858                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4859 
   4860                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4861                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4862                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4863                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4864 
   4865                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4866 
   4867                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4868                     pi2_dst_scratch += out_stride;
   4869 
   4870                 }
   4871 
   4872                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   4873                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
   4874 
   4875                 /* o6[0-3] */
   4876                 {
   4877                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4878                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4879 
   4880                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4881 
   4882                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
   4883                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
   4884 
   4885                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4886                     m_count = _mm_cvtsi32_si128(i4_shift);
   4887                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4888                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4889 
   4890                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4891                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4892                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4893                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4894 
   4895                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4896 
   4897                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4898                     pi2_dst_scratch += out_stride;
   4899 
   4900                 }
   4901 
   4902                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   4903                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
   4904 
   4905                 /* o7[0-3] */
   4906                 {
   4907                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4908                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4909 
   4910                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4911 
   4912                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
   4913                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
   4914 
   4915                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4916                     m_count = _mm_cvtsi32_si128(i4_shift);
   4917                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4918                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4919 
   4920                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4921                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4922                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4923                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4924 
   4925                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4926 
   4927                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4928                     pi2_dst_scratch += 8;
   4929 
   4930                 }
   4931 
   4932                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   4933                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
   4934 
   4935                 /* o8[0-3] */
   4936                 {
   4937                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4938                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4939 
   4940                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4941 
   4942                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
   4943                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
   4944 
   4945                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4946                     m_count = _mm_cvtsi32_si128(i4_shift);
   4947                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4948                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4949 
   4950                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4951                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4952                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4953                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4954 
   4955                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4956 
   4957                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4958                     pi2_dst_scratch += out_stride;
   4959                 }
   4960 
   4961                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   4962                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
   4963 
   4964                 /* o9[0-3] */
   4965                 {
   4966                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4967                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4968 
   4969                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4970 
   4971                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
   4972                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
   4973 
   4974                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   4975                     m_count = _mm_cvtsi32_si128(i4_shift);
   4976                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   4977                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   4978 
   4979                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   4980                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   4981                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   4982                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   4983 
   4984                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   4985 
   4986                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   4987                     pi2_dst_scratch += out_stride;
   4988                 }
   4989 
   4990                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   4991                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
   4992 
   4993                 /* o10[0-3] */
   4994                 {
   4995                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   4996                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   4997 
   4998                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   4999 
   5000                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
   5001                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
   5002 
   5003                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5004                     m_count = _mm_cvtsi32_si128(i4_shift);
   5005                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5006                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5007 
   5008                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5009                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5010                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5011                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5012 
   5013                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5014 
   5015                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5016                     pi2_dst_scratch += out_stride;
   5017                 }
   5018 
   5019                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   5020                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
   5021 
   5022                 /* o11[0-3] */
   5023                 {
   5024                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5025                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5026 
   5027                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5028 
   5029                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
   5030                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
   5031 
   5032                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5033                     m_count = _mm_cvtsi32_si128(i4_shift);
   5034                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5035                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5036 
   5037                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5038                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5039                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5040                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5041 
   5042                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5043 
   5044                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5045                     pi2_dst_scratch += out_stride;
   5046 
   5047                 }
   5048 
   5049                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   5050                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
   5051 
   5052                 /* o12[0-3] */
   5053                 {
   5054                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5055                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5056 
   5057                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5058 
   5059                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
   5060                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
   5061 
   5062                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5063                     m_count = _mm_cvtsi32_si128(i4_shift);
   5064                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5065                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5066 
   5067                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5068                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5069                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5070                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5071 
   5072                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5073 
   5074                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5075                     pi2_dst_scratch += out_stride;
   5076 
   5077                 }
   5078 
   5079                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   5080                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
   5081 
   5082                 /* o13[0-3] */
   5083                 {
   5084                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5085                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5086 
   5087                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5088 
   5089                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
   5090                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
   5091 
   5092                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5093                     m_count = _mm_cvtsi32_si128(i4_shift);
   5094                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5095                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5096 
   5097                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5098                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5099                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5100                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5101 
   5102                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5103 
   5104                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5105                     pi2_dst_scratch += out_stride;
   5106                 }
   5107 
   5108                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   5109                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
   5110 
   5111                 /* o14[0-3] */
   5112                 {
   5113                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5114                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5115 
   5116                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5117 
   5118                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
   5119                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
   5120 
   5121                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5122                     m_count = _mm_cvtsi32_si128(i4_shift);
   5123                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5124                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5125 
   5126                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5127                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5128                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5129                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5130 
   5131                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5132 
   5133                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5134                     pi2_dst_scratch += out_stride;
   5135                 }
   5136 
   5137                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   5138                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
   5139 
   5140                 /* o15[0-3] */
   5141                 {
   5142                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5143                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5144 
   5145                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
   5146 
   5147                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
   5148                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
   5149 
   5150                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5151                     m_count = _mm_cvtsi32_si128(i4_shift);
   5152                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5153                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5154 
   5155                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5156                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5157                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5158                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5159 
   5160                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5161 
   5162                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5163                     pi2_dst_scratch += 8;
   5164                 }
   5165 
   5166             }
   5167         }
   5168         else
   5169         {
   5170             /* eo */
   5171             {
   5172 
   5173                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
   5174                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
   5175                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
   5176                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
   5177 
   5178 
   5179                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
   5180                 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
   5181                 m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
   5182                 m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
   5183                 m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
   5184                 m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
   5185                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
   5186                 m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
   5187 
   5188                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
   5189                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
   5190                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
   5191                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
   5192 
   5193                 /* eo0[0-3] */
   5194                 {
   5195                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5196                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5197 
   5198                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5199 
   5200                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5201                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5202 
   5203                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5204 
   5205                     m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5206 
   5207                 }
   5208 
   5209                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
   5210                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
   5211                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
   5212                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
   5213 
   5214                 /* eo1[0-3] */
   5215                 {
   5216                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5217                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5218 
   5219                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5220 
   5221                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5222                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5223 
   5224                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5225 
   5226                     m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
   5227 
   5228                 }
   5229 
   5230                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
   5231                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
   5232                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
   5233                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
   5234 
   5235                 /* eo2[0-3] */
   5236                 {
   5237                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5238                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5239 
   5240                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
   5241 
   5242                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5243                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5244 
   5245                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5246 
   5247                     m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5248 
   5249                 }
   5250 
   5251                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
   5252                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
   5253                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
   5254                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
   5255 
   5256                 /* eo3[0-3] */
   5257                 {
   5258                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5259                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5260 
   5261                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5262 
   5263                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5264                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5265 
   5266                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
   5267 
   5268                     m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5269 
   5270                 }
   5271 
   5272                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
   5273                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
   5274                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
   5275                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
   5276 
   5277 
   5278                 /* eo4[0-3] */
   5279                 {
   5280                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5281                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5282 
   5283                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5284 
   5285                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5286                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5287 
   5288                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
   5289 
   5290                     m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5291 
   5292                 }
   5293 
   5294                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
   5295                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
   5296                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
   5297                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
   5298 
   5299                 /* eo5[0-3] */
   5300                 {
   5301                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5302                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5303 
   5304                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5305 
   5306                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5307                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5308 
   5309                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5310 
   5311                     m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5312                 }
   5313 
   5314                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
   5315                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
   5316                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
   5317                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
   5318 
   5319                 /* eo6[0-3] */
   5320                 {
   5321                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5322                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5323 
   5324                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5325 
   5326                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5327                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5328 
   5329                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5330 
   5331                     m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5332 
   5333                 }
   5334 
   5335                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
   5336                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
   5337                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
   5338                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
   5339 
   5340                 /* eo7[0-3] */
   5341                 {
   5342                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5343                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5344 
   5345                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5346 
   5347                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5348                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5349 
   5350                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
   5351 
   5352                     m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
   5353 
   5354 
   5355                 }
   5356 
   5357             }
   5358 
   5359             /* eeo */
   5360             {
   5361                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
   5362                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
   5363 
   5364                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
   5365                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
   5366                 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
   5367                 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
   5368 
   5369                 /* eeo0[0-3] */
   5370                 {
   5371 
   5372                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
   5373                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
   5374 
   5375                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5376                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5377 
   5378                     temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5379 
   5380                 }
   5381 
   5382                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
   5383                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
   5384 
   5385                 /* eeo1[0-3] */
   5386                 {
   5387                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   5388                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   5389 
   5390                     temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
   5391 
   5392                 }
   5393 
   5394                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
   5395                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
   5396 
   5397                 /* eo2[0-3] */
   5398                 {
   5399                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   5400                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   5401 
   5402                     temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5403 
   5404                 }
   5405 
   5406                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
   5407                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
   5408 
   5409                 /* eo3[0-3] */
   5410                 {
   5411                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   5412                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
   5413 
   5414                     temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
   5415 
   5416                 }
   5417 
   5418 
   5419             }
   5420 
   5421             m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
   5422             m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
   5423 
   5424             m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
   5425             m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
   5426 
   5427             m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
   5428             m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
   5429 
   5430             m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
   5431 
   5432             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
   5433             m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
   5434 
   5435             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
   5436 
   5437             m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
   5438             m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
   5439 
   5440             m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
   5441             m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
   5442 
   5443 /* eeeo[0]= m_temp_reg_20  */
   5444 /* eeeo[1]= m_temp_reg_21  */
   5445 /* eeee[0]= m_temp_reg_22  */
   5446 /* eeee[1]= m_temp_reg_23  */
   5447 
   5448             /* eee[0] = eeee[0] + eeeo[0]; */
   5449             m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
   5450 
   5451             /* eee[3] = eeee[0] - eeeo[0]; */
   5452             m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
   5453 
   5454             /* eee[2] = eeee[1] - eeeo[1]; */
   5455             m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
   5456 
   5457             /* eee[1] = eeee[1] + eeeo[1];*/
   5458             m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
   5459 
   5460             m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1);  /* ee[0] */
   5461             m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1);  /* ee[7] */
   5462 
   5463             m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2);  /* ee[1] */
   5464             m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2);  /* ee[6] */
   5465 
   5466             m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3);  /* ee[2] */
   5467             m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3);  /* ee[5] */
   5468 
   5469             m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4);  /* ee[3] */
   5470             m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4);  /* ee[4] */
   5471 
   5472 /* e[]*/
   5473 
   5474             temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
   5475             temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
   5476 
   5477             temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
   5478             temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
   5479 
   5480             temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
   5481             temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
   5482 
   5483             temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
   5484             temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
   5485 
   5486             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
   5487             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
   5488 
   5489             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
   5490             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
   5491 
   5492             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
   5493             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
   5494 
   5495             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
   5496             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
   5497 
   5498 /*o[k] */
   5499             {
   5500 
   5501                 WORD16 *pi2_dst_scratch = temp_ptr;
   5502                 WORD32 out_stride = 8;
   5503 
   5504                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
   5505                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
   5506                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
   5507                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
   5508                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
   5509                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
   5510                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
   5511                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
   5512 
   5513 
   5514                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
   5515                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
   5516                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
   5517                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
   5518                 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
   5519                 m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
   5520                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
   5521                 m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
   5522 
   5523                 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
   5524                 m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
   5525                 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
   5526                 m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
   5527                 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
   5528                 m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
   5529                 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
   5530                 m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
   5531 
   5532                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
   5533                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
   5534                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
   5535                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
   5536                 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
   5537                 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
   5538                 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
   5539                 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
   5540 
   5541                 /* o0[0-3] */
   5542                 {
   5543                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5544                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5545                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5546                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5547 
   5548                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5549                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5550 
   5551                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5552 
   5553                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5554                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5555                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5556                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5557 
   5558                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5559                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5560 
   5561                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5562 
   5563                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5564 
   5565                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
   5566                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
   5567 
   5568                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5569                     m_count = _mm_cvtsi32_si128(i4_shift);
   5570                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5571                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5572 
   5573                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5574                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5575                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5576                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5577 
   5578                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5579 
   5580                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5581                     pi2_dst_scratch += out_stride;
   5582 
   5583                 }
   5584 
   5585                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
   5586                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
   5587                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
   5588                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
   5589                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
   5590                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
   5591                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
   5592                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
   5593 
   5594                 /* o1[0-3] */
   5595                 {
   5596                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5597                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5598                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5599                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5600 
   5601                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5602                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5603 
   5604                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
   5605 
   5606                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5607                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5608                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5609                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5610 
   5611                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5612                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5613 
   5614                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5615 
   5616                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5617 
   5618                     m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
   5619                     m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
   5620 
   5621                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5622                     m_count = _mm_cvtsi32_si128(i4_shift);
   5623                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5624                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5625 
   5626                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5627                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5628                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5629                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5630 
   5631                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5632 
   5633                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5634                     pi2_dst_scratch += out_stride;
   5635 
   5636                 }
   5637 
   5638                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
   5639                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
   5640                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
   5641                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
   5642                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
   5643                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
   5644                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
   5645                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
   5646 
   5647                 /* o2[0-3] */
   5648                 {
   5649                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5650                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5651                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5652                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5653 
   5654                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   5655                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5656 
   5657                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5658 
   5659                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5660                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5661                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5662                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5663 
   5664                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
   5665                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5666 
   5667                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
   5668 
   5669                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5670 
   5671                     m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
   5672                     m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
   5673 
   5674                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5675                     m_count = _mm_cvtsi32_si128(i4_shift);
   5676                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5677                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5678 
   5679                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5680                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5681                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5682                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5683 
   5684                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5685 
   5686                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5687                     pi2_dst_scratch += out_stride;
   5688 
   5689                 }
   5690 
   5691                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
   5692                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
   5693                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
   5694                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
   5695                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
   5696                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
   5697                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
   5698                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
   5699 
   5700                 /* o3[0-3] */
   5701                 {
   5702                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5703                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5704                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5705                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5706 
   5707                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
   5708                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5709 
   5710                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5711 
   5712                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5713                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5714                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5715                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5716 
   5717                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
   5718                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5719 
   5720                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5721 
   5722                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5723 
   5724                     m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
   5725                     m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
   5726 
   5727                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5728                     m_count = _mm_cvtsi32_si128(i4_shift);
   5729                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5730                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5731 
   5732                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5733                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5734                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5735                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5736 
   5737                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5738 
   5739                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5740                     pi2_dst_scratch += out_stride;
   5741 
   5742                 }
   5743 
   5744                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
   5745                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
   5746                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
   5747                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
   5748                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
   5749                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
   5750                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
   5751                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
   5752 
   5753                 /* o4[0-3] */
   5754                 {
   5755                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5756                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5757                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5758                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5759 
   5760                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5761                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5762 
   5763                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5764 
   5765                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5766                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5767                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5768                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5769 
   5770                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5771                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5772 
   5773                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5774 
   5775                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5776 
   5777                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
   5778                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
   5779                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5780                     m_count = _mm_cvtsi32_si128(i4_shift);
   5781                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5782                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5783 
   5784                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5785                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5786                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5787                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5788 
   5789                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5790 
   5791                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5792                     pi2_dst_scratch += out_stride;
   5793 
   5794                 }
   5795 
   5796                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
   5797                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
   5798                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
   5799                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
   5800                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
   5801                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
   5802                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
   5803                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
   5804 
   5805                 /* o5[0-3] */
   5806                 {
   5807                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5808                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5809                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5810                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5811 
   5812                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5813                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5814 
   5815                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5816 
   5817                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5818                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5819                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5820                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5821 
   5822                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5823                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5824 
   5825                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5826 
   5827                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5828 
   5829                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
   5830                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
   5831 
   5832                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5833                     m_count = _mm_cvtsi32_si128(i4_shift);
   5834                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5835                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5836 
   5837                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5838                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5839                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5840                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5841 
   5842                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5843 
   5844                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5845                     pi2_dst_scratch += out_stride;
   5846 
   5847                 }
   5848 
   5849                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
   5850                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
   5851                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
   5852                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
   5853                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
   5854                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
   5855                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
   5856                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
   5857 
   5858                 /* o6[0-3] */
   5859                 {
   5860                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5861                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5862                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5863                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5864 
   5865                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5866                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5867 
   5868                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5869 
   5870                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5871                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5872                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5873                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5874 
   5875                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5876                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5877 
   5878                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5879 
   5880                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5881 
   5882                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
   5883                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
   5884 
   5885                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5886                     m_count = _mm_cvtsi32_si128(i4_shift);
   5887                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5888                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5889 
   5890                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5891                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5892                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5893                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5894 
   5895                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5896 
   5897                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5898                     pi2_dst_scratch += out_stride;
   5899 
   5900                 }
   5901 
   5902                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
   5903                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
   5904                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
   5905                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
   5906                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
   5907                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
   5908                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
   5909                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
   5910 
   5911                 /* o7[0-3] */
   5912                 {
   5913                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5914                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5915                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5916                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5917 
   5918                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5919                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5920 
   5921                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5922 
   5923                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5924                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5925                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5926                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5927 
   5928                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5929                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5930 
   5931                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5932 
   5933                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5934 
   5935                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
   5936                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
   5937 
   5938                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5939                     m_count = _mm_cvtsi32_si128(i4_shift);
   5940                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5941                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5942 
   5943                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5944                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5945                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5946                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   5947 
   5948                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   5949 
   5950                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   5951                     pi2_dst_scratch += 8;
   5952 
   5953                 }
   5954 
   5955                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
   5956                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
   5957                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
   5958                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
   5959                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
   5960                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
   5961                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
   5962                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
   5963 
   5964                 /* o8[0-3] */
   5965                 {
   5966                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   5967                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   5968                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   5969                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   5970 
   5971                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   5972                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   5973 
   5974                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   5975 
   5976                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   5977                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   5978                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   5979                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   5980 
   5981                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   5982                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   5983 
   5984                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   5985 
   5986                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   5987 
   5988                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
   5989                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
   5990 
   5991                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   5992                     m_count = _mm_cvtsi32_si128(i4_shift);
   5993                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   5994                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   5995 
   5996                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   5997                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   5998                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   5999                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6000 
   6001                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6002 
   6003                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6004                     pi2_dst_scratch += out_stride;
   6005                 }
   6006 
   6007                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
   6008                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
   6009                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
   6010                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
   6011                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
   6012                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
   6013                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
   6014                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
   6015 
   6016                 /* o9[0-3] */
   6017                 {
   6018                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6019                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6020                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6021                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6022 
   6023                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6024                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6025 
   6026                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6027 
   6028                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6029                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6030                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6031                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6032 
   6033                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6034                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6035 
   6036                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6037 
   6038                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6039 
   6040                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
   6041                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
   6042 
   6043                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6044                     m_count = _mm_cvtsi32_si128(i4_shift);
   6045                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6046                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6047 
   6048                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6049                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6050                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6051                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6052 
   6053                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6054 
   6055                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6056                     pi2_dst_scratch += out_stride;
   6057                 }
   6058 
   6059                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
   6060                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
   6061                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
   6062                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
   6063                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
   6064                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
   6065                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
   6066                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
   6067 
   6068                 /* o10[0-3] */
   6069                 {
   6070                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6071                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6072                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6073                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6074 
   6075                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6076                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6077 
   6078                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6079 
   6080                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6081                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6082                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6083                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6084 
   6085                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6086                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6087 
   6088                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6089 
   6090                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6091 
   6092                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
   6093                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
   6094 
   6095                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6096                     m_count = _mm_cvtsi32_si128(i4_shift);
   6097                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6098                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6099 
   6100                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6101                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6102                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6103                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6104 
   6105                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6106 
   6107                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6108                     pi2_dst_scratch += out_stride;
   6109                 }
   6110 
   6111 
   6112                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
   6113                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
   6114                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
   6115                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
   6116                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
   6117                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
   6118                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
   6119                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
   6120 
   6121                 /* o11[0-3] */
   6122                 {
   6123                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6124                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6125                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6126                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6127 
   6128                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6129                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6130 
   6131                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6132 
   6133                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6134                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6135                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6136                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6137 
   6138                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6139                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6140 
   6141                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6142 
   6143                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6144 
   6145                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
   6146                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
   6147 
   6148                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6149                     m_count = _mm_cvtsi32_si128(i4_shift);
   6150                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6151                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6152 
   6153                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6154                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6155                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6156                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6157 
   6158                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6159 
   6160                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6161                     pi2_dst_scratch += out_stride;
   6162 
   6163                 }
   6164 
   6165                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
   6166                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
   6167                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
   6168                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
   6169                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
   6170                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
   6171                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
   6172                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
   6173 
   6174                 /* o12[0-3] */
   6175                 {
   6176                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6177                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6178                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6179                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6180 
   6181                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6182                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6183 
   6184                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6185 
   6186                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6187                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6188                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6189                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6190 
   6191                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6192                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6193 
   6194                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6195 
   6196                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6197 
   6198                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
   6199                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
   6200 
   6201                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6202                     m_count = _mm_cvtsi32_si128(i4_shift);
   6203                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6204                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6205 
   6206                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6207                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6208                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6209                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6210 
   6211                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6212 
   6213                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6214                     pi2_dst_scratch += out_stride;
   6215 
   6216                 }
   6217 
   6218                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
   6219                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
   6220                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
   6221                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
   6222                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
   6223                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
   6224                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
   6225                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
   6226 
   6227                 /* o13[0-3] */
   6228                 {
   6229                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6230                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6231                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6232                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6233 
   6234                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6235                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6236 
   6237                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6238 
   6239                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6240                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6241                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6242                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6243 
   6244                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6245                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6246 
   6247                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6248 
   6249                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6250 
   6251                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
   6252                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
   6253 
   6254                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6255                     m_count = _mm_cvtsi32_si128(i4_shift);
   6256                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6257                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6258 
   6259                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6260                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6261                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6262                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6263 
   6264                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6265 
   6266                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6267                     pi2_dst_scratch += out_stride;
   6268                 }
   6269 
   6270                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
   6271                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
   6272                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
   6273                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
   6274                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
   6275                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
   6276                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
   6277                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
   6278 
   6279                 /* o14[0-3] */
   6280                 {
   6281                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6282                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6283                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6284                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6285 
   6286                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6287                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6288 
   6289                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6290 
   6291                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6292                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6293                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6294                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6295 
   6296                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6297                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6298 
   6299                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6300 
   6301                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6302 
   6303                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
   6304                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
   6305 
   6306                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6307                     m_count = _mm_cvtsi32_si128(i4_shift);
   6308                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6309                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6310 
   6311                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6312                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6313                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6314                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6315 
   6316                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6317 
   6318                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6319                     pi2_dst_scratch += out_stride;
   6320 
   6321                 }
   6322 
   6323                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
   6324                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
   6325                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
   6326                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
   6327                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
   6328                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
   6329                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
   6330                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
   6331 
   6332                 /* o15[0-3] */
   6333                 {
   6334                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   6335                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   6336                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   6337                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   6338 
   6339                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   6340                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   6341 
   6342                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   6343 
   6344                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
   6345                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
   6346                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
   6347                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
   6348 
   6349                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
   6350                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
   6351 
   6352                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
   6353 
   6354                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
   6355 
   6356                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
   6357                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
   6358 
   6359                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   6360                     m_count = _mm_cvtsi32_si128(i4_shift);
   6361                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   6362                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   6363 
   6364                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   6365                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   6366                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   6367                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   6368 
   6369                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   6370 
   6371                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   6372                     pi2_dst_scratch += 8;
   6373                 }
   6374 
   6375             }
   6376         }
   6377 
   6378         /* Transpose */
   6379         {
   6380 
   6381             WORD16 *pi2_src_scratch = temp_ptr;
   6382             WORD32 out_stride = dst_strd;
   6383             WORD32 in_stride = 8;
   6384 
   6385             m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6386             pi2_src_scratch += in_stride;
   6387             m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6388             pi2_src_scratch += in_stride;
   6389             m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6390             pi2_src_scratch += in_stride;
   6391             m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6392             pi2_src_scratch += in_stride;
   6393             m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6394             pi2_src_scratch += in_stride;
   6395             m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6396             pi2_src_scratch += in_stride;
   6397             m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6398             pi2_src_scratch += in_stride;
   6399             m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6400             pi2_src_scratch += 8;
   6401 
   6402             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6403             pi2_src_scratch += in_stride;
   6404             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6405             pi2_src_scratch += in_stride;
   6406             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6407             pi2_src_scratch += in_stride;
   6408             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6409             pi2_src_scratch += in_stride;
   6410             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6411             pi2_src_scratch += in_stride;
   6412             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6413             pi2_src_scratch += in_stride;
   6414             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6415             pi2_src_scratch += in_stride;
   6416             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
   6417             pi2_src_scratch += 8;
   6418 
   6419 
   6420             m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
   6421             m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
   6422 
   6423             m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
   6424             m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
   6425 
   6426             m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
   6427             m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
   6428 
   6429             m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
   6430             m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
   6431 
   6432             m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
   6433             m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
   6434 
   6435             m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
   6436             m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
   6437 
   6438             m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
   6439             m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
   6440 
   6441             m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
   6442             m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
   6443 
   6444 
   6445             m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
   6446             m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
   6447 
   6448             m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
   6449             m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
   6450 
   6451             m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
   6452             m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
   6453 
   6454             m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
   6455             m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
   6456 
   6457             m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
   6458             m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
   6459 
   6460             m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
   6461             m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
   6462 
   6463             m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
   6464             m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
   6465 
   6466             m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
   6467             m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
   6468 
   6469 
   6470             m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);       // row0 = 0-7
   6471             m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);       // row1 = 0-7
   6472 
   6473             m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);     // row0=24-31
   6474             m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);     // row1=24-31
   6475 
   6476             m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);       // row0=8-15
   6477             m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);       // row1=8-15
   6478 
   6479             m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);     // row0=16-23
   6480             m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);     // row1=16-23
   6481 
   6482             m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);      // row2 =0-7
   6483             m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);      // row3 =0-7
   6484 
   6485             m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);    // row2=24-31
   6486             m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);    // row3=24-31
   6487 
   6488             m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);      // row2=8-15
   6489             m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);      // row3=8-15
   6490 
   6491             m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);    // row2=16-23
   6492             m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);    // row3=16-23
   6493 
   6494             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
   6495 
   6496             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
   6497             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6498 
   6499             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
   6500             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6501 
   6502             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
   6503             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6504 
   6505             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
   6506             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6507 
   6508             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   6509 
   6510             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
   6511 
   6512             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
   6513             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6514 
   6515             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
   6516             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6517 
   6518             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
   6519             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6520 
   6521             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
   6522             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6523 
   6524             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
   6525             pu1_dst += out_stride;
   6526             pu1_pred += pred_strd;
   6527 
   6528 
   6529             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
   6530 
   6531             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
   6532             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6533 
   6534             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
   6535             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6536 
   6537             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
   6538             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6539 
   6540             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
   6541             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6542 
   6543             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   6544 
   6545             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
   6546 
   6547             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
   6548             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6549 
   6550             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
   6551             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6552 
   6553             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
   6554             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6555 
   6556             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
   6557             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6558 
   6559             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
   6560             pu1_dst += out_stride;
   6561             pu1_pred += pred_strd;
   6562 
   6563             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
   6564 
   6565             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
   6566             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6567 
   6568             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
   6569             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6570 
   6571             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
   6572             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6573 
   6574             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
   6575             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6576 
   6577             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   6578 
   6579             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
   6580 
   6581             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
   6582             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6583 
   6584             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
   6585             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6586 
   6587             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
   6588             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6589 
   6590             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
   6591             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6592 
   6593             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
   6594             pu1_dst += out_stride;
   6595             pu1_pred += pred_strd;
   6596 
   6597 
   6598             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
   6599 
   6600             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
   6601             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6602 
   6603             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
   6604             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6605 
   6606             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
   6607             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6608 
   6609             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
   6610             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6611 
   6612             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   6613 
   6614             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
   6615 
   6616             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
   6617             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
   6618 
   6619             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
   6620             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
   6621 
   6622             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
   6623             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
   6624 
   6625             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
   6626             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   6627 
   6628             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
   6629             pu1_dst += out_stride;
   6630             pu1_pred += pred_strd;
   6631 
   6632         }
   6633         pi2_tmp += 4;
   6634     }
   6635 }
   6636 
   6637