Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevc_16x16_itrans_recon_x86_intr.c
     22  *
     23  * @brief
     24  *  Contains function definitions for inverse
     25  * transform and reconstruction for 16x16.
     26  *
     27  * @author
     28  *  100470
     29  *  100592 (edited by)
     30  *
     31  * @par List of Functions:
     32  *  - ihevc_itrans_recon_16x16_sse42()
     33  *
     34  * @remarks
     35  *  None
     36  *
     37  *******************************************************************************
     38  */
     39 #include <stdio.h>
     40 #include <string.h>
     41 #include "ihevc_typedefs.h"
     42 #include "ihevc_macros.h"
     43 #include "ihevc_platform_macros.h"
     44 #include "ihevc_defs.h"
     45 #include "ihevc_trans_tables.h"
     46 #include "ihevc_itrans_recon.h"
     47 #include "ihevc_func_selector.h"
     48 #include "ihevc_trans_macros.h"
     49 
     50 #include <immintrin.h>
     51 #include <emmintrin.h>
     52 #include <smmintrin.h>
     53 #include <tmmintrin.h>
     54 
     55 /**
     56  *******************************************************************************
     57  *
     58  * @brief
     59  *  This function performs inverse quantization, inverse  transform and
     60  * reconstruction for 16x16 input block
     61  *
     62  * @par Description:
     63  *  Performs inverse quantization , inverse transform  and adds the
     64  * prediction data and clips output to 8 bit
     65  *
     66  * @param[in] pi2_src
     67  *  Input 16x16 coefficients
     68  *
     69  * @param[in] pi2_tmp
     70  *  Temporary 16x16 buffer for storing inverse
     71  *  transform 1st stage output
     72  *
     73  * @param[in] pu1_pred
     74  *  Prediction 16x16 block
     75  *
     76  * @param[in] pi2_dequant_coeff
     77  *  Dequant Coeffs
     78  *
     79  * @param[out] pu1_dst
     80  *  Output 16x16 block
     81  *
     82  * @param[in] qp_div
     83  *  Quantization parameter / 6
     84  *
     85  * @param[in] qp_rem
     86  *  Quantization parameter % 6
     87  *
     88  * @param[in] src_strd
     89  *  Input stride
     90  *
     91  * @param[in] pred_strd
     92  *  Prediction stride
     93  *
     94  * @param[in] dst_strd
     95  *  Output Stride
     96  *
     97  * @param[in] zero_cols
     98  *  Zero columns in pi2_src
     99  *
    100  * @returns  Void
    101  *
    102  * @remarks
    103  *  None
    104  *
    105  *******************************************************************************
    106  */
    107 
    108 void ihevc_itrans_recon_16x16_sse42(WORD16 *pi2_src,
    109                                     WORD16 *pi2_tmp,
    110                                     UWORD8 *pu1_pred,
    111                                     UWORD8 *pu1_dst,
    112                                     WORD32 src_strd,
    113                                     WORD32 pred_strd,
    114                                     WORD32 dst_strd,
    115                                     WORD32 zero_cols,
    116                                     WORD32 zero_rows)
    117 {
    118     __m128i m_temp_reg_0;
    119     __m128i m_temp_reg_1;
    120     __m128i m_temp_reg_10;
    121     __m128i m_temp_reg_11;
    122     __m128i m_temp_reg_12;
    123     __m128i m_temp_reg_13;
    124     __m128i m_temp_reg_14;
    125     __m128i m_temp_reg_20;
    126     __m128i m_temp_reg_21;
    127     __m128i m_temp_reg_22;
    128     __m128i m_temp_reg_23;
    129     __m128i m_temp_reg_24;
    130     __m128i m_temp_reg_25;
    131     __m128i m_temp_reg_26;
    132     __m128i m_temp_reg_27;
    133     __m128i m_temp_reg_30;
    134     __m128i m_temp_reg_31;
    135     __m128i m_temp_reg_32;
    136     __m128i m_temp_reg_33;
    137     __m128i m_temp_reg_34;
    138     __m128i m_temp_reg_35;
    139     __m128i m_temp_reg_36;
    140     __m128i m_temp_reg_37;
    141     __m128i m_temp_reg_40;
    142     __m128i m_temp_reg_41;
    143     __m128i m_temp_reg_42;
    144     __m128i m_temp_reg_43;
    145     __m128i m_temp_reg_44;
    146     __m128i m_temp_reg_45;
    147     __m128i m_temp_reg_46;
    148     __m128i m_temp_reg_47;
    149 
    150     __m128i m_temp_reg_70;
    151     __m128i m_temp_reg_71;
    152     __m128i m_temp_reg_72;
    153     __m128i m_temp_reg_73;
    154     __m128i m_temp_reg_74;
    155     __m128i m_temp_reg_75;
    156     __m128i m_temp_reg_76;
    157     __m128i m_temp_reg_77;
    158     __m128i m_rdng_factor;
    159     __m128i m_count;
    160     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
    161     __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
    162 
    163     WORD32 i;
    164 
    165     WORD32  zero_last8_cols_stg1;
    166     WORD32  zero_last8_rows_stg1;
    167     WORD32  zero_last12_rows_stg1;
    168     WORD32  zero_last12_rows_stg2;
    169     WORD32  zero_last8_rows_stg2;
    170 
    171     WORD32  loop = 0;
    172 
    173     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    174     WORD32 trans_size = TRANS_SIZE_16;
    175 
    176     /* Following 3 instructions replicates the value in the */
    177     /* lower 16 bits of m_add_iq in the entire register */
    178 
    179     /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
    180 
    181     zero_last8_cols_stg1  = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
    182     zero_last8_rows_stg1  = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
    183     zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
    184 
    185     zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
    186     zero_last8_rows_stg2 = zero_last8_cols_stg1;
    187 
    188     if(zero_last8_cols_stg1)
    189     {
    190         loop = 1;
    191     }
    192     else
    193         loop = 2;
    194 
    195     /* i = 0 => lower 8 samples */
    196     /* i = 1 => higher 8 samples */
    197     for(i = 0; i < loop; i++)
    198     {
    199         {
    200             WORD32 sample_half_index = i << 3;
    201             WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
    202             WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
    203 
    204             m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    205             pi2_tmp_src += (src_strd << 1);
    206             m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    207             pi2_tmp_src += (src_strd << 1);
    208             m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    209             pi2_tmp_src += (src_strd << 1);
    210             m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    211             pi2_tmp_src += (src_strd << 1);
    212             m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    213             pi2_tmp_src += (src_strd << 1);
    214             m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    215             pi2_tmp_src += (src_strd << 1);
    216             m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    217             pi2_tmp_src += (src_strd << 1);
    218             m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    219             pi2_tmp_src += (src_strd << 1);
    220 
    221 
    222 
    223 
    224             /* If last 12 rows are zero : Rishab */
    225             if(zero_last12_rows_stg1)
    226             {
    227 
    228                 /* eee */
    229                 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
    230                 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
    231                 {
    232                     /* Loading coeff and src for use in next block */
    233 
    234                     m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
    235                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
    236 
    237                     m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
    238 
    239                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
    240 
    241                     m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
    242 
    243                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
    244 
    245                     m_temp_reg_26 = m_temp_reg_24;
    246                     m_temp_reg_27 = m_temp_reg_25;
    247                 }
    248 
    249                 /* eo */
    250 
    251                 /* eo0[0-3] */
    252                 {
    253                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    254                     m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
    255 
    256                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    257 
    258                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    259 
    260                     /* e[0][0-3] stored in pi2_tmp[0][0-7] */
    261                     /* e[7][0-3] stored in pi2_tmp[0][8-15] */
    262                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
    263                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
    264 
    265                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    266                     pi2_scratch += 8;
    267                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    268                     pi2_scratch += 8;
    269 
    270                 }
    271 
    272 
    273                 /* eo0[4-7] */
    274                 {
    275                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    276 
    277                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    278 
    279                     /* e[0][4-7] stored in pi2_tmp[1][0-7] */
    280                     /* e[7][4-7] stored in pi2_tmp[1][8-15] */
    281                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
    282                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
    283 
    284                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    285                     pi2_scratch += 8;
    286                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    287                     pi2_scratch += 8;
    288 
    289                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
    290                 }
    291 
    292                 /* eo1[0-3] */
    293                 {
    294                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    295 
    296                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    297 
    298                     /* e[1][0-3] stored in pi2_tmp[2][0-7] */
    299                     /* e[6][0-3] stored in pi2_tmp[2][8-15] */
    300                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
    301                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
    302 
    303                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    304                     pi2_scratch += 8;
    305                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    306                     pi2_scratch += 8;
    307                 }
    308 
    309                 /* eo1[4-7] */
    310                 {
    311                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    312 
    313                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    314 
    315                     /* e[1][4-7] stored in pi2_tmp[3][0-7] */
    316                     /* e[6][4-7] stored in pi2_tmp[3][8-15] */
    317                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
    318                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
    319 
    320                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    321                     pi2_scratch += 8;
    322                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    323                     pi2_scratch += 8;
    324 
    325                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
    326 
    327                 }
    328 
    329                 /* eo2[0-3] */
    330                 {
    331                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    332 
    333                     /* e[2][0-3] stored in pi2_tmp[4][0-7] */
    334                     /* e[5][0-3] stored in pi2_tmp[4][8-15] */
    335                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
    336                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
    337 
    338                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    339                     pi2_scratch += 8;
    340                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    341                     pi2_scratch += 8;
    342 
    343                 }
    344 
    345                 /* eo2[4-7] */
    346                 {
    347                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    348 
    349                     /* e[2][4-7] stored in pi2_tmp[5][0-7] */
    350                     /* e[5][4-7] stored in pi2_tmp[5][8-15] */
    351                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
    352                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
    353 
    354 
    355                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    356                     pi2_scratch += 8;
    357                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    358                     pi2_scratch += 8;
    359 
    360                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
    361                 }
    362 
    363                 /* eo3[0-3] */
    364                 {
    365                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    366 
    367                     /* e[3][0-3] stored in pi2_tmp[6][0-7] */
    368                     /* e[4][0-3] stored in pi2_tmp[6][8-15] */
    369                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
    370                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
    371 
    372                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    373                     pi2_scratch += 8;
    374                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    375                     pi2_scratch += 8;
    376                 }
    377 
    378                 /* eo3[4-7] */
    379                 {
    380                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    381 
    382                     /* e[3][4-7] stored in pi2_tmp[7][0-7] */
    383                     /* e[4][4-7] stored in pi2_tmp[7][8-15] */
    384                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
    385                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
    386 
    387                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    388                     pi2_scratch += 8;
    389                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    390                     pi2_scratch += 8;
    391                 }
    392             }
    393             /* If last 8 rows are zero : Rishab */
    394             else if(zero_last8_rows_stg1)
    395             {
    396                 /* eeo */
    397                 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
    398                 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
    399                 {
    400                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
    401                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
    402 
    403                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
    404                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
    405 
    406                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    407                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    408 
    409                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    410                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
    411 
    412                 }
    413 
    414                 /* eee */
    415                 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
    416                 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
    417                 {
    418                     /* Loading coeff and src for use in next block */
    419                     m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to  get signs
    420                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
    421 
    422                     m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
    423 
    424                     //m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
    425 
    426                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
    427 
    428                     m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
    429 
    430                     m_temp_reg_26 = m_temp_reg_24;
    431                     m_temp_reg_27 = m_temp_reg_25;
    432 
    433                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
    434                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
    435                 }
    436 
    437                 /* eo0[0-3] */
    438                 {
    439                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    440                     m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
    441 
    442                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    443 
    444                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    445                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
    446                     m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
    447 
    448                     /* e[0][0-3] stored in pi2_tmp[0][0-7] */
    449                     /* e[7][0-3] stored in pi2_tmp[0][8-15] */
    450                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    451                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    452 
    453                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    454                     pi2_scratch += 8;
    455                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    456                     pi2_scratch += 8;
    457 
    458                 }
    459 
    460                 /* eo0[4-7] */
    461                 {
    462                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    463 
    464                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    465                     m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
    466                     m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
    467 
    468                     /* e[0][4-7] stored in pi2_tmp[1][0-7] */
    469                     /* e[7][4-7] stored in pi2_tmp[1][8-15] */
    470                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
    471                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
    472 
    473                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    474                     pi2_scratch += 8;
    475                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    476                     pi2_scratch += 8;
    477 
    478                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
    479                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
    480 
    481                 }
    482 
    483                 /* eo1[0-3] */
    484                 {
    485                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    486 
    487                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    488                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
    489                     m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
    490 
    491                     /* e[1][0-3] stored in pi2_tmp[2][0-7] */
    492                     /* e[6][0-3] stored in pi2_tmp[2][8-15] */
    493                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    494                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    495 
    496                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    497                     pi2_scratch += 8;
    498                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    499                     pi2_scratch += 8;
    500 
    501                 }
    502 
    503                 /* eo1[4-7] */
    504                 {
    505                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    506 
    507                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    508                     m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
    509                     m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
    510 
    511                     /* e[1][4-7] stored in pi2_tmp[3][0-7] */
    512                     /* e[6][4-7] stored in pi2_tmp[3][8-15] */
    513                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
    514                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
    515 
    516                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    517                     pi2_scratch += 8;
    518                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    519                     pi2_scratch += 8;
    520 
    521                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
    522                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
    523 
    524                 }
    525 
    526                 /* eo2[0-3] */
    527                 {
    528                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    529 
    530                     /* e[2][0-3] stored in pi2_tmp[4][0-7] */
    531                     /* e[5][0-3] stored in pi2_tmp[4][8-15] */
    532                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    533                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    534 
    535                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    536                     pi2_scratch += 8;
    537                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    538                     pi2_scratch += 8;
    539 
    540                 }
    541 
    542                 /* eo2[4-7] */
    543                 {
    544                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    545 
    546                     /* e[2][4-7] stored in pi2_tmp[5][0-7] */
    547                     /* e[5][4-7] stored in pi2_tmp[5][8-15] */
    548                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
    549                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
    550 
    551                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    552                     pi2_scratch += 8;
    553                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    554                     pi2_scratch += 8;
    555 
    556                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
    557                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
    558 
    559                 }
    560 
    561                 /* eo3[0-3] */
    562                 {
    563                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    564 
    565                     /* e[3][0-3] stored in pi2_tmp[6][0-7] */
    566                     /* e[4][0-3] stored in pi2_tmp[6][8-15] */
    567                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    568                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    569 
    570                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    571                     pi2_scratch += 8;
    572                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    573                     pi2_scratch += 8;
    574                 }
    575 
    576                 /* eo3[4-7] */
    577                 {
    578                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    579 
    580                     /* e[3][4-7] stored in pi2_tmp[7][0-7] */
    581                     /* e[4][4-7] stored in pi2_tmp[7][8-15] */
    582                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
    583                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
    584 
    585                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    586                     pi2_scratch += 8;
    587                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    588                     pi2_scratch += 8;
    589                 }
    590             } /* If all the rows are non-zero : Rishab */
    591             else
    592             {
    593                 /* eeo */
    594                 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
    595                 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
    596 
    597                 {
    598                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
    599                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
    600 
    601                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
    602                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
    603 
    604                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    605                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    606 
    607                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    608                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
    609                 }
    610 
    611                 /* eee */
    612                 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
    613                 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
    614                 {
    615                     /* Loading coeff and src for use in next block */
    616                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
    617                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
    618 
    619                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
    620                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
    621 
    622                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
    623                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
    624 
    625                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    626                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
    627 
    628                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
    629                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
    630 
    631                 }
    632                 /* eo0[0-3] */
    633                 {
    634                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    635                     m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
    636                     m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
    637                     m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
    638 
    639                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    640                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
    641 
    642 
    643                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    644                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
    645                     m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
    646 
    647                     /* e[0][0-3] stored in pi2_tmp[0][0-7] */
    648                     /* e[7][0-3] stored in pi2_tmp[0][8-15] */
    649                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    650                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    651                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
    652                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
    653 
    654                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    655                     pi2_scratch += 8;
    656                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    657                     pi2_scratch += 8;
    658 
    659 
    660                 }
    661 
    662                 /* eo0[4-7] */
    663                 {
    664                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    665                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
    666 
    667                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    668                     m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
    669                     m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
    670 
    671                     /* e[0][4-7] stored in pi2_tmp[1][0-7] */
    672                     /* e[7][4-7] stored in pi2_tmp[1][8-15] */
    673                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
    674                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
    675                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
    676                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
    677 
    678                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    679                     pi2_scratch += 8;
    680                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    681                     pi2_scratch += 8;
    682 
    683                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
    684                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
    685 
    686                 }
    687 
    688                 /* eo1[0-3] */
    689                 {
    690                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    691                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
    692 
    693                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    694                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
    695                     m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
    696 
    697                     /* e[1][0-3] stored in pi2_tmp[2][0-7] */
    698                     /* e[6][0-3] stored in pi2_tmp[2][8-15] */
    699                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    700                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    701                     m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
    702                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
    703 
    704                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    705                     pi2_scratch += 8;
    706                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    707                     pi2_scratch += 8;
    708 
    709                 }
    710 
    711                 /* eo1[4-7] */
    712                 {
    713                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    714                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
    715 
    716                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    717                     m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
    718                     m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
    719 
    720                     /* e[1][4-7] stored in pi2_tmp[3][0-7] */
    721                     /* e[6][4-7] stored in pi2_tmp[3][8-15] */
    722                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
    723                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
    724                     m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
    725                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
    726 
    727                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    728                     pi2_scratch += 8;
    729                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    730                     pi2_scratch += 8;
    731                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
    732                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
    733                 }
    734 
    735                 /* eo2[0-3] */
    736                 {
    737                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    738                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
    739 
    740                     /* e[2][0-3] stored in pi2_tmp[4][0-7] */
    741                     /* e[5][0-3] stored in pi2_tmp[4][8-15] */
    742                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    743                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    744                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
    745                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
    746 
    747                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    748                     pi2_scratch += 8;
    749                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    750                     pi2_scratch += 8;
    751                 }
    752 
    753                 /* eo2[4-7] */
    754                 {
    755                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    756                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
    757 
    758                     /* e[2][4-7] stored in pi2_tmp[5][0-7] */
    759                     /* e[5][4-7] stored in pi2_tmp[5][8-15] */
    760                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
    761                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
    762                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
    763                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
    764 
    765                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    766                     pi2_scratch += 8;
    767                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    768                     pi2_scratch += 8;
    769 
    770                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
    771                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
    772 
    773                 }
    774 
    775                 /* eo3[0-3] */
    776                 {
    777                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    778                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
    779 
    780                     /* e[3][0-3] stored in pi2_tmp[6][0-7] */
    781                     /* e[4][0-3] stored in pi2_tmp[6][8-15] */
    782                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    783                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    784                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
    785                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
    786 
    787 
    788                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    789                     pi2_scratch += 8;
    790                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    791                     pi2_scratch += 8;
    792                 }
    793 
    794                 /* eo3[4-7] */
    795                 {
    796                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    797                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
    798 
    799                     /* e[3][4-7] stored in pi2_tmp[7][0-7] */
    800                     /* e[4][4-7] stored in pi2_tmp[7][8-15] */
    801                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
    802                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
    803                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
    804                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
    805 
    806                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    807                     pi2_scratch += 8;
    808                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    809                     pi2_scratch += 8;
    810                 }
    811 
    812             }
    813         }
    814 
    815         {
    816             WORD32 sample_half_index = i << 3;
    817             WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
    818 
    819             m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    820             pi2_tmp_src += (src_strd << 1);
    821             m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    822             pi2_tmp_src += (src_strd << 1);
    823             m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    824             pi2_tmp_src += (src_strd << 1);
    825             m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    826             pi2_tmp_src += (src_strd << 1);
    827             m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    828             pi2_tmp_src += (src_strd << 1);
    829             m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    830             pi2_tmp_src += (src_strd << 1);
    831             m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    832             pi2_tmp_src += (src_strd << 1);
    833             m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
    834             pi2_tmp_src += (src_strd << 1);
    835         }
    836 
    837         /* o & stage 1 out */
    838         {
    839             WORD32 j;
    840             WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
    841             WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
    842             WORD32 out_stride = (trans_size << 1);
    843             WORD32 in_stride = trans_size << 1;
    844 
    845             if(zero_last12_rows_stg1)
    846             {
    847                 for(j = 0; j < 2; j++)
    848                 {
    849                     if(j) //H8B= higher 8 bytes L8B lower 8 bytes
    850                     {
    851                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
    852                     }
    853                     else
    854                     {
    855                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
    856                     }
    857                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
    858 
    859 
    860                     /* o0[0-3] */
    861                     {
    862                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    863 
    864 
    865                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
    866                         pi2_src_scratch += in_stride;
    867 
    868                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
    869 
    870 
    871                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
    872                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
    873 
    874                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    875                         m_count = _mm_cvtsi32_si128(i4_shift);
    876                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
    877 
    878                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    879                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    880                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    881                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    882 
    883                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    884 
    885                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    886                         pi2_dst_scratch += out_stride;
    887                     }
    888 
    889                     /* o1[0-3] */
    890                     {
    891                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
    892 
    893 
    894                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
    895                         pi2_src_scratch += in_stride;
    896 
    897                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
    898 
    899                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
    900                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
    901 
    902                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    903                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    904                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    905                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    906 
    907                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    908 
    909                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    910                         pi2_dst_scratch += out_stride;
    911                     }
    912 
    913                     /* o2[0-3] */
    914                     {
    915                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    916 
    917                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
    918                         pi2_src_scratch += in_stride;
    919 
    920                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
    921 
    922                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
    923                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
    924 
    925                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    926                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    927                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    928                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    929 
    930                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    931 
    932                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    933                         pi2_dst_scratch += out_stride;
    934                     }
    935 
    936                     /* o3[0-3] */
    937                     {
    938                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
    939 
    940                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
    941                         pi2_src_scratch += 8;
    942 
    943                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
    944 
    945                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
    946                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
    947 
    948                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    949                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    950                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    951                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    952 
    953                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    954 
    955                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    956                         pi2_dst_scratch += 8;
    957                     }
    958 
    959                     /* o4[0-3] */
    960                     {
    961                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    962 
    963                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
    964                         pi2_src_scratch -= in_stride;
    965 
    966                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
    967 
    968                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
    969                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
    970 
    971                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    972                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    973                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    974                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    975 
    976                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    977 
    978                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    979                         pi2_dst_scratch -= out_stride;
    980                     }
    981 
    982                     /* o5[0-3] */
    983                     {
    984                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
    985 
    986 
    987                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
    988                         pi2_src_scratch -= in_stride;
    989 
    990                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
    991 
    992                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
    993                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
    994 
    995                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    996                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    997                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    998                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    999 
   1000                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1001 
   1002                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1003                         pi2_dst_scratch -= out_stride;
   1004                     }
   1005 
   1006                     /* o6[0-3] */
   1007                     {
   1008                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1009 
   1010                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1011                         pi2_src_scratch -= in_stride;
   1012 
   1013                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   1014 
   1015                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1016                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1017 
   1018                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1019                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1020                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1021                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1022 
   1023                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1024 
   1025                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1026                         pi2_dst_scratch -= out_stride;
   1027                     }
   1028 
   1029                     /* o7[0-3] */
   1030                     {
   1031                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1032 
   1033                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1034                         pi2_src_scratch += 8;
   1035 
   1036                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1037                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1038 
   1039                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1040                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1041                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1042                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1043 
   1044                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1045 
   1046                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1047                         pi2_dst_scratch += 8;
   1048                     }
   1049                 }
   1050             }
   1051             else if(zero_last8_rows_stg1)
   1052             {
   1053                 for(j = 0; j < 2; j++)
   1054                 {
   1055                     if(j)
   1056                     {
   1057                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   1058                         m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
   1059                     }
   1060                     else
   1061                     {
   1062                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   1063                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
   1064                     }
   1065                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   1066                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
   1067 
   1068                     /* o0[0-3] */
   1069                     {
   1070                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1071                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1072 
   1073                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1074                         pi2_src_scratch += in_stride;
   1075 
   1076                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   1077                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
   1078 
   1079                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   1080                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1081                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1082 
   1083 
   1084                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1085                         m_count = _mm_cvtsi32_si128(i4_shift);
   1086 
   1087                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
   1088 
   1089                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1090                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1091                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1092                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1093 
   1094                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1095 
   1096                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1097                         pi2_dst_scratch += out_stride;
   1098                     }
   1099 
   1100                     /* o1[0-3] */
   1101                     {
   1102                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1103                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1104 
   1105                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1106                         pi2_src_scratch += in_stride;
   1107 
   1108                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   1109                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
   1110 
   1111                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1112                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1113                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1114 
   1115                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1116                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1117                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1118                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1119 
   1120                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1121 
   1122                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1123                         pi2_dst_scratch += out_stride;
   1124                     }
   1125 
   1126                     /* o2[0-3] */
   1127                     {
   1128                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1129                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1130 
   1131                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1132                         pi2_src_scratch += in_stride;
   1133 
   1134                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   1135                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
   1136 
   1137                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   1138                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1139                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1140 
   1141                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1142                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1143                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1144                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1145 
   1146                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1147 
   1148                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1149                         pi2_dst_scratch += out_stride;
   1150                     }
   1151 
   1152                     /* o3[0-3] */
   1153                     {
   1154                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1155                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1156 
   1157                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1158                         pi2_src_scratch += 8;
   1159 
   1160                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   1161                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
   1162 
   1163                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
   1164                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1165                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1166 
   1167                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1168                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1169                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1170                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1171 
   1172                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1173 
   1174                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1175                         pi2_dst_scratch += 8;
   1176                     }
   1177 
   1178                     /* o4[0-3] */
   1179                     {
   1180                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1181                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1182 
   1183                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1184                         pi2_src_scratch -= in_stride;
   1185 
   1186                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   1187                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
   1188 
   1189                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   1190                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1191                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1192 
   1193 
   1194                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1195                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1196                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1197                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1198 
   1199                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1200 
   1201                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1202                         pi2_dst_scratch -= out_stride;
   1203                     }
   1204 
   1205                     /* o5[0-3] */
   1206                     {
   1207                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1208                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1209 
   1210                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1211                         pi2_src_scratch -= in_stride;
   1212 
   1213                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   1214                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
   1215 
   1216                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1217                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1218                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1219 
   1220 
   1221                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1222                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1223                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1224                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1225 
   1226                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1227 
   1228                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1229                         pi2_dst_scratch -= out_stride;
   1230                     }
   1231 
   1232                     /* o6[0-3] */
   1233                     {
   1234                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1235                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1236 
   1237                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1238                         pi2_src_scratch -= in_stride;
   1239 
   1240                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   1241                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
   1242 
   1243                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   1244                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1245                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1246 
   1247                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1248                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1249                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1250                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1251 
   1252                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1253 
   1254                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1255                         pi2_dst_scratch -= out_stride;
   1256                     }
   1257 
   1258                     /* o7[0-3] */
   1259                     {
   1260                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1261                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1262 
   1263                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1264                         pi2_src_scratch += 8;
   1265 
   1266                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1267                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1268                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1269 
   1270                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1271                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1272                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1273                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1274 
   1275                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1276 
   1277                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1278                         pi2_dst_scratch += 8;
   1279                     }
   1280                 }
   1281 
   1282             }
   1283             else
   1284             {
   1285 
   1286 
   1287 
   1288                 for(j = 0; j < 2; j++)
   1289                 {
   1290                     if(j) //H8B= higher 8 bytes L8B lower 8 bytes
   1291                     {
   1292                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   1293                         m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
   1294                         m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
   1295                         m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
   1296                     }
   1297                     else
   1298                     {
   1299                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   1300                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
   1301                         m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
   1302                         m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
   1303                     }
   1304                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   1305                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
   1306                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
   1307                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
   1308 
   1309 
   1310                     /* o0[0-3] */
   1311                     {
   1312                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1313                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1314                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1315                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1316 
   1317 
   1318                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1319                         pi2_src_scratch += in_stride;
   1320 
   1321                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   1322                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
   1323                         m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
   1324                         m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
   1325 
   1326                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   1327                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   1328                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   1329                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1330                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1331 
   1332                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1333                         m_count = _mm_cvtsi32_si128(i4_shift);
   1334                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1335                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1336 
   1337                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1338                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1339                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1340                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1341 
   1342                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1343 
   1344                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1345                         pi2_dst_scratch += out_stride;
   1346                     }
   1347 
   1348                     /* o1[0-3] */
   1349                     {
   1350                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1351                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1352                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   1353                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   1354 
   1355 
   1356                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1357                         pi2_src_scratch += in_stride;
   1358 
   1359                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   1360                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
   1361                         m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
   1362                         m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
   1363 
   1364                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1365                         m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
   1366                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
   1367                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1368                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1369 
   1370                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1371                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1372                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1373                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1374 
   1375                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1376 
   1377                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1378                         pi2_dst_scratch += out_stride;
   1379                     }
   1380 
   1381                     /* o2[0-3] */
   1382                     {
   1383                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1384                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1385                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1386                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1387 
   1388                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1389                         pi2_src_scratch += in_stride;
   1390 
   1391                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   1392                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
   1393                         m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
   1394                         m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
   1395 
   1396                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   1397                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   1398                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
   1399                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1400                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1401 
   1402                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1403                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1404                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1405                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1406 
   1407                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1408 
   1409                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1410                         pi2_dst_scratch += out_stride;
   1411                     }
   1412 
   1413                     /* o3[0-3] */
   1414                     {
   1415                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1416                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1417                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   1418                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   1419 
   1420 
   1421                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1422                         pi2_src_scratch += 8;
   1423 
   1424                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   1425                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
   1426                         m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
   1427                         m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
   1428 
   1429                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
   1430                         m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
   1431                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
   1432                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1433                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1434 
   1435                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1436                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1437                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1438                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1439 
   1440                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1441 
   1442                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1443                         pi2_dst_scratch += 8;
   1444                     }
   1445 
   1446                     /* o4[0-3] */
   1447                     {
   1448                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1449                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1450                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1451                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1452 
   1453                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1454                         pi2_src_scratch -= in_stride;
   1455 
   1456                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   1457                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
   1458                         m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
   1459                         m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
   1460 
   1461                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   1462                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   1463                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
   1464                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1465                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1466 
   1467                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1468                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1469                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1470                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1471 
   1472                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1473 
   1474                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1475                         pi2_dst_scratch -= out_stride;
   1476                     }
   1477 
   1478                     /* o5[0-3] */
   1479                     {
   1480                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1481                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1482                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   1483                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   1484 
   1485 
   1486                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1487                         pi2_src_scratch -= in_stride;
   1488 
   1489                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   1490                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
   1491                         m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
   1492                         m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
   1493 
   1494                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1495                         m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
   1496                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
   1497                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1498                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1499 
   1500                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1501                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1502                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1503                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1504 
   1505                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1506 
   1507                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1508                         pi2_dst_scratch -= out_stride;
   1509                     }
   1510 
   1511                     /* o6[0-3] */
   1512                     {
   1513                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1514                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1515                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1516                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1517 
   1518 
   1519                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1520                         pi2_src_scratch -= in_stride;
   1521 
   1522                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   1523                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
   1524                         m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
   1525                         m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
   1526 
   1527 
   1528                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   1529                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   1530                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   1531                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1532                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1533 
   1534                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1535                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1536                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1537                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1538 
   1539                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1540 
   1541                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1542                         pi2_dst_scratch -= out_stride;
   1543                     }
   1544 
   1545                     /* o7[0-3] */
   1546                     {
   1547                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1548                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1549                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   1550                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   1551 
   1552                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1553                         pi2_src_scratch += 8;
   1554 
   1555                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1556                         m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
   1557                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
   1558                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1559                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1560 
   1561 
   1562                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1563                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1564                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1565                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1566 
   1567                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1568 
   1569                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1570                         pi2_dst_scratch += 8;
   1571                     }
   1572                 }
   1573             }
   1574         }
   1575 
   1576         /* Transpose */
   1577         {
   1578             WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
   1579             WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
   1580             WORD32 out_stride = (trans_size << 1);
   1581             WORD32 in_stride = (trans_size << 1);
   1582             WORD32 j;
   1583 
   1584             for(j = 0; j < 2; j++)
   1585             {
   1586                 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
   1587                 pi2_src_scratch += in_stride;
   1588                 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
   1589                 pi2_src_scratch += in_stride;
   1590                 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
   1591                 pi2_src_scratch += in_stride;
   1592                 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
   1593                 pi2_src_scratch += 8;
   1594                 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
   1595                 pi2_src_scratch -= in_stride;
   1596                 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
   1597                 pi2_src_scratch -= in_stride;
   1598                 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
   1599                 pi2_src_scratch -= in_stride;
   1600                 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
   1601                 pi2_src_scratch += 8;
   1602 
   1603                 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
   1604                 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
   1605 
   1606                 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
   1607                 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
   1608 
   1609                 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
   1610                 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
   1611 
   1612                 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
   1613                 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
   1614 
   1615 
   1616                 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
   1617                 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
   1618 
   1619                 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
   1620                 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
   1621 
   1622                 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
   1623                 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
   1624 
   1625                 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
   1626                 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
   1627 
   1628 
   1629                 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
   1630                 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
   1631 
   1632                 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
   1633                 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
   1634 
   1635                 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
   1636                 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
   1637 
   1638                 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
   1639                 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
   1640 
   1641                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1642                 pi2_dst_scratch += out_stride;
   1643                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
   1644                 pi2_dst_scratch += out_stride;
   1645                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
   1646                 pi2_dst_scratch += out_stride;
   1647                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
   1648                 pi2_dst_scratch += 8;
   1649                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
   1650                 pi2_dst_scratch -= out_stride;
   1651                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
   1652                 pi2_dst_scratch -= out_stride;
   1653                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
   1654                 pi2_dst_scratch -= out_stride;
   1655                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
   1656                 pi2_dst_scratch += 8;
   1657             }
   1658         }
   1659     }
   1660 
   1661     if(zero_last8_cols_stg1)
   1662     {
   1663         WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
   1664         WORD32 out_stride = (trans_size << 1);
   1665         WORD32 j;
   1666 
   1667         m_temp_reg_40 = _mm_setzero_si128();
   1668         for(j = 0; j < 2; j++)
   1669         {
   1670             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1671             pi2_dst_scratch += out_stride;
   1672             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1673             pi2_dst_scratch += out_stride;
   1674             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1675             pi2_dst_scratch += out_stride;
   1676             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1677             pi2_dst_scratch += 8;
   1678             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1679             pi2_dst_scratch -= out_stride;
   1680             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1681             pi2_dst_scratch -= out_stride;
   1682             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1683             pi2_dst_scratch -= out_stride;
   1684             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1685             pi2_dst_scratch += 8;
   1686         }
   1687     }
   1688 
   1689 
   1690 
   1691 
   1692     /* Stage 2 */
   1693     for(i = 0; i < 2; i++)
   1694     {
   1695         //__m128i m_temp_reg_15,m_temp_reg_16;
   1696         WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
   1697         WORD32 stride = (trans_size);
   1698         WORD16 temp_array[256];
   1699 
   1700         i4_shift = IT_SHIFT_STAGE_2;
   1701 
   1702         if(zero_last12_rows_stg2)
   1703         {
   1704             /* eeo */
   1705             /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
   1706             /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
   1707             {
   1708                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
   1709 
   1710                 pi2_src_temp += (stride * 9);
   1711 
   1712                 if(!i)
   1713                 {
   1714                     pi2_src_temp += (stride * 6 + 8);
   1715                 }
   1716                 else
   1717                 {
   1718                     pi2_src_temp += (stride * 2 + 8);
   1719                 }
   1720 
   1721                 pi2_src_temp -= (stride * 9);
   1722 
   1723                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
   1724 
   1725                 m_temp_reg_20 = _mm_setzero_si128();
   1726                 m_temp_reg_22 = _mm_setzero_si128();
   1727 
   1728                 m_temp_reg_21 = _mm_setzero_si128();
   1729                 m_temp_reg_23 = _mm_setzero_si128();
   1730             }
   1731 
   1732             /* eee */
   1733             /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
   1734             /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
   1735             {
   1736                 /* Loading coeff and src for use in next block */
   1737 
   1738                 /* Loading coeff and src for use in next block */
   1739                 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
   1740 
   1741                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
   1742 
   1743                 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
   1744 
   1745                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
   1746                 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
   1747 
   1748                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
   1749                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
   1750 
   1751                 m_temp_reg_26 = m_temp_reg_24;
   1752                 m_temp_reg_27 = m_temp_reg_25;
   1753                 /*  */
   1754 
   1755                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
   1756                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
   1757             }
   1758 
   1759             /* eo */
   1760             {
   1761                 WORD16 *pi2_scratch = temp_array;
   1762                 WORD32 out_stride = 8;
   1763 
   1764 
   1765                 /* eo0[0-3] */
   1766                 {
   1767                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1768 
   1769                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   1770 
   1771                     /* e[0][0-3] stored in pu1_dst[0] */
   1772                     /* e[7][0-3] stored in pu1_dst[1] */
   1773                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
   1774                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
   1775 
   1776                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1777                     pi2_scratch += out_stride;
   1778                     _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
   1779                     pi2_scratch += out_stride;
   1780                 }
   1781 
   1782                 /* eo0[4-7] */
   1783                 {
   1784                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1785 
   1786                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   1787 
   1788                     /* e[0][4-7] stored in pu1_dst[2] */
   1789                     /* e[7][4-7] stored in pu1_dst[3] */
   1790 
   1791                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
   1792                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
   1793 
   1794                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1795                     pi2_scratch += out_stride;
   1796                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1797                     pi2_scratch += out_stride;
   1798 
   1799                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
   1800                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
   1801 
   1802                 }
   1803 
   1804                 /* eo1[0-3] */
   1805                 {
   1806                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1807 
   1808                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   1809 
   1810                     /* e[1][0-3] stored in pu1_dst[4] */
   1811                     /* e[6][0-3] stored in pu1_dst[5] */
   1812                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
   1813                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
   1814 
   1815                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1816                     pi2_scratch += out_stride;
   1817                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1818                     pi2_scratch += out_stride;
   1819                 }
   1820 
   1821                 /* eo1[4-7] */
   1822                 {
   1823                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1824 
   1825                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   1826 
   1827                     /* e[1][4-7] stored in pu1_dst[6]*/
   1828                     /* e[6][4-7] stored in pu1_dst[7] */
   1829                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
   1830                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
   1831 
   1832                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1833                     pi2_scratch += out_stride;
   1834                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1835                     pi2_scratch += out_stride;
   1836 
   1837                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
   1838 
   1839                 }
   1840 
   1841                 /* eo2[0-3] */
   1842                 {
   1843                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1844 
   1845                     /* e[2][0-3] stored in pu1_dst[8]*/
   1846                     /* e[5][0-3] stored in pu1_dst[9] */
   1847                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
   1848                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
   1849 
   1850                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1851                     pi2_scratch += out_stride;
   1852                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1853                     pi2_scratch += out_stride;
   1854                 }
   1855 
   1856                 /* eo2[4-7] */
   1857                 {
   1858                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1859 
   1860                     /* e[2][4-7] stored in pu1_dst[10]*/
   1861                     /* e[5][4-7] stored in pu1_dst[11] */
   1862                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
   1863                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
   1864 
   1865                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1866                     pi2_scratch += out_stride;
   1867                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1868                     pi2_scratch += out_stride;
   1869 
   1870                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
   1871                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
   1872 
   1873                 }
   1874 
   1875                 /* eo3[0-3] */
   1876                 {
   1877                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1878 
   1879                     /* e[3][0-3] stored in pu1_dst[12]*/
   1880                     /* e[4][0-3] stored in pu1_dst[13] */
   1881                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
   1882                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
   1883 
   1884                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1885                     pi2_scratch += out_stride;
   1886                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1887                     pi2_scratch += out_stride;
   1888                 }
   1889 
   1890                 /* eo3[4-7] */
   1891                 {
   1892                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1893 
   1894                     /* e[3][4-7] stored in pu1_dst[14]*/
   1895                     /* e[4][4-7] stored in pu1_dst[15] */
   1896                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
   1897                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
   1898 
   1899                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1900                     pi2_scratch += out_stride;
   1901                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1902                     pi2_scratch += out_stride;
   1903                 }
   1904 
   1905             }
   1906         }
   1907         else if(zero_last8_rows_stg2)
   1908         {
   1909             /* eeo */
   1910             /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
   1911             /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
   1912             {
   1913 
   1914                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
   1915                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
   1916 
   1917                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
   1918                 pi2_src_temp += (stride);
   1919                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
   1920                 pi2_src_temp += (stride * 8);
   1921 
   1922                 if(!i)
   1923                 {
   1924                     pi2_src_temp += (stride * 6 + 8);
   1925                 }
   1926                 else
   1927                 {
   1928                     pi2_src_temp += (stride * 2 + 8);
   1929                 }
   1930 
   1931                 pi2_src_temp -= (stride * 8);
   1932                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
   1933                 pi2_src_temp -= (stride);
   1934                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
   1935 
   1936 
   1937                 m_temp_reg_76 = _mm_setzero_si128();
   1938 
   1939 
   1940                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
   1941                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
   1942 
   1943                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
   1944                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
   1945 
   1946                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1947                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1948 
   1949                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1950                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1951             }
   1952 
   1953             /* eee */
   1954             /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
   1955             /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
   1956             {
   1957                 /* Loading coeff and src for use in next block */
   1958 
   1959 
   1960                 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
   1961 
   1962                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
   1963 
   1964                 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
   1965                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
   1966                 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
   1967 
   1968                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
   1969 
   1970                 m_temp_reg_26 = m_temp_reg_24;
   1971                 m_temp_reg_27 = m_temp_reg_25;
   1972 
   1973                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1974                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   1975             }
   1976 
   1977             /* eo */
   1978             {
   1979                 WORD16 *pi2_scratch = temp_array;
   1980                 WORD32 out_stride = 8;
   1981 
   1982 
   1983                 /* eo0[0-3] */
   1984                 {
   1985                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1986 
   1987                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   1988                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
   1989                     m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
   1990 
   1991                     /* e[0][0-3] stored in pu1_dst[0] */
   1992                     /* e[7][0-3] stored in pu1_dst[1] */
   1993                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1994                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1995 
   1996                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1997                     pi2_scratch += out_stride;
   1998                     _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
   1999                     pi2_scratch += out_stride;
   2000                 }
   2001 
   2002                 /* eo0[4-7] */
   2003                 {
   2004                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   2005 
   2006                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   2007                     m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
   2008                     m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
   2009 
   2010                     /* e[0][4-7] stored in pu1_dst[2] */
   2011                     /* e[7][4-7] stored in pu1_dst[3] */
   2012 
   2013                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   2014                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   2015 
   2016                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2017                     pi2_scratch += out_stride;
   2018                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2019                     pi2_scratch += out_stride;
   2020 
   2021                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
   2022                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
   2023 
   2024                 }
   2025 
   2026                 /* eo1[0-3] */
   2027                 {
   2028                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   2029 
   2030                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   2031                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
   2032                     m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
   2033 
   2034                     /* e[1][0-3] stored in pu1_dst[4] */
   2035                     /* e[6][0-3] stored in pu1_dst[5] */
   2036                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
   2037                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
   2038 
   2039                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2040                     pi2_scratch += out_stride;
   2041                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2042                     pi2_scratch += out_stride;
   2043                 }
   2044 
   2045                 /* eo1[4-7] */
   2046                 {
   2047                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   2048 
   2049                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   2050                     m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
   2051                     m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
   2052 
   2053                     /* e[1][4-7] stored in pu1_dst[6]*/
   2054                     /* e[6][4-7] stored in pu1_dst[7] */
   2055                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
   2056                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
   2057 
   2058                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2059                     pi2_scratch += out_stride;
   2060                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2061                     pi2_scratch += out_stride;
   2062 
   2063                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
   2064 
   2065                 }
   2066 
   2067                 /* eo2[0-3] */
   2068                 {
   2069                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2070 
   2071                     /* e[2][0-3] stored in pu1_dst[8]*/
   2072                     /* e[5][0-3] stored in pu1_dst[9] */
   2073                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
   2074                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
   2075 
   2076                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2077                     pi2_scratch += out_stride;
   2078                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2079                     pi2_scratch += out_stride;
   2080                 }
   2081 
   2082                 /* eo2[4-7] */
   2083                 {
   2084                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   2085 
   2086                     /* e[2][4-7] stored in pu1_dst[10]*/
   2087                     /* e[5][4-7] stored in pu1_dst[11] */
   2088                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
   2089                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
   2090 
   2091                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2092                     pi2_scratch += out_stride;
   2093                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2094                     pi2_scratch += out_stride;
   2095 
   2096                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
   2097                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
   2098 
   2099                 }
   2100 
   2101                 /* eo3[0-3] */
   2102                 {
   2103                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   2104 
   2105                     /* e[3][0-3] stored in pu1_dst[12]*/
   2106                     /* e[4][0-3] stored in pu1_dst[13] */
   2107                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
   2108                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
   2109 
   2110                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2111                     pi2_scratch += out_stride;
   2112                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2113                     pi2_scratch += out_stride;
   2114                 }
   2115 
   2116                 /* eo3[4-7] */
   2117                 {
   2118                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   2119 
   2120                     /* e[3][4-7] stored in pu1_dst[14]*/
   2121                     /* e[4][4-7] stored in pu1_dst[15] */
   2122                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
   2123                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
   2124 
   2125                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2126                     pi2_scratch += out_stride;
   2127                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2128                     pi2_scratch += out_stride;
   2129                 }
   2130             }
   2131         }
   2132 
   2133         else
   2134         {
   2135             /* eeo */
   2136             /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
   2137             /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
   2138             {
   2139 
   2140 
   2141                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
   2142                 pi2_src_temp += (stride);
   2143                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
   2144                 pi2_src_temp += (stride * 7);
   2145                 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
   2146                 pi2_src_temp += (stride);
   2147                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
   2148                 if(!i)
   2149                 {
   2150                     pi2_src_temp += (stride * 6 + 8);
   2151                 }
   2152                 else
   2153                 {
   2154                     pi2_src_temp += (stride * 2 + 8);
   2155                 }
   2156                 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
   2157                 pi2_src_temp -= (stride);
   2158                 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
   2159                 pi2_src_temp -= (stride * 7);
   2160                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
   2161                 pi2_src_temp -= (stride);
   2162                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
   2163 
   2164                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
   2165                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
   2166 
   2167                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
   2168                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
   2169 
   2170                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2171                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   2172 
   2173                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2174                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   2175 
   2176 
   2177             }
   2178 
   2179             /* eee */
   2180             /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
   2181             /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
   2182             {
   2183                 /* Loading coeff and src for use in next block */
   2184                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
   2185                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
   2186 
   2187                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
   2188                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
   2189 
   2190                 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   2191                 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
   2192 
   2193                 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   2194                 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
   2195 
   2196                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
   2197                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
   2198 
   2199             }
   2200 
   2201             /* eo */
   2202             {
   2203                 WORD16 *pi2_scratch = temp_array;
   2204                 WORD32 out_stride = 8;
   2205 
   2206 
   2207 
   2208                 /* eo0[0-3] */
   2209                 {
   2210                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   2211                     m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   2212                     m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   2213                     m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
   2214 
   2215 
   2216                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2217                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
   2218 
   2219 
   2220                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   2221                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
   2222                     m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
   2223 
   2224 
   2225                     /* e[0][0-3] stored in pi2_tmp[0][0-7] */
   2226                     /* e[7][0-3] stored in pi2_tmp[0][8-15] */
   2227                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   2228                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   2229                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
   2230                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
   2231 
   2232                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2233                     pi2_scratch += out_stride;
   2234                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2235                     pi2_scratch += out_stride;
   2236 
   2237 
   2238                 }
   2239 
   2240                 /* eo0[4-7] */
   2241                 {
   2242 
   2243                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
   2244                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
   2245 
   2246                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   2247                     m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
   2248                     m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
   2249 
   2250                     /* e[0][4-7] stored in pi2_tmp[1][0-7] */
   2251                     /* e[7][4-7] stored in pi2_tmp[1][8-15] */
   2252                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   2253                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   2254                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
   2255                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
   2256 
   2257                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2258                     pi2_scratch += out_stride;
   2259                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2260                     pi2_scratch += out_stride;
   2261 
   2262                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
   2263                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
   2264 
   2265                 }
   2266 
   2267                 /* eo1[0-3] */
   2268                 {
   2269                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   2270                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
   2271 
   2272                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   2273                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
   2274                     m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
   2275 
   2276                     /* e[1][0-3] stored in pi2_tmp[2][0-7] */
   2277                     /* e[6][0-3] stored in pi2_tmp[2][8-15] */
   2278                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
   2279                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
   2280                     m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
   2281                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
   2282 
   2283                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2284                     pi2_scratch += out_stride;
   2285                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2286                     pi2_scratch += out_stride;
   2287 
   2288                 }
   2289 
   2290                 /* eo1[4-7] */
   2291                 {
   2292                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
   2293                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2294 
   2295                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   2296                     m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
   2297                     m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
   2298 
   2299                     /* e[1][4-7] stored in pi2_tmp[3][0-7] */
   2300                     /* e[6][4-7] stored in pi2_tmp[3][8-15] */
   2301                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
   2302                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
   2303                     m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
   2304                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
   2305 
   2306                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2307                     pi2_scratch += out_stride;
   2308                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2309                     pi2_scratch += out_stride;
   2310                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
   2311                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
   2312                 }
   2313 
   2314                 /* eo2[0-3] */
   2315                 {
   2316                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2317                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
   2318 
   2319                     /* e[2][0-3] stored in pi2_tmp[4][0-7] */
   2320                     /* e[5][0-3] stored in pi2_tmp[4][8-15] */
   2321                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
   2322                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
   2323                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
   2324                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
   2325 
   2326                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2327                     pi2_scratch += out_stride;
   2328                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2329                     pi2_scratch += out_stride;
   2330                 }
   2331 
   2332                 /* eo2[4-7] */
   2333                 {
   2334                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
   2335                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
   2336 
   2337                     /* e[2][4-7] stored in pi2_tmp[5][0-7] */
   2338                     /* e[5][4-7] stored in pi2_tmp[5][8-15] */
   2339                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
   2340                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
   2341                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
   2342                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
   2343 
   2344                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2345                     pi2_scratch += out_stride;
   2346                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2347                     pi2_scratch += out_stride;
   2348 
   2349                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
   2350                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
   2351 
   2352                 }
   2353 
   2354                 /* eo3[0-3] */
   2355                 {
   2356                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   2357                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
   2358 
   2359                     /* e[3][0-3] stored in pi2_tmp[6][0-7] */
   2360                     /* e[4][0-3] stored in pi2_tmp[6][8-15] */
   2361                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
   2362                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
   2363                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
   2364                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
   2365 
   2366 
   2367                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2368                     pi2_scratch += out_stride;
   2369                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2370                     pi2_scratch += out_stride;
   2371                 }
   2372 
   2373                 /* eo3[4-7] */
   2374                 {
   2375                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
   2376                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2377 
   2378                     /* e[3][4-7] stored in pi2_tmp[7][0-7] */
   2379                     /* e[4][4-7] stored in pi2_tmp[7][8-15] */
   2380                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
   2381                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
   2382                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
   2383                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
   2384 
   2385                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2386                     pi2_scratch += out_stride;
   2387                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2388                     pi2_scratch += out_stride;
   2389                 }
   2390             }
   2391         }
   2392 
   2393         if(zero_last12_rows_stg2)
   2394         {
   2395             /* o & stage 2 pre-transposed out */
   2396             {
   2397                 WORD32 j;
   2398                 WORD16 *pi2_src_scratch = temp_array;
   2399                 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
   2400                 WORD32 out_stride = (trans_size);
   2401                 WORD32 in_stride = (8) * 4;
   2402 
   2403                 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
   2404 
   2405                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
   2406 
   2407                 pi2_src_temp += (stride * 9);
   2408 
   2409                 if(0 == i)
   2410                 {
   2411                     pi2_src_temp -= (stride * 2 - 8);
   2412                 }
   2413                 else
   2414                 {
   2415                     pi2_src_temp -= (stride * 6 - 8);
   2416                 }
   2417                 pi2_src_temp -= (stride * 9);
   2418 
   2419                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
   2420 
   2421 
   2422                 for(j = 0; j < 2; j++)
   2423                 {
   2424                     if(j)
   2425                     {
   2426                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   2427                     }
   2428                     else
   2429                     {
   2430                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   2431                     }
   2432                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   2433 
   2434                     /* o0[0-3] */
   2435                     {
   2436                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2437 
   2438                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2439                         pi2_src_scratch += in_stride;
   2440 
   2441                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   2442 
   2443                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2444                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2445 
   2446 
   2447                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2448                         m_count = _mm_cvtsi32_si128(i4_shift);
   2449                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
   2450 
   2451 
   2452                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2453                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2454                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2455                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2456 
   2457                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2458 
   2459                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2460                         pi2_dst_scratch += out_stride;
   2461                     }
   2462 
   2463                     /* o1[0-3] */
   2464                     {
   2465                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2466 
   2467                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2468                         pi2_src_scratch += in_stride;
   2469 
   2470                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   2471 
   2472                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2473                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2474 
   2475                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2476                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2477                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2478                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2479 
   2480                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2481 
   2482                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2483                         pi2_dst_scratch += ((!i) * out_stride + 8);
   2484                     }
   2485 
   2486                     /* o2[0-3] */
   2487                     {
   2488                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2489 
   2490                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2491                         pi2_src_scratch += in_stride;
   2492 
   2493                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   2494 
   2495                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2496                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2497 
   2498                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2499                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2500                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2501                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2502 
   2503                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2504 
   2505                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2506                         pi2_dst_scratch += out_stride;
   2507                     }
   2508 
   2509                     /* o3[0-3] */
   2510                     {
   2511                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2512 
   2513                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2514                         pi2_src_scratch += 8;
   2515 
   2516                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   2517 
   2518                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2519                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2520 
   2521                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2522                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2523                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2524                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2525 
   2526                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2527 
   2528                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2529                         pi2_dst_scratch += (i * out_stride + 8);
   2530                     }
   2531 
   2532                     /* o4[0-3] */
   2533                     {
   2534                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2535 
   2536                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2537                         pi2_src_scratch -= in_stride;
   2538 
   2539                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   2540 
   2541                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2542                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2543 
   2544 
   2545                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2546                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2547                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2548                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2549 
   2550                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2551 
   2552                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2553                         pi2_dst_scratch += out_stride;
   2554                     }
   2555 
   2556                     /* o5[0-3] */
   2557                     {
   2558                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2559 
   2560                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2561                         pi2_src_scratch -= in_stride;
   2562 
   2563                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   2564 
   2565                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2566                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2567 
   2568 
   2569                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2570                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2571                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2572                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2573 
   2574                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2575 
   2576                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2577                         pi2_dst_scratch += ((!i) * out_stride + 8);
   2578                     }
   2579 
   2580                     /* o6[0-3] */
   2581                     {
   2582                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2583 
   2584                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2585                         pi2_src_scratch -= in_stride;
   2586 
   2587                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   2588 
   2589                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2590                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2591 
   2592                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2593                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2594                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2595                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2596 
   2597                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2598 
   2599                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2600                         pi2_dst_scratch += out_stride;
   2601                     }
   2602 
   2603                     /* o7[0-3] */
   2604                     {
   2605                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2606 
   2607                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2608                         pi2_src_scratch += 8;
   2609 
   2610                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2611                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2612 
   2613                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2614                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2615                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2616                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2617 
   2618                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2619 
   2620                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2621                         pi2_dst_scratch += (i * out_stride + 8);
   2622                     }
   2623 
   2624 
   2625                 }
   2626             }
   2627         }
   2628         else if(zero_last8_rows_stg2)
   2629         {
   2630             /* o & stage 2 pre-transposed out */
   2631             {
   2632                 WORD32 j;
   2633                 WORD16 *pi2_src_scratch = temp_array;
   2634                 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
   2635                 WORD32 out_stride = (trans_size);
   2636                 WORD32 in_stride = (8) * 4;
   2637 
   2638                 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
   2639 
   2640 
   2641                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
   2642                 pi2_src_temp += (stride);
   2643                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
   2644                 pi2_src_temp += (stride * 8);
   2645 
   2646                 if(0 == i)
   2647                 {
   2648                     pi2_src_temp -= (stride * 2 - 8);
   2649                 }
   2650                 else
   2651                 {
   2652                     pi2_src_temp -= (stride * 6 - 8);
   2653                 }
   2654 
   2655                 pi2_src_temp -= (stride * 8);
   2656                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
   2657                 pi2_src_temp -= (stride);
   2658                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
   2659 
   2660 
   2661                 for(j = 0; j < 2; j++)
   2662                 {
   2663                     if(j)
   2664                     {
   2665                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   2666                         m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
   2667                     }
   2668                     else
   2669                     {
   2670                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   2671                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
   2672                     }
   2673                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   2674                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
   2675 
   2676                     /* o0[0-3] */
   2677                     {
   2678                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2679                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2680 
   2681                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2682                         pi2_src_scratch += in_stride;
   2683 
   2684                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   2685                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
   2686 
   2687                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2688                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2689                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2690 
   2691 
   2692                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2693                         m_count = _mm_cvtsi32_si128(i4_shift);
   2694 
   2695                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
   2696 
   2697                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2698                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2699                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2700                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2701 
   2702                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2703 
   2704                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2705                         pi2_dst_scratch += out_stride;
   2706                     }
   2707 
   2708                     /* o1[0-3] */
   2709                     {
   2710                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2711                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2712 
   2713                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2714                         pi2_src_scratch += in_stride;
   2715 
   2716                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   2717                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
   2718 
   2719                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   2720                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2721                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2722 
   2723                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2724                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2725                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2726                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2727 
   2728                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2729 
   2730                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2731                         pi2_dst_scratch += ((!i) * out_stride + 8);
   2732                     }
   2733 
   2734                     /* o2[0-3] */
   2735                     {
   2736                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2737                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2738 
   2739                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2740                         pi2_src_scratch += in_stride;
   2741 
   2742                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   2743                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
   2744 
   2745                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   2746                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2747                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2748 
   2749                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2750                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2751                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2752                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2753 
   2754                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2755 
   2756                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2757                         pi2_dst_scratch += out_stride;
   2758                     }
   2759 
   2760                     /* o3[0-3] */
   2761                     {
   2762                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2763                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2764 
   2765                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2766                         pi2_src_scratch += 8;
   2767 
   2768                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   2769                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
   2770 
   2771                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
   2772                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2773                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2774 
   2775                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2776                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2777                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2778                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2779 
   2780                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2781 
   2782                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2783                         pi2_dst_scratch += (i * out_stride + 8);
   2784                     }
   2785 
   2786                     /* o4[0-3] */
   2787                     {
   2788                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2789                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2790 
   2791                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2792                         pi2_src_scratch -= in_stride;
   2793 
   2794                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   2795                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
   2796 
   2797                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   2798                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2799                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2800 
   2801 
   2802                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2803                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2804                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2805                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2806 
   2807                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2808 
   2809                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2810                         pi2_dst_scratch += out_stride;
   2811                     }
   2812 
   2813                     /* o5[0-3] */
   2814                     {
   2815                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2816                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2817 
   2818                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2819                         pi2_src_scratch -= in_stride;
   2820 
   2821                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   2822                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
   2823 
   2824                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   2825                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2826                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2827 
   2828 
   2829                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2830                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2831                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2832                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2833 
   2834                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2835 
   2836                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2837                         pi2_dst_scratch += ((!i) * out_stride + 8);
   2838                     }
   2839 
   2840                     /* o6[0-3] */
   2841                     {
   2842                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2843                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2844 
   2845                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2846                         pi2_src_scratch -= in_stride;
   2847 
   2848                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   2849                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
   2850 
   2851                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2852                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2853                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2854 
   2855                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2856                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2857                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2858                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2859 
   2860                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2861 
   2862                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2863                         pi2_dst_scratch += out_stride;
   2864                     }
   2865 
   2866                     /* o7[0-3] */
   2867                     {
   2868                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2869                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2870 
   2871                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2872                         pi2_src_scratch += 8;
   2873 
   2874                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   2875                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2876                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2877 
   2878                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2879                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2880                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2881                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2882 
   2883                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2884 
   2885                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2886                         pi2_dst_scratch += (i * out_stride + 8);
   2887                     }
   2888                 }
   2889             }
   2890         }
   2891         else
   2892         {
   2893             /* o & stage 2 pre-transposed out */
   2894             {
   2895                 WORD32 j;
   2896                 WORD16 *pi2_src_scratch = temp_array;
   2897                 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
   2898                 WORD32 out_stride = (trans_size);
   2899                 WORD32 in_stride = (8) * 4;
   2900 
   2901                 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
   2902 
   2903 
   2904                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
   2905                 pi2_src_temp += (stride);
   2906                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
   2907                 pi2_src_temp += (stride * 7);
   2908                 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
   2909                 pi2_src_temp += (stride);
   2910                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
   2911                 if(0 == i)
   2912                 {
   2913                     pi2_src_temp -= (stride * 2 - 8);
   2914                 }
   2915                 else
   2916                 {
   2917                     pi2_src_temp -= (stride * 6 - 8);
   2918                 }
   2919                 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
   2920                 pi2_src_temp -= (stride);
   2921                 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
   2922                 pi2_src_temp -= (stride * 7);
   2923                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
   2924                 pi2_src_temp -= (stride);
   2925                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
   2926 
   2927 
   2928                 for(j = 0; j < 2; j++)
   2929                 {
   2930 
   2931                     if(j) //H8B= higher 8 bytes L8B lower 8 bytes
   2932                     {
   2933                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   2934                         m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
   2935                         m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
   2936                         m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
   2937                     }
   2938                     else
   2939                     {
   2940                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   2941                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
   2942                         m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
   2943                         m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
   2944                     }
   2945                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   2946                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
   2947                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
   2948                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
   2949 
   2950 
   2951                     /* o0[0-3] */
   2952                     {
   2953                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2954                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2955                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   2956                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2957 
   2958 
   2959                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2960                         pi2_src_scratch += in_stride;
   2961 
   2962                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   2963                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
   2964                         m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
   2965                         m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
   2966 
   2967                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2968                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   2969                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   2970                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2971                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2972 
   2973                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2974                         m_count = _mm_cvtsi32_si128(i4_shift);
   2975                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
   2976 
   2977                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2978                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2979                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2980                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2981 
   2982                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2983 
   2984                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2985                         pi2_dst_scratch += out_stride;
   2986                     }
   2987 
   2988                     /* o1[0-3] */
   2989                     {
   2990                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2991                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2992                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   2993                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   2994 
   2995 
   2996                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   2997                         pi2_src_scratch += in_stride;
   2998 
   2999                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   3000                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
   3001                         m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
   3002                         m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
   3003 
   3004                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   3005                         m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
   3006                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
   3007                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   3008                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   3009 
   3010                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3011                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3012                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3013                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3014 
   3015                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3016 
   3017                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3018                         pi2_dst_scratch += ((!i) * out_stride + 8);
   3019                     }
   3020 
   3021                     /* o2[0-3] */
   3022                     {
   3023                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3024                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3025                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3026                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3027 
   3028                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3029                         pi2_src_scratch += in_stride;
   3030 
   3031                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   3032                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
   3033                         m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
   3034                         m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
   3035 
   3036                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   3037                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   3038                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
   3039                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3040                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3041 
   3042                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3043                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3044                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3045                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3046 
   3047                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3048 
   3049                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3050                         pi2_dst_scratch += out_stride;
   3051                     }
   3052 
   3053                     /* o3[0-3] */
   3054                     {
   3055                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   3056                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   3057                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   3058                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   3059 
   3060 
   3061                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3062                         pi2_src_scratch += 8;
   3063 
   3064                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   3065                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
   3066                         m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
   3067                         m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
   3068 
   3069                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
   3070                         m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
   3071                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
   3072                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   3073                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   3074 
   3075                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3076                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3077                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3078                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3079 
   3080                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3081 
   3082                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3083                         pi2_dst_scratch += (i * out_stride + 8);
   3084                     }
   3085 
   3086                     /* o4[0-3] */
   3087                     {
   3088                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3089                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3090                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3091                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3092 
   3093                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3094                         pi2_src_scratch -= in_stride;
   3095 
   3096                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   3097                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
   3098                         m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
   3099                         m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
   3100 
   3101                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   3102                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   3103                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
   3104                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3105                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3106 
   3107                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3108                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3109                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3110                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3111 
   3112                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3113 
   3114                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3115                         pi2_dst_scratch += out_stride;
   3116                     }
   3117 
   3118                     /* o5[0-3] */
   3119                     {
   3120                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   3121                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   3122                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   3123                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   3124 
   3125 
   3126                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3127                         pi2_src_scratch -= in_stride;
   3128 
   3129                         m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   3130                         m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
   3131                         m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
   3132                         m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
   3133 
   3134                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   3135                         m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
   3136                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
   3137                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   3138                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   3139 
   3140                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3141                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3142                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3143                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3144 
   3145                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3146 
   3147                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3148                         pi2_dst_scratch += ((!i) * out_stride + 8);
   3149                     }
   3150 
   3151                     /* o6[0-3] */
   3152                     {
   3153                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3154                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3155                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3156                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3157 
   3158 
   3159                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3160                         pi2_src_scratch -= in_stride;
   3161 
   3162                         m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   3163                         m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
   3164                         m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
   3165                         m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
   3166 
   3167 
   3168                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3169                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   3170                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3171                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3172                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3173 
   3174                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3175                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3176                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3177                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3178 
   3179                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3180 
   3181                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3182                         pi2_dst_scratch += out_stride;
   3183                     }
   3184 
   3185                     /* o7[0-3] */
   3186                     {
   3187                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   3188                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   3189                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   3190                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   3191 
   3192                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   3193                         pi2_src_scratch += 8;
   3194 
   3195                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   3196                         m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
   3197                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
   3198                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   3199                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   3200 
   3201 
   3202                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3203                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3204                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3205                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3206 
   3207                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3208 
   3209                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3210                         pi2_dst_scratch += (i * out_stride + 8);
   3211                     }
   3212 
   3213                 }
   3214             }
   3215         }
   3216     }
   3217 
   3218     /* Transpose */
   3219     {
   3220         WORD16 *pi2_src_scratch;
   3221         UWORD8 *pu1_pred_temp = pu1_pred;
   3222         WORD32 out_stride = dst_strd;
   3223         WORD32 in_stride = trans_size;
   3224         WORD32 j;
   3225         m_temp_reg_1 = _mm_setzero_si128();
   3226         for(i = 0; i < 2; i++)
   3227         {
   3228             pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
   3229 
   3230             for(j = 0; j < 2; j++)
   3231             {
   3232                 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
   3233                 pi2_src_scratch += in_stride;
   3234                 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
   3235                 pi2_src_scratch += ((!i) * in_stride + 8);
   3236                 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
   3237                 pi2_src_scratch += (in_stride);
   3238                 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
   3239                 pi2_src_scratch += (i * in_stride + 8);
   3240                 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
   3241                 pi2_src_scratch += in_stride;
   3242                 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
   3243                 pi2_src_scratch += ((!i) * in_stride + 8);
   3244                 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
   3245                 pi2_src_scratch += in_stride;
   3246                 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
   3247                 pi2_src_scratch += (i * in_stride + 8);
   3248 
   3249                 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
   3250                 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
   3251 
   3252                 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
   3253                 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
   3254 
   3255                 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
   3256                 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
   3257 
   3258                 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
   3259                 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
   3260 
   3261 
   3262                 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
   3263                 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
   3264 
   3265                 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
   3266                 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
   3267 
   3268                 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
   3269                 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
   3270 
   3271                 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
   3272                 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
   3273 
   3274 
   3275                 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
   3276                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
   3277 
   3278                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
   3279                 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
   3280 
   3281                 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
   3282                 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
   3283                 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
   3284 
   3285                 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   3286                 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   3287                 pu1_dst += out_stride;
   3288                 pu1_pred_temp += pred_strd;
   3289 
   3290                 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
   3291                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
   3292 
   3293                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
   3294                 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
   3295 
   3296                 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
   3297                 m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
   3298                 m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
   3299 
   3300                 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
   3301                 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   3302                 pu1_dst += out_stride;
   3303                 pu1_pred_temp += pred_strd;
   3304 
   3305                 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
   3306                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
   3307 
   3308                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
   3309                 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
   3310 
   3311                 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
   3312                 m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
   3313                 m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
   3314 
   3315                 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
   3316                 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   3317                 pu1_dst += out_stride;
   3318                 pu1_pred_temp += pred_strd;
   3319 
   3320                 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
   3321                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
   3322 
   3323                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
   3324                 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
   3325 
   3326                 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
   3327                 m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
   3328                 m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
   3329 
   3330                 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
   3331                 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   3332                 pu1_dst += out_stride;
   3333                 pu1_pred_temp += pred_strd;
   3334             }
   3335         }
   3336     }
   3337 }
   3338