Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevc_iquant_itrans_recon_atom_intr.c
     22  *
     23  * @brief
     24  *  Contains function definitions for inverse  quantization, inverse
     25  * transform and reconstruction
     26  *
     27  * @author
     28  *  100470
     29  *  100592 (edited by)
     30  *
     31  * @par List of Functions:
     32  *  - ihevc_iquant_itrans_recon_16x16_ssse3()
     33  *
     34  * @remarks
     35  *  None
     36  *
     37  *******************************************************************************
     38  */
     39 #include <stdio.h>
     40 #include <string.h>
     41 #include "ihevc_typedefs.h"
     42 #include "ihevc_macros.h"
     43 #include "ihevc_platform_macros.h"
     44 #include "ihevc_defs.h"
     45 #include "ihevc_trans_tables.h"
     46 #include "ihevc_itrans_recon.h"
     47 #include "ihevc_func_selector.h"
     48 #include "ihevc_trans_macros.h"
     49 
     50 
     51 
     52 #include <immintrin.h>
     53 #include <emmintrin.h>
     54 
     55 #include <tmmintrin.h>
     56 
     57 
     58 /**
     59  *******************************************************************************
     60  *
     61  * @brief
     62  *  This function performs inverse quantization, inverse  transform and
     63  * reconstruction for 16x16 input block
     64  *
     65  * @par Description:
     66  *  Performs inverse quantization , inverse transform  and adds the
     67  * prediction data and clips output to 8 bit
     68  *
     69  * @param[in] pi2_src
     70  *  Input 16x16 coefficients
     71  *
     72  * @param[in] pi2_tmp
     73  *  Temporary 16x16 buffer for storing inverse
     74  *  transform 1st stage output
     75  *
     76  * @param[in] pu1_pred
     77  *  Prediction 16x16 block
     78  *
     79  * @param[in] pi2_dequant_coeff
     80  *  Dequant Coeffs
     81  *
     82  * @param[out] pu1_dst
     83  *  Output 16x16 block
     84  *
     85  * @param[in] qp_div
     86  *  Quantization parameter / 6
     87  *
     88  * @param[in] qp_rem
     89  *  Quantization parameter % 6
     90  *
     91  * @param[in] src_strd
     92  *  Input stride
     93  *
     94  * @param[in] pred_strd
     95  *  Prediction stride
     96  *
     97  * @param[in] dst_strd
     98  *  Output Stride
     99  *
    100  * @param[in] zero_cols
    101  *  Zero columns in pi2_src
    102  *
    103  * @returns  Void
    104  *
    105  * @remarks
    106  *  None
    107  *
    108  *******************************************************************************
    109  */
    110 
    111 void ihevc_itrans_recon_16x16_ssse3(WORD16 *pi2_src,
    112                                     WORD16 *pi2_tmp,
    113                                     UWORD8 *pu1_pred,
    114                                     UWORD8 *pu1_dst,
    115                                     WORD32 src_strd,
    116                                     WORD32 pred_strd,
    117                                     WORD32 dst_strd,
    118                                     WORD32 zero_cols,
    119                                     WORD32 zero_rows)
    120 {
    121     __m128i m_temp_reg_0;
    122     __m128i m_temp_reg_1;
    123     __m128i m_temp_reg_10;
    124     __m128i m_temp_reg_11;
    125     __m128i m_temp_reg_12;
    126     __m128i m_temp_reg_13;
    127     __m128i m_temp_reg_14;
    128 
    129     __m128i m_temp_reg_20;
    130     __m128i m_temp_reg_21;
    131     __m128i m_temp_reg_22;
    132     __m128i m_temp_reg_23;
    133     __m128i m_temp_reg_24;
    134     __m128i m_temp_reg_25;
    135     __m128i m_temp_reg_26;
    136     __m128i m_temp_reg_27;
    137     __m128i m_temp_reg_30;
    138     __m128i m_temp_reg_31;
    139     __m128i m_temp_reg_32;
    140     __m128i m_temp_reg_33;
    141     __m128i m_temp_reg_34;
    142     __m128i m_temp_reg_35;
    143     __m128i m_temp_reg_36;
    144     __m128i m_temp_reg_37;
    145     __m128i m_temp_reg_40;
    146     __m128i m_temp_reg_41;
    147     __m128i m_temp_reg_42;
    148     __m128i m_temp_reg_43;
    149     __m128i m_temp_reg_44;
    150     __m128i m_temp_reg_45;
    151     __m128i m_temp_reg_46;
    152     __m128i m_temp_reg_47;
    153 
    154     __m128i m_temp_reg_70;
    155     __m128i m_temp_reg_71;
    156     __m128i m_temp_reg_72;
    157     __m128i m_temp_reg_73;
    158     __m128i m_temp_reg_74;
    159     __m128i m_temp_reg_75;
    160     __m128i m_temp_reg_76;
    161     __m128i m_temp_reg_77;
    162     __m128i m_rdng_factor;
    163     __m128i m_count;
    164     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
    165     __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
    166 
    167     WORD32 i;
    168 /*Lokesh*/
    169     WORD32  zero_last8_cols_stg1;
    170     WORD32  zero_last8_rows_stg1;
    171     WORD32  zero_last12_rows_stg1;
    172     WORD32  zero_last12_rows_stg2;
    173     WORD32  zero_last8_rows_stg2;
    174 
    175     WORD32  loop = 0;
    176 
    177     WORD32 i4_shift = IT_SHIFT_STAGE_1;
    178     WORD32 trans_size = TRANS_SIZE_16;
    179 
    180 
    181 
    182 
    183     /* Following 3 instructions replicates the value in the */
    184     /* lower 16 bits of m_add_iq in the entire register */
    185 
    186     /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
    187 
    188     zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
    189     zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
    190     zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
    191 
    192     zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
    193     zero_last8_rows_stg2 = zero_last8_cols_stg1;
    194     if(zero_last8_cols_stg1)
    195     {
    196         loop = 1;
    197     }
    198     else
    199         loop = 2;
    200 
    201     /* i = 0 => lower 8 samples */
    202     /* i = 1 => higher 8 samples */
    203     for(i = 0; i < loop; i++)
    204     {
    205         {
    206             WORD32 sample_half_index = i << 3;
    207             WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
    208             WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
    209 
    210             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
    211             pi2_tmp_src += (src_strd << 1);
    212             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
    213             pi2_tmp_src += (src_strd << 1);
    214             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
    215             pi2_tmp_src += (src_strd << 1);
    216             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
    217             pi2_tmp_src += (src_strd << 1);
    218             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
    219             pi2_tmp_src += (src_strd << 1);
    220             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
    221             pi2_tmp_src += (src_strd << 1);
    222             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
    223             pi2_tmp_src += (src_strd << 1);
    224             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
    225             pi2_tmp_src += (src_strd << 1);
    226 
    227 
    228 
    229 
    230             /* If last 12 rows are zero : Rishab */
    231             if(zero_last12_rows_stg1)
    232             {
    233 
    234                 /* eee */
    235                 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
    236                 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
    237                 {
    238                     /* Loading coeff and src for use in next block */
    239 
    240                     m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
    241                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
    242 
    243                     m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
    244 
    245                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
    246 
    247                     m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
    248 
    249                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
    250 
    251                     m_temp_reg_26 = m_temp_reg_24;
    252                     m_temp_reg_27 = m_temp_reg_25;
    253                 }
    254 
    255                 /* eo */
    256 
    257                 /* eo0[0-3] */
    258                 {
    259                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    260                     m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
    261 
    262                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    263 
    264                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    265 
    266                     /* e[0][0-3] stored in pi2_tmp[0][0-7] */
    267                     /* e[7][0-3] stored in pi2_tmp[0][8-15] */
    268                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
    269                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
    270 
    271                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    272                     pi2_scratch += 8;
    273                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    274                     pi2_scratch += 8;
    275 
    276                 }
    277 
    278 
    279                 /* eo0[4-7] */
    280                 {
    281                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    282 
    283                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    284 
    285                     /* e[0][4-7] stored in pi2_tmp[1][0-7] */
    286                     /* e[7][4-7] stored in pi2_tmp[1][8-15] */
    287                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
    288                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
    289 
    290                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    291                     pi2_scratch += 8;
    292                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    293                     pi2_scratch += 8;
    294 
    295                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
    296                 }
    297 
    298                 /* eo1[0-3] */
    299                 {
    300                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    301 
    302                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    303 
    304                     /* e[1][0-3] stored in pi2_tmp[2][0-7] */
    305                     /* e[6][0-3] stored in pi2_tmp[2][8-15] */
    306                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
    307                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
    308 
    309                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    310                     pi2_scratch += 8;
    311                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    312                     pi2_scratch += 8;
    313                 }
    314 
    315                 /* eo1[4-7] */
    316                 {
    317                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    318 
    319                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    320 
    321                     /* e[1][4-7] stored in pi2_tmp[3][0-7] */
    322                     /* e[6][4-7] stored in pi2_tmp[3][8-15] */
    323                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
    324                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
    325 
    326                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    327                     pi2_scratch += 8;
    328                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    329                     pi2_scratch += 8;
    330 
    331                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
    332 
    333                 }
    334 
    335                 /* eo2[0-3] */
    336                 {
    337                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    338 
    339                     /* e[2][0-3] stored in pi2_tmp[4][0-7] */
    340                     /* e[5][0-3] stored in pi2_tmp[4][8-15] */
    341                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
    342                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
    343 
    344                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    345                     pi2_scratch += 8;
    346                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    347                     pi2_scratch += 8;
    348 
    349                 }
    350 
    351                 /* eo2[4-7] */
    352                 {
    353                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    354 
    355                     /* e[2][4-7] stored in pi2_tmp[5][0-7] */
    356                     /* e[5][4-7] stored in pi2_tmp[5][8-15] */
    357                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
    358                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
    359 
    360 
    361                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    362                     pi2_scratch += 8;
    363                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    364                     pi2_scratch += 8;
    365 
    366                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
    367                 }
    368 
    369                 /* eo3[0-3] */
    370                 {
    371                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    372 
    373                     /* e[3][0-3] stored in pi2_tmp[6][0-7] */
    374                     /* e[4][0-3] stored in pi2_tmp[6][8-15] */
    375                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
    376                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
    377 
    378                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    379                     pi2_scratch += 8;
    380                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    381                     pi2_scratch += 8;
    382                 }
    383 
    384                 /* eo3[4-7] */
    385                 {
    386                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    387 
    388                     /* e[3][4-7] stored in pi2_tmp[7][0-7] */
    389                     /* e[4][4-7] stored in pi2_tmp[7][8-15] */
    390                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
    391                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
    392 
    393                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    394                     pi2_scratch += 8;
    395                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    396                     pi2_scratch += 8;
    397                 }
    398             }
    399             /* If last 8 rows are zero : Rishab */
    400             else if(zero_last8_rows_stg1)
    401             {
    402                 /* eeo */
    403                 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
    404                 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
    405                 {
    406                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
    407                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
    408 
    409                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
    410                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
    411 
    412                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    413                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    414 
    415                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    416                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
    417 
    418                 }
    419 
    420                 /* eee */
    421                 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
    422                 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
    423                 {
    424                     /* Loading coeff and src for use in next block */
    425                     m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to  get signs
    426                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
    427 
    428                     m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
    429 
    430                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
    431 
    432                     m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
    433 
    434                     m_temp_reg_26 = m_temp_reg_24;
    435                     m_temp_reg_27 = m_temp_reg_25;
    436 
    437                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
    438                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
    439                 }
    440 
    441                 /* eo0[0-3] */
    442                 {
    443                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    444                     m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
    445 
    446                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    447 
    448                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    449                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
    450                     m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
    451 
    452                     /* e[0][0-3] stored in pi2_tmp[0][0-7] */
    453                     /* e[7][0-3] stored in pi2_tmp[0][8-15] */
    454                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    455                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    456 
    457                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    458                     pi2_scratch += 8;
    459                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    460                     pi2_scratch += 8;
    461 
    462                 }
    463 
    464                 /* eo0[4-7] */
    465                 {
    466                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    467 
    468                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    469                     m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
    470                     m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
    471 
    472                     /* e[0][4-7] stored in pi2_tmp[1][0-7] */
    473                     /* e[7][4-7] stored in pi2_tmp[1][8-15] */
    474                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
    475                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
    476 
    477                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    478                     pi2_scratch += 8;
    479                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    480                     pi2_scratch += 8;
    481 
    482                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
    483                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
    484 
    485                 }
    486 
    487                 /* eo1[0-3] */
    488                 {
    489                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    490 
    491                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    492                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
    493                     m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
    494 
    495                     /* e[1][0-3] stored in pi2_tmp[2][0-7] */
    496                     /* e[6][0-3] stored in pi2_tmp[2][8-15] */
    497                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    498                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    499 
    500                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    501                     pi2_scratch += 8;
    502                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    503                     pi2_scratch += 8;
    504 
    505                 }
    506 
    507                 /* eo1[4-7] */
    508                 {
    509                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    510 
    511                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    512                     m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
    513                     m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
    514 
    515                     /* e[1][4-7] stored in pi2_tmp[3][0-7] */
    516                     /* e[6][4-7] stored in pi2_tmp[3][8-15] */
    517                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
    518                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
    519 
    520                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    521                     pi2_scratch += 8;
    522                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    523                     pi2_scratch += 8;
    524 
    525                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
    526                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
    527 
    528                 }
    529 
    530                 /* eo2[0-3] */
    531                 {
    532                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    533 
    534                     /* e[2][0-3] stored in pi2_tmp[4][0-7] */
    535                     /* e[5][0-3] stored in pi2_tmp[4][8-15] */
    536                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    537                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    538 
    539                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    540                     pi2_scratch += 8;
    541                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    542                     pi2_scratch += 8;
    543 
    544                 }
    545 
    546                 /* eo2[4-7] */
    547                 {
    548                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    549 
    550                     /* e[2][4-7] stored in pi2_tmp[5][0-7] */
    551                     /* e[5][4-7] stored in pi2_tmp[5][8-15] */
    552                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
    553                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
    554 
    555                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    556                     pi2_scratch += 8;
    557                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    558                     pi2_scratch += 8;
    559 
    560                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
    561                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
    562 
    563                 }
    564 
    565                 /* eo3[0-3] */
    566                 {
    567                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    568 
    569                     /* e[3][0-3] stored in pi2_tmp[6][0-7] */
    570                     /* e[4][0-3] stored in pi2_tmp[6][8-15] */
    571                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    572                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    573 
    574                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    575                     pi2_scratch += 8;
    576                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    577                     pi2_scratch += 8;
    578                 }
    579 
    580                 /* eo3[4-7] */
    581                 {
    582                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    583 
    584                     /* e[3][4-7] stored in pi2_tmp[7][0-7] */
    585                     /* e[4][4-7] stored in pi2_tmp[7][8-15] */
    586                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
    587                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
    588 
    589                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    590                     pi2_scratch += 8;
    591                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    592                     pi2_scratch += 8;
    593                 }
    594             } /* If all the rows are non-zero : Rishab */
    595             else
    596             {
    597                 /* eeo */
    598                 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
    599                 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
    600 
    601                 {
    602                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
    603                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
    604 
    605                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
    606                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
    607 
    608                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
    609                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
    610 
    611                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
    612                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
    613                 }
    614 
    615                 /* eee */
    616                 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
    617                 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
    618                 {
    619                     /* Loading coeff and src for use in next block */
    620                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
    621                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
    622 
    623                     m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
    624                     m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
    625 
    626                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
    627                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
    628 
    629                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
    630                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
    631 
    632                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
    633                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
    634 
    635                 }
    636                 /* eo0[0-3] */
    637                 {
    638                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
    639                     m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
    640                     m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
    641                     m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
    642 
    643                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    644                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
    645 
    646 
    647                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    648                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
    649                     m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
    650 
    651                     /* e[0][0-3] stored in pi2_tmp[0][0-7] */
    652                     /* e[7][0-3] stored in pi2_tmp[0][8-15] */
    653                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
    654                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
    655                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
    656                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
    657 
    658                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    659                     pi2_scratch += 8;
    660                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    661                     pi2_scratch += 8;
    662 
    663 
    664                 }
    665 
    666                 /* eo0[4-7] */
    667                 {
    668                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    669                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
    670 
    671                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
    672                     m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
    673                     m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
    674 
    675                     /* e[0][4-7] stored in pi2_tmp[1][0-7] */
    676                     /* e[7][4-7] stored in pi2_tmp[1][8-15] */
    677                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
    678                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
    679                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
    680                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
    681 
    682                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    683                     pi2_scratch += 8;
    684                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    685                     pi2_scratch += 8;
    686 
    687                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
    688                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
    689 
    690                 }
    691 
    692                 /* eo1[0-3] */
    693                 {
    694                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    695                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
    696 
    697                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    698                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
    699                     m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
    700 
    701                     /* e[1][0-3] stored in pi2_tmp[2][0-7] */
    702                     /* e[6][0-3] stored in pi2_tmp[2][8-15] */
    703                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
    704                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
    705                     m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
    706                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
    707 
    708                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    709                     pi2_scratch += 8;
    710                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    711                     pi2_scratch += 8;
    712 
    713                 }
    714 
    715                 /* eo1[4-7] */
    716                 {
    717                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    718                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
    719 
    720                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
    721                     m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
    722                     m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
    723 
    724                     /* e[1][4-7] stored in pi2_tmp[3][0-7] */
    725                     /* e[6][4-7] stored in pi2_tmp[3][8-15] */
    726                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
    727                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
    728                     m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
    729                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
    730 
    731                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    732                     pi2_scratch += 8;
    733                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    734                     pi2_scratch += 8;
    735                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
    736                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
    737                 }
    738 
    739                 /* eo2[0-3] */
    740                 {
    741                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    742                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
    743 
    744                     /* e[2][0-3] stored in pi2_tmp[4][0-7] */
    745                     /* e[5][0-3] stored in pi2_tmp[4][8-15] */
    746                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
    747                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
    748                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
    749                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
    750 
    751                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    752                     pi2_scratch += 8;
    753                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    754                     pi2_scratch += 8;
    755                 }
    756 
    757                 /* eo2[4-7] */
    758                 {
    759                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
    760                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
    761 
    762                     /* e[2][4-7] stored in pi2_tmp[5][0-7] */
    763                     /* e[5][4-7] stored in pi2_tmp[5][8-15] */
    764                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
    765                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
    766                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
    767                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
    768 
    769                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    770                     pi2_scratch += 8;
    771                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    772                     pi2_scratch += 8;
    773 
    774                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
    775                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
    776 
    777                 }
    778 
    779                 /* eo3[0-3] */
    780                 {
    781                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
    782                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
    783 
    784                     /* e[3][0-3] stored in pi2_tmp[6][0-7] */
    785                     /* e[4][0-3] stored in pi2_tmp[6][8-15] */
    786                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
    787                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
    788                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
    789                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
    790 
    791 
    792                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    793                     pi2_scratch += 8;
    794                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    795                     pi2_scratch += 8;
    796                 }
    797 
    798                 /* eo3[4-7] */
    799                 {
    800                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
    801                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
    802 
    803                     /* e[3][4-7] stored in pi2_tmp[7][0-7] */
    804                     /* e[4][4-7] stored in pi2_tmp[7][8-15] */
    805                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
    806                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
    807                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
    808                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
    809 
    810                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
    811                     pi2_scratch += 8;
    812                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
    813                     pi2_scratch += 8;
    814                 }
    815 
    816             }
    817         }
    818 
    819         {
    820             WORD32 sample_half_index = i << 3;
    821             WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
    822 
    823             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
    824             pi2_tmp_src += (src_strd << 1);
    825             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
    826             pi2_tmp_src += (src_strd << 1);
    827             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
    828             pi2_tmp_src += (src_strd << 1);
    829             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
    830             pi2_tmp_src += (src_strd << 1);
    831             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
    832             pi2_tmp_src += (src_strd << 1);
    833             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
    834             pi2_tmp_src += (src_strd << 1);
    835             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
    836             pi2_tmp_src += (src_strd << 1);
    837             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
    838             pi2_tmp_src += (src_strd << 1);
    839         }
    840 
    841         /* o & stage 1 out */
    842         {
    843             WORD32 j;
    844             WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
    845             WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
    846             WORD32 out_stride = (trans_size << 1);
    847             WORD32 in_stride = trans_size << 1;
    848 
    849             if(zero_last12_rows_stg1)
    850             {
    851                 for(j = 0; j < 2; j++)
    852                 {
    853                     if(j) //H8B= higher 8 bytes L8B lower 8 bytes
    854                     {
    855                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
    856                     }
    857                     else
    858                     {
    859                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
    860                     }
    861                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
    862 
    863 
    864                     /* o0[0-3] */
    865                     {
    866                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    867 
    868 
    869                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
    870                         pi2_src_scratch += in_stride;
    871 
    872                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
    873 
    874 
    875                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
    876                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
    877 
    878                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
    879                         m_count = _mm_cvtsi32_si128(i4_shift);
    880                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
    881 
    882                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    883                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    884                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    885                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    886 
    887                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    888 
    889                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    890                         pi2_dst_scratch += out_stride;
    891                     }
    892 
    893                     /* o1[0-3] */
    894                     {
    895                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
    896 
    897 
    898                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
    899                         pi2_src_scratch += in_stride;
    900 
    901                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
    902 
    903                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
    904                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
    905 
    906                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    907                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    908                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    909                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    910 
    911                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    912 
    913                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    914                         pi2_dst_scratch += out_stride;
    915                     }
    916 
    917                     /* o2[0-3] */
    918                     {
    919                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    920 
    921                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
    922                         pi2_src_scratch += in_stride;
    923 
    924                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
    925 
    926                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
    927                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
    928 
    929                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    930                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    931                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    932                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    933 
    934                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    935 
    936                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    937                         pi2_dst_scratch += out_stride;
    938                     }
    939 
    940                     /* o3[0-3] */
    941                     {
    942                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
    943 
    944                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
    945                         pi2_src_scratch += 8;
    946 
    947                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
    948 
    949                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
    950                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
    951 
    952                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    953                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    954                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    955                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    956 
    957                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    958 
    959                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    960                         pi2_dst_scratch += 8;
    961                     }
    962 
    963                     /* o4[0-3] */
    964                     {
    965                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
    966 
    967                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
    968                         pi2_src_scratch -= in_stride;
    969 
    970                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
    971 
    972                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
    973                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
    974 
    975                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
    976                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
    977                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
    978                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
    979 
    980                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
    981 
    982                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
    983                         pi2_dst_scratch -= out_stride;
    984                     }
    985 
    986                     /* o5[0-3] */
    987                     {
    988                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
    989 
    990 
    991                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
    992                         pi2_src_scratch -= in_stride;
    993 
    994                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
    995 
    996                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
    997                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
    998 
    999                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1000                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1001                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1002                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1003 
   1004                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1005 
   1006                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1007                         pi2_dst_scratch -= out_stride;
   1008                     }
   1009 
   1010                     /* o6[0-3] */
   1011                     {
   1012                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1013 
   1014                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1015                         pi2_src_scratch -= in_stride;
   1016 
   1017                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   1018 
   1019                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1020                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1021 
   1022                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1023                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1024                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1025                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1026 
   1027                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1028 
   1029                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1030                         pi2_dst_scratch -= out_stride;
   1031                     }
   1032 
   1033                     /* o7[0-3] */
   1034                     {
   1035                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1036 
   1037                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1038                         pi2_src_scratch += 8;
   1039 
   1040                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1041                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1042 
   1043                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1044                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1045                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1046                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1047 
   1048                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1049 
   1050                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1051                         pi2_dst_scratch += 8;
   1052                     }
   1053                 }
   1054             }
   1055             else if(zero_last8_rows_stg1)
   1056             {
   1057                 for(j = 0; j < 2; j++)
   1058                 {
   1059                     if(j)
   1060                     {
   1061                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   1062                         m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
   1063                     }
   1064                     else
   1065                     {
   1066                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   1067                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
   1068                     }
   1069                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   1070                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
   1071 
   1072                     /* o0[0-3] */
   1073                     {
   1074                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1075                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1076 
   1077                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1078                         pi2_src_scratch += in_stride;
   1079 
   1080                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   1081                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
   1082 
   1083                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   1084                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1085                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1086 
   1087 
   1088                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1089                         m_count = _mm_cvtsi32_si128(i4_shift);
   1090 
   1091                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
   1092 
   1093                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1094                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1095                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1096                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1097 
   1098                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1099 
   1100                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1101                         pi2_dst_scratch += out_stride;
   1102                     }
   1103 
   1104                     /* o1[0-3] */
   1105                     {
   1106                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1107                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1108 
   1109                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1110                         pi2_src_scratch += in_stride;
   1111 
   1112                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   1113                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
   1114 
   1115                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1116                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1117                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1118 
   1119                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1120                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1121                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1122                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1123 
   1124                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1125 
   1126                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1127                         pi2_dst_scratch += out_stride;
   1128                     }
   1129 
   1130                     /* o2[0-3] */
   1131                     {
   1132                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1133                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1134 
   1135                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1136                         pi2_src_scratch += in_stride;
   1137 
   1138                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   1139                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
   1140 
   1141                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   1142                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1143                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1144 
   1145                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1146                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1147                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1148                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1149 
   1150                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1151 
   1152                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1153                         pi2_dst_scratch += out_stride;
   1154                     }
   1155 
   1156                     /* o3[0-3] */
   1157                     {
   1158                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1159                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1160 
   1161                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1162                         pi2_src_scratch += 8;
   1163 
   1164                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   1165                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
   1166 
   1167                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
   1168                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1169                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1170 
   1171                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1172                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1173                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1174                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1175 
   1176                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1177 
   1178                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1179                         pi2_dst_scratch += 8;
   1180                     }
   1181 
   1182                     /* o4[0-3] */
   1183                     {
   1184                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1185                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1186 
   1187                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1188                         pi2_src_scratch -= in_stride;
   1189 
   1190                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   1191                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
   1192 
   1193                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   1194                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1195                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1196 
   1197 
   1198                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1199                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1200                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1201                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1202 
   1203                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1204 
   1205                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1206                         pi2_dst_scratch -= out_stride;
   1207                     }
   1208 
   1209                     /* o5[0-3] */
   1210                     {
   1211                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1212                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1213 
   1214                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1215                         pi2_src_scratch -= in_stride;
   1216 
   1217                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   1218                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
   1219 
   1220                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1221                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1222                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1223 
   1224 
   1225                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1226                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1227                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1228                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1229 
   1230                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1231 
   1232                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1233                         pi2_dst_scratch -= out_stride;
   1234                     }
   1235 
   1236                     /* o6[0-3] */
   1237                     {
   1238                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1239                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1240 
   1241                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1242                         pi2_src_scratch -= in_stride;
   1243 
   1244                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   1245                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
   1246 
   1247                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   1248                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1249                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1250 
   1251                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1252                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1253                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1254                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1255 
   1256                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1257 
   1258                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1259                         pi2_dst_scratch -= out_stride;
   1260                     }
   1261 
   1262                     /* o7[0-3] */
   1263                     {
   1264                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1265                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1266 
   1267                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1268                         pi2_src_scratch += 8;
   1269 
   1270                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1271                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1272                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1273 
   1274                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1275                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1276                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1277                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1278 
   1279                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1280 
   1281                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1282                         pi2_dst_scratch += 8;
   1283                     }
   1284                 }
   1285 
   1286             }
   1287             else
   1288             {
   1289 
   1290 
   1291 
   1292                 for(j = 0; j < 2; j++)
   1293                 {
   1294                     if(j) //H8B= higher 8 bytes L8B lower 8 bytes
   1295                     {
   1296                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   1297                         m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
   1298                         m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
   1299                         m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
   1300                     }
   1301                     else
   1302                     {
   1303                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   1304                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
   1305                         m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
   1306                         m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
   1307                     }
   1308                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   1309                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
   1310                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
   1311                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
   1312 
   1313 
   1314                     /* o0[0-3] */
   1315                     {
   1316                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1317                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1318                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1319                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1320 
   1321 
   1322                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1323                         pi2_src_scratch += in_stride;
   1324 
   1325                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   1326                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
   1327                         m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
   1328                         m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
   1329 
   1330                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   1331                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   1332                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   1333                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1334                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1335 
   1336                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   1337                         m_count = _mm_cvtsi32_si128(i4_shift);
   1338                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
   1339                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
   1340 
   1341                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1342                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1343                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1344                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1345 
   1346                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1347 
   1348                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1349                         pi2_dst_scratch += out_stride;
   1350                     }
   1351 
   1352                     /* o1[0-3] */
   1353                     {
   1354                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1355                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1356                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   1357                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   1358 
   1359 
   1360                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1361                         pi2_src_scratch += in_stride;
   1362 
   1363                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   1364                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
   1365                         m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
   1366                         m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
   1367 
   1368                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1369                         m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
   1370                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
   1371                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1372                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1373 
   1374                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1375                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1376                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1377                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1378 
   1379                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1380 
   1381                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1382                         pi2_dst_scratch += out_stride;
   1383                     }
   1384 
   1385                     /* o2[0-3] */
   1386                     {
   1387                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1388                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1389                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1390                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1391 
   1392                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1393                         pi2_src_scratch += in_stride;
   1394 
   1395                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   1396                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
   1397                         m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
   1398                         m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
   1399 
   1400                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   1401                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   1402                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
   1403                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1404                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1405 
   1406                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1407                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1408                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1409                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1410 
   1411                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1412 
   1413                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1414                         pi2_dst_scratch += out_stride;
   1415                     }
   1416 
   1417                     /* o3[0-3] */
   1418                     {
   1419                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1420                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1421                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   1422                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   1423 
   1424 
   1425                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   1426                         pi2_src_scratch += 8;
   1427 
   1428                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   1429                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
   1430                         m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
   1431                         m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
   1432 
   1433                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
   1434                         m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
   1435                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
   1436                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1437                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1438 
   1439                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1440                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1441                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1442                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1443 
   1444                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1445 
   1446                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1447                         pi2_dst_scratch += 8;
   1448                     }
   1449 
   1450                     /* o4[0-3] */
   1451                     {
   1452                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1453                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1454                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1455                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1456 
   1457                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1458                         pi2_src_scratch -= in_stride;
   1459 
   1460                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   1461                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
   1462                         m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
   1463                         m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
   1464 
   1465                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   1466                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   1467                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
   1468                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1469                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1470 
   1471                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1472                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1473                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1474                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1475 
   1476                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1477 
   1478                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1479                         pi2_dst_scratch -= out_stride;
   1480                     }
   1481 
   1482                     /* o5[0-3] */
   1483                     {
   1484                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1485                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1486                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   1487                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   1488 
   1489 
   1490                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1491                         pi2_src_scratch -= in_stride;
   1492 
   1493                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   1494                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
   1495                         m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
   1496                         m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
   1497 
   1498                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1499                         m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
   1500                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
   1501                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1502                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1503 
   1504                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1505                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1506                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1507                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1508 
   1509                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1510 
   1511                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1512                         pi2_dst_scratch -= out_stride;
   1513                     }
   1514 
   1515                     /* o6[0-3] */
   1516                     {
   1517                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1518                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   1519                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   1520                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   1521 
   1522 
   1523                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1524                         pi2_src_scratch -= in_stride;
   1525 
   1526                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   1527                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
   1528                         m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
   1529                         m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
   1530 
   1531 
   1532                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   1533                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   1534                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   1535                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   1536                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   1537 
   1538                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1539                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1540                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1541                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1542 
   1543                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1544 
   1545                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1546                         pi2_dst_scratch -= out_stride;
   1547                     }
   1548 
   1549                     /* o7[0-3] */
   1550                     {
   1551                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   1552                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   1553                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   1554                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   1555 
   1556                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
   1557                         pi2_src_scratch += 8;
   1558 
   1559                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   1560                         m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
   1561                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
   1562                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   1563                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   1564 
   1565 
   1566                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   1567                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   1568                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   1569                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   1570 
   1571                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   1572 
   1573                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   1574                         pi2_dst_scratch += 8;
   1575                     }
   1576                 }
   1577             }
   1578         }
   1579 
   1580         /* Transpose */
   1581         {
   1582             WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
   1583             WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
   1584             WORD32 out_stride = (trans_size << 1);
   1585             WORD32 in_stride = (trans_size << 1);
   1586             WORD32 j;
   1587 
   1588             for(j = 0; j < 2; j++)
   1589             {
   1590                 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
   1591                 pi2_src_scratch += in_stride;
   1592                 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
   1593                 pi2_src_scratch += in_stride;
   1594                 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
   1595                 pi2_src_scratch += in_stride;
   1596                 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
   1597                 pi2_src_scratch += 8;
   1598                 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
   1599                 pi2_src_scratch -= in_stride;
   1600                 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
   1601                 pi2_src_scratch -= in_stride;
   1602                 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
   1603                 pi2_src_scratch -= in_stride;
   1604                 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
   1605                 pi2_src_scratch += 8;
   1606 
   1607                 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
   1608                 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
   1609 
   1610                 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
   1611                 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
   1612 
   1613                 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
   1614                 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
   1615 
   1616                 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
   1617                 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
   1618 
   1619 
   1620                 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
   1621                 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
   1622 
   1623                 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
   1624                 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
   1625 
   1626                 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
   1627                 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
   1628 
   1629                 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
   1630                 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
   1631 
   1632 
   1633                 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
   1634                 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
   1635 
   1636                 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
   1637                 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
   1638 
   1639                 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
   1640                 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
   1641 
   1642                 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
   1643                 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
   1644 
   1645                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1646                 pi2_dst_scratch += out_stride;
   1647                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
   1648                 pi2_dst_scratch += out_stride;
   1649                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
   1650                 pi2_dst_scratch += out_stride;
   1651                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
   1652                 pi2_dst_scratch += 8;
   1653                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
   1654                 pi2_dst_scratch -= out_stride;
   1655                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
   1656                 pi2_dst_scratch -= out_stride;
   1657                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
   1658                 pi2_dst_scratch -= out_stride;
   1659                 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
   1660                 pi2_dst_scratch += 8;
   1661             }
   1662         }
   1663     }
   1664 
   1665     if(zero_last8_cols_stg1)
   1666     {
   1667         WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
   1668         WORD32 out_stride = (trans_size << 1);
   1669         WORD32 j;
   1670 
   1671         m_temp_reg_40 = _mm_setzero_si128();
   1672         for(j = 0; j < 2; j++)
   1673         {
   1674             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1675             pi2_dst_scratch += out_stride;
   1676             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1677             pi2_dst_scratch += out_stride;
   1678             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1679             pi2_dst_scratch += out_stride;
   1680             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1681             pi2_dst_scratch += 8;
   1682             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1683             pi2_dst_scratch -= out_stride;
   1684             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1685             pi2_dst_scratch -= out_stride;
   1686             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1687             pi2_dst_scratch -= out_stride;
   1688             _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
   1689             pi2_dst_scratch += 8;
   1690         }
   1691     }
   1692 
   1693 
   1694 
   1695 
   1696     /* Stage 2 */
   1697     for(i = 0; i < 2; i++)
   1698     {
   1699         WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
   1700         WORD32 stride = (trans_size);
   1701         MEM_ALIGN16 WORD16 temp_array[256];
   1702 
   1703         i4_shift = IT_SHIFT_STAGE_2;
   1704 
   1705         if(zero_last12_rows_stg2)
   1706         {
   1707             /* eeo */
   1708             /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
   1709             /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
   1710             {
   1711                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
   1712 
   1713                 pi2_src_temp += (stride * 9);
   1714 
   1715                 if(!i)
   1716                 {
   1717                     pi2_src_temp += (stride * 6 + 8);
   1718                 }
   1719                 else
   1720                 {
   1721                     pi2_src_temp += (stride * 2 + 8);
   1722                 }
   1723 
   1724                 pi2_src_temp -= (stride * 9);
   1725 
   1726                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
   1727 
   1728                 m_temp_reg_20 = _mm_setzero_si128();
   1729                 m_temp_reg_22 = _mm_setzero_si128();
   1730 
   1731                 m_temp_reg_21 = _mm_setzero_si128();
   1732                 m_temp_reg_23 = _mm_setzero_si128();
   1733             }
   1734 
   1735             /* eee */
   1736             /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
   1737             /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
   1738             {
   1739                 /* Loading coeff and src for use in next block */
   1740 
   1741                 /* Loading coeff and src for use in next block */
   1742                 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
   1743 
   1744                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
   1745 
   1746                 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
   1747 
   1748                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
   1749                 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
   1750 
   1751                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
   1752                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
   1753 
   1754                 m_temp_reg_26 = m_temp_reg_24;
   1755                 m_temp_reg_27 = m_temp_reg_25;
   1756                 /*  */
   1757 
   1758                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
   1759                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
   1760             }
   1761 
   1762             /* eo */
   1763             {
   1764                 WORD16 *pi2_scratch = temp_array;
   1765                 WORD32 out_stride = 8;
   1766 
   1767 
   1768                 /* eo0[0-3] */
   1769                 {
   1770                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1771 
   1772                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   1773 
   1774                     /* e[0][0-3] stored in pu1_dst[0] */
   1775                     /* e[7][0-3] stored in pu1_dst[1] */
   1776                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
   1777                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
   1778 
   1779                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1780                     pi2_scratch += out_stride;
   1781                     _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
   1782                     pi2_scratch += out_stride;
   1783                 }
   1784 
   1785                 /* eo0[4-7] */
   1786                 {
   1787                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1788 
   1789                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   1790 
   1791                     /* e[0][4-7] stored in pu1_dst[2] */
   1792                     /* e[7][4-7] stored in pu1_dst[3] */
   1793 
   1794                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
   1795                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
   1796 
   1797                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1798                     pi2_scratch += out_stride;
   1799                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1800                     pi2_scratch += out_stride;
   1801 
   1802                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
   1803                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
   1804 
   1805                 }
   1806 
   1807                 /* eo1[0-3] */
   1808                 {
   1809                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1810 
   1811                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   1812 
   1813                     /* e[1][0-3] stored in pu1_dst[4] */
   1814                     /* e[6][0-3] stored in pu1_dst[5] */
   1815                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
   1816                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
   1817 
   1818                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1819                     pi2_scratch += out_stride;
   1820                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1821                     pi2_scratch += out_stride;
   1822                 }
   1823 
   1824                 /* eo1[4-7] */
   1825                 {
   1826                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1827 
   1828                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   1829 
   1830                     /* e[1][4-7] stored in pu1_dst[6]*/
   1831                     /* e[6][4-7] stored in pu1_dst[7] */
   1832                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
   1833                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
   1834 
   1835                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1836                     pi2_scratch += out_stride;
   1837                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1838                     pi2_scratch += out_stride;
   1839 
   1840                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
   1841 
   1842                 }
   1843 
   1844                 /* eo2[0-3] */
   1845                 {
   1846                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1847 
   1848                     /* e[2][0-3] stored in pu1_dst[8]*/
   1849                     /* e[5][0-3] stored in pu1_dst[9] */
   1850                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
   1851                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
   1852 
   1853                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1854                     pi2_scratch += out_stride;
   1855                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1856                     pi2_scratch += out_stride;
   1857                 }
   1858 
   1859                 /* eo2[4-7] */
   1860                 {
   1861                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   1862 
   1863                     /* e[2][4-7] stored in pu1_dst[10]*/
   1864                     /* e[5][4-7] stored in pu1_dst[11] */
   1865                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
   1866                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
   1867 
   1868                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1869                     pi2_scratch += out_stride;
   1870                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1871                     pi2_scratch += out_stride;
   1872 
   1873                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
   1874                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
   1875 
   1876                 }
   1877 
   1878                 /* eo3[0-3] */
   1879                 {
   1880                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   1881 
   1882                     /* e[3][0-3] stored in pu1_dst[12]*/
   1883                     /* e[4][0-3] stored in pu1_dst[13] */
   1884                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
   1885                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
   1886 
   1887                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1888                     pi2_scratch += out_stride;
   1889                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1890                     pi2_scratch += out_stride;
   1891                 }
   1892 
   1893                 /* eo3[4-7] */
   1894                 {
   1895                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   1896 
   1897                     /* e[3][4-7] stored in pu1_dst[14]*/
   1898                     /* e[4][4-7] stored in pu1_dst[15] */
   1899                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
   1900                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
   1901 
   1902                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   1903                     pi2_scratch += out_stride;
   1904                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   1905                     pi2_scratch += out_stride;
   1906                 }
   1907 
   1908             }
   1909         }
   1910         else if(zero_last8_rows_stg2)
   1911         {
   1912             /* eeo */
   1913             /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
   1914             /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
   1915             {
   1916 
   1917                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
   1918                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
   1919 
   1920                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
   1921                 pi2_src_temp += (stride);
   1922                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
   1923                 pi2_src_temp += (stride * 8);
   1924 
   1925                 if(!i)
   1926                 {
   1927                     pi2_src_temp += (stride * 6 + 8);
   1928                 }
   1929                 else
   1930                 {
   1931                     pi2_src_temp += (stride * 2 + 8);
   1932                 }
   1933 
   1934                 pi2_src_temp -= (stride * 8);
   1935                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
   1936                 pi2_src_temp -= (stride);
   1937                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
   1938 
   1939 
   1940                 m_temp_reg_76 = _mm_setzero_si128();
   1941 
   1942 
   1943                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
   1944                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
   1945 
   1946                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
   1947                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
   1948 
   1949                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   1950                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   1951 
   1952                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   1953                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   1954             }
   1955 
   1956             /* eee */
   1957             /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
   1958             /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
   1959             {
   1960                 /* Loading coeff and src for use in next block */
   1961 
   1962 
   1963                 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
   1964 
   1965                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
   1966 
   1967                 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
   1968                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
   1969                 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
   1970 
   1971                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
   1972 
   1973                 m_temp_reg_26 = m_temp_reg_24;
   1974                 m_temp_reg_27 = m_temp_reg_25;
   1975 
   1976                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   1977                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   1978             }
   1979 
   1980             /* eo */
   1981             {
   1982                 WORD16 *pi2_scratch = temp_array;
   1983                 WORD32 out_stride = 8;
   1984 
   1985 
   1986                 /* eo0[0-3] */
   1987                 {
   1988                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   1989 
   1990                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   1991                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
   1992                     m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
   1993 
   1994                     /* e[0][0-3] stored in pu1_dst[0] */
   1995                     /* e[7][0-3] stored in pu1_dst[1] */
   1996                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   1997                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   1998 
   1999                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2000                     pi2_scratch += out_stride;
   2001                     _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
   2002                     pi2_scratch += out_stride;
   2003                 }
   2004 
   2005                 /* eo0[4-7] */
   2006                 {
   2007                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   2008 
   2009                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   2010                     m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
   2011                     m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
   2012 
   2013                     /* e[0][4-7] stored in pu1_dst[2] */
   2014                     /* e[7][4-7] stored in pu1_dst[3] */
   2015 
   2016                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   2017                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   2018 
   2019                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2020                     pi2_scratch += out_stride;
   2021                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2022                     pi2_scratch += out_stride;
   2023 
   2024                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
   2025                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
   2026 
   2027                 }
   2028 
   2029                 /* eo1[0-3] */
   2030                 {
   2031                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   2032 
   2033                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   2034                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
   2035                     m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
   2036 
   2037                     /* e[1][0-3] stored in pu1_dst[4] */
   2038                     /* e[6][0-3] stored in pu1_dst[5] */
   2039                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
   2040                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
   2041 
   2042                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2043                     pi2_scratch += out_stride;
   2044                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2045                     pi2_scratch += out_stride;
   2046                 }
   2047 
   2048                 /* eo1[4-7] */
   2049                 {
   2050                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   2051 
   2052                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   2053                     m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
   2054                     m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
   2055 
   2056                     /* e[1][4-7] stored in pu1_dst[6]*/
   2057                     /* e[6][4-7] stored in pu1_dst[7] */
   2058                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
   2059                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
   2060 
   2061                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2062                     pi2_scratch += out_stride;
   2063                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2064                     pi2_scratch += out_stride;
   2065 
   2066                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
   2067 
   2068                 }
   2069 
   2070                 /* eo2[0-3] */
   2071                 {
   2072                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2073 
   2074                     /* e[2][0-3] stored in pu1_dst[8]*/
   2075                     /* e[5][0-3] stored in pu1_dst[9] */
   2076                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
   2077                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
   2078 
   2079                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2080                     pi2_scratch += out_stride;
   2081                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2082                     pi2_scratch += out_stride;
   2083                 }
   2084 
   2085                 /* eo2[4-7] */
   2086                 {
   2087                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
   2088 
   2089                     /* e[2][4-7] stored in pu1_dst[10]*/
   2090                     /* e[5][4-7] stored in pu1_dst[11] */
   2091                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
   2092                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
   2093 
   2094                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2095                     pi2_scratch += out_stride;
   2096                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2097                     pi2_scratch += out_stride;
   2098 
   2099                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
   2100                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
   2101 
   2102                 }
   2103 
   2104                 /* eo3[0-3] */
   2105                 {
   2106                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   2107 
   2108                     /* e[3][0-3] stored in pu1_dst[12]*/
   2109                     /* e[4][0-3] stored in pu1_dst[13] */
   2110                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
   2111                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
   2112 
   2113                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2114                     pi2_scratch += out_stride;
   2115                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2116                     pi2_scratch += out_stride;
   2117                 }
   2118 
   2119                 /* eo3[4-7] */
   2120                 {
   2121                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
   2122 
   2123                     /* e[3][4-7] stored in pu1_dst[14]*/
   2124                     /* e[4][4-7] stored in pu1_dst[15] */
   2125                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
   2126                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
   2127 
   2128                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2129                     pi2_scratch += out_stride;
   2130                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2131                     pi2_scratch += out_stride;
   2132                 }
   2133             }
   2134         }
   2135 
   2136         else
   2137         {
   2138             /* eeo */
   2139             /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
   2140             /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
   2141             {
   2142 
   2143 
   2144                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
   2145                 pi2_src_temp += (stride);
   2146                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
   2147                 pi2_src_temp += (stride * 7);
   2148                 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
   2149                 pi2_src_temp += (stride);
   2150                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
   2151                 if(!i)
   2152                 {
   2153                     pi2_src_temp += (stride * 6 + 8);
   2154                 }
   2155                 else
   2156                 {
   2157                     pi2_src_temp += (stride * 2 + 8);
   2158                 }
   2159                 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
   2160                 pi2_src_temp -= (stride);
   2161                 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
   2162                 pi2_src_temp -= (stride * 7);
   2163                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
   2164                 pi2_src_temp -= (stride);
   2165                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
   2166 
   2167                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
   2168                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
   2169 
   2170                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
   2171                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
   2172 
   2173                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
   2174                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
   2175 
   2176                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
   2177                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
   2178 
   2179 
   2180             }
   2181 
   2182             /* eee */
   2183             /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
   2184             /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
   2185             {
   2186                 /* Loading coeff and src for use in next block */
   2187                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
   2188                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
   2189 
   2190                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
   2191                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
   2192 
   2193                 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
   2194                 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
   2195 
   2196                 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
   2197                 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
   2198 
   2199                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
   2200                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
   2201 
   2202             }
   2203 
   2204             /* eo */
   2205             {
   2206                 WORD16 *pi2_scratch = temp_array;
   2207                 WORD32 out_stride = 8;
   2208 
   2209 
   2210 
   2211                 /* eo0[0-3] */
   2212                 {
   2213                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
   2214                     m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
   2215                     m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
   2216                     m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
   2217 
   2218 
   2219                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2220                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
   2221 
   2222 
   2223                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   2224                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
   2225                     m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
   2226 
   2227 
   2228                     /* e[0][0-3] stored in pi2_tmp[0][0-7] */
   2229                     /* e[7][0-3] stored in pi2_tmp[0][8-15] */
   2230                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
   2231                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
   2232                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
   2233                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
   2234 
   2235                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2236                     pi2_scratch += out_stride;
   2237                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2238                     pi2_scratch += out_stride;
   2239 
   2240 
   2241                 }
   2242 
   2243                 /* eo0[4-7] */
   2244                 {
   2245 
   2246                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
   2247                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
   2248 
   2249                     /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
   2250                     m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
   2251                     m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
   2252 
   2253                     /* e[0][4-7] stored in pi2_tmp[1][0-7] */
   2254                     /* e[7][4-7] stored in pi2_tmp[1][8-15] */
   2255                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
   2256                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
   2257                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
   2258                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
   2259 
   2260                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2261                     pi2_scratch += out_stride;
   2262                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2263                     pi2_scratch += out_stride;
   2264 
   2265                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
   2266                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
   2267 
   2268                 }
   2269 
   2270                 /* eo1[0-3] */
   2271                 {
   2272                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   2273                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
   2274 
   2275                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   2276                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
   2277                     m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
   2278 
   2279                     /* e[1][0-3] stored in pi2_tmp[2][0-7] */
   2280                     /* e[6][0-3] stored in pi2_tmp[2][8-15] */
   2281                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
   2282                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
   2283                     m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
   2284                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
   2285 
   2286                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2287                     pi2_scratch += out_stride;
   2288                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2289                     pi2_scratch += out_stride;
   2290 
   2291                 }
   2292 
   2293                 /* eo1[4-7] */
   2294                 {
   2295                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
   2296                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2297 
   2298                     /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
   2299                     m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
   2300                     m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
   2301 
   2302                     /* e[1][4-7] stored in pi2_tmp[3][0-7] */
   2303                     /* e[6][4-7] stored in pi2_tmp[3][8-15] */
   2304                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
   2305                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
   2306                     m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
   2307                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
   2308 
   2309                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2310                     pi2_scratch += out_stride;
   2311                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2312                     pi2_scratch += out_stride;
   2313                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
   2314                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
   2315                 }
   2316 
   2317                 /* eo2[0-3] */
   2318                 {
   2319                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2320                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
   2321 
   2322                     /* e[2][0-3] stored in pi2_tmp[4][0-7] */
   2323                     /* e[5][0-3] stored in pi2_tmp[4][8-15] */
   2324                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
   2325                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
   2326                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
   2327                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
   2328 
   2329                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2330                     pi2_scratch += out_stride;
   2331                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2332                     pi2_scratch += out_stride;
   2333                 }
   2334 
   2335                 /* eo2[4-7] */
   2336                 {
   2337                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
   2338                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
   2339 
   2340                     /* e[2][4-7] stored in pi2_tmp[5][0-7] */
   2341                     /* e[5][4-7] stored in pi2_tmp[5][8-15] */
   2342                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
   2343                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
   2344                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
   2345                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
   2346 
   2347                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2348                     pi2_scratch += out_stride;
   2349                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2350                     pi2_scratch += out_stride;
   2351 
   2352                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
   2353                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
   2354 
   2355                 }
   2356 
   2357                 /* eo3[0-3] */
   2358                 {
   2359                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
   2360                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
   2361 
   2362                     /* e[3][0-3] stored in pi2_tmp[6][0-7] */
   2363                     /* e[4][0-3] stored in pi2_tmp[6][8-15] */
   2364                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
   2365                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
   2366                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
   2367                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
   2368 
   2369 
   2370                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2371                     pi2_scratch += out_stride;
   2372                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2373                     pi2_scratch += out_stride;
   2374                 }
   2375 
   2376                 /* eo3[4-7] */
   2377                 {
   2378                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
   2379                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2380 
   2381                     /* e[3][4-7] stored in pi2_tmp[7][0-7] */
   2382                     /* e[4][4-7] stored in pi2_tmp[7][8-15] */
   2383                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
   2384                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
   2385                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
   2386                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
   2387 
   2388                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
   2389                     pi2_scratch += out_stride;
   2390                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
   2391                     pi2_scratch += out_stride;
   2392                 }
   2393             }
   2394         }
   2395 
   2396         if(zero_last12_rows_stg2)
   2397         {
   2398             /* o & stage 2 pre-transposed out */
   2399             {
   2400                 WORD32 j;
   2401                 WORD16 *pi2_src_scratch = temp_array;
   2402                 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
   2403                 WORD32 out_stride = (trans_size);
   2404                 WORD32 in_stride = (8) * 4;
   2405 
   2406                 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
   2407 
   2408                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
   2409 
   2410                 pi2_src_temp += (stride * 9);
   2411 
   2412                 if(0 == i)
   2413                 {
   2414                     pi2_src_temp -= (stride * 2 - 8);
   2415                 }
   2416                 else
   2417                 {
   2418                     pi2_src_temp -= (stride * 6 - 8);
   2419                 }
   2420                 pi2_src_temp -= (stride * 9);
   2421 
   2422                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
   2423 
   2424 
   2425                 for(j = 0; j < 2; j++)
   2426                 {
   2427                     if(j)
   2428                     {
   2429                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   2430                     }
   2431                     else
   2432                     {
   2433                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   2434                     }
   2435                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   2436 
   2437                     /* o0[0-3] */
   2438                     {
   2439                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2440 
   2441                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2442                         pi2_src_scratch += in_stride;
   2443 
   2444                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   2445 
   2446                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2447                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2448 
   2449 
   2450                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2451                         m_count = _mm_cvtsi32_si128(i4_shift);
   2452                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
   2453 
   2454 
   2455                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2456                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2457                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2458                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2459 
   2460                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2461 
   2462                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2463                         pi2_dst_scratch += out_stride;
   2464                     }
   2465 
   2466                     /* o1[0-3] */
   2467                     {
   2468                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2469 
   2470                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2471                         pi2_src_scratch += in_stride;
   2472 
   2473                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   2474 
   2475                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2476                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2477 
   2478                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2479                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2480                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2481                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2482 
   2483                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2484 
   2485                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2486                         pi2_dst_scratch += ((!i) * out_stride + 8);
   2487                     }
   2488 
   2489                     /* o2[0-3] */
   2490                     {
   2491                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2492 
   2493                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2494                         pi2_src_scratch += in_stride;
   2495 
   2496                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   2497 
   2498                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2499                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2500 
   2501                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2502                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2503                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2504                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2505 
   2506                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2507 
   2508                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2509                         pi2_dst_scratch += out_stride;
   2510                     }
   2511 
   2512                     /* o3[0-3] */
   2513                     {
   2514                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2515 
   2516                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2517                         pi2_src_scratch += 8;
   2518 
   2519                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   2520 
   2521                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2522                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2523 
   2524                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2525                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2526                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2527                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2528 
   2529                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2530 
   2531                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2532                         pi2_dst_scratch += (i * out_stride + 8);
   2533                     }
   2534 
   2535                     /* o4[0-3] */
   2536                     {
   2537                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2538 
   2539                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2540                         pi2_src_scratch -= in_stride;
   2541 
   2542                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   2543 
   2544                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2545                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2546 
   2547 
   2548                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2549                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2550                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2551                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2552 
   2553                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2554 
   2555                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2556                         pi2_dst_scratch += out_stride;
   2557                     }
   2558 
   2559                     /* o5[0-3] */
   2560                     {
   2561                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2562 
   2563                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2564                         pi2_src_scratch -= in_stride;
   2565 
   2566                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   2567 
   2568                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2569                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2570 
   2571 
   2572                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2573                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2574                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2575                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2576 
   2577                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2578 
   2579                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2580                         pi2_dst_scratch += ((!i) * out_stride + 8);
   2581                     }
   2582 
   2583                     /* o6[0-3] */
   2584                     {
   2585                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2586 
   2587                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2588                         pi2_src_scratch -= in_stride;
   2589 
   2590                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   2591 
   2592                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2593                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2594 
   2595                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2596                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2597                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2598                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2599 
   2600                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2601 
   2602                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2603                         pi2_dst_scratch += out_stride;
   2604                     }
   2605 
   2606                     /* o7[0-3] */
   2607                     {
   2608                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2609 
   2610                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2611                         pi2_src_scratch += 8;
   2612 
   2613                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2614                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2615 
   2616                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2617                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2618                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2619                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2620 
   2621                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2622 
   2623                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2624                         pi2_dst_scratch += (i * out_stride + 8);
   2625                     }
   2626 
   2627 
   2628                 }
   2629             }
   2630         }
   2631         else if(zero_last8_rows_stg2)
   2632         {
   2633             /* o & stage 2 pre-transposed out */
   2634             {
   2635                 WORD32 j;
   2636                 WORD16 *pi2_src_scratch = temp_array;
   2637                 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
   2638                 WORD32 out_stride = (trans_size);
   2639                 WORD32 in_stride = (8) * 4;
   2640 
   2641                 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
   2642 
   2643 
   2644                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
   2645                 pi2_src_temp += (stride);
   2646                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
   2647                 pi2_src_temp += (stride * 8);
   2648 
   2649                 if(0 == i)
   2650                 {
   2651                     pi2_src_temp -= (stride * 2 - 8);
   2652                 }
   2653                 else
   2654                 {
   2655                     pi2_src_temp -= (stride * 6 - 8);
   2656                 }
   2657 
   2658                 pi2_src_temp -= (stride * 8);
   2659                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
   2660                 pi2_src_temp -= (stride);
   2661                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
   2662 
   2663 
   2664                 for(j = 0; j < 2; j++)
   2665                 {
   2666                     if(j)
   2667                     {
   2668                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   2669                         m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
   2670                     }
   2671                     else
   2672                     {
   2673                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   2674                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
   2675                     }
   2676                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   2677                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
   2678 
   2679                     /* o0[0-3] */
   2680                     {
   2681                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2682                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2683 
   2684                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2685                         pi2_src_scratch += in_stride;
   2686 
   2687                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   2688                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
   2689 
   2690                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2691                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2692                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2693 
   2694 
   2695                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2696                         m_count = _mm_cvtsi32_si128(i4_shift);
   2697 
   2698                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
   2699 
   2700                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2701                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2702                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2703                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2704 
   2705                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2706 
   2707                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2708                         pi2_dst_scratch += out_stride;
   2709                     }
   2710 
   2711                     /* o1[0-3] */
   2712                     {
   2713                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2714                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2715 
   2716                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2717                         pi2_src_scratch += in_stride;
   2718 
   2719                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   2720                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
   2721 
   2722                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   2723                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2724                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2725 
   2726                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2727                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2728                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2729                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2730 
   2731                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2732 
   2733                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2734                         pi2_dst_scratch += ((!i) * out_stride + 8);
   2735                     }
   2736 
   2737                     /* o2[0-3] */
   2738                     {
   2739                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2740                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2741 
   2742                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2743                         pi2_src_scratch += in_stride;
   2744 
   2745                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   2746                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
   2747 
   2748                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   2749                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2750                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2751 
   2752                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2753                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2754                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2755                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2756 
   2757                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2758 
   2759                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2760                         pi2_dst_scratch += out_stride;
   2761                     }
   2762 
   2763                     /* o3[0-3] */
   2764                     {
   2765                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2766                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2767 
   2768                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2769                         pi2_src_scratch += 8;
   2770 
   2771                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   2772                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
   2773 
   2774                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
   2775                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2776                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2777 
   2778                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2779                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2780                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2781                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2782 
   2783                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2784 
   2785                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2786                         pi2_dst_scratch += (i * out_stride + 8);
   2787                     }
   2788 
   2789                     /* o4[0-3] */
   2790                     {
   2791                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2792                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2793 
   2794                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2795                         pi2_src_scratch -= in_stride;
   2796 
   2797                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   2798                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
   2799 
   2800                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   2801                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2802                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2803 
   2804 
   2805                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2806                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2807                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2808                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2809 
   2810                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2811 
   2812                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2813                         pi2_dst_scratch += out_stride;
   2814                     }
   2815 
   2816                     /* o5[0-3] */
   2817                     {
   2818                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2819                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2820 
   2821                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2822                         pi2_src_scratch -= in_stride;
   2823 
   2824                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   2825                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
   2826 
   2827                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   2828                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2829                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2830 
   2831 
   2832                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2833                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2834                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2835                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2836 
   2837                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2838 
   2839                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2840                         pi2_dst_scratch += ((!i) * out_stride + 8);
   2841                     }
   2842 
   2843                     /* o6[0-3] */
   2844                     {
   2845                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2846                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2847 
   2848                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2849                         pi2_src_scratch -= in_stride;
   2850 
   2851                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   2852                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
   2853 
   2854                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2855                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2856                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2857 
   2858                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2859                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2860                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2861                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2862 
   2863                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2864 
   2865                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2866                         pi2_dst_scratch += out_stride;
   2867                     }
   2868 
   2869                     /* o7[0-3] */
   2870                     {
   2871                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2872                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2873 
   2874                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2875                         pi2_src_scratch += 8;
   2876 
   2877                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   2878                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   2879                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   2880 
   2881                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2882                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2883                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2884                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2885 
   2886                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2887 
   2888                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2889                         pi2_dst_scratch += (i * out_stride + 8);
   2890                     }
   2891                 }
   2892             }
   2893         }
   2894         else
   2895         {
   2896             /* o & stage 2 pre-transposed out */
   2897             {
   2898                 WORD32 j;
   2899                 WORD16 *pi2_src_scratch = temp_array;
   2900                 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
   2901                 WORD32 out_stride = (trans_size);
   2902                 WORD32 in_stride = (8) * 4;
   2903 
   2904                 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
   2905 
   2906 
   2907                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
   2908                 pi2_src_temp += (stride);
   2909                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
   2910                 pi2_src_temp += (stride * 7);
   2911                 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
   2912                 pi2_src_temp += (stride);
   2913                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
   2914                 if(0 == i)
   2915                 {
   2916                     pi2_src_temp -= (stride * 2 - 8);
   2917                 }
   2918                 else
   2919                 {
   2920                     pi2_src_temp -= (stride * 6 - 8);
   2921                 }
   2922                 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
   2923                 pi2_src_temp -= (stride);
   2924                 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
   2925                 pi2_src_temp -= (stride * 7);
   2926                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
   2927                 pi2_src_temp -= (stride);
   2928                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
   2929 
   2930 
   2931                 for(j = 0; j < 2; j++)
   2932                 {
   2933 
   2934                     if(j) //H8B= higher 8 bytes L8B lower 8 bytes
   2935                     {
   2936                         m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
   2937                         m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
   2938                         m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
   2939                         m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
   2940                     }
   2941                     else
   2942                     {
   2943                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
   2944                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
   2945                         m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
   2946                         m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
   2947                     }
   2948                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
   2949                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
   2950                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
   2951                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
   2952 
   2953 
   2954                     /* o0[0-3] */
   2955                     {
   2956                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   2957                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   2958                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   2959                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   2960 
   2961 
   2962                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   2963                         pi2_src_scratch += in_stride;
   2964 
   2965                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
   2966                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
   2967                         m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
   2968                         m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
   2969 
   2970                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   2971                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
   2972                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   2973                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   2974                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   2975 
   2976                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
   2977                         m_count = _mm_cvtsi32_si128(i4_shift);
   2978                         m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
   2979 
   2980                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   2981                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   2982                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   2983                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   2984 
   2985                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   2986 
   2987                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   2988                         pi2_dst_scratch += out_stride;
   2989                     }
   2990 
   2991                     /* o1[0-3] */
   2992                     {
   2993                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   2994                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   2995                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   2996                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   2997 
   2998 
   2999                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3000                         pi2_src_scratch += in_stride;
   3001 
   3002                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
   3003                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
   3004                         m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
   3005                         m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
   3006 
   3007                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   3008                         m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
   3009                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
   3010                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   3011                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   3012 
   3013                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3014                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3015                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3016                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3017 
   3018                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3019 
   3020                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3021                         pi2_dst_scratch += ((!i) * out_stride + 8);
   3022                     }
   3023 
   3024                     /* o2[0-3] */
   3025                     {
   3026                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3027                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3028                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3029                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3030 
   3031                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3032                         pi2_src_scratch += in_stride;
   3033 
   3034                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
   3035                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
   3036                         m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
   3037                         m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
   3038 
   3039                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   3040                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   3041                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
   3042                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3043                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3044 
   3045                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3046                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3047                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3048                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3049 
   3050                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3051 
   3052                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3053                         pi2_dst_scratch += out_stride;
   3054                     }
   3055 
   3056                     /* o3[0-3] */
   3057                     {
   3058                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   3059                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   3060                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   3061                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   3062 
   3063 
   3064                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3065                         pi2_src_scratch += 8;
   3066 
   3067                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
   3068                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
   3069                         m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
   3070                         m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
   3071 
   3072                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
   3073                         m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
   3074                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
   3075                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   3076                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   3077 
   3078                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3079                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3080                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3081                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3082 
   3083                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3084 
   3085                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3086                         pi2_dst_scratch += (i * out_stride + 8);
   3087                     }
   3088 
   3089                     /* o4[0-3] */
   3090                     {
   3091                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3092                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3093                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3094                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3095 
   3096                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3097                         pi2_src_scratch -= in_stride;
   3098 
   3099                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
   3100                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
   3101                         m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
   3102                         m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
   3103 
   3104                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
   3105                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   3106                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
   3107                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3108                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3109 
   3110                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3111                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3112                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3113                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3114 
   3115                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3116 
   3117                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3118                         pi2_dst_scratch += out_stride;
   3119                     }
   3120 
   3121                     /* o5[0-3] */
   3122                     {
   3123                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   3124                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   3125                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   3126                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   3127 
   3128 
   3129                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3130                         pi2_src_scratch -= in_stride;
   3131 
   3132                         m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
   3133                         m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
   3134                         m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
   3135                         m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
   3136 
   3137                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   3138                         m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
   3139                         m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
   3140                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   3141                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   3142 
   3143                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3144                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3145                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3146                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3147 
   3148                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3149 
   3150                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3151                         pi2_dst_scratch += ((!i) * out_stride + 8);
   3152                     }
   3153 
   3154                     /* o6[0-3] */
   3155                     {
   3156                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
   3157                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
   3158                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
   3159                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
   3160 
   3161 
   3162                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3163                         pi2_src_scratch -= in_stride;
   3164 
   3165                         m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
   3166                         m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
   3167                         m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
   3168                         m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
   3169 
   3170 
   3171                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
   3172                         m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
   3173                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
   3174                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
   3175                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
   3176 
   3177                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3178                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3179                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3180                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3181 
   3182                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3183 
   3184                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3185                         pi2_dst_scratch += out_stride;
   3186                     }
   3187 
   3188                     /* o7[0-3] */
   3189                     {
   3190                         m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
   3191                         m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
   3192                         m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
   3193                         m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
   3194 
   3195                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
   3196                         pi2_src_scratch += 8;
   3197 
   3198                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
   3199                         m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
   3200                         m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
   3201                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
   3202                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
   3203 
   3204 
   3205                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
   3206                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
   3207                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
   3208                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
   3209 
   3210                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
   3211 
   3212                         _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
   3213                         pi2_dst_scratch += (i * out_stride + 8);
   3214                     }
   3215 
   3216                 }
   3217             }
   3218         }
   3219     }
   3220 
   3221     /* Transpose */
   3222     {
   3223         WORD16 *pi2_src_scratch;
   3224         UWORD8 *pu1_pred_temp = pu1_pred;
   3225         WORD32 out_stride = dst_strd;
   3226         WORD32 in_stride = trans_size;
   3227         WORD32 j;
   3228         m_temp_reg_1 = _mm_setzero_si128();
   3229         for(i = 0; i < 2; i++)
   3230         {
   3231             pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
   3232 
   3233             for(j = 0; j < 2; j++)
   3234             {
   3235                 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
   3236                 pi2_src_scratch += in_stride;
   3237                 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
   3238                 pi2_src_scratch += ((!i) * in_stride + 8);
   3239                 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
   3240                 pi2_src_scratch += (in_stride);
   3241                 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
   3242                 pi2_src_scratch += (i * in_stride + 8);
   3243                 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
   3244                 pi2_src_scratch += in_stride;
   3245                 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
   3246                 pi2_src_scratch += ((!i) * in_stride + 8);
   3247                 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
   3248                 pi2_src_scratch += in_stride;
   3249                 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
   3250                 pi2_src_scratch += (i * in_stride + 8);
   3251 
   3252                 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
   3253                 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
   3254 
   3255                 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
   3256                 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
   3257 
   3258                 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
   3259                 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
   3260 
   3261                 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
   3262                 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
   3263 
   3264 
   3265                 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
   3266                 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
   3267 
   3268                 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
   3269                 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
   3270 
   3271                 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
   3272                 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
   3273 
   3274                 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
   3275                 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
   3276 
   3277 
   3278                 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
   3279                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
   3280 
   3281                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
   3282                 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
   3283 
   3284                 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
   3285                 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
   3286                 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
   3287 
   3288                 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
   3289                 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   3290                 pu1_dst += out_stride;
   3291                 pu1_pred_temp += pred_strd;
   3292 
   3293                 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
   3294                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
   3295 
   3296                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
   3297                 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
   3298 
   3299                 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
   3300                 m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
   3301                 m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
   3302 
   3303                 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
   3304                 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   3305                 pu1_dst += out_stride;
   3306                 pu1_pred_temp += pred_strd;
   3307 
   3308                 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
   3309                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
   3310 
   3311                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
   3312                 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
   3313 
   3314                 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
   3315                 m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
   3316                 m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
   3317 
   3318                 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
   3319                 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   3320                 pu1_dst += out_stride;
   3321                 pu1_pred_temp += pred_strd;
   3322 
   3323                 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
   3324                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
   3325 
   3326                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
   3327                 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
   3328 
   3329                 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
   3330                 m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
   3331                 m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
   3332 
   3333                 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
   3334                 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
   3335                 pu1_dst += out_stride;
   3336                 pu1_pred_temp += pred_strd;
   3337             }
   3338         }
   3339     }
   3340 }
   3341