Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21  *******************************************************************************
     22  * @file
     23  *  ih264_iquant_itrans_recon_sse42.c
     24  *
     25  * @brief
     26  *  Contains function definitions for inverse  quantization, inverse
     27  * transform and reconstruction
     28  *
     29  * @author
     30  *  Mohit [100664]
     31  *
     32  * @par List of Functions:
     33  *  - ih264_iquant_itrans_recon_4x4_sse42()
     34  *  - ih264_iquant_itrans_recon_chroma_4x4_sse42()
     35  *
     36  * @remarks
     37  *  None
     38  *
     39  *******************************************************************************
     40  */
     41 /* User include files */
     42 #include "ih264_typedefs.h"
     43 #include "ih264_defs.h"
     44 #include "ih264_trans_macros.h"
     45 #include "ih264_macros.h"
     46 #include "ih264_platform_macros.h"
     47 #include "ih264_trans_data.h"
     48 #include "ih264_size_defs.h"
     49 #include "ih264_structs.h"
     50 #include "ih264_trans_quant_itrans_iquant.h"
     51 #include <immintrin.h>
     52 
     53 /*
     54  ********************************************************************************
     55  *
     56  * @brief This function reconstructs a 4x4 sub block from quantized resiude and
     57  * prediction buffer
     58  *
     59  * @par Description:
     60  *  The quantized residue is first inverse quantized, then inverse transformed.
     61  *  This inverse transformed content is added to the prediction buffer to recon-
     62  *  struct the end output
     63  *
     64  * @param[in] pi2_src
     65  *  quantized 4x4 block
     66  *
     67  * @param[in] pu1_pred
     68  *  prediction 4x4 block
     69  *
     70  * @param[out] pu1_out
     71  *  reconstructed 4x4 block
     72  *
     73  * @param[in] src_strd
     74  *  quantization buffer stride
     75  *
     76  * @param[in] pred_strd,
     77  *  Prediction buffer stride
     78  *
     79  * @param[in] out_strd
     80  *  recon buffer Stride
     81  *
     82  * @param[in] pu2_scaling_list
     83  *  pointer to scaling list
     84  *
     85  * @param[in] pu2_norm_adjust
     86  *  pointer to inverse scale matrix
     87  *
     88  * @param[in] u4_qp_div_6
     89  *  Floor (qp/6)
     90  *
     91  * @param[in] pi4_tmp
     92  * temporary buffer of size 1*16
     93  *
     94  * @returns none
     95  *
     96  * @remarks none
     97  *
     98  *******************************************************************************
     99  */
    100 void ih264_iquant_itrans_recon_4x4_sse42(WORD16 *pi2_src,
    101                                    UWORD8 *pu1_pred,
    102                                    UWORD8 *pu1_out,
    103                                    WORD32 pred_strd,
    104                                    WORD32 out_strd,
    105                                    const UWORD16 *pu2_iscal_mat,
    106                                    const UWORD16 *pu2_weigh_mat,
    107                                    UWORD32 u4_qp_div_6,
    108                                    WORD16 *pi2_tmp,
    109                                    WORD32 iq_start_idx,
    110                                    WORD16 *pi2_dc_ld_addr)
    111  {
    112     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
    113     __m128i src_r0_r1, src_r2_r3;
    114     __m128i src_r0, src_r1, src_r2, src_r3;
    115     __m128i scalemat_r0_r1, scalemat_r2_r3;
    116     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
    117     __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
    118     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
    119     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
    120     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
    121     __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
    122     __m128i value_32 = _mm_set1_epi32(32);
    123     UNUSED (pi2_tmp);
    124 
    125     /*************************************************************/
    126     /* Dequantization of coefficients. Will be replaced by SIMD  */
    127     /* operations on platform                                    */
    128     /*************************************************************/
    129     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
    130     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
    131     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
    132     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
    133     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
    134     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); //q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
    135 
    136     temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
    137     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
    138 
    139     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
    140     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
    141     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
    142     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
    143 
    144     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
    145     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
    146     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
    147     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
    148 
    149     temp4 = _mm_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
    150     temp5 = _mm_madd_epi16(src_r1, temp5);
    151     temp6 = _mm_madd_epi16(src_r2, temp6);
    152     temp7 = _mm_madd_epi16(src_r3, temp7);
    153 
    154     if (u4_qp_div_6 >= 4) {
    155         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
    156         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
    157         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
    158         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
    159     } else {
    160         temp4 = _mm_add_epi32(temp4, add_rshift);
    161         temp5 = _mm_add_epi32(temp5, add_rshift);
    162         temp6 = _mm_add_epi32(temp6, add_rshift);
    163         temp7 = _mm_add_epi32(temp7, add_rshift);
    164         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
    165         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
    166         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
    167         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
    168     }
    169 
    170     if (iq_start_idx == 1)
    171         resq_r0 = _mm_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0);
    172     /* Perform Inverse transform */
    173     /*-------------------------------------------------------------*/
    174     /* IDCT [ Horizontal transformation ]                          */
    175     /*-------------------------------------------------------------*/
    176     // Matrix transpose
    177     /*
    178      *  a0 a1 a2 a3
    179      *  b0 b1 b2 b3
    180      *  c0 c1 c2 c3
    181      *  d0 d1 d2 d3
    182      */
    183     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);                  //a0 b0 a1 b1
    184     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);                  //c0 d0 c1 d1
    185     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);                  //a2 b2 a3 b3
    186     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);                  //c2 d2 c3 d3
    187     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);                    //a0 b0 c0 d0
    188     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);                    //a1 b1 c1 d1
    189     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);                    //a2 b2 c2 d2
    190     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);                    //a3 b3 c3 d3
    191     //Transform starts -- horizontal transform
    192     /*------------------------------------------------------------------*/
    193     /* z0 = w0 + w2                                             */
    194     temp0 = _mm_add_epi32(resq_r0, resq_r2);
    195     /* z1 = w0 - w2                                             */
    196     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
    197     /* z2 = (w1 >> 1) - w3                                      */
    198     temp2 = _mm_srai_epi32(resq_r1, 1);                         //(w1>>1)
    199     temp2 = _mm_sub_epi32(temp2, resq_r3);                      //(w1>>1) - w3
    200     /* z3 = w1 + (w3 >> 1)                                      */
    201     temp3 = _mm_srai_epi32(resq_r3, 1);                         //(w3>>1) + w1
    202     temp3 = _mm_add_epi32(temp3, resq_r1);
    203     /*----------------------------------------------------------*/
    204     /* x0 = z0 + z3                                             */
    205     resq_r0 = _mm_add_epi32(temp0, temp3);
    206     /* x1 = z1 + z2                                             */
    207     resq_r1 = _mm_add_epi32(temp1, temp2);
    208     /* x2 = z1 - z2                                             */
    209     resq_r2 = _mm_sub_epi32(temp1, temp2);
    210     /* x3 = z0 - z3                                             */
    211     resq_r3 = _mm_sub_epi32(temp0, temp3);
    212     // Matrix transpose
    213     /*
    214      *  a0 b0 c0 d0
    215      *  a1 b1 c1 d1
    216      *  a2 b2 c2 d2
    217      *  a3 b3 c3 d3
    218      */
    219     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);                  //a0 a1 b0 b1
    220     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);                  //a2 a3 b2 b3
    221     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);                  //c0 c1 d0 d1
    222     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);                  //c2 c3 d2 d3
    223     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);                    //a0 a1 a2 a3
    224     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);                    //b0 b1 b2 b3
    225     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);                    //c0 c1 c2 c3
    226     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);                    //d0 d1 d2 d3
    227     //Transform ends -- horizontal transform
    228 
    229     //Load pred buffer
    230     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
    231     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
    232     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
    233     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
    234 
    235     pred_r0 = _mm_cvtepu8_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits
    236     pred_r1 = _mm_cvtepu8_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits
    237     pred_r2 = _mm_cvtepu8_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits
    238     pred_r3 = _mm_cvtepu8_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits
    239 
    240     /*--------------------------------------------------------------*/
    241     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
    242     /*                                                              */
    243     /* Add the prediction and store it back to same buffer          */
    244     /*--------------------------------------------------------------*/
    245     /* z0j = y0j + y2j                                                        */
    246     temp0 = _mm_add_epi32(resq_r0, resq_r2);
    247     /* z1j = y0j - y2j                                                        */
    248     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
    249     /* z2j = (y1j>>1) - y3j                                                        */
    250     temp2 = _mm_srai_epi32(resq_r1, 1);                             //(y1j>>1)
    251     temp2 = _mm_sub_epi32(temp2, resq_r3);
    252     /* z3j = y1j + (y3j>>1)                                                        */
    253     temp3 = _mm_srai_epi32(resq_r3, 1);                             //(y3j>>1)
    254     temp3 = _mm_add_epi32(temp3, resq_r1);
    255 
    256     /* x0j = z0j + z3j                                                        */
    257     temp4 = _mm_add_epi32(temp0, temp3);
    258     temp4 = _mm_add_epi32(temp4, value_32);
    259     temp4 = _mm_srai_epi32(temp4, 6);
    260     temp4 = _mm_add_epi32(temp4, pred_r0);
    261     /* x1j = z1j + z2j                                                        */
    262     temp5 = _mm_add_epi32(temp1, temp2);
    263     temp5 = _mm_add_epi32(temp5, value_32);
    264     temp5 = _mm_srai_epi32(temp5, 6);
    265     temp5 = _mm_add_epi32(temp5, pred_r1);
    266     /* x2j = z1j - z2j                                                        */
    267     temp6 = _mm_sub_epi32(temp1, temp2);
    268     temp6 = _mm_add_epi32(temp6, value_32);
    269     temp6 = _mm_srai_epi32(temp6, 6);
    270     temp6 = _mm_add_epi32(temp6, pred_r2);
    271     /* x3j = z0j - z3j                                                        */
    272     temp7 = _mm_sub_epi32(temp0, temp3);
    273     temp7 = _mm_add_epi32(temp7, value_32);
    274     temp7 = _mm_srai_epi32(temp7, 6);
    275     temp7 = _mm_add_epi32(temp7, pred_r3);
    276 
    277     // 32-bit to 16-bit conversion
    278     temp0 = _mm_packs_epi32(temp4, temp5);
    279     temp1 = _mm_packs_epi32(temp6, temp7);
    280     /*------------------------------------------------------------------*/
    281     //Clipping the results to 8 bits
    282     sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);      // sign check
    283     temp0 = _mm_and_si128(temp0, sign_reg);
    284     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
    285     temp1 = _mm_and_si128(temp1, sign_reg);
    286 
    287     resq_r0 = _mm_packus_epi16(temp0, temp1);
    288     resq_r1 = _mm_srli_si128(resq_r0, 4);
    289     resq_r2 = _mm_srli_si128(resq_r1, 4);
    290     resq_r3 = _mm_srli_si128(resq_r2, 4);
    291 
    292     *pu4_out = _mm_cvtsi128_si32(resq_r0);
    293     pu1_out += out_strd;
    294     pu4_out = (UWORD32 *) (pu1_out);
    295     *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
    296     pu1_out += out_strd;
    297     pu4_out = (UWORD32 *) (pu1_out);
    298     *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
    299     pu1_out += out_strd;
    300     pu4_out = (UWORD32 *) (pu1_out);
    301     *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
    302 }
    303 
    304 /*
    305  ********************************************************************************
    306  *
    307  * @brief This function reconstructs a 4x4 sub block from quantized chroma resiude and
    308  * prediction buffer
    309  *
    310  * @par Description:
    311  *  The quantized residue is first inverse quantized, then inverse transformed.
    312  *  This inverse transformed content is added to the prediction buffer to recon-
    313  *  struct the end output
    314  *
    315  * @param[in] pi2_src
    316  *  quantized 4x4 block
    317  *
    318  * @param[in] pu1_pred
    319  *  prediction 4x4 block
    320  *
    321  * @param[out] pu1_out
    322  *  reconstructed 4x4 block
    323  *
    324  * @param[in] src_strd
    325  *  quantization buffer stride
    326  *
    327  * @param[in] pred_strd,
    328  *  Prediction buffer stride
    329  *
    330  * @param[in] out_strd
    331  *  recon buffer Stride
    332  *
    333  * @param[in] pu2_scaling_list
    334  *  pointer to scaling list
    335  *
    336  * @param[in] pu2_norm_adjust
    337  *  pointer to inverse scale matrix
    338  *
    339  * @param[in] u4_qp_div_6
    340  *  Floor (qp/6)
    341  *
    342  * @param[in] pi4_tmp
    343  * temporary buffer of size 1*16
    344  *
    345  * @returns none
    346  *
    347  * @remarks none
    348  *
    349  *******************************************************************************
    350  */
    351 void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src,
    352                                    UWORD8 *pu1_pred,
    353                                    UWORD8 *pu1_out,
    354                                    WORD32 pred_strd,
    355                                    WORD32 out_strd,
    356                                    const UWORD16 *pu2_iscal_mat,
    357                                    const UWORD16 *pu2_weigh_mat,
    358                                    UWORD32 u4_qp_div_6,
    359                                    WORD16 *pi2_tmp,
    360                                    WORD16 *pi2_dc_ld_addr)
    361  {
    362     __m128i src_r0_r1, src_r2_r3;
    363     __m128i src_r0, src_r1, src_r2, src_r3;
    364     __m128i scalemat_r0_r1, scalemat_r2_r3;
    365     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
    366     __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
    367     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
    368     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
    369     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
    370     __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
    371     __m128i value_32 = _mm_set1_epi32(32);
    372     __m128i chroma_mask = _mm_set1_epi16 (0xFF);
    373     __m128i out_r0, out_r1, out_r2, out_r3;
    374     UNUSED (pi2_tmp);
    375 
    376     /*************************************************************/
    377     /* Dequantization of coefficients. Will be replaced by SIMD  */
    378     /* operations on platform                                    */
    379     /*************************************************************/
    380     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
    381     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
    382     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
    383     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
    384     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
    385     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8)); //q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
    386 
    387     temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
    388     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3); //b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
    389 
    390     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
    391     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
    392     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
    393     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
    394 
    395     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
    396     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
    397     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
    398     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
    399 
    400     temp4 = _mm_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
    401     temp5 = _mm_madd_epi16(src_r1, temp5);
    402     temp6 = _mm_madd_epi16(src_r2, temp6);
    403     temp7 = _mm_madd_epi16(src_r3, temp7);
    404 
    405     if (u4_qp_div_6 >= 4) {
    406         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
    407         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
    408         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
    409         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
    410     } else {
    411         temp4 = _mm_add_epi32(temp4, add_rshift);
    412         temp5 = _mm_add_epi32(temp5, add_rshift);
    413         temp6 = _mm_add_epi32(temp6, add_rshift);
    414         temp7 = _mm_add_epi32(temp7, add_rshift);
    415         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
    416         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
    417         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
    418         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
    419     }
    420 
    421     resq_r0 = _mm_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0);
    422     /* Perform Inverse transform */
    423     /*-------------------------------------------------------------*/
    424     /* IDCT [ Horizontal transformation ]                          */
    425     /*-------------------------------------------------------------*/
    426     // Matrix transpose
    427     /*
    428      *  a0 a1 a2 a3
    429      *  b0 b1 b2 b3
    430      *  c0 c1 c2 c3
    431      *  d0 d1 d2 d3
    432      */
    433     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);                  //a0 b0 a1 b1
    434     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);                  //c0 d0 c1 d1
    435     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);                  //a2 b2 a3 b3
    436     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);                  //c2 d2 c3 d3
    437     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);                    //a0 b0 c0 d0
    438     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);                    //a1 b1 c1 d1
    439     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);                    //a2 b2 c2 d2
    440     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);                    //a3 b3 c3 d3
    441     //Transform starts -- horizontal transform
    442     /*------------------------------------------------------------------*/
    443     /* z0 = w0 + w2                                             */
    444     temp0 = _mm_add_epi32(resq_r0, resq_r2);
    445     /* z1 = w0 - w2                                             */
    446     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
    447     /* z2 = (w1 >> 1) - w3                                      */
    448     temp2 = _mm_srai_epi32(resq_r1, 1);                         //(w1>>1)
    449     temp2 = _mm_sub_epi32(temp2, resq_r3);                      //(w1>>1) - w3
    450     /* z3 = w1 + (w3 >> 1)                                      */
    451     temp3 = _mm_srai_epi32(resq_r3, 1);                         //(w3>>1) + w1
    452     temp3 = _mm_add_epi32(temp3, resq_r1);
    453     /*----------------------------------------------------------*/
    454     /* x0 = z0 + z3                                             */
    455     resq_r0 = _mm_add_epi32(temp0, temp3);
    456     /* x1 = z1 + z2                                             */
    457     resq_r1 = _mm_add_epi32(temp1, temp2);
    458     /* x2 = z1 - z2                                             */
    459     resq_r2 = _mm_sub_epi32(temp1, temp2);
    460     /* x3 = z0 - z3                                             */
    461     resq_r3 = _mm_sub_epi32(temp0, temp3);
    462     // Matrix transpose
    463     /*
    464      *  a0 b0 c0 d0
    465      *  a1 b1 c1 d1
    466      *  a2 b2 c2 d2
    467      *  a3 b3 c3 d3
    468      */
    469     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);                  //a0 a1 b0 b1
    470     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);                  //a2 a3 b2 b3
    471     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);                  //c0 c1 d0 d1
    472     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);                  //c2 c3 d2 d3
    473     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);                    //a0 a1 a2 a3
    474     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);                    //b0 b1 b2 b3
    475     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);                    //c0 c1 c2 c3
    476     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);                    //d0 d1 d2 d3
    477     //Transform ends -- horizontal transform
    478 
    479     //Load pred buffer
    480     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
    481     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
    482     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
    483     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
    484 
    485     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
    486     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
    487     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
    488     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
    489 
    490     pred_r0 = _mm_cvtepu16_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits
    491     pred_r1 = _mm_cvtepu16_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits
    492     pred_r2 = _mm_cvtepu16_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits
    493     pred_r3 = _mm_cvtepu16_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits
    494 
    495     /*--------------------------------------------------------------*/
    496     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
    497     /*                                                              */
    498     /* Add the prediction and store it back to same buffer          */
    499     /*--------------------------------------------------------------*/
    500     /* z0j = y0j + y2j                                                        */
    501     temp0 = _mm_add_epi32(resq_r0, resq_r2);
    502     /* z1j = y0j - y2j                                                        */
    503     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
    504     /* z2j = (y1j>>1) - y3j                                                        */
    505     temp2 = _mm_srai_epi32(resq_r1, 1);                             //(y1j>>1)
    506     temp2 = _mm_sub_epi32(temp2, resq_r3);
    507     /* z3j = y1j + (y3j>>1)                                                        */
    508     temp3 = _mm_srai_epi32(resq_r3, 1);                             //(y3j>>1)
    509     temp3 = _mm_add_epi32(temp3, resq_r1);
    510 
    511     /* x0j = z0j + z3j                                                        */
    512     temp4 = _mm_add_epi32(temp0, temp3);
    513     temp4 = _mm_add_epi32(temp4, value_32);
    514     temp4 = _mm_srai_epi32(temp4, 6);
    515     temp4 = _mm_add_epi32(temp4, pred_r0);
    516     /* x1j = z1j + z2j                                                        */
    517     temp5 = _mm_add_epi32(temp1, temp2);
    518     temp5 = _mm_add_epi32(temp5, value_32);
    519     temp5 = _mm_srai_epi32(temp5, 6);
    520     temp5 = _mm_add_epi32(temp5, pred_r1);
    521     /* x2j = z1j - z2j                                                        */
    522     temp6 = _mm_sub_epi32(temp1, temp2);
    523     temp6 = _mm_add_epi32(temp6, value_32);
    524     temp6 = _mm_srai_epi32(temp6, 6);
    525     temp6 = _mm_add_epi32(temp6, pred_r2);
    526     /* x3j = z0j - z3j                                                        */
    527     temp7 = _mm_sub_epi32(temp0, temp3);
    528     temp7 = _mm_add_epi32(temp7, value_32);
    529     temp7 = _mm_srai_epi32(temp7, 6);
    530     temp7 = _mm_add_epi32(temp7, pred_r3);
    531 
    532     // 32-bit to 16-bit conversion
    533     temp0 = _mm_packs_epi32(temp4, temp5);
    534     temp1 = _mm_packs_epi32(temp6, temp7);
    535     /*------------------------------------------------------------------*/
    536     //Clipping the results to 8 bits
    537     sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);      // sign check
    538     temp0 = _mm_and_si128(temp0, sign_reg);
    539     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
    540     temp1 = _mm_and_si128(temp1, sign_reg);
    541 
    542     resq_r0 = _mm_packus_epi16(temp0, temp1);
    543     resq_r1 = _mm_srli_si128(resq_r0, 4);
    544     resq_r2 = _mm_srli_si128(resq_r1, 4);
    545     resq_r3 = _mm_srli_si128(resq_r2, 4);
    546 
    547     resq_r0 = _mm_cvtepu8_epi16(resq_r0); //p00 p01 p02 p03 -- all 16 bits
    548     resq_r1 = _mm_cvtepu8_epi16(resq_r1); //p10 p11 p12 p13 -- all 16 bits
    549     resq_r2 = _mm_cvtepu8_epi16(resq_r2); //p20 p21 p22 p23 -- all 16 bits
    550     resq_r3 = _mm_cvtepu8_epi16(resq_r3); //p30 p31 p32 p33 -- all 16 bits
    551 
    552     chroma_mask = _mm_set1_epi16 (0xFF00);
    553     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
    554     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[out_strd]));
    555     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * out_strd]));
    556     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * out_strd]));
    557 
    558     out_r0 = _mm_and_si128(out_r0, chroma_mask);
    559     out_r1 = _mm_and_si128(out_r1, chroma_mask);
    560     out_r2 = _mm_and_si128(out_r2, chroma_mask);
    561     out_r3 = _mm_and_si128(out_r3, chroma_mask);
    562 
    563     out_r0 = _mm_add_epi8(out_r0, resq_r0);
    564     out_r1 = _mm_add_epi8(out_r1, resq_r1);
    565     out_r2 = _mm_add_epi8(out_r2, resq_r2);
    566     out_r3 = _mm_add_epi8(out_r3, resq_r3);
    567 
    568     _mm_storel_epi64((__m128i *)(&pu1_out[0]), out_r0);
    569     _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), out_r1);
    570     _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), out_r2);
    571     _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), out_r3);
    572 }
    573