Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21  *******************************************************************************
     22  * @file
     23  *  ih264_resi_trans_quant_sse42.c
     24  *
     25  * @brief
     26  *  Contains function definitions single stage  forward transform for H.264
     27  *  It will calculate the residue, do the cf and then do quantization
     28  *
     29  * @author
     30  *  Mohit [100664]
     31  *
     32  * @par List of Functions:
     33  *  - ih264_resi_trans_quant_4x4_sse42()
     34  *  - ih264_resi_trans_quant_chroma_4x4_sse42()
     35  *
     36  * @remarks
     37  *  None
     38  *
     39  *******************************************************************************
     40  */
     41 /* System include files */
     42 #include <stddef.h>
     43 
     44 /* User include files */
     45 #include "ih264_typedefs.h"
     46 #include "ih264_defs.h"
     47 #include "ih264_size_defs.h"
     48 #include "ih264_macros.h"
     49 #include "ih264_trans_macros.h"
     50 #include "ih264_trans_data.h"
     51 #include "ih264_structs.h"
     52 #include "ih264_trans_quant_itrans_iquant.h"
     53 #include <immintrin.h>
     54 /**
     55  *******************************************************************************
     56  *
     57  * @brief
     58  *   This function performs forward transform and quantization on a 4*4 block
     59  *
     60  * @par Description:
     61  *   The function accepts source buffer and estimation buffer. From these, it
     62  *   computes the residue. This is residue is then transformed and quantized.
     63  *   The transform and quantization are in placed computed. They use the residue
     64  *   buffer for this.
     65  *
     66  * @param[in] pu1_src
     67  *   Pointer to source sub-block
     68  *
     69  * @param[in] pu1_pred
     70  *   Pointer to prediction sub-block
     71  *
     72  * @param[in] pi2_out
     73  *   Pointer to residual sub-block
     74  *
     75  * @param[in] src_strd
     76  *   Source stride
     77  *
     78  * @param[in] pred_strd
     79  *   Prediction stride
     80  *
     81  * @param[in] dst_strd
     82  *   Destination stride
     83  *
     84  * @param[in] u4_qbits
     85  *    QP_BITS_h264_4x4 + floor(QP/6)
     86  *
     87  * @param[in] pu2_threshold_matrix
     88  *   Pointer to Forward Quant Threshold Matrix
     89  *
     90  * @param[in] pu2_scale_matrix
     91  *   Pointer to Forward Quant Scale Matrix
     92  *
     93  * @param[in] u4_round_factor
     94  *   Quantization Round factor
     95  *
     96  * @param[out] pu1_nnz
     97  *   Total non-zero coefficients in the current sub-block
     98  *
     99  * @returns
    100  *
    101  * @remarks
    102  *   None
    103  *
    104  *******************************************************************************
    105  */
    106 void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred,
    107                                       WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
    108                                       const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix,
    109                                       UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz,
    110                                       WORD16 *pi2_alt_dc_addr)
    111 {
    112     WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
    113     WORD32 mask0, mask1;
    114     __m128i sum0, sum1, sum2, cmp0, cmp1;
    115     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
    116     __m128i temp_2 = _mm_set1_epi16(2);
    117     __m128i temp_1 = _mm_set1_epi16(1);
    118     __m128i src_r0, src_r1, src_r2, src_r3;
    119     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
    120     __m128i temp0, temp1, temp2, temp3;
    121     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
    122     __m128i sign_reg0, sign_reg2;
    123     __m128i scalemat_r0_r1, scalemat_r2_r3;
    124 
    125     UNUSED (pu2_threshold_matrix);
    126 
    127     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
    128     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
    129     src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
    130     src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
    131     src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
    132     src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
    133 
    134     src_r0 = _mm_cvtepu8_epi16(src_r0);
    135     src_r1 = _mm_cvtepu8_epi16(src_r1);
    136     src_r2 = _mm_cvtepu8_epi16(src_r2);
    137     src_r3 = _mm_cvtepu8_epi16(src_r3);
    138 
    139     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
    140     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
    141     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
    142     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
    143 
    144     pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
    145     pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
    146     pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
    147     pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
    148 
    149     src_r0 = _mm_sub_epi16(src_r0, pred_r0);
    150     src_r1 = _mm_sub_epi16(src_r1, pred_r1);
    151     src_r2 = _mm_sub_epi16(src_r2, pred_r2);
    152     src_r3 = _mm_sub_epi16(src_r3, pred_r3);
    153 
    154     /* Perform Forward transform */
    155     /*-------------------------------------------------------------*/
    156     /* DCT [ Horizontal transformation ]                          */
    157     /*-------------------------------------------------------------*/
    158     // Matrix transpose
    159     /*
    160      *  a0 a1 a2 a3
    161      *  b0 b1 b2 b3
    162      *  c0 c1 c2 c3
    163      *  d0 d1 d2 d3
    164      */
    165     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 b0 a1 b1 a2 b2 a3 b3
    166     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //c0 d0 c1 d1 c2 d2 c3 d3
    167     temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 b0 c0 d0 a1 b1 c1 d1
    168     temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //a2 b2 c2 d2 a3 b3 c3 d3
    169 
    170     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 b0 c0 d0
    171     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //a1 b1 c1 d1
    172     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //a2 b2 c2 d2
    173     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //a3 b3 c3 d3
    174 
    175     /*----------------------------------------------------------*/
    176     /* x0 = z0 + z3                                             */
    177     temp0 = _mm_add_epi16(src_r0, src_r3);
    178     /* x1 = z1 + z2                                             */
    179     temp1 = _mm_add_epi16(src_r1, src_r2);
    180     /* x2 = z1 - z2                                             */
    181     temp2 = _mm_sub_epi16(src_r1, src_r2);
    182     /* x3 = z0 - z3                                             */
    183     temp3 = _mm_sub_epi16(src_r0, src_r3);
    184 
    185     /* z0 = x0 + x1                                             */
    186     src_r0 = _mm_add_epi16(temp0, temp1);
    187     /* z1 = (x3 << 1) + x2                                      */
    188     src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
    189     src_r1 = _mm_add_epi16(src_r1, temp2);
    190     /* z2 = x0 - x1                                             */
    191     src_r2 = _mm_sub_epi16(temp0, temp1);
    192     /* z3 = x3 - (x2 << 1)                                      */
    193     src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
    194     src_r3 = _mm_sub_epi16(temp3, src_r3);
    195 
    196     // Matrix transpose
    197     /*
    198      *  a0 b0 c0 d0
    199      *  a1 b1 c1 d1
    200      *  a2 b2 c2 d2
    201      *  a3 b3 c3 d3
    202      */
    203     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 a1 b0 b1 c0 c1 d0 d1
    204     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //a2 a3 b2 b3 c2 c3 d2 d3
    205     temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 a1 a2 a3 b0 b1 b2 b3
    206     temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //c0 c1 c2 c3 d0 d1 d2 d3
    207 
    208     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 a1 a2 a3
    209     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //b0 b1 b2 b3
    210     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //c0 c1 c2 c3
    211     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //d0 d1 d2 d3
    212 
    213     /*----------------------------------------------------------*/
    214     /* x0 = z0 + z3                                             */
    215     temp0 = _mm_add_epi16(src_r0, src_r3);
    216     /* x1 = z1 + z2                                             */
    217     temp1 = _mm_add_epi16(src_r1, src_r2);
    218     /* x2 = z1 - z2                                             */
    219     temp2 = _mm_sub_epi16(src_r1, src_r2);
    220     /* x3 = z0 - z3                                             */
    221     temp3 = _mm_sub_epi16(src_r0, src_r3);
    222 
    223     /* z0 = x0 + x1                                             */
    224     src_r0 = _mm_add_epi16(temp0, temp1);
    225     /* z1 = (x3 << 1) + x2                                      */
    226     src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
    227     src_r1 = _mm_add_epi16(src_r1, temp2);
    228     /* z2 = x0 - x1                                             */
    229     src_r2 = _mm_sub_epi16(temp0, temp1);
    230     /* z3 = x3 - (x2 << 1)                                      */
    231     src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
    232     src_r3 = _mm_sub_epi16(temp3, src_r3);
    233 
    234     tmp_dc = _mm_extract_epi16(src_r0,0);                       //a0
    235     *pi2_alt_dc_addr = tmp_dc;
    236 
    237     src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);                //a0 a1 a2 a3 b0 b1 b2 b3
    238     src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);                //c0 c1 c2 c3 d0 d1 d2 d3
    239     sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
    240     sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
    241 
    242     sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
    243     sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
    244 
    245     sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
    246     sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
    247 
    248     src_r0 = _mm_abs_epi16(src_r0);
    249     src_r2 = _mm_abs_epi16(src_r2);
    250 
    251     src_r1 = _mm_srli_si128(src_r0, 8);
    252     src_r0 = _mm_cvtepu16_epi32(src_r0);
    253     src_r1 = _mm_cvtepu16_epi32(src_r1);
    254     src_r3 = _mm_srli_si128(src_r2, 8);
    255     src_r2 = _mm_cvtepu16_epi32(src_r2);
    256     src_r3 = _mm_cvtepu16_epi32(src_r3);
    257 
    258     temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
    259     scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
    260     temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
    261     scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
    262     temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
    263     temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
    264 
    265     temp0 = _mm_mullo_epi32(temp0, src_r0);
    266     temp1 = _mm_mullo_epi32(temp1, src_r1);
    267     temp2 = _mm_mullo_epi32(temp2, src_r2);
    268     temp3 = _mm_mullo_epi32(temp3, src_r3);
    269 
    270     temp0 = _mm_add_epi32(temp0,rnd_fact);
    271     temp1 = _mm_add_epi32(temp1,rnd_fact);
    272     temp2 = _mm_add_epi32(temp2,rnd_fact);
    273     temp3 = _mm_add_epi32(temp3,rnd_fact);
    274 
    275     temp0 = _mm_srli_epi32(temp0,u4_qbits);
    276     temp1 = _mm_srli_epi32(temp1,u4_qbits);
    277     temp2 = _mm_srli_epi32(temp2,u4_qbits);
    278     temp3 = _mm_srli_epi32(temp3,u4_qbits);
    279 
    280     temp0 =  _mm_packs_epi32 (temp0,temp1);
    281     temp2 =  _mm_packs_epi32 (temp2,temp3);
    282 
    283     temp0 =  _mm_sign_epi16(temp0, sign_reg0);
    284     temp2 =  _mm_sign_epi16(temp2, sign_reg2);
    285 
    286     _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
    287     _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
    288 
    289     cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
    290     cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
    291 
    292     mask0 = _mm_movemask_epi8(cmp0);
    293     mask1 = _mm_movemask_epi8(cmp1);
    294     u4_zero_coeff = 0;
    295     if(mask0)
    296     {
    297         if(mask0 == 0xffff)
    298             u4_zero_coeff+=8;
    299         else
    300         {
    301             cmp0 = _mm_and_si128(temp_1, cmp0);
    302             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
    303             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
    304             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
    305             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
    306         }
    307     }
    308     if(mask1)
    309     {
    310         if(mask1 == 0xffff)
    311             u4_zero_coeff+=8;
    312         else
    313         {
    314             cmp1 = _mm_and_si128(temp_1, cmp1);
    315             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
    316             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
    317             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
    318             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
    319         }
    320     }
    321 
    322     /* Return total nonzero coefficients in the current sub block */
    323     u4_nonzero_coeff = 16 - u4_zero_coeff;
    324     *pu1_nnz =  u4_nonzero_coeff;
    325 }
    326 
    327 /**
    328  *******************************************************************************
    329  *
    330  * @brief
    331  *   This function performs forward transform and quantization on a 4*4 chroma block
    332  *
    333  * @par Description:
    334  *   The function accepts source buffer and estimation buffer. From these, it
    335  *   computes the residue. This is residue is then transformed and quantized.
    336  *   The transform and quantization are in placed computed. They use the residue
    337  *   buffer for this.
    338  *
    339  * @param[in] pu1_src
    340  *   Pointer to source sub-block
    341  *
    342  * @param[in] pu1_pred
    343  *   Pointer to prediction sub-block
    344  *
    345  * @param[in] pi2_out
    346  *   Pointer to residual sub-block
    347  *
    348  * @param[in] src_strd
    349  *   Source stride
    350  *
    351  * @param[in] pred_strd
    352  *   Prediction stride
    353  *
    354  * @param[in] dst_strd
    355  *   Destination stride
    356  *
    357  * @param[in] u4_qbits
    358  *    QP_BITS_h264_4x4 + floor(QP/6)
    359  *
    360  * @param[in] pu2_threshold_matrix
    361  *   Pointer to Forward Quant Threshold Matrix
    362  *
    363  * @param[in] pu2_scale_matrix
    364  *   Pointer to Forward Quant Scale Matrix
    365  *
    366  * @param[in] u4_round_factor
    367  *   Quantization Round factor
    368  *
    369  * @param[out] pu1_nnz
    370  *   Total non-zero coefficients in the current sub-block
    371  *
    372  * @returns
    373  *
    374  * @remarks
    375  *   None
    376  *
    377  *******************************************************************************
    378  */
    379 void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out,
    380                                             WORD32 src_strd,WORD32 pred_strd,
    381                                             const UWORD16 *pu2_scale_matrix,
    382                                             const UWORD16 *pu2_threshold_matrix,
    383                                             UWORD32 u4_qbits,UWORD32 u4_round_factor,
    384                                             UWORD8  *pu1_nnz, WORD16 *pi2_alt_dc_addr)
    385 {
    386     WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
    387     WORD32 mask0, mask1;
    388     __m128i cmp0, cmp1, sum0, sum1, sum2;
    389     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
    390     __m128i temp_2 = _mm_set1_epi16(2);
    391     __m128i temp_1 = _mm_set1_epi16(1);
    392     __m128i src_r0, src_r1, src_r2, src_r3;
    393     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
    394     __m128i temp0, temp1, temp2, temp3;
    395     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
    396     __m128i sign_reg0, sign_reg2;
    397     __m128i scalemat_r0_r1, scalemat_r2_r3;
    398     __m128i chroma_mask = _mm_set1_epi16 (0xFF);
    399 
    400     UNUSED (pu2_threshold_matrix);
    401 
    402     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
    403     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
    404     src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
    405     src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
    406     src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
    407     src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
    408 
    409     src_r0 = _mm_and_si128(src_r0, chroma_mask);
    410     src_r1 = _mm_and_si128(src_r1, chroma_mask);
    411     src_r2 = _mm_and_si128(src_r2, chroma_mask);
    412     src_r3 = _mm_and_si128(src_r3, chroma_mask);
    413 //  src_r0 = _mm_cvtepu8_epi16(src_r0);
    414 //  src_r1 = _mm_cvtepu8_epi16(src_r1);
    415 //  src_r2 = _mm_cvtepu8_epi16(src_r2);
    416 //  src_r3 = _mm_cvtepu8_epi16(src_r3);
    417 
    418     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
    419     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
    420     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
    421     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
    422 
    423     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
    424     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
    425     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
    426     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
    427 //  pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
    428 //  pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
    429 //  pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
    430 //  pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
    431 
    432     src_r0 = _mm_sub_epi16(src_r0, pred_r0);
    433     src_r1 = _mm_sub_epi16(src_r1, pred_r1);
    434     src_r2 = _mm_sub_epi16(src_r2, pred_r2);
    435     src_r3 = _mm_sub_epi16(src_r3, pred_r3);
    436 
    437     /* Perform Forward transform */
    438     /*-------------------------------------------------------------*/
    439     /* DCT [ Horizontal transformation ]                          */
    440     /*-------------------------------------------------------------*/
    441     // Matrix transpose
    442     /*
    443      *  a0 a1 a2 a3
    444      *  b0 b1 b2 b3
    445      *  c0 c1 c2 c3
    446      *  d0 d1 d2 d3
    447      */
    448     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 b0 a1 b1 a2 b2 a3 b3
    449     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //c0 d0 c1 d1 c2 d2 c3 d3
    450     temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 b0 c0 d0 a1 b1 c1 d1
    451     temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //a2 b2 c2 d2 a3 b3 c3 d3
    452 
    453     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 b0 c0 d0
    454     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //a1 b1 c1 d1
    455     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //a2 b2 c2 d2
    456     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //a3 b3 c3 d3
    457 
    458     /*----------------------------------------------------------*/
    459     /* x0 = z0 + z3                                             */
    460     temp0 = _mm_add_epi16(src_r0, src_r3);
    461     /* x1 = z1 + z2                                             */
    462     temp1 = _mm_add_epi16(src_r1, src_r2);
    463     /* x2 = z1 - z2                                             */
    464     temp2 = _mm_sub_epi16(src_r1, src_r2);
    465     /* x3 = z0 - z3                                             */
    466     temp3 = _mm_sub_epi16(src_r0, src_r3);
    467 
    468     /* z0 = x0 + x1                                             */
    469     src_r0 = _mm_add_epi16(temp0, temp1);
    470     /* z1 = (x3 << 1) + x2                                      */
    471     src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
    472     src_r1 = _mm_add_epi16(src_r1, temp2);
    473     /* z2 = x0 - x1                                             */
    474     src_r2 = _mm_sub_epi16(temp0, temp1);
    475     /* z3 = x3 - (x2 << 1)                                      */
    476     src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
    477     src_r3 = _mm_sub_epi16(temp3, src_r3);
    478 
    479     // Matrix transpose
    480     /*
    481      *  a0 b0 c0 d0
    482      *  a1 b1 c1 d1
    483      *  a2 b2 c2 d2
    484      *  a3 b3 c3 d3
    485      */
    486     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 a1 b0 b1 c0 c1 d0 d1
    487     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //a2 a3 b2 b3 c2 c3 d2 d3
    488     temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 a1 a2 a3 b0 b1 b2 b3
    489     temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //c0 c1 c2 c3 d0 d1 d2 d3
    490 
    491     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 a1 a2 a3
    492     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //b0 b1 b2 b3
    493     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //c0 c1 c2 c3
    494     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //d0 d1 d2 d3
    495 
    496     /*----------------------------------------------------------*/
    497     /* x0 = z0 + z3                                             */
    498     temp0 = _mm_add_epi16(src_r0, src_r3);
    499     /* x1 = z1 + z2                                             */
    500     temp1 = _mm_add_epi16(src_r1, src_r2);
    501     /* x2 = z1 - z2                                             */
    502     temp2 = _mm_sub_epi16(src_r1, src_r2);
    503     /* x3 = z0 - z3                                             */
    504     temp3 = _mm_sub_epi16(src_r0, src_r3);
    505 
    506     /* z0 = x0 + x1                                             */
    507     src_r0 = _mm_add_epi16(temp0, temp1);
    508     /* z1 = (x3 << 1) + x2                                      */
    509     src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
    510     src_r1 = _mm_add_epi16(src_r1, temp2);
    511     /* z2 = x0 - x1                                             */
    512     src_r2 = _mm_sub_epi16(temp0, temp1);
    513     /* z3 = x3 - (x2 << 1)                                      */
    514     src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
    515     src_r3 = _mm_sub_epi16(temp3, src_r3);
    516 
    517     tmp_dc = _mm_extract_epi16(src_r0,0);                       //a0
    518     *pi2_alt_dc_addr = tmp_dc;
    519 
    520     src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);                //a0 a1 a2 a3 b0 b1 b2 b3
    521     src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);                //c0 c1 c2 c3 d0 d1 d2 d3
    522     sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
    523     sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
    524 
    525     sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
    526     sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
    527 
    528     sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
    529     sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
    530 
    531     src_r0 = _mm_abs_epi16(src_r0);
    532     src_r2 = _mm_abs_epi16(src_r2);
    533 
    534     src_r1 = _mm_srli_si128(src_r0, 8);
    535     src_r0 = _mm_cvtepu16_epi32(src_r0);
    536     src_r1 = _mm_cvtepu16_epi32(src_r1);
    537     src_r3 = _mm_srli_si128(src_r2, 8);
    538     src_r2 = _mm_cvtepu16_epi32(src_r2);
    539     src_r3 = _mm_cvtepu16_epi32(src_r3);
    540 
    541     temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
    542     scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
    543     temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
    544     scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
    545     temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
    546     temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
    547 
    548     temp0 = _mm_mullo_epi32(temp0, src_r0);
    549     temp1 = _mm_mullo_epi32(temp1, src_r1);
    550     temp2 = _mm_mullo_epi32(temp2, src_r2);
    551     temp3 = _mm_mullo_epi32(temp3, src_r3);
    552 
    553     temp0 = _mm_add_epi32(temp0,rnd_fact);
    554     temp1 = _mm_add_epi32(temp1,rnd_fact);
    555     temp2 = _mm_add_epi32(temp2,rnd_fact);
    556     temp3 = _mm_add_epi32(temp3,rnd_fact);
    557 
    558     temp0 = _mm_srli_epi32(temp0,u4_qbits);
    559     temp1 = _mm_srli_epi32(temp1,u4_qbits);
    560     temp2 = _mm_srli_epi32(temp2,u4_qbits);
    561     temp3 = _mm_srli_epi32(temp3,u4_qbits);
    562 
    563     temp0 =  _mm_packs_epi32 (temp0,temp1);
    564     temp2 =  _mm_packs_epi32 (temp2,temp3);
    565 
    566     temp0 =  _mm_sign_epi16(temp0, sign_reg0);
    567     temp2 =  _mm_sign_epi16(temp2, sign_reg2);
    568 
    569     //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0);
    570 
    571     _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
    572     _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
    573 
    574     cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
    575     cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
    576 
    577     mask0 = _mm_movemask_epi8(cmp0);
    578     mask1 = _mm_movemask_epi8(cmp1);
    579     u4_zero_coeff = 0;
    580     if(mask0)
    581     {
    582         if(mask0 == 0xffff)
    583             u4_zero_coeff+=8;
    584         else
    585         {
    586             cmp0 = _mm_and_si128(temp_1, cmp0);
    587             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
    588             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
    589             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
    590             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
    591         }
    592     }
    593     if(mask1)
    594     {
    595         if(mask1 == 0xffff)
    596             u4_zero_coeff+=8;
    597         else
    598         {
    599             cmp1 = _mm_and_si128(temp_1, cmp1);
    600             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
    601             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
    602             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
    603             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
    604         }
    605     }
    606 
    607     /* Return total nonzero coefficients in the current sub block */
    608     u4_nonzero_coeff = 16 - u4_zero_coeff;
    609     *pu1_nnz =  u4_nonzero_coeff;
    610 
    611 }
    612 
    613 
    614 /**
    615  *******************************************************************************
    616  *
    617  * @brief
    618  *   This function performs forward hadamard transform and quantization on a 4*4 block
    619  *
    620  * @par Description:
    621  *   The function accepts source buffer and estimation buffer. From these, it
    622  *   computes the residue. This is residue is then transformed and quantized.
    623  *   The transform and quantization are in placed computed. They use the residue
    624  *   buffer for this.
    625  *
    626  * @param[in] pu1_src
    627  *   Pointer to source sub-block
    628  *
    629  * @param[in] pu1_pred
    630  *   Pointer to prediction sub-block
    631  *
    632  * @param[in] pi2_out
    633  *   Pointer to residual sub-block
    634  *
    635  * @param[in] src_strd
    636  *   Source stride
    637  *
    638  * @param[in] pred_strd
    639  *   Prediction stride
    640  *
    641  * @param[in] dst_strd
    642  *   Destination stride
    643  *
    644  * @param[in] u4_qbits
    645  *    QP_BITS_h264_4x4 + floor(QP/6)
    646  *
    647  * @param[in] pu2_threshold_matrix
    648  *   Pointer to Forward Quant Threshold Matrix
    649  *
    650  * @param[in] pu2_scale_matrix
    651  *   Pointer to Forward Quant Scale Matrix
    652  *
    653  * @param[in] u4_round_factor
    654  *   Quantization Round factor
    655  *
    656  * @param[out] pu1_nnz
    657  *   Total non-zero coefficients in the current sub-block
    658  *
    659  * @returns
    660  *
    661  * @remarks
    662  *   None
    663  *
    664  */
    665 
    666 void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
    667                           const UWORD16 *pu2_scale_matrix,
    668                           const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
    669                           UWORD32 u4_round_factor,UWORD8  *pu1_nnz
    670                           )
    671 {
    672     WORD32 u4_zero_coeff,u4_nonzero_coeff=0;
    673     __m128i cmp0, cmp1, sum0, sum1, sum2;
    674     WORD32 mask0, mask1;
    675     __m128i src_r0_r1, src_r2_r3, sign_reg;
    676     __m128i src_r0, src_r1, src_r2, src_r3;
    677     __m128i zero_8x16b = _mm_setzero_si128();
    678     __m128i temp0, temp1, temp2, temp3;
    679     __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3;
    680     __m128i temp_1 = _mm_set1_epi16(1);
    681     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
    682     __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
    683 
    684     UNUSED (pu2_threshold_matrix);
    685 
    686     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
    687     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
    688     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
    689     src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg);   //a0 a1 a2 a3
    690     src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg);   //b0 b1 b2 b3
    691     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
    692     src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg);   //c0 c1 c2 c3
    693     src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg);   //d0 d1 d2 d3
    694 
    695     /* Perform Inverse transform */
    696     /*-------------------------------------------------------------*/
    697     /* Forward DC transform [ Horizontal transformation ]                          */
    698     /*-------------------------------------------------------------*/
    699     // Matrix transpose
    700     /*
    701      *  a0 a1 a2 a3
    702      *  b0 b1 b2 b3
    703      *  c0 c1 c2 c3
    704      *  d0 d1 d2 d3
    705      */
    706     temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 b0 a1 b1
    707     temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //c0 d0 c1 d1
    708     temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //a2 b2 a3 b3
    709     temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 d2 c3 d3
    710     src_r0 = _mm_unpacklo_epi64(temp0, temp2);                    //a0 b0 c0 d0
    711     src_r1 = _mm_unpackhi_epi64(temp0, temp2);                    //a1 b1 c1 d1
    712     src_r2 = _mm_unpacklo_epi64(temp1, temp3);                    //a2 b2 c2 d2
    713     src_r3 = _mm_unpackhi_epi64(temp1, temp3);                    //a3 b3 c3 d3
    714 
    715     temp0 = _mm_add_epi32(src_r0, src_r3);
    716     temp1 = _mm_add_epi32(src_r1, src_r2);
    717     temp2 = _mm_sub_epi32(src_r1, src_r2);
    718     temp3 = _mm_sub_epi32(src_r0, src_r3);
    719 
    720     src_r0 = _mm_add_epi32(temp0, temp1);
    721     src_r1 = _mm_add_epi32(temp2, temp3);
    722     src_r2 = _mm_sub_epi32(temp0, temp1);
    723     src_r3 = _mm_sub_epi32(temp3, temp2);
    724 
    725     /*-------------------------------------------------------------*/
    726     /* Forward DC transform [ Vertical transformation ]                          */
    727     /*-------------------------------------------------------------*/
    728     // Matrix transpose
    729     /*
    730      *  a0 b0 c0 d0
    731      *  a1 b1 c1 d1
    732      *  a2 b2 c2 d2
    733      *  a3 b3 c3 d3
    734      */
    735     temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 a1 b0 b1
    736     temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //a2 a3 b2 b3
    737     temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //c0 c1 d0 d1
    738     temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 c3 d2 d3
    739     src_r0 = _mm_unpacklo_epi64(temp0, temp2);                   //a0 a1 a2 a3
    740     src_r1 = _mm_unpackhi_epi64(temp0, temp2);                   //b0 b1 b2 b3
    741     src_r2 = _mm_unpacklo_epi64(temp1, temp3);                   //c0 c1 c2 c3
    742     src_r3 = _mm_unpackhi_epi64(temp1, temp3);                   //d0 d1 d2 d3
    743 
    744     temp0 = _mm_add_epi32(src_r0, src_r3);
    745     temp1 = _mm_add_epi32(src_r1, src_r2);
    746     temp2 = _mm_sub_epi32(src_r1, src_r2);
    747     temp3 = _mm_sub_epi32(src_r0, src_r3);
    748 
    749     src_r0 = _mm_add_epi32(temp0, temp1);
    750     src_r1 = _mm_add_epi32(temp2, temp3);
    751     src_r2 = _mm_sub_epi32(temp0, temp1);
    752     src_r3 = _mm_sub_epi32(temp3, temp2);
    753 
    754     src_r0 = _mm_srai_epi32(src_r0, 1);
    755     src_r1 = _mm_srai_epi32(src_r1, 1);
    756     src_r2 = _mm_srai_epi32(src_r2, 1);
    757     src_r3 = _mm_srai_epi32(src_r3, 1);
    758 
    759     // Quantization
    760     sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0);        //Find sign of each value for later restoration
    761     sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1);
    762     sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2);
    763     sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3);
    764 
    765     sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1);      //Sign = -1 or 0 depending on <0 or >0 respectively
    766     sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3);
    767 
    768     sign_reg0 = _mm_slli_epi16(sign_reg0, 1);               //Sign = -2 or 0 depending on <0 or >0 respectively
    769     sign_reg2 = _mm_slli_epi16(sign_reg2, 1);
    770 
    771     sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);            //Sign = -1 or 1 depending on <0 or >0 respectively
    772     sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
    773 
    774     src_r0 = _mm_abs_epi32(src_r0);                         //Absolute values
    775     src_r1 = _mm_abs_epi32(src_r1);
    776     src_r2 = _mm_abs_epi32(src_r2);
    777     src_r3 = _mm_abs_epi32(src_r3);
    778 
    779     temp0 = _mm_mullo_epi32(scale_val, src_r0);             //multiply by pu2_scale_matrix[0]
    780     temp1 = _mm_mullo_epi32(scale_val, src_r1);
    781     temp2 = _mm_mullo_epi32(scale_val, src_r2);
    782     temp3 = _mm_mullo_epi32(scale_val, src_r3);
    783 
    784     temp0 = _mm_add_epi32(temp0,rnd_fact);                  //Add round factor
    785     temp1 = _mm_add_epi32(temp1,rnd_fact);
    786     temp2 = _mm_add_epi32(temp2,rnd_fact);
    787     temp3 = _mm_add_epi32(temp3,rnd_fact);
    788 
    789     temp0 = _mm_srli_epi32(temp0,u4_qbits);                 //RIght shift by qbits, unsigned variable, so shift right immediate works
    790     temp1 = _mm_srli_epi32(temp1,u4_qbits);
    791     temp2 = _mm_srli_epi32(temp2,u4_qbits);
    792     temp3 = _mm_srli_epi32(temp3,u4_qbits);
    793 
    794     temp0 =  _mm_packs_epi32 (temp0,temp1);                 //Final values are 16-bits only.
    795     temp2 =  _mm_packs_epi32 (temp2,temp3);
    796 
    797     temp0 =  _mm_sign_epi16(temp0, sign_reg0);              //Sign restoration
    798     temp2 =  _mm_sign_epi16(temp2, sign_reg2);
    799 
    800     _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
    801     _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2);
    802 
    803     cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
    804     cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
    805 
    806     mask0 = _mm_movemask_epi8(cmp0);
    807     mask1 = _mm_movemask_epi8(cmp1);
    808     u4_zero_coeff = 0;
    809     if(mask0)
    810     {
    811         if(mask0 == 0xffff)
    812             u4_zero_coeff+=8;
    813         else
    814         {
    815             cmp0 = _mm_and_si128(temp_1, cmp0);
    816             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
    817             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
    818             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
    819             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
    820         }
    821     }
    822     if(mask1)
    823     {
    824         if(mask1 == 0xffff)
    825             u4_zero_coeff+=8;
    826         else
    827         {
    828             cmp1 = _mm_and_si128(temp_1, cmp1);
    829             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
    830             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
    831             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
    832             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
    833         }
    834     }
    835 
    836     /* Return total nonzero coefficients in the current sub block */
    837     u4_nonzero_coeff = 16 - u4_zero_coeff;
    838     pu1_nnz[0] =  u4_nonzero_coeff;
    839 }
    840 
    841 
    842 /**
    843  *******************************************************************************
    844  *
    845  * @brief
    846  *   This function performs forward hadamard transform and quantization on a 2*2 block
    847  *   for both U and V planes
    848  *
    849  * @par Description:
    850  *   The function accepts source buffer and estimation buffer. From these, it
    851  *   computes the residue. This is residue is then transformed and quantized.
    852  *   The transform and quantization are in placed computed. They use the residue
    853  *   buffer for this.
    854  *
    855  * @param[in] pu1_src
    856  *   Pointer to source sub-block
    857  *
    858  * @param[in] pu1_pred
    859  *   Pointer to prediction sub-block
    860  *
    861  * @param[in] pi2_out
    862  *   Pointer to residual sub-block
    863  *
    864  * @param[in] src_strd
    865  *   Source stride
    866  *
    867  * @param[in] pred_strd
    868  *   Prediction stride
    869  *
    870  * @param[in] dst_strd
    871  *   Destination stride
    872  *
    873  * @param[in] u4_qbits
    874  *    QP_BITS_h264_4x4 + floor(QP/6)
    875  *
    876  * @param[in] pu2_threshold_matrix
    877  *   Pointer to Forward Quant Threshold Matrix
    878  *
    879  * @param[in] pu2_scale_matrix
    880  *   Pointer to Forward Quant Scale Matrix
    881  *
    882  * @param[in] u4_round_factor
    883  *   Quantization Round factor
    884  *
    885  * @param[out] pu1_nnz
    886  *   Total non-zero coefficients in the current sub-block
    887  *
    888  * @returns
    889  *
    890  * @remarks
    891  *   NNZ for dc is populated at 0 and 5th position of pu1_nnz
    892  *
    893  */
    894 
    895 void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
    896                             const UWORD16 *pu2_scale_matrix,
    897                             const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
    898                             UWORD32 u4_round_factor,UWORD8  *pu1_nnz)
    899 {
    900     WORD32 val, nonzero_coeff_0=0, nonzero_coeff_1=0;
    901     __m128i cmp, cmp0, cmp1;
    902     __m128i sum0, sum1;
    903     WORD32 mask, mask0, mask1;
    904     __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
    905     __m128i zero_8x16b = _mm_setzero_si128();
    906     __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
    907     __m128i sign_reg0, sign_reg1;
    908     __m128i temp_1 = _mm_set1_epi16(1);
    909     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
    910 
    911     UNUSED (pu2_threshold_matrix);
    912 
    913     src = _mm_loadu_si128((__m128i *)pi2_src);          //a0 a1 a2 a3 b0 b1 b2 b3
    914     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
    915     plane_0 = _mm_unpacklo_epi16(src, sign_reg);        //a0 a1 a2 a3 -- 32 bits
    916     plane_1 = _mm_unpackhi_epi16(src, sign_reg);        //b0 b1 b2 b3 -- 32 bits
    917 
    918     temp0 = _mm_hadd_epi32(plane_0, plane_1);           //a0+a1 a2+a3 b0+b1 b2+b3
    919     temp1 = _mm_hsub_epi32(plane_0, plane_1);           //a0-a1 a2-a3 b0-b1 b2-b3
    920 
    921     plane_0 = _mm_hadd_epi32(temp0, temp1);             //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
    922     plane_1 = _mm_hsub_epi32(temp0, temp1);             //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
    923 
    924     temp0 = _mm_unpacklo_epi32(plane_0, plane_1);       //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
    925     temp1 = _mm_unpackhi_epi32(plane_0, plane_1);       //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
    926 
    927     plane_0 = _mm_unpacklo_epi64(temp0, temp1);         //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
    928     plane_1 = _mm_unpackhi_epi64(temp0, temp1);         //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
    929 
    930     plane_0 = _mm_shuffle_epi32(plane_0, 0xd8);         //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
    931     plane_1 = _mm_shuffle_epi32(plane_1, 0xd8);         //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
    932     // Quantization
    933     sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0);       //Find sign of each value for later restoration
    934     sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1);
    935 
    936     sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1);      //Sign = -1 or 0 depending on <0 or >0 respectively
    937     sign_reg0 = _mm_slli_epi16(sign_reg0, 1);               //Sign = -2 or 0 depending on <0 or >0 respectively
    938     sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);            //Sign = -1 or 1 depending on <0 or >0 respectively
    939 
    940     plane_0 = _mm_abs_epi32(plane_0);                           //Absolute values
    941     plane_1 = _mm_abs_epi32(plane_1);
    942 
    943     temp0 = _mm_mullo_epi32(scale_val, plane_0);                //multiply by pu2_scale_matrix[0]
    944     temp1 = _mm_mullo_epi32(scale_val, plane_1);                //multiply by pu2_scale_matrix[0]
    945 
    946     temp0 = _mm_add_epi32(temp0,rnd_fact);                  //Add round factor
    947     temp1 = _mm_add_epi32(temp1,rnd_fact);
    948 
    949     temp0 = _mm_srli_epi32(temp0,u4_qbits);                 //RIght shift by qbits, unsigned variable, so shift right immediate works
    950     temp1 = _mm_srli_epi32(temp1,u4_qbits);
    951 
    952     temp0 =  _mm_packs_epi32 (temp0,temp1);                 //Final values are 16-bits only.
    953     temp0 =  _mm_sign_epi16(temp0, sign_reg0);              //Sign restoration
    954 
    955     _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
    956 
    957     cmp = _mm_cmpeq_epi16(temp0, zero_8x16b);
    958     mask = _mm_movemask_epi8(cmp);
    959     mask0 = mask & 0xff;
    960     mask1 = mask>>8;
    961     if(mask0)
    962     {
    963         if(mask0 == 0xff)
    964             nonzero_coeff_0 += 4;
    965         else
    966         {
    967             cmp0 = _mm_and_si128(temp_1, cmp);
    968             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
    969             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
    970             val = _mm_cvtsi128_si32(sum1);
    971             val = val & 0xffff;
    972             nonzero_coeff_0 += val;
    973         }
    974     }
    975     if(mask1)
    976     {
    977         if(mask1 == 0xff)
    978             nonzero_coeff_1 += 4;
    979         else
    980         {
    981             cmp1 = _mm_srli_si128(cmp, 8);
    982             cmp1 = _mm_and_si128(temp_1, cmp1);
    983             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
    984             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
    985             nonzero_coeff_1 += _mm_cvtsi128_si32(sum1);
    986         }
    987     }
    988 
    989     pu1_nnz[0] = 4 - nonzero_coeff_0;
    990     pu1_nnz[1] = 4 - nonzero_coeff_1;
    991 
    992 }
    993