Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21  *******************************************************************************
     22  * @file
     23  *  ih264_iquant_itrans_recon_dc_ssse3.c
     24  *
     25  * @brief
     26  *  Contains function definitions for inverse  quantization, inverse
     27  * transform and reconstruction
     28  *
     29  * @author
     30  *  Mohit [100664]
     31  *
     32  * @par List of Functions:
     33  *  - ih264_iquant_itrans_recon_4x4_dc_ssse3()
     34  *  - ih264_iquant_itrans_recon_8x8_dc_ssse3()
     35  *
     36  * @remarks
     37  *  None
     38  *
     39  *******************************************************************************
     40  */
     41 /* User include files */
     42 #include "ih264_typedefs.h"
     43 #include "ih264_defs.h"
     44 #include "ih264_trans_macros.h"
     45 #include "ih264_macros.h"
     46 #include "ih264_platform_macros.h"
     47 #include "ih264_trans_data.h"
     48 #include "ih264_size_defs.h"
     49 #include "ih264_structs.h"
     50 #include "ih264_trans_quant_itrans_iquant.h"
     51 #include <immintrin.h>
     52 
     53 /*
     54  ********************************************************************************
     55  *
     56  * @brief This function reconstructs a 4x4 sub block from quantized resiude and
     57  * prediction buffer for dc input pattern only, i.e. only the (0,0) element of the input
     58  * 4x4 block is non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c
     59  *
     60  * @par Description:
     61  *  The quantized residue is first inverse quantized, then inverse transformed.
     62  *  This inverse transformed content is added to the prediction buffer to recon-
     63  *  struct the end output
     64  *
     65  * @param[in] pi2_src
     66  *  quantized 4x4 block
     67  *
     68  * @param[in] pu1_pred
     69  *  prediction 4x4 block
     70  *
     71  * @param[out] pu1_out
     72  *  reconstructed 4x4 block
     73  *
     74  * @param[in] src_strd
     75  *  quantization buffer stride
     76  *
     77  * @param[in] pred_strd,
     78  *  Prediction buffer stride
     79  *
     80  * @param[in] out_strd
     81  *  recon buffer Stride
     82  *
     83  * @param[in] pu2_scaling_list
     84  *  pointer to scaling list
     85  *
     86  * @param[in] pu2_norm_adjust
     87  *  pointer to inverse scale matrix
     88  *
     89  * @param[in] u4_qp_div_6
     90  *  Floor (qp/6)
     91  *
     92  * @param[in] pi4_tmp
     93  * temporary buffer of size 1*16
     94  *
     95  * @returns none
     96  *
     97  * @remarks none
     98  *
     99  *******************************************************************************
    100  */
    101 void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src,
    102                                    UWORD8 *pu1_pred,
    103                                    UWORD8 *pu1_out,
    104                                    WORD32 pred_strd,
    105                                    WORD32 out_strd,
    106                                    const UWORD16 *pu2_iscal_mat,
    107                                    const UWORD16 *pu2_weigh_mat,
    108                                    UWORD32 u4_qp_div_6,
    109                                    WORD16 *pi2_tmp,
    110                                    WORD32 iq_start_idx,
    111                                    WORD16 *pi2_dc_ld_addr)
    112 {
    113     UWORD32 *pu4_out = (UWORD32 *)pu1_out;
    114     WORD32 q0 = pi2_src[0];
    115     WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
    116 
    117     __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3;
    118     __m128i sign_reg;
    119     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
    120     __m128i temp4, temp5, temp6, temp7;
    121     __m128i value_add;
    122 
    123     UNUSED (pi2_tmp);
    124 
    125     INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
    126 
    127     if (iq_start_idx != 0 )
    128         q0 = pi2_dc_ld_addr[0];     // Restoring dc value for intra case
    129 
    130     i_macro = ((q0 + 32) >> 6);
    131 
    132     value_add = _mm_set1_epi16(i_macro);
    133 
    134     zero_8x16b = _mm_setzero_si128();                  // all bits reset to zero
    135     //Load pred buffer
    136     predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
    137     pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p00 p01 p02 p03 0 0 0 0 -- all 16 bits
    138     predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
    139     pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p10 p11 p12 p13 0 0 0 0 -- all 16 bits
    140     predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2*pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
    141     pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p20 p21 p22 p23 0 0 0 0 -- all 16 bits
    142     predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3*pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
    143     pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p30 p31 p32 p33 0 0 0 0 -- all 16 bits
    144 
    145     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13
    146     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33
    147 
    148     temp4 = _mm_add_epi16(value_add, pred_r0);
    149     temp5 = _mm_add_epi16(value_add, pred_r2);
    150     /*------------------------------------------------------------------*/
    151     //Clipping the results to 8 bits
    152     sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);                 // sign check
    153     temp4 = _mm_and_si128(temp4, sign_reg);
    154     sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);                 // sign check
    155     temp5 = _mm_and_si128(temp5, sign_reg);
    156 
    157     temp4 = _mm_packus_epi16(temp4,temp5);
    158     temp5 = _mm_srli_si128(temp4,4);
    159     temp6 = _mm_srli_si128(temp5,4);
    160     temp7 = _mm_srli_si128(temp6,4);
    161 
    162     *pu4_out = _mm_cvtsi128_si32(temp4);
    163     pu1_out += out_strd;
    164     pu4_out = (UWORD32 *)(pu1_out);
    165     *(pu4_out) = _mm_cvtsi128_si32(temp5);
    166     pu1_out += out_strd;
    167     pu4_out = (UWORD32 *)(pu1_out);
    168     *(pu4_out) = _mm_cvtsi128_si32(temp6);
    169     pu1_out += out_strd;
    170     pu4_out = (UWORD32 *)(pu1_out);
    171     *(pu4_out) = _mm_cvtsi128_si32(temp7);
    172 }
    173 /**
    174  *******************************************************************************
    175  *
    176  * @brief
    177  *  This function performs inverse quant and Inverse transform type Ci4 for 8x8 block
    178  *  for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is
    179  *  non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c
    180  *
    181  * @par Description:
    182  *  Performs inverse transform Ci8 and adds the residue to get the
    183  *  reconstructed block
    184  *
    185  * @param[in] pi2_src
    186  *  Input 8x8coefficients
    187  *
    188  * @param[in] pu1_pred
    189  *  Prediction 8x8 block
    190  *
    191  * @param[out] pu1_recon
    192  *  Output 8x8 block
    193  *
    194  * @param[in] q_div
    195  *  QP/6
    196  *
    197  * @param[in] q_rem
    198  *  QP%6
    199  *
    200  * @param[in] q_lev
    201  *  Quantizer level
    202  *
    203  * @param[in] u4_src_stride
    204  *  Input stride
    205  *
    206  * @param[in] u4_pred_stride,
    207  *  Prediction stride
    208  *
    209  * @param[in] u4_out_stride
    210  *  Output Stride
    211  *
    212  * @param[in] pi4_tmp
    213  *  temporary buffer of size 1*64
    214  *  the tmp for each block
    215  *
    216  * @param[in] pu4_iquant_mat
    217  *  Pointer to the inverse quantization matrix
    218  *
    219  * @returns  Void
    220  *
    221  * @remarks
    222  *  None
    223  *
    224  *******************************************************************************
    225  */
    226 
    227 void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src,
    228                                          UWORD8 *pu1_pred,
    229                                          UWORD8 *pu1_out,
    230                                          WORD32 pred_strd,
    231                                          WORD32 out_strd,
    232                                          const UWORD16 *pu2_iscale_mat,
    233                                          const UWORD16 *pu2_weigh_mat,
    234                                          UWORD32 qp_div,
    235                                          WORD16 *pi2_tmp,
    236                                          WORD32 iq_start_idx,
    237                                          WORD16 *pi2_dc_ld_addr)
    238 {
    239     WORD32 q0 = pi2_src[0];
    240     WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0;
    241 
    242     __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3,pred_r4,pred_r5,pred_r6,pred_r7;
    243     __m128i sign_reg;
    244     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
    245     __m128i temp1,temp2,temp3,temp4, temp5, temp6, temp7,temp8;
    246     __m128i value_add;
    247 
    248     UNUSED (pi2_tmp);
    249     UNUSED (iq_start_idx);
    250     UNUSED (pi2_dc_ld_addr);
    251 
    252     INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
    253     i_macro = ((q0 + 32) >> 6);
    254 
    255     value_add = _mm_set1_epi16(i_macro);
    256 
    257     //Load pred buffer row 0
    258     predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[0])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
    259     pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
    260     //Load pred buffer row 1
    261     predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
    262     pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
    263     //Load pred buffer row 2
    264     predload_r = _mm_loadl_epi64(
    265                     (__m128i *)(&pu1_pred[2 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
    266     pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
    267     //Load pred buffer row 3
    268     predload_r = _mm_loadl_epi64(
    269                     (__m128i *)(&pu1_pred[3 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
    270     pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
    271     //Load pred buffer row 4
    272     predload_r = _mm_loadl_epi64(
    273                     (__m128i *)(&pu1_pred[4 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
    274     pred_r4 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
    275     //Load pred buffer row 5
    276     predload_r = _mm_loadl_epi64(
    277                     (__m128i *)(&pu1_pred[5 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bit
    278     pred_r5 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
    279     //Load pred buffer row 6
    280     predload_r = _mm_loadl_epi64(
    281                     (__m128i *)(&pu1_pred[6 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
    282     pred_r6 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
    283     //Load pred buffer row 7
    284     predload_r = _mm_loadl_epi64(
    285                     (__m128i *)(&pu1_pred[7 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
    286     pred_r7 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
    287 
    288     temp1 = _mm_add_epi16(value_add, pred_r0);
    289 
    290     temp2 = _mm_add_epi16(value_add, pred_r1);
    291 
    292     temp3 = _mm_add_epi16(value_add, pred_r2);
    293 
    294     temp4 = _mm_add_epi16(value_add, pred_r3);
    295 
    296     temp5 = _mm_add_epi16(value_add, pred_r4);
    297 
    298     temp6 = _mm_add_epi16(value_add, pred_r5);
    299 
    300     temp7 = _mm_add_epi16(value_add, pred_r6);
    301 
    302     temp8 = _mm_add_epi16(value_add, pred_r7);
    303     /*------------------------------------------------------------------*/
    304     //Clipping the results to 8 bits
    305     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); // sign check
    306     temp1 = _mm_and_si128(temp1, sign_reg);
    307     sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b); // sign check
    308     temp2 = _mm_and_si128(temp2, sign_reg);
    309     sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b); // sign check
    310     temp3 = _mm_and_si128(temp3, sign_reg);
    311     sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check
    312     temp4 = _mm_and_si128(temp4, sign_reg);
    313     sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check
    314     temp5 = _mm_and_si128(temp5, sign_reg);
    315     sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b); // sign check
    316     temp6 = _mm_and_si128(temp6, sign_reg);
    317     sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b); // sign check
    318     temp7 = _mm_and_si128(temp7, sign_reg);
    319     sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b); // sign check
    320     temp8 = _mm_and_si128(temp8, sign_reg);
    321 
    322     temp1 = _mm_packus_epi16(temp1, zero_8x16b);
    323     temp2 = _mm_packus_epi16(temp2, zero_8x16b);
    324     temp3 = _mm_packus_epi16(temp3, zero_8x16b);
    325     temp4 = _mm_packus_epi16(temp4, zero_8x16b);
    326     temp5 = _mm_packus_epi16(temp5, zero_8x16b);
    327     temp6 = _mm_packus_epi16(temp6, zero_8x16b);
    328     temp7 = _mm_packus_epi16(temp7, zero_8x16b);
    329     temp8 = _mm_packus_epi16(temp8, zero_8x16b);
    330 
    331     _mm_storel_epi64((__m128i *)(&pu1_out[0]), temp1);
    332     _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), temp2);
    333     _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), temp3);
    334     _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), temp4);
    335     _mm_storel_epi64((__m128i *)(&pu1_out[4 * out_strd]), temp5);
    336     _mm_storel_epi64((__m128i *)(&pu1_out[5 * out_strd]), temp6);
    337     _mm_storel_epi64((__m128i *)(&pu1_out[6 * out_strd]), temp7);
    338     _mm_storel_epi64((__m128i *)(&pu1_out[7 * out_strd]), temp8);
    339 }
    340 
    341 /*
    342  ********************************************************************************
    343  *
    344  * @brief This function reconstructs a 4x4 sub block from quantized chroma resiude and
    345  * prediction buffer
    346  *
    347  * @par Description:
    348  *  The quantized residue is first inverse quantized, then inverse transformed.
    349  *  This inverse transformed content is added to the prediction buffer to recon-
    350  *  struct the end output
    351  *
    352  * @param[in] pi2_src
    353  *  quantized 4x4 block
    354  *
    355  * @param[in] pu1_pred
    356  *  prediction 4x4 block
    357  *
    358  * @param[out] pu1_out
    359  *  reconstructed 4x4 block
    360  *
    361  * @param[in] src_strd
    362  *  quantization buffer stride
    363  *
    364  * @param[in] pred_strd,
    365  *  Prediction buffer stride
    366  *
    367  * @param[in] out_strd
    368  *  recon buffer Stride
    369  *
    370  * @param[in] pu2_scaling_list
    371  *  pointer to scaling list
    372  *
    373  * @param[in] pu2_norm_adjust
    374  *  pointer to inverse scale matrix
    375  *
    376  * @param[in] u4_qp_div_6
    377  *  Floor (qp/6)
    378  *
    379  * @param[in] pi4_tmp
    380  * temporary buffer of size 1*16
    381  *
    382  * @returns none
    383  *
    384  * @remarks none
    385  *
    386  *******************************************************************************
    387  */
    388 void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src,
    389                                    UWORD8 *pu1_pred,
    390                                    UWORD8 *pu1_out,
    391                                    WORD32 pred_strd,
    392                                    WORD32 out_strd,
    393                                    const UWORD16 *pu2_iscal_mat,
    394                                    const UWORD16 *pu2_weigh_mat,
    395                                    UWORD32 u4_qp_div_6,
    396                                    WORD16 *pi2_tmp,
    397                                    WORD16 *pi2_dc_src)
    398  {
    399     WORD16 q0 = pi2_dc_src[0];      // DC value won't be dequantized for chroma inverse transform
    400     WORD16 i_macro = ((q0 + 32) >> 6);
    401 
    402     __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg;
    403     __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
    404     __m128i chroma_mask = _mm_set1_epi16 (0xFF);
    405     __m128i value_add = _mm_set1_epi16(i_macro);
    406     __m128i out_r0, out_r1, out_r2, out_r3;
    407 
    408     UNUSED (pi2_src);
    409     UNUSED (pu2_iscal_mat);
    410     UNUSED (pu2_weigh_mat);
    411     UNUSED (u4_qp_div_6);
    412     UNUSED (pi2_tmp);
    413 
    414     //Load pred buffer
    415     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
    416     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
    417     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
    418     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
    419 
    420     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
    421     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
    422     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
    423     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
    424 
    425     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13
    426     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33
    427 
    428     pred_r0 = _mm_add_epi16(value_add, pred_r0);
    429     pred_r2 = _mm_add_epi16(value_add, pred_r2);
    430 
    431     /*------------------------------------------------------------------*/
    432     //Clipping the results to 8 bits
    433     sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b);        // sign check
    434     pred_r0 = _mm_and_si128(pred_r0, sign_reg);
    435     sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b);
    436     pred_r2 = _mm_and_si128(pred_r2, sign_reg);
    437 
    438     pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
    439     pred_r1 = _mm_srli_si128(pred_r0, 4);
    440     pred_r2 = _mm_srli_si128(pred_r1, 4);
    441     pred_r3 = _mm_srli_si128(pred_r2, 4);
    442 
    443     pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b); //p00 p01 p02 p03 -- all 16 bits
    444     pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b); //p10 p11 p12 p13 -- all 16 bits
    445     pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- all 16 bits
    446     pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- all 16 bits
    447 
    448     chroma_mask = _mm_set1_epi16 (0xFF00);
    449     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
    450     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[out_strd]));
    451     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * out_strd]));
    452     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * out_strd]));
    453 
    454     out_r0 = _mm_and_si128(out_r0, chroma_mask);
    455     out_r1 = _mm_and_si128(out_r1, chroma_mask);
    456     out_r2 = _mm_and_si128(out_r2, chroma_mask);
    457     out_r3 = _mm_and_si128(out_r3, chroma_mask);
    458 
    459     out_r0 = _mm_add_epi8(out_r0, pred_r0);
    460     out_r1 = _mm_add_epi8(out_r1, pred_r1);
    461     out_r2 = _mm_add_epi8(out_r2, pred_r2);
    462     out_r3 = _mm_add_epi8(out_r3, pred_r3);
    463 
    464     _mm_storel_epi64((__m128i *)(&pu1_out[0]), out_r0);
    465     _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), out_r1);
    466     _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), out_r2);
    467     _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), out_r3);
    468 }
    469 
    470 
    471