Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21  *******************************************************************************
     22  * @file
     23  *  ih264_luma_intra_pred_filters_ssse3.c
     24  *
     25  * @brief
     26  *  Contains function definitions for luma intra prediction filters in x86
     27  *  intrinsics
     28  *
     29  * @author
     30  *  Ittiam
     31  *
     32  * @par List of Functions:
     33  *  - ih264_intra_pred_luma_4x4_mode_vert_ssse3
     34  *  - ih264_intra_pred_luma_4x4_mode_horz_ssse3
     35  *  - ih264_intra_pred_luma_4x4_mode_dc_ssse3
     36  *  - ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3
     37  *  - ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3
     38  *  - ih264_intra_pred_luma_4x4_mode_vert_r_ssse3
     39  *  - ih264_intra_pred_luma_4x4_mode_horz_d_ssse3
     40  *  - ih264_intra_pred_luma_4x4_mode_vert_l_ssse3
     41  *  - ih264_intra_pred_luma_4x4_mode_horz_u_ssse3
     42  *  - ih264_intra_pred_luma_8x8_mode_vert_ssse3
     43  *  - ih264_intra_pred_luma_8x8_mode_horz_ssse3
     44  *  - ih264_intra_pred_luma_8x8_mode_dc_ssse3
     45  *  - ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3
     46  *  - ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3
     47  *  - ih264_intra_pred_luma_8x8_mode_vert_r_ssse3
     48  *  - ih264_intra_pred_luma_8x8_mode_horz_d_ssse3
     49  *  - ih264_intra_pred_luma_8x8_mode_vert_l_ssse3
     50  *  - ih264_intra_pred_luma_8x8_mode_horz_u_ssse3
     51  *  - ih264_intra_pred_luma_16x16_mode_vert_ssse3
     52  *  - ih264_intra_pred_luma_16x16_mode_horz_ssse3
     53  *  - ih264_intra_pred_luma_16x16_mode_dc_ssse3
     54  *  - ih264_intra_pred_luma_16x16_mode_plane_ssse3
     55  *
     56  * @remarks
     57  *  None
     58  *
     59  ******************************************************************************
     60  */
     61 
     62 /*****************************************************************************/
     63 /* File Includes                                                             */
     64 /*****************************************************************************/
     65 /* System include files */
     66 #include <stdio.h>
     67 #include <stddef.h>
     68 #include <string.h>
     69 #include <immintrin.h>
     70 
     71 /* User include files */
     72 #include "ih264_defs.h"
     73 #include "ih264_typedefs.h"
     74 #include "ih264_macros.h"
     75 #include "ih264_platform_macros.h"
     76 #include "ih264_intra_pred_filters.h"
     77 
     78 
     79 
     80 /*******************    LUMA INTRAPREDICTION    *******************/
     81 
     82 /*******************    4x4 Modes    *******************/
     83 
     84 /**
     85  *******************************************************************************
     86  *
     87  * ih264_intra_pred_luma_4x4_mode_vert_ssse3
     88  *
     89  * @brief
     90  *  Perform Intra prediction for luma_4x4 mode:vertical
     91  *
     92  * @par Description:
     93  *  Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1
     94  *
     95  * @param[in] pu1_src
     96  *  UWORD8 pointer to the source
     97  *
     98  * @param[out] pu1_dst
     99  *  UWORD8 pointer to the destination
    100  *
    101  * @param[in] src_strd
    102  *  integer source stride
    103  *
    104  * @param[in] dst_strd
    105  *  integer destination stride
    106  *
    107  * @param[in] ngbr_avail
    108  * availability of neighbouring pixels(Not used in this function)
    109  *
    110  * @returns
    111  *
    112  * @remarks
    113  *  None
    114  *
    115  *******************************************************************************
    116  */
    117 void ih264_intra_pred_luma_4x4_mode_vert_ssse3(UWORD8 *pu1_src,
    118                                                UWORD8 *pu1_dst,
    119                                                WORD32 src_strd,
    120                                                WORD32 dst_strd,
    121                                                WORD32 ngbr_avail)
    122 {
    123     UWORD8 *pu1_top;
    124     WORD32 dst_strd2, dst_strd3;
    125     WORD32 i4_top;
    126 
    127     UNUSED(src_strd);
    128     UNUSED(ngbr_avail);
    129 
    130     pu1_top = pu1_src + BLK_SIZE + 1;
    131 
    132     i4_top = *((WORD32 *)pu1_top);
    133 
    134     dst_strd2 = dst_strd << 1;
    135     dst_strd3 = dst_strd + dst_strd2;
    136 
    137     *((WORD32 *)(pu1_dst)) = i4_top;
    138     *((WORD32 *)(pu1_dst + dst_strd)) = i4_top;
    139     *((WORD32 *)(pu1_dst + dst_strd2)) = i4_top;
    140     *((WORD32 *)(pu1_dst + dst_strd3)) = i4_top;
    141 }
    142 
    143 /**
    144  *******************************************************************************
    145  *
    146  *ih264_intra_pred_luma_4x4_mode_horz_ssse3
    147  *
    148  * @brief
    149  *  Perform Intra prediction for luma_4x4 mode:horizontal
    150  *
    151  * @par Description:
    152  *  Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2
    153  *
    154  * @param[in] pu1_src
    155  *  UWORD8 pointer to the source
    156  *
    157  * @param[out] pu1_dst
    158  *  UWORD8 pointer to the destination
    159  *
    160  * @param[in] src_strd
    161  *  integer source stride
    162  *
    163  * @param[in] dst_strd
    164  *  integer destination stride
    165  *
    166  * @param[in] ngbr_avail
    167  * availability of neighbouring pixels(Not used in this function)
    168  *
    169  * @returns
    170  *
    171  * @remarks
    172  *  None
    173  *
    174  *******************************************************************************
    175  */
    176 void ih264_intra_pred_luma_4x4_mode_horz_ssse3(UWORD8 *pu1_src,
    177                                                UWORD8 *pu1_dst,
    178                                                WORD32 src_strd,
    179                                                WORD32 dst_strd,
    180                                                WORD32 ngbr_avail)
    181 {
    182     UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
    183     WORD32 row1,row2,row3,row4;
    184     UWORD8 val;
    185     WORD32 dst_strd2, dst_strd3;
    186 
    187     UNUSED(src_strd);
    188     UNUSED(ngbr_avail);
    189     pu1_left = pu1_src + BLK_SIZE - 1;
    190 
    191     val  = *pu1_left;
    192     row1 = val + (val << 8) + (val << 16) + (val << 24);
    193     val  = *(pu1_left - 1);
    194     row2 = val + (val << 8) + (val << 16) + (val << 24);
    195     val  = *(pu1_left - 2);
    196     row3 = val + (val << 8) + (val << 16) + (val << 24);
    197     val  = *(pu1_left - 3);
    198     row4 = val + (val << 8) + (val << 16) + (val << 24);
    199 
    200     dst_strd2 = dst_strd << 1;
    201     dst_strd3 = dst_strd + dst_strd2;
    202 
    203     *((WORD32 *)(pu1_dst)) = row1;
    204     *((WORD32 *)(pu1_dst + dst_strd)) = row2;
    205     *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
    206     *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
    207 }
    208 
    209 /**
    210  *******************************************************************************
    211  *
    212  * ih264_intra_pred_luma_4x4_mode_dc_ssse3
    213  *
    214  * @brief
    215  *  Perform Intra prediction for luma_4x4 mode:DC
    216  *
    217  * @par Description:
    218  *  Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3
    219  *
    220  * @param[in] pu1_src
    221  *  UWORD8 pointer to the source
    222  *
    223  * @param[out] pu1_dst
    224  *  UWORD8 pointer to the destination
    225  *
    226  * @param[in] src_strd
    227  *  integer source stride
    228  *
    229  * @param[in] dst_strd
    230  *  integer destination stride
    231  *
    232  * @param[in] ngbr_avail
    233  *  availability of neighbouring pixels
    234  *
    235  * @returns
    236  *
    237  * @remarks
    238  *  None
    239  *
    240  *******************************************************************************/
    241 void ih264_intra_pred_luma_4x4_mode_dc_ssse3(UWORD8 *pu1_src,
    242                                              UWORD8 *pu1_dst,
    243                                              WORD32 src_strd,
    244                                              WORD32 dst_strd,
    245                                              WORD32 ngbr_avail)
    246 {
    247     UWORD8 u1_useleft; /* availability of left predictors (only for DC) */
    248     UWORD8 u1_usetop; /* availability of top predictors (only for DC) */
    249     UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
    250     UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
    251     WORD32 dst_strd2, dst_strd3;
    252     WORD32 val = 0;
    253     UNUSED(src_strd);
    254     UNUSED(ngbr_avail);
    255     u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
    256     u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
    257     pu1_top = pu1_src + BLK_SIZE + 1;
    258     pu1_left = pu1_src + BLK_SIZE - 1;
    259 
    260     if(u1_useleft)
    261     {
    262         val += *pu1_left--;
    263         val += *pu1_left--;
    264         val += *pu1_left--;
    265         val += *pu1_left + 2;
    266     }
    267     if(u1_usetop)
    268     {
    269         val += *pu1_top + *(pu1_top + 1) + *(pu1_top + 2) + *(pu1_top + 3)
    270                         + 2;
    271     }
    272     /* Since 2 is added if either left/top pred is there,
    273      val still being zero implies both preds are not there */
    274     val = (val) ? (val >> (1 + u1_useleft + u1_usetop)) : 128;
    275 
    276     val = val + (val << 8) + (val << 16) + (val << 24);
    277 
    278     dst_strd2 = dst_strd << 1;
    279     dst_strd3 = dst_strd + dst_strd2;
    280 
    281     *((WORD32 *)(pu1_dst)) = val;
    282     *((WORD32 *)(pu1_dst + dst_strd)) = val;
    283     *((WORD32 *)(pu1_dst + dst_strd2)) = val;
    284     *((WORD32 *)(pu1_dst + dst_strd3)) = val;
    285 }
    286 
    287 /**
    288  *******************************************************************************
    289  *
    290  * ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3
    291  *
    292  * @brief
    293  *  Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left
    294  *
    295  * @par Description:
    296  *  Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4
    297  *
    298  * @param[in] pu1_src
    299  *  UWORD8 pointer to the source
    300  *
    301  * @param[out] pu1_dst
    302  *  UWORD8 pointer to the destination
    303  *
    304  * @param[in] src_strd
    305  *  integer source stride
    306  *
    307  * @param[in] dst_strd
    308  *  integer destination stride
    309  *
    310  * @param[in] ngbr_avail
    311  * availability of neighbouring pixels(Not used in this function)
    312  *
    313  * @returns
    314  *
    315  * @remarks
    316  *  None
    317  *
    318  *******************************************************************************/
    319 void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src,
    320                                                   UWORD8 *pu1_dst,
    321                                                   WORD32 src_strd,
    322                                                   WORD32 dst_strd,
    323                                                   WORD32 ngbr_avail)
    324 {
    325     UWORD8 *pu1_top;
    326     WORD32 dst_strd2, dst_strd3;
    327 
    328     __m128i top_16x8b, top_8x16b, top_sh_8x16b;
    329     __m128i res1_8x16b, res2_8x16b, res_16x8b;
    330     __m128i zero_vector, const_2_8x16b;
    331     WORD32 row1,row2,row3,row4;
    332 
    333     UNUSED(src_strd);
    334     UNUSED(ngbr_avail);
    335 
    336     pu1_top = pu1_src + BLK_SIZE + 1;
    337 
    338     top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
    339     zero_vector = _mm_setzero_si128();
    340     top_8x16b = _mm_unpacklo_epi8(top_16x8b, zero_vector);    //t0 t1 t2 t3 t4 t5 t6 t7
    341 
    342     top_sh_8x16b = _mm_srli_si128(top_8x16b, 2);              //t1 t2 t3 t4 t5 t6 t7 0
    343     const_2_8x16b = _mm_set1_epi16(2);
    344 
    345     top_sh_8x16b = _mm_shufflehi_epi16(top_sh_8x16b, 0xa4);   //t1 t2 t3 t4 t5 t6 t7 t7
    346     res1_8x16b = _mm_add_epi16(top_8x16b, top_sh_8x16b);
    347     res2_8x16b = _mm_srli_si128(res1_8x16b, 2);
    348 
    349     res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b);
    350     res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b);
    351     res1_8x16b = _mm_srai_epi16(res1_8x16b, 2);
    352 
    353     dst_strd2 = dst_strd << 1;
    354     dst_strd3 = dst_strd + dst_strd2;
    355 
    356     res_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
    357     row1 = _mm_cvtsi128_si32(res_16x8b);
    358     res_16x8b = _mm_srli_si128(res_16x8b, 1);
    359     row2 = _mm_cvtsi128_si32(res_16x8b);
    360     res_16x8b = _mm_srli_si128(res_16x8b, 1);
    361     row3 = _mm_cvtsi128_si32(res_16x8b);
    362     res_16x8b = _mm_srli_si128(res_16x8b, 1);
    363     row4 = _mm_cvtsi128_si32(res_16x8b);
    364 
    365     *((WORD32 *)(pu1_dst)) = row1;
    366     *((WORD32 *)(pu1_dst + dst_strd)) = row2;
    367     *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
    368     *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
    369 }
    370 
    371 /**
    372  *******************************************************************************
    373  *
    374  * ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3
    375  *
    376  * @brief
    377  *  Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right
    378  *
    379  * @par Description:
    380  *  Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5
    381  *
    382  * @param[in] pu1_src
    383  *  UWORD8 pointer to the source
    384  *
    385  * @param[out] pu1_dst
    386  *  UWORD8 pointer to the destination
    387  *
    388  * @param[in] src_strd
    389  *  integer source stride
    390  *
    391  * @param[in] dst_strd
    392  *  integer destination stride
    393  *
    394  * @param[in] ngbr_avail
    395  * availability of neighbouring pixels(Not used in this function)
    396  *
    397  * @returns
    398  *
    399  * @remarks
    400  *  None
    401  *
    402  *******************************************************************************/
    403 void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src,
    404                                                   UWORD8 *pu1_dst,
    405                                                   WORD32 src_strd,
    406                                                   WORD32 dst_strd,
    407                                                   WORD32 ngbr_avail)
    408 {
    409     UWORD8 *pu1_left;
    410     WORD32 dst_strd2, dst_strd3;
    411 
    412     __m128i top_left_16x8b, top_left_8x16b;
    413     __m128i top_left_sh_16x8b, top_left_sh_8x16b;
    414     __m128i res1_8x16b, res2_8x16b;
    415     __m128i res1_16x8b, res2_16x8b;
    416     __m128i zero_vector, const_2_8x16b;
    417     WORD32 row1,row2,row3,row4;
    418 
    419     UNUSED(src_strd);
    420     UNUSED(ngbr_avail);
    421 
    422     pu1_left = pu1_src + BLK_SIZE - 1;
    423 
    424     top_left_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 3));             //l3 l2 l1 l0 tl t0 t1 t2...
    425     zero_vector = _mm_setzero_si128();
    426     top_left_sh_16x8b = _mm_srli_si128(top_left_16x8b, 1);                   //l2 l1 l0 tl t0 t1 t2 t3...
    427 
    428     top_left_8x16b = _mm_unpacklo_epi8(top_left_16x8b, zero_vector);
    429     top_left_sh_8x16b = _mm_unpacklo_epi8(top_left_sh_16x8b, zero_vector);
    430 
    431     res1_8x16b = _mm_add_epi16(top_left_8x16b, top_left_sh_8x16b);           //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3...
    432     const_2_8x16b = _mm_set1_epi16(2);
    433     res2_8x16b = _mm_srli_si128(res1_8x16b, 2);                              //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3...
    434 
    435     res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b);
    436     res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b);                      //l3+2*l2+l1+2 l2+2*l1+l0+2...
    437     res1_8x16b = _mm_srai_epi16(res1_8x16b, 2);
    438     res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
    439 
    440     dst_strd2 = dst_strd << 1;
    441     dst_strd3 = dst_strd + dst_strd2;
    442 
    443     res2_16x8b = _mm_srli_si128(res1_16x8b, 3);
    444 
    445     row1 = _mm_cvtsi128_si32(res2_16x8b);
    446     res2_16x8b = _mm_srli_si128(res1_16x8b, 2);
    447     row2 = _mm_cvtsi128_si32(res2_16x8b);
    448     res2_16x8b = _mm_srli_si128(res1_16x8b, 1);
    449     row3 = _mm_cvtsi128_si32(res2_16x8b);
    450     row4 = _mm_cvtsi128_si32(res1_16x8b);
    451 
    452     *((WORD32 *)(pu1_dst)) = row1;
    453     *((WORD32 *)(pu1_dst + dst_strd)) = row2;
    454     *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
    455     *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
    456 }
    457 
    458 /**
    459  *******************************************************************************
    460  *
    461  * ih264_intra_pred_luma_4x4_mode_vert_r_ssse3
    462  *
    463  * @brief
    464  *  Perform Intra prediction for luma_4x4 mode:Vertical_Right
    465  *
    466  * @par Description:
    467  *  Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6
    468  *
    469  * @param[in] pu1_src
    470  *  UWORD8 pointer to the source
    471  *
    472  * @param[out] pu1_dst
    473  *  UWORD8 pointer to the destination
    474  *
    475  * @param[in] src_strd
    476  *  integer source stride
    477  *
    478  * @param[in] dst_strd
    479  *  integer destination stride
    480  *
    481  * @param[in] ngbr_avail
    482  * availability of neighbouring pixels(Not used in this function)
    483  *
    484  * @returns
    485  *
    486  * @remarks
    487  *  None
    488  *
    489  *******************************************************************************/
    490 void ih264_intra_pred_luma_4x4_mode_vert_r_ssse3(UWORD8 *pu1_src,
    491                                                  UWORD8 *pu1_dst,
    492                                                  WORD32 src_strd,
    493                                                  WORD32 dst_strd,
    494                                                  WORD32 ngbr_avail)
    495 {
    496     UWORD8 *pu1_left;
    497     WORD32 dst_strd2, dst_strd3;
    498 
    499     __m128i val_16x8b, temp_16x8b;
    500     __m128i w11_a1_16x8b, w11_a2_16x8b;
    501     __m128i w121_a1_8x16b, w121_a2_8x16b, w121_sh_8x16b;
    502     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
    503     __m128i zero_vector, const_2_8x16b;
    504     WORD32 row1,row2,row3,row4;
    505 
    506     UNUSED(src_strd);
    507     UNUSED(ngbr_avail);
    508 
    509     pu1_left = pu1_src + BLK_SIZE - 1;
    510 
    511     val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 2));
    512     zero_vector = _mm_setzero_si128();
    513 
    514     w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector);        //l2 l1 l0 tl t0 t1 t2 t3
    515     w11_a1_16x8b = _mm_srli_si128(val_16x8b, 3);
    516     w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2);                 //l1 l0 tl t0 t1 t2 t3 0
    517     w11_a2_16x8b = _mm_srli_si128(val_16x8b, 4);
    518 
    519     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b);      //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3 t3
    520     row1_16x8b = _mm_avg_epu8(w11_a1_16x8b, w11_a2_16x8b);
    521     w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2);                 //l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3 t3    0
    522 
    523     const_2_8x16b = _mm_set1_epi16(2);
    524     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b);      //l2+2*l1+l0 l1+2*l0+tl ...
    525     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b);
    526     w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2);
    527 
    528     w121_sh_8x16b = _mm_shufflelo_epi16(w121_a1_8x16b, 0xe1);
    529     w121_sh_8x16b = _mm_srli_si128(w121_sh_8x16b, 2);
    530 
    531     row4_16x8b = _mm_packus_epi16(w121_sh_8x16b, w121_sh_8x16b);
    532     temp_16x8b = _mm_slli_si128(w121_a1_8x16b, 13);
    533     row2_16x8b = _mm_srli_si128(row4_16x8b, 1);
    534     row3_16x8b = _mm_alignr_epi8(row1_16x8b, temp_16x8b, 15);
    535 
    536     dst_strd2 = dst_strd << 1;
    537     dst_strd3 = dst_strd + dst_strd2;
    538 
    539     row1 = _mm_cvtsi128_si32(row1_16x8b);
    540     row2 = _mm_cvtsi128_si32(row2_16x8b);
    541     row3 = _mm_cvtsi128_si32(row3_16x8b);
    542     row4 = _mm_cvtsi128_si32(row4_16x8b);
    543 
    544     *((WORD32 *)(pu1_dst)) = row1;
    545     *((WORD32 *)(pu1_dst + dst_strd)) = row2;
    546     *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
    547     *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
    548 }
    549 
    550 /*
    551  *******************************************************************************
    552  *
    553  * ih264_intra_pred_luma_4x4_mode_horz_d_ssse3
    554  *
    555  * @brief
    556  *  Perform Intra prediction for luma_4x4 mode:Horizontal_Down
    557  *
    558  * @par Description:
    559  *  Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7
    560  *
    561  * @param[in] pu1_src
    562  *  UWORD8 pointer to the source
    563  *
    564  * @param[out] pu1_dst
    565  *  UWORD8 pointer to the destination
    566  *
    567  * @param[in] src_strd
    568  *  integer source stride
    569  *
    570  * @param[in] dst_strd
    571  *  integer destination stride
    572  *
    573  * @param[in] ngbr_avail
    574  * availability of neighbouring pixels(Not used in this function)
    575  *
    576  * @returns
    577  *
    578  * @remarks
    579  *  None
    580  *
    581  *******************************************************************************/
    582 void ih264_intra_pred_luma_4x4_mode_horz_d_ssse3(UWORD8 *pu1_src,
    583                                            UWORD8 *pu1_dst,
    584                                            WORD32 src_strd,
    585                                            WORD32 dst_strd,
    586                                            WORD32 ngbr_avail)
    587 {
    588     UWORD8 *pu1_left;
    589     WORD32 dst_strd2, dst_strd3;
    590     WORD32 val_121_t0t1;
    591 
    592     __m128i val_16x8b, val_sh_16x8b;
    593     __m128i w11_16x8b;
    594     __m128i w121_a1_8x16b, w121_a2_8x16b, w121_16x8b;
    595     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
    596 
    597     __m128i zero_vector, const_2_8x16b;
    598     WORD32 row1,row2,row3,row4;
    599 
    600     UNUSED(src_strd);
    601     UNUSED(ngbr_avail);
    602 
    603     pu1_left = pu1_src + BLK_SIZE - 1;
    604 
    605     val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3));
    606     zero_vector = _mm_setzero_si128();
    607     val_sh_16x8b = _mm_srli_si128(val_16x8b, 1);
    608     w11_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b);
    609 
    610     w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector);        //l3 l2 l1 l0 tl t0 t1 t2
    611     w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2);                 //l2 l1 l0 tl t0 t1 t2 0
    612     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b);      //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2
    613     w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2);                 //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2    0
    614 
    615     zero_vector = _mm_setzero_si128();
    616     const_2_8x16b = _mm_set1_epi16(2);
    617 
    618     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b);      //l3+2*l2+l1 l2+2*l1+l0 l1+2*l0+tl ...
    619     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b);
    620     w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2);
    621 
    622     w121_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b);
    623 
    624     row4_16x8b = _mm_unpacklo_epi8(w11_16x8b, w121_16x8b);
    625     val_121_t0t1 = _mm_extract_epi16(w121_16x8b, 2);
    626     row4_16x8b = _mm_insert_epi16(row4_16x8b, val_121_t0t1, 4);
    627 
    628     dst_strd2 = dst_strd << 1;
    629     dst_strd3 = dst_strd + dst_strd2;
    630 
    631     row1_16x8b = _mm_srli_si128(row4_16x8b, 6);
    632     row2_16x8b = _mm_srli_si128(row4_16x8b, 4);
    633     row3_16x8b = _mm_srli_si128(row4_16x8b, 2);
    634 
    635     row1 = _mm_cvtsi128_si32(row1_16x8b);
    636     row2 = _mm_cvtsi128_si32(row2_16x8b);
    637     row3 = _mm_cvtsi128_si32(row3_16x8b);
    638     row4 = _mm_cvtsi128_si32(row4_16x8b);
    639 
    640     *((WORD32 *)(pu1_dst)) = row1;
    641     *((WORD32 *)(pu1_dst + dst_strd)) = row2;
    642     *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
    643     *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
    644 }
    645 
    646 /**
    647  *******************************************************************************
    648  *
    649  * ih264_intra_pred_luma_4x4_mode_vert_l_ssse3
    650  *
    651  * @brief
    652  *  Perform Intra prediction for luma_4x4 mode:Vertical_Left
    653  *
    654  * @par Description:
    655  *  Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8
    656  *
    657  * @param[in] pu1_src
    658  *  UWORD8 pointer to the source
    659  *
    660  * @param[out] pu1_dst
    661  *  UWORD8 pointer to the destination
    662  *
    663  * @param[in] src_strd
    664  *  integer source stride
    665  *
    666  * @param[in] dst_strd
    667  *  integer destination stride
    668  *
    669  * @param[in] ngbr_avail
    670  * availability of neighbouring pixels(Not used in this function)
    671  *
    672  * @returns
    673  *
    674  * @remarks
    675  *  None
    676  *
    677  *******************************************************************************/
    678 void ih264_intra_pred_luma_4x4_mode_vert_l_ssse3(UWORD8 *pu1_src,
    679                                                  UWORD8 *pu1_dst,
    680                                                  WORD32 src_strd,
    681                                                  WORD32 dst_strd,
    682                                                  WORD32 ngbr_avail)
    683 {
    684     UWORD8 *pu1_top;
    685     WORD32 dst_strd2, dst_strd3;
    686 
    687     __m128i val_16x8b, val_sh_16x8b;
    688     __m128i w121_a1_8x16b, w121_a2_8x16b;
    689     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
    690 
    691     __m128i zero_vector, const_2_8x16b;
    692     WORD32 row1,row2,row3,row4;
    693 
    694     UNUSED(src_strd);
    695     UNUSED(ngbr_avail);
    696 
    697     pu1_top = pu1_src +BLK_SIZE + 1;
    698 
    699     val_16x8b = _mm_loadl_epi64((__m128i *)pu1_top);
    700     zero_vector = _mm_setzero_si128();
    701     val_sh_16x8b = _mm_srli_si128(val_16x8b, 1);
    702     row1_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b);
    703 
    704     w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector);        //t0 t1 t2 t3 t4 t5...
    705     w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2);                 //t1 t2 t3 t4 t5 t6...
    706     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b);      //t0+t1 t1+t2 t2+t3 t3+t4 t4+t5...
    707     w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2);                 //t1+t2 t2+t3 t3+t4 t4+t5 t5+t6...
    708 
    709     zero_vector = _mm_setzero_si128();
    710     const_2_8x16b = _mm_set1_epi16(2);
    711 
    712     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b);      //t0+2*t1+t2 t1+2*t2+t3 t2+2*t3+t4...
    713     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b);
    714     w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2);
    715 
    716     row2_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b);
    717 
    718     dst_strd2 = dst_strd << 1;
    719     dst_strd3 = dst_strd + dst_strd2;
    720 
    721     row3_16x8b = _mm_srli_si128(row1_16x8b, 1);
    722     row4_16x8b = _mm_srli_si128(row2_16x8b, 1);
    723 
    724     row1 = _mm_cvtsi128_si32(row1_16x8b);
    725     row2 = _mm_cvtsi128_si32(row2_16x8b);
    726     row3 = _mm_cvtsi128_si32(row3_16x8b);
    727     row4 = _mm_cvtsi128_si32(row4_16x8b);
    728 
    729     *((WORD32 *)(pu1_dst)) = row1;
    730     *((WORD32 *)(pu1_dst + dst_strd)) = row2;
    731     *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
    732     *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
    733 }
    734 
    735 /**
    736  *******************************************************************************
    737  *
    738  * ih264_intra_pred_luma_4x4_mode_horz_u_ssse3
    739  *
    740  * @brief
    741  *  Perform Intra prediction for luma_4x4 mode:Horizontal_Up
    742  *
    743  * @par Description:
    744  *  Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9
    745  *
    746  * @param[in] pu1_src
    747  *  UWORD8 pointer to the source
    748  *
    749  * @param[out] pu1_dst
    750  *  UWORD8 pointer to the destination
    751  *
    752  * @param[in] src_strd
    753  *  integer source stride
    754  *
    755  * @param[in] dst_strd
    756  *  integer destination stride
    757  *
    758  * @param[in] ngbr_avail
    759  * availability of neighbouring pixels(Not used in this function)
    760  *
    761  * @returns
    762  *
    763  * @remarks
    764  *  None
    765  *
    766  *******************************************************************************/
    767 void ih264_intra_pred_luma_4x4_mode_horz_u_ssse3(UWORD8 *pu1_src,
    768                                                  UWORD8 *pu1_dst,
    769                                                  WORD32 src_strd,
    770                                                  WORD32 dst_strd,
    771                                                  WORD32 ngbr_avail)
    772 {
    773     UWORD8 *pu1_left;
    774     WORD32 dst_strd2, dst_strd3;
    775 
    776     __m128i val_16x8b, val_sh_16x8b;
    777     __m128i w11_16x8b;
    778     __m128i w121_a1_8x16b, w121_a2_8x16b, w121_16x8b;
    779     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
    780 
    781     __m128i zero_vector, const_2_8x16b, rev_16x8b;
    782     WORD32 row1,row2,row3,row4;
    783 
    784     UNUSED(src_strd);
    785     UNUSED(ngbr_avail);
    786 
    787     pu1_left = pu1_src + BLK_SIZE - 1;
    788 
    789     zero_vector = _mm_setzero_si128();
    790     rev_16x8b = _mm_setr_epi8(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    791 
    792     val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3));           //l3 l2 l1 l0 0  0  0...
    793     val_16x8b = _mm_shuffle_epi8(val_16x8b, rev_16x8b);                //l0 l1 l2 l3 l3 l3 l3...
    794 
    795     val_sh_16x8b = _mm_srli_si128(val_16x8b, 1);
    796     w11_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b);
    797 
    798     w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector);        //l0 l1 l2 l3 l3 l3...
    799     w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2);                 //l1 l2 l3 l3 l3 l3...
    800 
    801     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b);      //l0+t1 l1+l2 l2+l3 2*l3 2*l3...
    802     w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2);                 //l1+t2 l2+l3 2*l3  2*l3 2*l3...
    803 
    804     zero_vector = _mm_setzero_si128();
    805     const_2_8x16b = _mm_set1_epi16(2);
    806 
    807     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b);      //l0+2*l1+l2 l1+2*l2+l3 l2+3*l3 4*l3 4*l3...
    808     w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b);
    809     w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2);
    810 
    811     w121_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b);
    812 
    813     dst_strd2 = dst_strd << 1;
    814     dst_strd3 = dst_strd + dst_strd2;
    815 
    816     row1_16x8b = _mm_unpacklo_epi8(w11_16x8b, w121_16x8b);
    817     row2_16x8b = _mm_srli_si128(row1_16x8b, 2);
    818     row3_16x8b = _mm_srli_si128(row1_16x8b, 4);
    819     row4_16x8b = _mm_srli_si128(row1_16x8b, 6);
    820 
    821     row1 = _mm_cvtsi128_si32(row1_16x8b);
    822     row2 = _mm_cvtsi128_si32(row2_16x8b);
    823     row3 = _mm_cvtsi128_si32(row3_16x8b);
    824     row4 = _mm_cvtsi128_si32(row4_16x8b);
    825 
    826     *((WORD32 *)(pu1_dst)) = row1;
    827     *((WORD32 *)(pu1_dst + dst_strd)) = row2;
    828     *((WORD32 *)(pu1_dst + dst_strd2)) = row3;
    829     *((WORD32 *)(pu1_dst + dst_strd3)) = row4;
    830 }
    831 
    832 /*******************    8x8 Modes    *******************/
    833 
    834 /**
    835  *******************************************************************************
    836  *
    837  * ih264_intra_pred_luma_8x8_mode_vert_ssse3
    838  *
    839  * @brief
    840  *  Perform Intra prediction for luma_8x8 mode:vertical
    841  *
    842  * @par Description:
    843  *  Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2
    844  *
    845  * @param[in] pu1_src
    846  *  UWORD8 pointer to the source
    847  *
    848  * @param[out] pu1_dst
    849  *  UWORD8 pointer to the destination
    850  *
    851  * @param[in] src_strd
    852  *  integer source stride
    853  *
    854  * @param[in] dst_strd
    855  *  integer destination stride
    856  *
    857  * @param[in] ngbr_avail
    858  * availability of neighbouring pixels(Not used in this function)
    859  *
    860  * @returns
    861  *
    862  * @remarks
    863  *  None
    864  *
    865  *******************************************************************************
    866  */
    867 void ih264_intra_pred_luma_8x8_mode_vert_ssse3(UWORD8 *pu1_src,
    868                                                UWORD8 *pu1_dst,
    869                                                WORD32 src_strd,
    870                                                WORD32 dst_strd,
    871                                                WORD32 ngbr_avail)
    872 {
    873     UWORD8 *pu1_top = NULL;
    874     __m128i top_8x8b;
    875     UNUSED(src_strd);
    876     UNUSED(ngbr_avail);
    877     pu1_top = pu1_src + BLK8x8SIZE + 1;
    878 
    879     top_8x8b = _mm_loadl_epi64((__m128i *)pu1_top);
    880 
    881     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), top_8x8b);
    882     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), top_8x8b);
    883     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), top_8x8b);
    884     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), top_8x8b);
    885     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), top_8x8b);
    886     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), top_8x8b);
    887     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), top_8x8b);
    888     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), top_8x8b);
    889 }
    890 
    891 /**
    892  *******************************************************************************
    893  *
    894  * ih264_intra_pred_luma_8x8_mode_horz_ssse3
    895  *
    896  * @brief
    897  *  Perform Intra prediction for luma_8x8 mode:horizontal
    898  *
    899  * @par Description:
    900  *  Perform Intra prediction for  uma_8x8 mode:horizontal ,described in sec 8.3.2.2.2
    901  *
    902  * @param[in] pu1_src
    903  *  UWORD8 pointer to the source
    904  *
    905  * @param[out] pu1_dst
    906  *  UWORD8 pointer to the destination
    907  *
    908  * @param[in] src_strd
    909  *  integer source stride
    910  *
    911  * @param[in] dst_strd
    912  *  integer destination stride
    913  *
    914  * @param[in] ngbr_avail
    915  * availability of neighbouring pixels(Not used in this function)
    916  *
    917  * @returns
    918  *
    919  * @remarks
    920  *  None
    921  *
    922  *******************************************************************************
    923  */
    924 void ih264_intra_pred_luma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
    925                                                UWORD8 *pu1_dst,
    926                                                WORD32 src_strd,
    927                                                WORD32 dst_strd,
    928                                                WORD32 ngbr_avail)
    929 {
    930     UWORD8 *pu1_left = pu1_src + BLK8x8SIZE - 1;
    931     __m128i row1_8x8b, row2_8x8b, row3_8x8b, row4_8x8b;
    932     __m128i row5_8x8b, row6_8x8b, row7_8x8b, row8_8x8b;
    933 
    934     UNUSED(src_strd);
    935     UNUSED(ngbr_avail);
    936 
    937     row1_8x8b = _mm_set1_epi8(pu1_left[0]);
    938     row2_8x8b = _mm_set1_epi8(pu1_left[-1]);
    939     row3_8x8b = _mm_set1_epi8(pu1_left[-2]);
    940     row4_8x8b = _mm_set1_epi8(pu1_left[-3]);
    941     row5_8x8b = _mm_set1_epi8(pu1_left[-4]);
    942     row6_8x8b = _mm_set1_epi8(pu1_left[-5]);
    943     row7_8x8b = _mm_set1_epi8(pu1_left[-6]);
    944     row8_8x8b = _mm_set1_epi8(pu1_left[-7]);
    945 
    946     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), row1_8x8b);
    947     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), row2_8x8b);
    948     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), row3_8x8b);
    949     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), row4_8x8b);
    950     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), row5_8x8b);
    951     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), row6_8x8b);
    952     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), row7_8x8b);
    953     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), row8_8x8b);
    954 }
    955 
    956 /**
    957  *******************************************************************************
    958  *
    959  * ih264_intra_pred_luma_8x8_mode_dc_ssse3
    960  *
    961  * @brief
    962  *  Perform Intra prediction for luma_8x8 mode:DC
    963  *
    964  * @par Description:
    965  *  Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.4
    966  *
    967  * @param[in] pu1_src
    968  *  UWORD8 pointer to the source
    969  *
    970  * @param[out] pu1_dst
    971  *  UWORD8 pointer to the destination
    972  *
    973  * @param[in] src_strd
    974  *  integer source stride
    975  *
    976  * @param[in] dst_strd
    977  *  integer destination stride
    978  *
    979  * @param[in] ngbr_avail
    980  *  availability of neighbouring pixels
    981  *
    982  * @returns
    983  *
    984  * @remarks
    985  *  None
    986  *
    987  *******************************************************************************/
    988 void ih264_intra_pred_luma_8x8_mode_dc_ssse3(UWORD8 *pu1_src,
    989                                              UWORD8 *pu1_dst,
    990                                              WORD32 src_strd,
    991                                              WORD32 dst_strd,
    992                                              WORD32 ngbr_avail)
    993 {
    994     UWORD8 u1_useleft; /* availability of left predictors (only for DC) */
    995     UWORD8 u1_usetop; /* availability of top predictors (only for DC) */
    996     UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
    997     UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
    998     __m128i dc_val_8x8b;
    999     WORD32 dc_val = 0;
   1000     UNUSED(src_strd);
   1001 
   1002     u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
   1003     u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
   1004     pu1_top = pu1_src + BLK8x8SIZE + 1;
   1005     pu1_left = pu1_src + BLK8x8SIZE - 1;
   1006 
   1007     if(u1_useleft || u1_usetop)
   1008     {
   1009         WORD32 shft = 2;
   1010         __m128i val_8x8b, zero_8x8b, sum_8x16b;
   1011 
   1012         zero_8x8b = _mm_setzero_si128();
   1013 
   1014         if(u1_useleft)
   1015         {
   1016             val_8x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 7));
   1017             sum_8x16b = _mm_sad_epu8(zero_8x8b, val_8x8b);
   1018 
   1019             shft++;
   1020             dc_val += 4;
   1021             dc_val += _mm_extract_epi16(sum_8x16b, 0);
   1022         }
   1023         if(u1_usetop)
   1024         {
   1025             val_8x8b = _mm_loadl_epi64((__m128i *)pu1_top);
   1026             sum_8x16b = _mm_sad_epu8(zero_8x8b, val_8x8b);
   1027 
   1028             shft++;
   1029             dc_val += 4;
   1030             dc_val += _mm_extract_epi16(sum_8x16b, 0);
   1031         }
   1032         dc_val = dc_val >> shft;
   1033     }
   1034     else
   1035         dc_val = 128;
   1036 
   1037     dc_val_8x8b = _mm_set1_epi8(dc_val);
   1038 
   1039     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), dc_val_8x8b);
   1040     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), dc_val_8x8b);
   1041     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), dc_val_8x8b);
   1042     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), dc_val_8x8b);
   1043     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), dc_val_8x8b);
   1044     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), dc_val_8x8b);
   1045     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), dc_val_8x8b);
   1046     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), dc_val_8x8b);
   1047 }
   1048 
   1049 /**
   1050  *******************************************************************************
   1051  *
   1052  * ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3
   1053  *
   1054  * @brief
   1055  *  Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left
   1056  *
   1057  * @par Description:
   1058  *  Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.5
   1059  *
   1060  * @param[in] pu1_src
   1061  *  UWORD8 pointer to the source
   1062  *
   1063  * @param[out] pu1_dst
   1064  *  UWORD8 pointer to the destination
   1065  *
   1066  * @param[in] src_strd
   1067  *  integer source stride
   1068  *
   1069  * @param[in] dst_strd
   1070  *  integer destination stride
   1071  *
   1072  * @param[in] ngbr_avail
   1073  * availability of neighbouring pixels(Not used in this function)
   1074  *
   1075  * @returns
   1076  *
   1077  * @remarks
   1078  *  None
   1079  *
   1080  *******************************************************************************/
   1081 void ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3(UWORD8 *pu1_src,
   1082                                                   UWORD8 *pu1_dst,
   1083                                                   WORD32 src_strd,
   1084                                                   WORD32 dst_strd,
   1085                                                   WORD32 ngbr_avail)
   1086 {
   1087     UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
   1088     __m128i top_16x8;
   1089     __m128i out_15x16;
   1090     __m128i a0_8x16, a1_8x16, a2_8x16;
   1091     __m128i temp1, temp2;
   1092     __m128i res1_8x16, res2_8x16;
   1093     __m128i zero = _mm_setzero_si128();
   1094     __m128i const_val2_8x16 = _mm_set1_epi16(2);
   1095 
   1096     UNUSED(src_strd);
   1097     UNUSED(ngbr_avail);
   1098 
   1099     pu1_top = pu1_src + BLK8x8SIZE + 1;
   1100 
   1101     top_16x8 = _mm_loadu_si128((__m128i *)(pu1_top));
   1102 
   1103     temp1 = _mm_srli_si128(top_16x8, 1);
   1104     temp2 = _mm_srli_si128(top_16x8, 2);
   1105     a0_8x16 = _mm_unpacklo_epi8(top_16x8, zero);
   1106     a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
   1107     a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
   1108 
   1109     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1110     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1111     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1112     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1113     res1_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1114 
   1115     temp2 = _mm_srli_si128(top_16x8, 2);
   1116     temp1 = _mm_srli_si128(top_16x8, 1);
   1117     a2_8x16 = _mm_unpackhi_epi8(temp2, zero);
   1118     a0_8x16 = _mm_unpackhi_epi8(top_16x8, zero);
   1119     a2_8x16 = _mm_shufflehi_epi16(a2_8x16, 0x14);
   1120     a1_8x16 = _mm_unpackhi_epi8(temp1, zero);
   1121 
   1122     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1123     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1124     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1125     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1126     res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1127 
   1128     out_15x16 = _mm_packus_epi16(res1_8x16, res2_8x16);
   1129 
   1130     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), out_15x16);
   1131     out_15x16 = _mm_srli_si128(out_15x16, 1);
   1132     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), out_15x16);
   1133     out_15x16 = _mm_srli_si128(out_15x16, 1);
   1134     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), out_15x16);
   1135     out_15x16 = _mm_srli_si128(out_15x16, 1);
   1136     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out_15x16);
   1137     out_15x16 = _mm_srli_si128(out_15x16, 1);
   1138     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), out_15x16);
   1139     out_15x16 = _mm_srli_si128(out_15x16, 1);
   1140     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), out_15x16);
   1141     out_15x16 = _mm_srli_si128(out_15x16, 1);
   1142     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), out_15x16);
   1143     out_15x16 = _mm_srli_si128(out_15x16, 1);
   1144     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out_15x16);
   1145 }
   1146 
   1147 /**
   1148  *******************************************************************************
   1149  *
   1150  * ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3
   1151  *
   1152  * @brief
   1153  *  Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right
   1154  *
   1155  * @par Description:
   1156  *  Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.6
   1157  *
   1158  * @param[in] pu1_src
   1159  *  UWORD8 pointer to the source
   1160  *
   1161  * @param[out] pu1_dst
   1162  *  UWORD8 pointer to the destination
   1163  *
   1164  * @param[in] src_strd
   1165  *  integer source stride
   1166  *
   1167  * @param[in] dst_strd
   1168  *  integer destination stride
   1169  *
   1170  * @param[in] ngbr_avail
   1171  * availability of neighbouring pixels(Not used in this function)
   1172  *
   1173  * @returns
   1174  *
   1175  * @remarks
   1176  *  None
   1177  *
   1178  *******************************************************************************/
   1179 void ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3(UWORD8 *pu1_src,
   1180                                                   UWORD8 *pu1_dst,
   1181                                                   WORD32 src_strd,
   1182                                                   WORD32 dst_strd,
   1183                                                   WORD32 ngbr_avail)
   1184 {
   1185     UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
   1186     UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
   1187     __m128i top_8x8, left_16x8;
   1188     __m128i out_15x16;
   1189     __m128i a0_8x16, a1_8x16, a2_8x16;
   1190     __m128i temp1, temp2;
   1191     __m128i res1_8x16, res2_8x16;
   1192     __m128i zero = _mm_setzero_si128();
   1193     __m128i const_val2_8x16 = _mm_set1_epi16(2);
   1194     __m128i str_8x8;
   1195 
   1196     UNUSED(src_strd);
   1197     UNUSED(ngbr_avail);
   1198 
   1199     pu1_left = pu1_src + BLK8x8SIZE - 1;
   1200     pu1_top = pu1_src + BLK8x8SIZE + 1;
   1201 
   1202     left_16x8 = _mm_loadu_si128((__m128i *)(pu1_left - 7));
   1203 
   1204     temp1 = _mm_srli_si128(left_16x8, 1);
   1205     temp2 = _mm_srli_si128(left_16x8, 2);
   1206     a0_8x16 = _mm_unpacklo_epi8(left_16x8, zero);
   1207     a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
   1208     a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
   1209 
   1210     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1211     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1212     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1213     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1214     res1_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1215 
   1216     top_8x8 = _mm_loadu_si128((__m128i *)(pu1_top - 1));
   1217 
   1218     temp1 = _mm_srli_si128(top_8x8, 1);
   1219     temp2 = _mm_srli_si128(top_8x8, 2);
   1220     a0_8x16 = _mm_unpacklo_epi8(top_8x8, zero);
   1221     a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
   1222     a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
   1223 
   1224     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1225     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1226     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1227     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1228     res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1229 
   1230     out_15x16 = _mm_packus_epi16(res1_8x16, res2_8x16);
   1231 
   1232     str_8x8 = _mm_srli_si128(out_15x16, 7);
   1233     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8);
   1234     str_8x8 = _mm_srli_si128(out_15x16, 6);
   1235     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8);
   1236     str_8x8 = _mm_srli_si128(out_15x16, 5);
   1237     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8);
   1238     str_8x8 = _mm_srli_si128(out_15x16, 4);
   1239     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8);
   1240     str_8x8 = _mm_srli_si128(out_15x16, 3);
   1241     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8);
   1242     str_8x8 = _mm_srli_si128(out_15x16, 2);
   1243     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8);
   1244     str_8x8 = _mm_srli_si128(out_15x16, 1);
   1245     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8);
   1246     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out_15x16);
   1247 }
   1248 
   1249 /**
   1250  *******************************************************************************
   1251  *
   1252  * ih264_intra_pred_luma_8x8_mode_vert_r_ssse3
   1253  *
   1254  * @brief
   1255  *  Perform Intra prediction for luma_8x8 mode:Vertical_Right
   1256  *
   1257  * @par Description:
   1258  *  Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.7
   1259  *
   1260  * @param[in] pu1_src
   1261  *  UWORD8 pointer to the source
   1262  *
   1263  * @param[out] pu1_dst
   1264  *  UWORD8 pointer to the destination
   1265  *
   1266  * @param[in] src_strd
   1267  *  integer source stride
   1268  *
   1269  * @param[in] dst_strd
   1270  *  integer destination stride
   1271  *
   1272  * @param[in] ngbr_avail
   1273  * availability of neighbouring pixels(Not used in this function)
   1274  *
   1275  * @returns
   1276  *
   1277  * @remarks
   1278  *  None
   1279  *
   1280  *******************************************************************************/
   1281 void ih264_intra_pred_luma_8x8_mode_vert_r_ssse3(UWORD8 *pu1_src,
   1282                                                  UWORD8 *pu1_dst,
   1283                                                  WORD32 src_strd,
   1284                                                  WORD32 dst_strd,
   1285                                                  WORD32 ngbr_avail)
   1286 {
   1287     UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
   1288     UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
   1289     __m128i top_8x8, left_16x8;
   1290     __m128i out1_16x16, out2_16x16;
   1291     __m128i a0_8x16, a1_8x16, a2_8x16;
   1292     __m128i temp1, temp2;
   1293     __m128i res1_8x16, res2_8x16, res3_8x16;
   1294     __m128i zero = _mm_setzero_si128();
   1295     __m128i const_val2_8x16 = _mm_set1_epi16(2);
   1296     __m128i str_8x8;
   1297     __m128i mask = _mm_set1_epi32(0xFFFF);
   1298 
   1299     UNUSED(src_strd);
   1300     UNUSED(ngbr_avail);
   1301 
   1302     pu1_left = pu1_src + BLK8x8SIZE - 1;
   1303     pu1_top = pu1_src + BLK8x8SIZE + 1;
   1304 
   1305     left_16x8 = _mm_loadu_si128((__m128i *)(pu1_left - 6));
   1306 
   1307     temp1 = _mm_srli_si128(left_16x8, 1);
   1308     temp2 = _mm_srli_si128(left_16x8, 2);
   1309     a0_8x16 = _mm_unpacklo_epi8(left_16x8, zero);
   1310     a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
   1311     a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
   1312 
   1313     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1314     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1315     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1316     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1317     res1_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1318 
   1319     top_8x8 = _mm_loadu_si128((__m128i *)(pu1_top - 1));
   1320 
   1321     temp1 = _mm_srli_si128(top_8x8, 1);
   1322     temp2 = _mm_srli_si128(top_8x8, 2);
   1323     a0_8x16 = _mm_unpacklo_epi8(top_8x8, zero);
   1324     a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
   1325     a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
   1326 
   1327     res3_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
   1328 
   1329     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1330     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1331     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1332     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1333     res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1334 
   1335     str_8x8 = _mm_packus_epi16(res3_8x16, zero);
   1336     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8);
   1337 
   1338     temp1 = _mm_and_si128(res1_8x16, mask);
   1339     temp1 = _mm_packs_epi32(temp1, temp1);
   1340     out1_16x16 = _mm_packus_epi16(temp1, res2_8x16);
   1341 
   1342     res1_8x16 = _mm_slli_si128(res1_8x16, 2);
   1343     temp1 = _mm_and_si128(res1_8x16, mask);
   1344     temp1 = _mm_packs_epi32(temp1, temp1);
   1345     out2_16x16 = _mm_packus_epi16(temp1, res3_8x16);
   1346 
   1347     str_8x8 = _mm_srli_si128(out1_16x16, 7);
   1348     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8);
   1349 
   1350     str_8x8 = _mm_srli_si128(out2_16x16, 7);
   1351     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8);
   1352 
   1353     str_8x8 = _mm_srli_si128(out1_16x16, 6);
   1354     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8);
   1355 
   1356     str_8x8 = _mm_srli_si128(out2_16x16, 6);
   1357     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8);
   1358 
   1359     str_8x8 = _mm_srli_si128(out1_16x16, 5);
   1360     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8);
   1361 
   1362     str_8x8 = _mm_srli_si128(out2_16x16, 5);
   1363     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8);
   1364 
   1365     str_8x8 = _mm_srli_si128(out1_16x16, 4);
   1366     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), str_8x8);
   1367 }
   1368 
   1369 /*
   1370  *******************************************************************************
   1371  *
   1372  * ih264_intra_pred_luma_8x8_mode_horz_d_ssse3
   1373  *
   1374  * @brief
   1375  *  Perform Intra prediction for luma_8x8 mode:Horizontal_Down
   1376  *
   1377  * @par Description:
   1378  *  Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.8
   1379  *
   1380  * @param[in] pu1_src
   1381  *  UWORD8 pointer to the source
   1382  *
   1383  * @param[out] pu1_dst
   1384  *  UWORD8 pointer to the destination
   1385  *
   1386  * @param[in] src_strd
   1387  *  integer source stride
   1388  *
   1389  * @param[in] dst_strd
   1390  *  integer destination stride
   1391  *
   1392  * @param[in] ngbr_avail
   1393  * availability of neighbouring pixels(Not used in this function)
   1394  *
   1395  * @returns
   1396  *
   1397  * @remarks
   1398  *  None
   1399  *
   1400  *******************************************************************************/
   1401 void ih264_intra_pred_luma_8x8_mode_horz_d_ssse3(UWORD8 *pu1_src,
   1402                                                  UWORD8 *pu1_dst,
   1403                                                  WORD32 src_strd,
   1404                                                  WORD32 dst_strd,
   1405                                                  WORD32 ngbr_avail)
   1406 {
   1407     UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
   1408     __m128i pels_16x16;
   1409     __m128i temp1, temp2, temp3, temp4;
   1410     __m128i a0_8x16, a1_8x16, a2_8x16;
   1411     __m128i zero = _mm_setzero_si128();
   1412     __m128i const_val2_8x16 = _mm_set1_epi16(2);
   1413     __m128i res1_8x16, res2_8x16;
   1414     __m128i out1_16x16, out2_16x16;
   1415     __m128i str_8x8;
   1416     UNUSED(src_strd);
   1417     UNUSED(ngbr_avail);
   1418 
   1419     pu1_left = pu1_src + BLK8x8SIZE - 1;
   1420 
   1421     pels_16x16 = _mm_loadu_si128((__m128i *)(pu1_left - 7));
   1422 
   1423     temp1 = _mm_srli_si128(pels_16x16, 1);
   1424     temp2 = _mm_srli_si128(pels_16x16, 2);
   1425     a0_8x16 = _mm_unpacklo_epi8(pels_16x16, zero);
   1426     a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
   1427     a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
   1428 
   1429     res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
   1430 
   1431     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1432     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1433     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1434     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1435     res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1436 
   1437     temp3 = _mm_unpacklo_epi16(res1_8x16, res2_8x16);
   1438     temp4 = _mm_unpackhi_epi16(res1_8x16, res2_8x16);
   1439     out2_16x16 = _mm_packus_epi16(temp3, temp4);
   1440 
   1441     a0_8x16 = _mm_unpackhi_epi8(pels_16x16, zero);
   1442     a1_8x16 = _mm_unpackhi_epi8(temp1, zero);
   1443     a2_8x16 = _mm_unpackhi_epi8(temp2, zero);
   1444 
   1445     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1446     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1447     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1448     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1449     res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1450 
   1451     out1_16x16 = _mm_packus_epi16(res2_8x16, zero);
   1452     temp1 = _mm_srli_si128(out2_16x16, 8);
   1453     out1_16x16 = _mm_unpacklo_epi64(temp1, out1_16x16);
   1454 
   1455     str_8x8 = _mm_srli_si128(out1_16x16, 6);
   1456     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8);
   1457     str_8x8 = _mm_srli_si128(out1_16x16, 4);
   1458     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8);
   1459     str_8x8 = _mm_srli_si128(out1_16x16, 2);
   1460     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8);
   1461     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out1_16x16);
   1462 
   1463     str_8x8 = _mm_srli_si128(out2_16x16, 6);
   1464     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8);
   1465     str_8x8 = _mm_srli_si128(out2_16x16, 4);
   1466     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8);
   1467     str_8x8 = _mm_srli_si128(out2_16x16, 2);
   1468     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8);
   1469     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out2_16x16);
   1470 }
   1471 
   1472 /**
   1473  *******************************************************************************
   1474  *
   1475  * ih264_intra_pred_luma_8x8_mode_vert_l_ssse3
   1476  *
   1477  * @brief
   1478  *  Perform Intra prediction for luma_8x8 mode:Vertical_Left
   1479  *
   1480  * @par Description:
   1481  *  Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.9
   1482  *
   1483  * @param[in] pu1_src
   1484  *  UWORD8 pointer to the source
   1485  *
   1486  * @param[out] pu1_dst
   1487  *  UWORD8 pointer to the destination
   1488  *
   1489  * @param[in] src_strd
   1490  *  integer source stride
   1491  *
   1492  * @param[in] dst_strd
   1493  *  integer destination stride
   1494  *
   1495  * @param[in] ngbr_avail
   1496  * availability of neighbouring pixels(Not used in this function)
   1497  *
   1498  * @returns
   1499  *
   1500  * @remarks
   1501  *  None
   1502  *
   1503  *******************************************************************************/
   1504 
   1505 void ih264_intra_pred_luma_8x8_mode_vert_l_ssse3(UWORD8 *pu1_src,
   1506                                                  UWORD8 *pu1_dst,
   1507                                                  WORD32 src_strd,
   1508                                                  WORD32 dst_strd,
   1509                                                  WORD32 ngbr_avail)
   1510 {
   1511     UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */
   1512     __m128i top_16x16;
   1513     __m128i temp1, temp2;
   1514     __m128i a0_8x16, a1_8x16, a2_8x16;
   1515     __m128i zero = _mm_setzero_si128();
   1516     __m128i const_val2_8x16 = _mm_set1_epi16(2);
   1517     __m128i res1_8x16, res2_8x16, res3_8x16, res4_8x16;
   1518     __m128i out1_16x16, out2_16x16;
   1519     UNUSED(src_strd);
   1520     UNUSED(ngbr_avail);
   1521     pu1_top = pu1_src + BLK8x8SIZE + 1;
   1522 
   1523     top_16x16 = _mm_loadu_si128((__m128i *)(pu1_top));
   1524     temp1 = _mm_srli_si128(top_16x16, 1);
   1525     temp2 = _mm_srli_si128(top_16x16, 2);
   1526     a0_8x16 = _mm_unpacklo_epi8(top_16x16, zero);
   1527     a1_8x16 = _mm_unpacklo_epi8(temp1, zero);
   1528     a2_8x16 = _mm_unpacklo_epi8(temp2, zero);
   1529 
   1530     res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
   1531 
   1532     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1533     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1534     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1535     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1536     res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1537 
   1538     a0_8x16 = _mm_unpackhi_epi8(top_16x16, zero);
   1539     a1_8x16 = _mm_unpackhi_epi8(temp1, zero);
   1540     a2_8x16 = _mm_unpackhi_epi8(temp2, zero);
   1541 
   1542     res3_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
   1543 
   1544     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1545     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1546     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1547     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1548     res4_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1549 
   1550     out1_16x16 = _mm_packus_epi16(res1_8x16, res3_8x16);
   1551     out2_16x16 = _mm_packus_epi16(res2_8x16, res4_8x16);
   1552 
   1553     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), out1_16x16);
   1554     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), out2_16x16);
   1555     out1_16x16 = _mm_srli_si128(out1_16x16, 1);
   1556     out2_16x16 = _mm_srli_si128(out2_16x16, 1);
   1557     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), out1_16x16);
   1558     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out2_16x16);
   1559     out1_16x16 = _mm_srli_si128(out1_16x16, 1);
   1560     out2_16x16 = _mm_srli_si128(out2_16x16, 1);
   1561     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), out1_16x16);
   1562     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), out2_16x16);
   1563     out1_16x16 = _mm_srli_si128(out1_16x16, 1);
   1564     out2_16x16 = _mm_srli_si128(out2_16x16, 1);
   1565     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), out1_16x16);
   1566     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out2_16x16);
   1567 }
   1568 
   1569 /**
   1570  *******************************************************************************
   1571  *
   1572  * ih264_intra_pred_luma_8x8_mode_horz_u_ssse3
   1573  *
   1574  * @brief
   1575  *  Perform Intra prediction for luma_8x8 mode:Horizontal_Up
   1576  *
   1577  * @par Description:
   1578  *  Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.10
   1579  *
   1580  * @param[in] pu1_src
   1581  *  UWORD8 pointer to the source
   1582  *
   1583  * @param[out] pu1_dst
   1584  *  UWORD8 pointer to the destination
   1585  *
   1586  * @param[in] src_strd
   1587  *  integer source stride
   1588  *
   1589  * @param[in] dst_strd
   1590  *  integer destination stride
   1591  *
   1592  * @param[in] ngbr_avail
   1593  * availability of neighbouring pixels(Not used in this function)
   1594  *
   1595  * @returns
   1596  *
   1597  * @remarks
   1598  *  None
   1599  *
   1600  *******************************************************************************/
   1601 void ih264_intra_pred_luma_8x8_mode_horz_u_ssse3(UWORD8 *pu1_src,
   1602                                                  UWORD8 *pu1_dst,
   1603                                                  WORD32 src_strd,
   1604                                                  WORD32 dst_strd,
   1605                                                  WORD32 ngbr_avail)
   1606 {
   1607     UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */
   1608     __m128i left_16x16;
   1609     __m128i temp1, temp2;
   1610     __m128i a0_8x16, a1_8x16, a2_8x16;
   1611     __m128i zero = _mm_setzero_si128();
   1612     __m128i const_val2_8x16 = _mm_set1_epi16(2);
   1613     __m128i res1_8x16, res2_8x16;
   1614     __m128i out1_16x16;
   1615     __m128i str_8x8;
   1616     __m128i shuffle_16x16;
   1617     UNUSED(src_strd);
   1618     UNUSED(ngbr_avail);
   1619 
   1620     pu1_left = pu1_src + BLK8x8SIZE - 1;
   1621     shuffle_16x16 = _mm_set_epi8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
   1622                                  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
   1623                                  0x0F);
   1624 
   1625     left_16x16 = _mm_loadu_si128((__m128i *)(pu1_left - 7));
   1626     temp1 = _mm_srli_si128(left_16x16, 1);
   1627     a0_8x16 = _mm_unpacklo_epi8(left_16x16, zero);
   1628     a0_8x16 = _mm_slli_si128(a0_8x16, 2);
   1629     a1_8x16 = _mm_unpacklo_epi8(left_16x16, zero);
   1630     a0_8x16 = _mm_shufflelo_epi16(a0_8x16, 0xE5);
   1631     a2_8x16 = _mm_unpacklo_epi8(temp1, zero);
   1632 
   1633     res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16);
   1634 
   1635     a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16);
   1636     a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16);
   1637     a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16);
   1638     a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16);
   1639     res2_8x16 = _mm_srai_epi16(a0_8x16, 2);
   1640 
   1641     temp1 = _mm_unpacklo_epi16(res1_8x16, res2_8x16);
   1642     temp2 = _mm_unpackhi_epi16(res1_8x16, res2_8x16);
   1643     out1_16x16 = _mm_packus_epi16(temp1, temp2);
   1644     out1_16x16 = _mm_shuffle_epi8(out1_16x16, shuffle_16x16);
   1645 
   1646     str_8x8 = _mm_srli_si128(out1_16x16, 1);
   1647     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8);
   1648     str_8x8 = _mm_srli_si128(out1_16x16, 3);
   1649     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8);
   1650     str_8x8 = _mm_srli_si128(out1_16x16, 5);
   1651     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8);
   1652     str_8x8 = _mm_srli_si128(out1_16x16, 7);
   1653     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8);
   1654     temp1 = _mm_set1_epi8(pu1_left[-7]);
   1655     str_8x8 = _mm_unpacklo_epi64(str_8x8, temp1);
   1656     str_8x8 = _mm_srli_si128(str_8x8, 2);
   1657     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8);
   1658     str_8x8 = _mm_srli_si128(str_8x8, 2);
   1659     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8);
   1660     str_8x8 = _mm_srli_si128(str_8x8, 2);
   1661     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8);
   1662     str_8x8 = _mm_srli_si128(str_8x8, 2);
   1663     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), str_8x8);
   1664 
   1665 }
   1666 
   1667 
   1668 /*******************    16x16 Modes    *******************/
   1669 
   1670 /**
   1671  *******************************************************************************
   1672  *
   1673  *ih264_intra_pred_luma_16x16_mode_vert_ssse3
   1674  *
   1675  * @brief
   1676  *  Perform Intra prediction for luma_16x16 mode:Vertical
   1677  *
   1678  * @par Description:
   1679  *  Perform Intra prediction for luma_16x16 mode:Vertical, described in sec 8.3.3.1
   1680  *
   1681  * @param[in] pu1_src
   1682  *  UWORD8 pointer to the source
   1683  *
   1684  * @param[out] pu1_dst
   1685  *  UWORD8 pointer to the destination
   1686  *
   1687  * @param[in] src_strd
   1688  *  integer source stride
   1689  *
   1690  * @param[in] dst_strd
   1691  *  integer destination stride
   1692  *
   1693  * @param[in] ngbr_avail
   1694  *  availability of neighbouring pixels (Not used in this function)
   1695  *
   1696  * @returns
   1697  *
   1698  * @remarks
   1699  *  None
   1700  *
   1701  *******************************************************************************/
   1702 void ih264_intra_pred_luma_16x16_mode_vert_ssse3(UWORD8 *pu1_src,
   1703                                                  UWORD8 *pu1_dst,
   1704                                                  WORD32 src_strd,
   1705                                                  WORD32 dst_strd,
   1706                                                  WORD32 ngbr_avail)
   1707 {
   1708     UWORD8 *pu1_top;
   1709     WORD32 dst_strd2, dst_strd3, dst_strd4;
   1710 
   1711     __m128i top_16x8b;
   1712 
   1713     UNUSED(src_strd);
   1714     UNUSED(ngbr_avail);
   1715 
   1716     pu1_top = pu1_src + MB_SIZE + 1;
   1717 
   1718     dst_strd2 = dst_strd << 1;
   1719     dst_strd4 = dst_strd << 2;
   1720 
   1721     top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
   1722 
   1723     dst_strd3 = dst_strd + dst_strd2;
   1724 
   1725     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1726     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
   1727     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b);
   1728     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b);
   1729     pu1_dst += dst_strd4;
   1730 
   1731     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1732     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
   1733     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b);
   1734     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b);
   1735     pu1_dst += dst_strd4;
   1736 
   1737     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1738     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
   1739     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b);
   1740     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b);
   1741     pu1_dst += dst_strd4;
   1742 
   1743     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1744     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
   1745     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b);
   1746     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b);
   1747 }
   1748 
   1749 /**
   1750  *******************************************************************************
   1751  *
   1752  *ih264_intra_pred_luma_16x16_mode_horz_ssse3
   1753  *
   1754  * @brief
   1755  *  Perform Intra prediction for luma_16x16 mode:Horizontal
   1756  *
   1757  * @par Description:
   1758  *  Perform Intra prediction for luma_16x16 mode:Horizontal, described in sec 8.3.3.2
   1759  *
   1760  * @param[in] pu1_src
   1761  *  UWORD8 pointer to the source
   1762  *
   1763  * @param[out] pu1_dst
   1764  *  UWORD8 pointer to the destination
   1765  *
   1766  * @param[in] src_strd
   1767  *  integer source stride
   1768  *
   1769  * @param[in] dst_strd
   1770  *  integer destination stride
   1771  *
   1772  * @param[in] ngbr_avail
   1773  * availability of neighbouring pixels(Not used in this function)
   1774  *
   1775  * @returns
   1776  *
   1777  * @remarks
   1778  *  None
   1779  *
   1780  *******************************************************************************/
   1781 void ih264_intra_pred_luma_16x16_mode_horz_ssse3(UWORD8 *pu1_src,
   1782                                                  UWORD8 *pu1_dst,
   1783                                                  WORD32 src_strd,
   1784                                                  WORD32 dst_strd,
   1785                                                  WORD32 ngbr_avail)
   1786 {
   1787     UWORD8 *pu1_left;
   1788     WORD32 dst_strd2, dst_strd3, dst_strd4;
   1789 
   1790     __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b;
   1791 
   1792     UNUSED(src_strd);
   1793     UNUSED(ngbr_avail);
   1794 
   1795     pu1_left = pu1_src + MB_SIZE - 1;
   1796 
   1797     dst_strd4 = dst_strd << 2;
   1798 
   1799     dst_strd2 = dst_strd << 1;
   1800     dst_strd3 = dst_strd4 - dst_strd;
   1801 
   1802     row1_16x8b = _mm_set1_epi8(*(pu1_left));
   1803     row2_16x8b = _mm_set1_epi8(*(pu1_left - 1));
   1804     row3_16x8b = _mm_set1_epi8(*(pu1_left - 2));
   1805     row4_16x8b = _mm_set1_epi8(*(pu1_left - 3));
   1806 
   1807     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
   1808     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
   1809     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
   1810     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
   1811 
   1812     pu1_dst += dst_strd4;
   1813     row1_16x8b = _mm_set1_epi8(*(pu1_left - 4));
   1814     row2_16x8b = _mm_set1_epi8(*(pu1_left - 5));
   1815     row3_16x8b = _mm_set1_epi8(*(pu1_left - 6));
   1816     row4_16x8b = _mm_set1_epi8(*(pu1_left - 7));
   1817 
   1818     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
   1819     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
   1820     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
   1821     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
   1822 
   1823     pu1_dst += dst_strd4;
   1824     row1_16x8b = _mm_set1_epi8(*(pu1_left - 8));
   1825     row2_16x8b = _mm_set1_epi8(*(pu1_left - 9));
   1826     row3_16x8b = _mm_set1_epi8(*(pu1_left - 10));
   1827     row4_16x8b = _mm_set1_epi8(*(pu1_left - 11));
   1828 
   1829     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
   1830     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
   1831     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
   1832     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
   1833 
   1834     pu1_dst += dst_strd4;
   1835     row1_16x8b = _mm_set1_epi8(*(pu1_left - 12));
   1836     row2_16x8b = _mm_set1_epi8(*(pu1_left - 13));
   1837     row3_16x8b = _mm_set1_epi8(*(pu1_left - 14));
   1838     row4_16x8b = _mm_set1_epi8(*(pu1_left - 15));
   1839 
   1840     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
   1841     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
   1842     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b);
   1843     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b);
   1844 }
   1845 
   1846 /**
   1847  *******************************************************************************
   1848  *
   1849  *ih264_intra_pred_luma_16x16_mode_dc_ssse3
   1850  *
   1851  * @brief
   1852  *  Perform Intra prediction for  luma_16x16 mode:DC
   1853  *
   1854  * @par Description:
   1855  *  Perform Intra prediction for  luma_16x16 mode:DC, described in sec 8.3.3.3
   1856  *
   1857  * @param[in] pu1_src
   1858  *  UWORD8 pointer to the source
   1859  *
   1860  * @param[out] pu1_dst
   1861  *  UWORD8 pointer to the destination
   1862  *
   1863  * @param[in] src_strd
   1864  *  integer source stride
   1865  *
   1866  * @param[in] dst_strd
   1867  *  integer destination stride
   1868  *
   1869  ** @param[in] ngbr_avail
   1870  *  availability of neighbouring pixels
   1871  *
   1872  * @returns
   1873  *
   1874  * @remarks
   1875  *  None
   1876  *
   1877  *******************************************************************************/
   1878 void ih264_intra_pred_luma_16x16_mode_dc_ssse3(UWORD8 *pu1_src,
   1879                                                UWORD8 *pu1_dst,
   1880                                                WORD32 src_strd,
   1881                                                WORD32 dst_strd,
   1882                                                WORD32 ngbr_avail)
   1883 {
   1884     WORD8 u1_useleft, u1_usetop;
   1885     WORD32 dc_val;
   1886 
   1887     WORD32 dst_strd2, dst_strd3, dst_strd4;
   1888 
   1889     __m128i dc_val_16x8b;
   1890 
   1891     UNUSED(src_strd);
   1892 
   1893     u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK);
   1894     u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK);
   1895 
   1896     if(u1_useleft || u1_usetop)
   1897     {
   1898         WORD32 shft;
   1899         __m128i val_16x8b, zero_16x8b, sum_8x16b;
   1900 
   1901         dc_val = 0;
   1902         shft = 3;
   1903 
   1904         zero_16x8b = _mm_setzero_si128();
   1905 
   1906         if(u1_useleft)
   1907         {
   1908             UWORD8 *pu1_left;
   1909 
   1910             pu1_left = pu1_src + MB_SIZE - 1;
   1911 
   1912             val_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 15));
   1913             sum_8x16b = _mm_sad_epu8(zero_16x8b, val_16x8b);
   1914 
   1915             shft++;
   1916             dc_val += 8;
   1917             dc_val += _mm_extract_epi16(sum_8x16b, 0);
   1918             dc_val += _mm_extract_epi16(sum_8x16b, 4);
   1919         }
   1920         if(u1_usetop)
   1921         {
   1922             UWORD8 *pu1_top;
   1923 
   1924             pu1_top = pu1_src + MB_SIZE + 1;
   1925 
   1926             val_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
   1927             sum_8x16b = _mm_sad_epu8(zero_16x8b, val_16x8b);
   1928 
   1929             shft++;
   1930             dc_val += 8;
   1931             dc_val += _mm_extract_epi16(sum_8x16b, 0);
   1932             dc_val += _mm_extract_epi16(sum_8x16b, 4);
   1933         }
   1934         dc_val = dc_val >> shft;
   1935     }
   1936     else
   1937         dc_val = 128;
   1938 
   1939     dc_val_16x8b =  _mm_set1_epi8(dc_val);
   1940 
   1941     dst_strd2 = dst_strd << 1;
   1942     dst_strd4 = dst_strd << 2;
   1943     dst_strd3 = dst_strd + dst_strd2;
   1944 
   1945     _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b);
   1946     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b);
   1947     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b);
   1948     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b);
   1949     pu1_dst += dst_strd4;
   1950 
   1951     _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b);
   1952     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b);
   1953     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b);
   1954     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b);
   1955     pu1_dst += dst_strd4;
   1956 
   1957     _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b);
   1958     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b);
   1959     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b);
   1960     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b);
   1961     pu1_dst += dst_strd4;
   1962 
   1963     _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b);
   1964     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b);
   1965     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b);
   1966     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b);
   1967 }
   1968 
   1969 /**
   1970  *******************************************************************************
   1971  *
   1972  *ih264_intra_pred_luma_16x16_mode_plane_ssse3
   1973  *
   1974  * @brief
   1975  *  Perform Intra prediction for  luma_16x16 mode:PLANE
   1976  *
   1977  * @par Description:
   1978  *  Perform Intra prediction for  luma_16x16 mode:PLANE, described in sec 8.3.3.4
   1979  *
   1980  * @param[in] pu1_src
   1981  *  UWORD8 pointer to the source
   1982  *
   1983  * @param[out] pu1_dst
   1984  *  UWORD8 pointer to the destination
   1985  *
   1986  * @param[in] src_strd
   1987  *  integer source stride
   1988  *
   1989  * @param[in] dst_strd
   1990  *  integer destination stride
   1991  *
   1992  * @param[in] ngbr_avail
   1993  * availability of neighbouring pixels(Not used in this function)
   1994  *
   1995  * @returns
   1996  *
   1997  * @remarks
   1998  *  None
   1999  *
   2000  *******************************************************************************/
   2001 void ih264_intra_pred_luma_16x16_mode_plane_ssse3(UWORD8 *pu1_src,
   2002                                                   UWORD8 *pu1_dst,
   2003                                                   WORD32 src_strd,
   2004                                                   WORD32 dst_strd,
   2005                                                   WORD32 ngbr_avail)
   2006 {
   2007     UWORD8 *pu1_left, *pu1_top;
   2008     WORD32 a, b, c;
   2009 
   2010     __m128i rev_8x16b, mul_8x16b, zero_16x8b;
   2011 
   2012     UNUSED(src_strd);
   2013     UNUSED(ngbr_avail);
   2014 
   2015     pu1_top = pu1_src + MB_SIZE + 1;
   2016     pu1_left = pu1_src + MB_SIZE - 1;
   2017 
   2018     rev_8x16b = _mm_setr_epi16(0x0f0e, 0x0d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
   2019     //used to reverse the order of 16-bit values in a vector
   2020 
   2021     mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
   2022     zero_16x8b = _mm_setzero_si128();
   2023 
   2024     //calculating a, b and c
   2025     {
   2026         WORD32 h, v;
   2027 
   2028         __m128i h_val1_16x8b, h_val2_16x8b;
   2029         __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
   2030         __m128i v_val1_16x8b, v_val2_16x8b;
   2031         __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b;
   2032         __m128i hv_val_4x32b;
   2033 
   2034         a = (pu1_top[15] + pu1_left[-15]) << 4;
   2035 
   2036         h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8));
   2037         h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 1));
   2038         v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 15));
   2039         v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 6));
   2040 
   2041         h_val1_8x16b = _mm_unpacklo_epi8(h_val1_16x8b, zero_16x8b);
   2042         h_val2_8x16b = _mm_unpacklo_epi8(h_val2_16x8b, zero_16x8b);
   2043         v_val1_8x16b = _mm_unpacklo_epi8(v_val1_16x8b, zero_16x8b);
   2044         v_val2_8x16b = _mm_unpacklo_epi8(v_val2_16x8b, zero_16x8b);
   2045 
   2046         h_val2_8x16b = _mm_shuffle_epi8(h_val2_8x16b, rev_8x16b);
   2047         v_val1_8x16b = _mm_shuffle_epi8(v_val1_8x16b, rev_8x16b);
   2048 
   2049         h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b);
   2050         v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b);
   2051 
   2052         h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
   2053         v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);
   2054 
   2055         hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);
   2056         hv_val_4x32b = _mm_hadd_epi32(hv_val_4x32b, hv_val_4x32b);
   2057 
   2058         h = _mm_extract_epi16(hv_val_4x32b, 0);
   2059         v = _mm_extract_epi16(hv_val_4x32b, 2);
   2060         h = (h << 16) >> 16;
   2061         v = (v << 16) >> 16;
   2062 
   2063         b = ((h << 2) + h + 32) >> 6;
   2064         c = ((v << 2) + v + 32) >> 6;
   2065     }
   2066 
   2067     //using a, b and c to compute the fitted plane values
   2068     {
   2069         __m128i const_8x16b, b_8x16b, c_8x16b, c2_8x16b;
   2070         __m128i res1_l_8x16b, res1_h_8x16b;
   2071         __m128i res2_l_8x16b, res2_h_8x16b;
   2072         __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b;
   2073         __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b;
   2074 
   2075         b_8x16b = _mm_set1_epi16(b);
   2076         c_8x16b = _mm_set1_epi16(c);
   2077         c2_8x16b = _mm_set1_epi16(c << 1);
   2078         const_8x16b = _mm_set1_epi16(a - c*7 + 16);
   2079 
   2080         res1_h_8x16b = _mm_mullo_epi16(mul_8x16b, b_8x16b);
   2081         //contains {b*1, b*2, b*3,... b*8}
   2082 
   2083         res1_l_8x16b = _mm_shuffle_epi8(res1_h_8x16b, rev_8x16b);
   2084         res1_l_8x16b = _mm_srli_si128(res1_l_8x16b, 2);
   2085         res1_l_8x16b = _mm_sub_epi16(zero_16x8b, res1_l_8x16b);
   2086         //contains {-b*7, -b*6,... -b*1, b*0}
   2087 
   2088         // rows 1, 2
   2089         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b);
   2090         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b);
   2091         res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c_8x16b);
   2092         res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c_8x16b);
   2093 
   2094         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
   2095         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
   2096         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
   2097         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
   2098 
   2099         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
   2100         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
   2101 
   2102         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
   2103         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   2104 
   2105         // rows 3, 4
   2106         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
   2107         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
   2108         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
   2109         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
   2110 
   2111         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
   2112         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
   2113         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
   2114         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
   2115 
   2116         pu1_dst += dst_strd << 1;
   2117 
   2118         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
   2119         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
   2120 
   2121         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
   2122         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   2123 
   2124         // rows 5, 6
   2125         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
   2126         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
   2127         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
   2128         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
   2129 
   2130         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
   2131         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
   2132         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
   2133         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
   2134 
   2135         pu1_dst += dst_strd << 1;
   2136 
   2137         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
   2138         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
   2139 
   2140         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
   2141         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   2142 
   2143         // rows 7, 8
   2144         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
   2145         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
   2146         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
   2147         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
   2148 
   2149         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
   2150         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
   2151         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
   2152         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
   2153 
   2154         pu1_dst += dst_strd << 1;
   2155 
   2156         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
   2157         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
   2158 
   2159         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
   2160         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   2161 
   2162         // rows 9, 10
   2163         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
   2164         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
   2165         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
   2166         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
   2167 
   2168         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
   2169         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
   2170         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
   2171         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
   2172 
   2173         pu1_dst += dst_strd << 1;
   2174 
   2175         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
   2176         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
   2177 
   2178         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
   2179         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   2180 
   2181         // rows 11, 12
   2182         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
   2183         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
   2184         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
   2185         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
   2186 
   2187         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
   2188         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
   2189         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
   2190         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
   2191 
   2192         pu1_dst += dst_strd << 1;
   2193 
   2194         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
   2195         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
   2196 
   2197         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
   2198         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   2199 
   2200         // rows 13, 14
   2201         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
   2202         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
   2203         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
   2204         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
   2205 
   2206         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
   2207         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
   2208         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
   2209         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
   2210 
   2211         pu1_dst += dst_strd << 1;
   2212 
   2213         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
   2214         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
   2215 
   2216         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
   2217         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   2218 
   2219         // rows 15, 16
   2220         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
   2221         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
   2222         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
   2223         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
   2224 
   2225         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
   2226         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
   2227         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
   2228         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
   2229 
   2230         pu1_dst += dst_strd << 1;
   2231 
   2232         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
   2233         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
   2234 
   2235         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
   2236         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   2237     }
   2238 }
   2239