Home | History | Annotate | Download | only in arm
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_intra_pred_filters_neon_intr.c
     22 *
     23 * @brief
     24 *  Contains function Definition for intra prediction  interpolation filters
     25 *
     26 *
     27 * @author
     28 *  Yogeswaran RS
     29 *
     30 * @par List of Functions:
     31 *  - ihevc_intra_pred_luma_planar()
     32 *  - ihevc_intra_pred_luma_dc()
     33 *  - ihevc_intra_pred_luma_horz()
     34 *  - ihevc_intra_pred_luma_ver()
     35 *  - ihevc_intra_pred_luma_mode2()
     36 *  - ihevc_intra_pred_luma_mode_18_34()
     37 *
     38 * @remarks
     39 *  None
     40 *
     41 *******************************************************************************
     42 */
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 #include <stdio.h>
     47 
     48 #include "ihevc_typedefs.h"
     49 #include "ihevc_intra_pred.h"
     50 #include "ihevc_macros.h"
     51 #include "ihevc_func_selector.h"
     52 #include "arm_neon.h"
     53 #include "ihevc_platform_macros.h"
     54 #include "ihevc_common_tables.h"
     55 
     56 /****************************************************************************/
     57 /* Constant Macros                                                          */
     58 /****************************************************************************/
     59 #define MAX_CU_SIZE 64
     60 #define BIT_DEPTH 8
     61 #define T32_4NT 128
     62 #define T16_4NT 64
     63 
     64 
     65 
     66 /*****************************************************************************/
     67 /* Table Look-up                                                             */
     68 /*****************************************************************************/
     69 
     70 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
     71 
     72 /*****************************************************************************/
     73 /* Function Definition                                                      */
     74 /*****************************************************************************/
     75 
     76 /**
     77 *******************************************************************************
     78 *
     79 * @brief
     80  *    Intra prediction interpolation filter for pu1_ref substitution
     81  *
     82  *
     83  * @par Description:
     84  *    Reference substitution process for samples unavailable  for prediction
     85  *    Refer to section 8.4.4.2.2
     86  *
     87  * @param[in] pu1_top_left
     88  *  UWORD8 pointer to the top-left
     89  *
     90  * @param[in] pu1_top
     91  *  UWORD8 pointer to the top
     92  *
     93  * @param[in] pu1_left
     94  *  UWORD8 pointer to the left
     95  *
     96  * @param[in] src_strd
     97  *  WORD32 Source stride
     98  *
     99  * @param[in] nbr_flags
    100  *  WORD32 neighbor availability flags
    101  *
    102  * @param[in] nt
    103  *  WORD32 transform Block size
    104  *
    105  * @param[in] dst_strd
    106  *  WORD32 Destination stride
    107  *
    108  * @returns
    109  *
    110  * @remarks
    111  *  None
    112  *
    113  *******************************************************************************
    114  */
    115 
    116 
    117 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
    118                                                      UWORD8 *pu1_top,
    119                                                      UWORD8 *pu1_left,
    120                                                      WORD32 src_strd,
    121                                                      WORD32 nt,
    122                                                      WORD32 nbr_flags,
    123                                                      UWORD8 *pu1_dst,
    124                                                      WORD32 dst_strd)
    125 {
    126     UWORD8 pu1_ref;
    127     WORD32 dc_val, i;
    128     WORD32 total_samples = (4 * nt) + 1;
    129     WORD32 two_nt = 2 * nt;
    130     WORD32 three_nt = 3 * nt;
    131     WORD32 get_bits;
    132     WORD32 next;
    133     WORD32 bot_left, left, top, tp_right, tp_left;
    134     WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
    135     UNUSED(dst_strd);
    136     dc_val = 1 << (BIT_DEPTH - 1);
    137 
    138     /* Neighbor Flag Structure*/
    139     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
    140               1         4         4     4         4
    141      */
    142 
    143     /* If no neighbor flags are present, fill the neighbor samples with DC value */
    144     if(nbr_flags == 0)
    145     {
    146         for(i = 0; i < total_samples; i++)
    147         {
    148             pu1_dst[i] = dc_val;
    149         }
    150     }
    151     else
    152     {
    153         /* Else fill the corresponding samples */
    154         pu1_dst[two_nt] = *pu1_top_left;
    155         UWORD8 *pu1_dst_tmp2 = pu1_dst;
    156         UWORD8 *pu1_top_tmp = pu1_top;
    157         pu1_dst_tmp2 += two_nt + 1;
    158 
    159         for(i = 0; i < two_nt; i++)
    160             pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
    161 
    162         uint8x8_t src;
    163         for(i = two_nt; i > 0; i -= 8)
    164         {
    165             src = vld1_u8(pu1_top_tmp);
    166             pu1_top_tmp += 8;
    167             vst1_u8(pu1_dst_tmp2, src);
    168             pu1_dst_tmp2 += 8;
    169         }
    170 
    171         if(nt <= 8)
    172         {
    173             /* 1 bit extraction for all the neighboring blocks */
    174             tp_left = (nbr_flags & 0x10000) >> 16;
    175             bot_left = nbr_flags & 0x1;
    176             left = (nbr_flags & 0x10) >> 4;
    177             top = (nbr_flags & 0x100) >> 8;
    178             tp_right = (nbr_flags & 0x1000) >> 12;
    179 
    180             next = 1;
    181 
    182             /* If bottom -left is not available, reverse substitution process*/
    183             if(bot_left == 0)
    184             {
    185                 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
    186 
    187                 /* Check for the 1st available sample from bottom-left*/
    188                 while(!a_nbr_flag[next])
    189                     next++;
    190 
    191                 /* If Left, top-left are available*/
    192                 if(next <= 2)
    193                 {
    194                     idx = nt * next;
    195                     pu1_ref = pu1_dst[idx];
    196                     for(i = 0; i < idx; i++)
    197                         pu1_dst[i] = pu1_ref;
    198                 }
    199                 else /* If top, top-right are available */
    200                 {
    201                     /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
    202                     idx = (nt * (next - 1)) + 1;
    203                     pu1_ref = pu1_dst[idx];
    204                     for(i = 0; i < idx; i++)
    205                         pu1_dst[i] = pu1_ref;
    206                 }
    207             }
    208 
    209             /* Forward Substitution Process */
    210             /* If left is Unavailable, copy the last bottom-left value */
    211 
    212             if(left == 0)
    213             {
    214                 uint8x8_t dup_pu1_dst1;
    215                 UWORD8 *pu1_dst_const_nt = pu1_dst;
    216                 pu1_dst_const_nt += nt;
    217 
    218                 if(0 == (nt & 7))
    219                 {
    220                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
    221                     for(i = nt; i > 0; i -= 8)
    222                     {
    223                         vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
    224                         pu1_dst_const_nt += 8;
    225 
    226                     }
    227                 }
    228                 else
    229                 {
    230                     //uint32x2_t dup_pu1_dst4;
    231                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
    232                     //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
    233                     for(i = nt; i > 0; i -= 4)
    234                     {
    235                         vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
    236                         pu1_dst_const_nt += 4;
    237 
    238                     }
    239 
    240                 }
    241 
    242             }
    243             if(tp_left == 0)
    244                 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
    245             if(top == 0)
    246             {
    247 
    248                 if(0 == (nt & 7))
    249                 {
    250                     uint8x8_t dup_pu1_dst2;
    251                     UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
    252                     pu1_dst_const_two_nt_1 += (two_nt + 1);
    253                     dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
    254                     for(i = nt; i > 0; i -= 8)
    255                     {
    256                         vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
    257                         pu1_dst_const_two_nt_1 += 8;
    258 
    259                     }
    260                 }
    261                 else
    262                 {
    263                     for(i = 0; i < nt; i++)
    264                         pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
    265                 }
    266             }
    267             if(tp_right == 0)
    268             {
    269                 uint8x8_t dup_pu1_dst3;
    270                 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
    271                 pu1_dst_const_three_nt_1 += (three_nt + 1);
    272                 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
    273                 if(0 == (nt & 7))
    274                 {
    275                     for(i = nt; i > 0; i -= 8)
    276                     {
    277                         vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
    278                         pu1_dst_const_three_nt_1 += 8;
    279 
    280                     }
    281                 }
    282                 else
    283                 {
    284                     for(i = nt; i > 0; i -= 4)
    285                     {
    286                         vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
    287                         pu1_dst_const_three_nt_1 += 4;
    288                     }
    289 
    290                 }
    291 
    292             }
    293         }
    294         if(nt == 16)
    295         {
    296             WORD32 nbr_flags_temp = 0;
    297             nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
    298                             + ((nbr_flags & 0x300) >> 4)
    299                             + ((nbr_flags & 0x3000) >> 6)
    300                             + ((nbr_flags & 0x10000) >> 8);
    301 
    302             /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
    303             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
    304             {
    305                 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
    306 
    307                 if(nbr_id_from_bl == 64)
    308                     nbr_id_from_bl = 32;
    309 
    310                 if(nbr_id_from_bl == 32)
    311                 {
    312                     /* for top left : 1 pel per nbr bit */
    313                     if(!((nbr_flags_temp >> 8) & 0x1))
    314                     {
    315                         nbr_id_from_bl++;
    316                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
    317                     }
    318                 }
    319                 /* Reverse Substitution Process*/
    320                 if(nbr_id_from_bl)
    321                 {
    322                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
    323                     pu1_ref = pu1_dst[nbr_id_from_bl];
    324                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
    325                     {
    326                         pu1_dst[i] = pu1_ref;
    327                     }
    328                 }
    329             }
    330 
    331             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
    332             while(nbr_id_from_bl < ((T16_4NT) + 1))
    333             {
    334                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
    335                 /* Devide by 8 to obtain the original index */
    336                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
    337 
    338                 /* The Top-left flag is at the last bit location of nbr_flags*/
    339                 if(nbr_id_from_bl == (T16_4NT / 2))
    340                 {
    341                     get_bits = GET_BITS(nbr_flags_temp, 8);
    342 
    343                     /* only pel substitution for TL */
    344                     if(!get_bits)
    345                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
    346                 }
    347                 else
    348                 {
    349                     get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
    350                     if(!get_bits)
    351                     {
    352                         /* 8 pel substitution (other than TL) */
    353                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
    354                         for(i = 0; i < 8; i++)
    355                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
    356                     }
    357 
    358                 }
    359                 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
    360             }
    361         }
    362 
    363         if(nt == 32)
    364         {
    365             /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
    366             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
    367             {
    368                 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
    369 
    370                 if(nbr_id_from_bl == 64)
    371                 {
    372                     /* for top left : 1 pel per nbr bit */
    373                     if(!((nbr_flags >> 16) & 0x1))
    374                     {
    375                         /* top left not available */
    376                         nbr_id_from_bl++;
    377                         /* top and top right;  8 pels per nbr bit */
    378                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
    379                     }
    380                 }
    381                 /* Reverse Substitution Process*/
    382                 if(nbr_id_from_bl)
    383                 {
    384                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
    385                     pu1_ref = pu1_dst[nbr_id_from_bl];
    386                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
    387                         pu1_dst[i] = pu1_ref;
    388                 }
    389             }
    390 
    391             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
    392             while(nbr_id_from_bl < ((T32_4NT)+1))
    393             {
    394                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
    395                 /* Devide by 8 to obtain the original index */
    396                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
    397 
    398                 /* The Top-left flag is at the last bit location of nbr_flags*/
    399                 if(nbr_id_from_bl == (T32_4NT / 2))
    400                 {
    401                     get_bits = GET_BITS(nbr_flags, 16);
    402                     /* only pel substitution for TL */
    403                     if(!get_bits)
    404                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
    405                 }
    406                 else
    407                 {
    408                     get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
    409                     if(!get_bits)
    410                     {
    411                         /* 8 pel substitution (other than TL) */
    412                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
    413                         for(i = 0; i < 8; i++)
    414                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
    415                     }
    416 
    417                 }
    418                 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
    419             }
    420         }
    421 
    422     }
    423 
    424 }
    425 
    426 /**
    427  *******************************************************************************
    428  *
    429  * @brief
    430  *    Intra prediction interpolation filter for ref_filtering
    431  *
    432  *
    433  * @par Description:
    434  *    Reference DC filtering for neighboring samples dependent  on TU size and
    435  *    mode  Refer to section 8.4.4.2.3 in the standard
    436  *
    437  * @param[in] pu1_src
    438  *  UWORD8 pointer to the source
    439  *
    440  * @param[out] pu1_dst
    441  *  UWORD8 pointer to the destination
    442  *
    443  * @param[in] nt
    444  *  integer Transform Block size
    445  *
    446  * @param[in] mode
    447  *  integer intraprediction mode
    448  *
    449  * @returns
    450  *
    451  * @remarks
    452  *  None
    453  *
    454  *******************************************************************************
    455  */
    456 
    457 
    458 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
    459                                              WORD32 nt,
    460                                              UWORD8 *pu1_dst,
    461                                              WORD32 mode,
    462                                              WORD32 strong_intra_smoothing_enable_flag)
    463 {
    464     WORD32 filter_flag;
    465     WORD32 i = 0;
    466     WORD32 four_nt = 4 * nt;
    467 
    468     WORD32 src_4nt;
    469     WORD32 src_0nt;
    470     /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1   */
    471     /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values        */
    472     UWORD8 *pu1_src_tmp_0 = pu1_src;
    473     UWORD8 *pu1_src_tmp_1;
    474     UWORD8 *pu1_src_tmp_2;
    475     UWORD8 *pu1_dst_tmp_0 = pu1_dst;
    476     UWORD8 *pu1_dst_tmp_1;
    477 
    478     uint8x8_t src_val_0, src_val_2;
    479     uint8x8_t src_val_1, shift_res;
    480     uint8x8_t dup_const_2;
    481     uint16x8_t mul_res, add_res;
    482     WORD32 bi_linear_int_flag = 0;
    483     WORD32 abs_cond_left_flag = 0;
    484     WORD32 abs_cond_top_flag = 0;
    485     WORD32 dc_val = 1 << (BIT_DEPTH - 5);
    486     shift_res = vdup_n_u8(0);
    487 
    488     filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
    489 
    490     if(0 == filter_flag)
    491     {
    492         if(pu1_src == pu1_dst)
    493         {
    494             return;
    495         }
    496         else
    497         {
    498             for(i = four_nt; i > 0; i -= 8)
    499             {
    500                 src_val_0 = vld1_u8(pu1_src_tmp_0);
    501                 pu1_src_tmp_0 += 8;
    502                 vst1_u8(pu1_dst_tmp_0, src_val_0);
    503                 pu1_dst_tmp_0 += 8;
    504             }
    505             pu1_dst[four_nt] = pu1_src[four_nt];
    506         }
    507     }
    508 
    509     else
    510     {
    511         /* If strong intra smoothin is enabled and transform size is 32 */
    512         if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
    513         {
    514             /*Strong Intra Filtering*/
    515             abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
    516                             - (2 * pu1_src[3 * nt]))) < dc_val;
    517             abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
    518                             - (2 * pu1_src[nt]))) < dc_val;
    519 
    520             bi_linear_int_flag = ((1 == abs_cond_left_flag)
    521                             && (1 == abs_cond_top_flag));
    522         }
    523 
    524         src_4nt = pu1_src[4 * nt];
    525         src_0nt = pu1_src[0];
    526         /* Strong filtering of reference samples */
    527         if(1 == bi_linear_int_flag)
    528         {
    529             WORD32 two_nt = four_nt >> 1;
    530 
    531             WORD32 pu1_src_0_val = pu1_src[0];
    532             WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
    533             WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
    534 
    535             WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
    536             uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
    537 
    538             WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
    539             uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
    540 
    541             const UWORD8 *const_col_i;
    542             uint8x8_t const_col_i_val;
    543             uint16x8_t prod_val_1;
    544             uint16x8_t prod_val_2;
    545             uint16x8_t prod_val_3;
    546             uint16x8_t prod_val_4;
    547             uint8x8_t res_val_1;
    548             uint8x8_t res_val_2;
    549             uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
    550             uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
    551             uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
    552             pu1_dst_tmp_0 = pu1_dst + 1;
    553             pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
    554 
    555             const_col_i = gau1_ihevc_planar_factor + 1;
    556 
    557             for(i = two_nt; i > 0; i -= 8)
    558             {
    559                 const_col_i_val = vld1_u8(const_col_i);
    560                 const_col_i += 8;
    561 
    562                 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
    563                 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
    564 
    565                 res_val_1 = vrshrn_n_u16(prod_val_2, 6);
    566                 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
    567 
    568                 vst1_u8(pu1_dst_tmp_0, res_val_1);
    569                 pu1_dst_tmp_0 += 8;
    570                 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
    571 
    572                 res_val_2 = vrshrn_n_u16(prod_val_4, 6);
    573                 vst1_u8(pu1_dst_tmp_1, res_val_2);
    574                 pu1_dst_tmp_1 += 8;
    575             }
    576             pu1_dst[2 * nt] = pu1_src[2 * nt];
    577         }
    578         else
    579         {
    580             pu1_src_tmp_1 = pu1_src + 1;
    581             pu1_src_tmp_2 = pu1_src + 2;
    582             pu1_dst_tmp_0 += 1;
    583 
    584             dup_const_2 = vdup_n_u8(2);
    585 
    586             /* Extremities Untouched*/
    587             pu1_dst[0] = pu1_src[0];
    588 
    589             /* To avoid the issue when the dest and src has the same pointer this load has been done
    590              * outside and the 2nd consecutive load is done before the store of the 1st */
    591 
    592             /* Perform bilinear filtering of Reference Samples */
    593             for(i = (four_nt - 1); i > 0; i -= 8)
    594             {
    595                 src_val_0 = vld1_u8(pu1_src_tmp_0);
    596                 pu1_src_tmp_0 += 8;
    597 
    598                 src_val_2 = vld1_u8(pu1_src_tmp_2);
    599                 pu1_src_tmp_2 += 8;
    600 
    601                 src_val_1 = vld1_u8(pu1_src_tmp_1);
    602                 pu1_src_tmp_1 += 8;
    603 
    604                 if(i < four_nt - 1)
    605                 {
    606                     vst1_u8(pu1_dst_tmp_0, shift_res);
    607                     pu1_dst_tmp_0 += 8;
    608                 }
    609 
    610                 add_res = vaddl_u8(src_val_0, src_val_2);
    611 
    612                 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
    613                 shift_res = vrshrn_n_u16(mul_res, 2);
    614 
    615             }
    616             vst1_u8(pu1_dst_tmp_0, shift_res);
    617             pu1_dst_tmp_0 += 8;
    618         }
    619         pu1_dst[4 * nt] = src_4nt;
    620         pu1_dst[0] = src_0nt;
    621     }
    622 
    623 }
    624 
    625 
    626 
    627 /**
    628  *******************************************************************************
    629  *
    630  * @brief
    631 *   Intra prediction interpolation filter for luma planar
    632 *
    633 * @par Description:
    634 *      Planar Intraprediction with reference neighboring samples  location
    635 *      pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'
    636 *
    637 * @param[in] pu1_src
    638 *  UWORD8 pointer to the source
    639 *
    640 * @param[out] pu1_dst
    641 *  UWORD8 pointer to the destination
    642 *
    643 * @param[in] src_strd
    644 *  integer source stride
    645 *
    646 * @param[in] dst_strd
    647 *  integer destination stride
    648 *
    649 * @param[in] nt
    650 *  integer Transform Block size
    651 *
    652 * @param[in] wd
    653 *  integer width of the array
    654 *
    655 * @returns
    656 *
    657 * @remarks
    658 *  None
    659 *
    660 *******************************************************************************
    661 */
    662 
    663 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
    664                                            WORD32 src_strd,
    665                                            UWORD8 *pu1_dst,
    666                                            WORD32 dst_strd,
    667                                            WORD32 nt,
    668                                            WORD32 mode)
    669 {
    670     /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor)   */
    671     /* load const_nt_1_col values into a d register                                                 */
    672     /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1                                         */
    673     /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup                 */
    674     /* log2nt + 1 is taken care while assigning the values itself                                   */
    675     /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
    676 
    677     WORD32 row, col = 0;
    678     WORD32 log2nt_plus1 = 6;
    679     WORD32 two_nt, three_nt;
    680     UWORD8 *pu1_ref_two_nt_1;
    681     UWORD8 *pu1_dst_tmp;
    682     const UWORD8 *const_nt_1_col;
    683     uint8x8_t const_nt_1_col_t;
    684     const UWORD8 *const_col_1;
    685     uint8x8_t const_col_1_t;
    686     uint8_t const_nt_1_row;
    687     uint8x8_t const_nt_1_row_dup;
    688     uint8_t const_row_1;
    689     uint8x8_t const_row_1_dup;
    690     uint8_t const_nt = nt;
    691     uint16x8_t const_nt_dup;
    692     uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
    693     uint8x8_t pu1_ref_nt_1_dup;
    694     uint8_t pu1_ref_two_nt_1_row;
    695     uint8_t pu1_ref_three_nt_1;
    696     uint8x8_t pu1_ref_two_nt_1_row_dup;
    697     uint8x8_t pu1_ref_two_nt_1_t;
    698     uint8x8_t pu1_ref_three_nt_1_dup;
    699     uint16x8_t prod_t1;
    700     uint16x8_t prod_t2;
    701     uint16x8_t sto_res_tmp;
    702     uint8x8_t sto_res;
    703     int16x8_t log2nt_dup;
    704     UNUSED(src_strd);
    705     UNUSED(mode);
    706     log2nt_plus1 = 32 - CLZ(nt);
    707     two_nt = 2 * nt;
    708     three_nt = 3 * nt;
    709     /* loops have been unrolld considering the fact width is multiple of 8  */
    710     if(0 == (nt & 7))
    711     {
    712         pu1_dst_tmp = pu1_dst;
    713         const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
    714 
    715         const_col_1 = gau1_ihevc_planar_factor + 1;
    716         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
    717 
    718         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
    719         const_nt_dup = vdupq_n_u16(const_nt);
    720 
    721         log2nt_dup = vdupq_n_s16(log2nt_plus1);
    722         log2nt_dup = vnegq_s16(log2nt_dup);
    723 
    724         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
    725 
    726         for(row = 0; row < nt; row++)
    727         {
    728             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
    729             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
    730 
    731             const_nt_1_row = nt - 1 - row;
    732             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
    733 
    734             const_row_1 = row + 1;
    735             const_row_1_dup = vdup_n_u8(const_row_1);
    736 
    737             const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
    738 
    739             const_col_1 = gau1_ihevc_planar_factor + 1;
    740             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
    741 
    742             for(col = nt; col > 0; col -= 8)
    743             {
    744                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
    745                 const_nt_1_col -= 8;
    746                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
    747 
    748                 const_col_1_t = vld1_u8(const_col_1);
    749                 const_col_1 += 8;
    750                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
    751 
    752                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
    753                 pu1_ref_two_nt_1 += 8;
    754                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
    755 
    756                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
    757                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
    758                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
    759                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
    760 
    761                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
    762                 sto_res = vmovn_u16(sto_res_tmp);
    763                 vst1_u8(pu1_dst_tmp, sto_res);
    764                 pu1_dst_tmp += 8;
    765             }
    766             pu1_dst_tmp += dst_strd - nt;
    767         }
    768     }
    769     /* loops have been unrolld considering the fact width is multiple of 4  */
    770     /* If column is multiple of 4 then height should be multiple of 2       */
    771     else
    772     {
    773         uint8x8_t const_row_1_dup1;
    774         uint8x8_t pu1_ref_two_nt_1_t1;
    775         uint8x8_t const_nt_1_col_t1;
    776         uint8x8_t const_col_1_t1;
    777         uint8x8_t pu1_ref_two_nt_1_row_dup1;
    778         uint8x8_t const_nt_1_row_dup1;
    779 
    780         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
    781 
    782         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
    783         const_nt_dup = vdupq_n_u16(const_nt);
    784 
    785         log2nt_dup = vdupq_n_s16(log2nt_plus1);
    786         log2nt_dup = vnegq_s16(log2nt_dup);
    787 
    788         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
    789 
    790         for(row = 0; row < nt; row += 2)
    791         {
    792             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
    793             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
    794             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
    795             pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
    796             pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
    797 
    798             const_nt_1_row = nt - 1 - row;
    799             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
    800             const_nt_1_row = nt - 2 - row;
    801             const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
    802             const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
    803 
    804             const_row_1 = row + 1;
    805             const_row_1_dup = vdup_n_u8(const_row_1);
    806             const_row_1 = row + 2;
    807             const_row_1_dup1 = vdup_n_u8(const_row_1);
    808             const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
    809 
    810             const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
    811 
    812             const_col_1 = gau1_ihevc_planar_factor + 1;
    813 
    814             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
    815 
    816             for(col = nt; col > 0; col -= 4)
    817             {
    818                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
    819                 const_nt_1_col -= 4;
    820                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
    821 
    822                 const_col_1_t = vld1_u8(const_col_1);
    823                 const_col_1 += 4;
    824                 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
    825 
    826                 pu1_dst_tmp = pu1_dst;
    827                 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
    828 
    829                 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
    830                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
    831 
    832                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
    833                 pu1_ref_two_nt_1 += 4;
    834                 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
    835 
    836                 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
    837                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
    838 
    839                 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
    840                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
    841 
    842                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
    843                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
    844                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
    845 
    846                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
    847                 sto_res = vmovn_u16(sto_res_tmp);
    848 
    849                 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
    850                 pu1_dst_tmp += dst_strd;
    851 
    852                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
    853                 pu1_dst += 4;
    854             }
    855             pu1_dst += 2 * dst_strd - nt;
    856         }
    857     }
    858 
    859 }
    860 /* INTRA_PRED_LUMA_PLANAR */
    861 
    862 /**
    863 *******************************************************************************
    864 *
    865 * @brief
    866 *    Intra prediction interpolation filter for luma dc
    867 *
    868 * @par Description:
    869 *    Intraprediction for DC mode with reference neighboring  samples location
    870 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
    871 *
    872 * @param[in] pu1_src
    873 *  UWORD8 pointer to the source
    874 *
    875 * @param[out] pu1_dst
    876 *  UWORD8 pointer to the destination
    877 *
    878 * @param[in] src_strd
    879 *  integer source stride
    880 *
    881 * @param[in] dst_strd
    882 *  integer destination stride
    883 *
    884 * @param[in] nt
    885 *  integer Transform Block size
    886 *
    887 * @param[in] wd
    888 *  integer width of the array
    889 *
    890 * @returns
    891 *
    892 * @remarks
    893 *  None
    894 *
    895 *******************************************************************************
    896 */
    897 
    898 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
    899                                        WORD32 src_strd,
    900                                        UWORD8 *pu1_dst,
    901                                        WORD32 dst_strd,
    902                                        WORD32 nt,
    903                                        WORD32 mode)
    904 {
    905     WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
    906     WORD32 i = 0;
    907     WORD32 row = 0, col = 0, col_count;
    908     WORD32 log2nt_plus1 = 6;
    909     WORD32 two_nt = 0;
    910     uint16x8_t ref_load_q;
    911     uint16x8_t three_dc_val_t;
    912     uint8x8_t sto_res_tmp;
    913     uint8x8_t sto_res_tmp1;
    914     uint8x8_t sto_res_tmp2;
    915     uint8x8_t sto_res_tmp3;
    916     uint8x8_t sto_res_tmp4;
    917     uint8x8_t dc_val_t;
    918 
    919     UWORD8 *pu1_ref_tmp;
    920     UWORD8 *pu1_ref_tmp1;
    921     UWORD8 *pu1_dst_tmp;
    922     UWORD8 *pu1_dst_tmp1;
    923     UWORD8 *pu1_dst_tmp2;
    924     UNUSED(src_strd);
    925     UNUSED(mode);
    926 
    927     /* log2nt + 1 is taken care while assigning the values itself.          */
    928     log2nt_plus1 = 32 - CLZ(nt);
    929 
    930     /* loops have been unrolld considering the fact width is multiple of 8  */
    931     if(0 == (nt & 7))
    932     {
    933         uint8x8_t ref_load1;
    934         uint8x8_t ref_load2;
    935         uint16x4_t acc_dc_pair1;
    936         uint32x2_t acc_dc_pair2;
    937         uint64x1_t acc_dc = vdup_n_u64(col);
    938 
    939         two_nt = 2 * nt;
    940         pu1_ref_tmp = pu1_ref + nt;
    941         pu1_ref_tmp1 = pu1_ref + two_nt + 1;
    942 
    943         for(i = two_nt; i > nt; i -= 8)
    944         {
    945             ref_load1 = vld1_u8(pu1_ref_tmp);
    946             pu1_ref_tmp += 8;
    947             acc_dc_pair1 = vpaddl_u8(ref_load1);
    948 
    949             ref_load2 = vld1_u8(pu1_ref_tmp1);
    950             pu1_ref_tmp1 += 8;
    951 
    952             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
    953             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
    954 
    955             acc_dc_pair1 = vpaddl_u8(ref_load2);
    956             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
    957             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
    958         }
    959 
    960         dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
    961         dc_val_t = vdup_n_u8(dc_val);
    962         two_dc_val = 2 * dc_val;
    963         three_dc_val = 3 * dc_val;
    964         three_dc_val += 2;
    965 
    966         three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
    967         pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
    968         pu1_dst_tmp = pu1_dst;
    969 
    970 
    971         if(nt == 32)
    972         {
    973             for(row = 0; row < nt; row++)
    974             {
    975                 for(col = nt; col > 0; col -= 8)
    976                 {
    977                     vst1_u8(pu1_dst_tmp, dc_val_t);
    978                     pu1_dst_tmp += 8;
    979                 }
    980                 pu1_dst_tmp += dst_strd - nt;
    981             }
    982         }
    983         else
    984 
    985         {
    986             for(col = nt; col > 0; col -= 8)
    987             {
    988                 ref_load1 = vld1_u8(pu1_ref_tmp);
    989                 pu1_ref_tmp += 8;
    990                 ref_load_q = vmovl_u8(ref_load1);
    991                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
    992                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
    993                 sto_res_tmp = vmovn_u16(ref_load_q);
    994                 vst1_u8(pu1_dst_tmp, sto_res_tmp);
    995                 pu1_dst_tmp += 8;
    996             }
    997 
    998             pu1_ref_tmp = pu1_ref + two_nt - 9;
    999             pu1_dst_tmp = pu1_dst + dst_strd;
   1000             col_count = nt - 8;
   1001 
   1002             /* Except the first row the remaining rows are done here                            */
   1003             /* Both column and row has been unrolled by 8                                       */
   1004             /* Store has been taken care for the unrolling                                      */
   1005             /* Except the 1st column of the remaining rows(other than 1st row), the values are  */
   1006             /* constant hence it is extracted with an constant value and stored                 */
   1007             /* If the column is greater than 8, then the remaining values are constant which is */
   1008             /* taken care in the inner for loop                                                 */
   1009 
   1010             for(row = nt; row > 0; row -= 8)
   1011             {
   1012                 pu1_dst_tmp1 = pu1_dst_tmp + 8;
   1013                 ref_load1 = vld1_u8(pu1_ref_tmp);
   1014                 pu1_ref_tmp -= 8;
   1015                 ref_load_q = vmovl_u8(ref_load1);
   1016                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
   1017                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
   1018                 sto_res_tmp = vmovn_u16(ref_load_q);
   1019 
   1020                 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
   1021 
   1022                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
   1023                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
   1024                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
   1025                 pu1_dst_tmp += dst_strd;
   1026 
   1027                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
   1028                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
   1029                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
   1030                 pu1_dst_tmp += dst_strd;
   1031 
   1032                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
   1033                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
   1034                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
   1035                 pu1_dst_tmp += dst_strd;
   1036 
   1037                 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
   1038                 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
   1039                 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
   1040                 pu1_dst_tmp += dst_strd;
   1041 
   1042                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
   1043                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
   1044                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
   1045                 pu1_dst_tmp += dst_strd;
   1046 
   1047                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
   1048                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
   1049                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
   1050                 pu1_dst_tmp += dst_strd;
   1051 
   1052                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
   1053                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
   1054                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
   1055                 pu1_dst_tmp += dst_strd;
   1056                 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
   1057                 if(row != 8)
   1058                     vst1_u8(pu1_dst_tmp, sto_res_tmp4);
   1059                 pu1_dst_tmp += dst_strd;
   1060 
   1061                 for(col = col_count; col > 0; col -= 8)
   1062                 {
   1063                     pu1_dst_tmp2 = pu1_dst_tmp1;
   1064                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1065                     pu1_dst_tmp1 += dst_strd;
   1066                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1067                     pu1_dst_tmp1 += dst_strd;
   1068                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1069                     pu1_dst_tmp1 += dst_strd;
   1070                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1071                     pu1_dst_tmp1 += dst_strd;
   1072                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1073                     pu1_dst_tmp1 += dst_strd;
   1074                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1075                     pu1_dst_tmp1 += dst_strd;
   1076                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1077                     pu1_dst_tmp1 += dst_strd;
   1078 
   1079                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
   1080                     if(row != 8)
   1081                         vst1_u8(pu1_dst_tmp1, dc_val_t);
   1082                     pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
   1083                 }
   1084             }
   1085             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
   1086         }
   1087     }
   1088     /* loops have been unrolld considering the fact width is multiple of 4  */
   1089     else
   1090     {
   1091         WORD32 acc_dc;
   1092         two_nt = 2 * nt;
   1093 
   1094         acc_dc = 0;
   1095         pu1_ref_tmp = pu1_ref + nt + 1;
   1096         for(i = nt; i < two_nt; i++)
   1097         {
   1098             acc_dc += pu1_ref[i];
   1099             acc_dc += pu1_ref_tmp[i];
   1100         }
   1101         dc_val = (acc_dc + nt) >> (log2nt_plus1);
   1102         two_dc_val = 2 * dc_val;
   1103         three_dc_val = 3 * dc_val;
   1104         three_dc_val = three_dc_val + 2;
   1105         dc_val_t = vdup_n_u8(dc_val);
   1106 
   1107         if(nt == 32)
   1108         {
   1109             pu1_dst_tmp = pu1_dst;
   1110             for(row = 0; row < nt; row++)
   1111             {
   1112                 for(col = nt; col > 0; col -= 4)
   1113                 {
   1114                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
   1115                     pu1_dst_tmp += 4;
   1116                 }
   1117                 pu1_dst_tmp += dst_strd - nt;
   1118             }
   1119         }
   1120         else
   1121 
   1122         {
   1123             for(col = 1; col < nt; col++)
   1124             {
   1125                 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
   1126             }
   1127 
   1128             pu1_dst_tmp = pu1_dst + dst_strd + 0;
   1129             /* Since first row is already updated before, loop count is nt-1 */
   1130             for(row = nt - 1; row > 0; row -= 1)
   1131             {
   1132                 for(col = nt; col > 0; col -= 4)
   1133                 {
   1134                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
   1135                     pu1_dst_tmp += 4;
   1136                 }
   1137                 pu1_dst_tmp += dst_strd - nt;
   1138             }
   1139 
   1140             for(row = 1; row < nt; row++)
   1141             {
   1142                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
   1143             }
   1144             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
   1145         }
   1146     }
   1147 }
   1148 /* INTRA_PRED_LUMA_DC */
   1149 
   1150 /**
   1151 *******************************************************************************
   1152 *
   1153 * @brief
   1154  *   Intra prediction interpolation filter for horizontal luma variable.
   1155  *
   1156  * @par Description:
   1157  *   Horizontal intraprediction with reference neighboring  samples location
   1158  *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
   1159  *
   1160  * @param[in] pu1_src
   1161  *  UWORD8 pointer to the source
   1162  *
   1163  * @param[out] pu1_dst
   1164  *  UWORD8 pointer to the destination
   1165  *
   1166  * @param[in] src_strd
   1167  *  integer source stride
   1168  *
   1169  * @param[in] dst_strd
   1170  *  integer destination stride
   1171  *
   1172  * @param[in] nt
   1173  *  integer Transform Block size
   1174  *
   1175  * @param[in] wd
   1176  *  integer width of the array
   1177  *
   1178  * @returns
   1179  *
   1180  * @remarks
   1181  *  None
   1182  *
   1183  *******************************************************************************
   1184  */
   1185 
   1186 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
   1187                                          WORD32 src_strd,
   1188                                          UWORD8 *pu1_dst,
   1189                                          WORD32 dst_strd,
   1190                                          WORD32 nt,
   1191                                          WORD32 mode)
   1192 {
   1193 
   1194     WORD32 row, col;
   1195     WORD32 two_nt;
   1196     UNUSED(src_strd);
   1197     UNUSED(mode);
   1198 
   1199     two_nt = 2 * nt;
   1200 
   1201 
   1202     UWORD8 *pu1_dst_tmp = pu1_dst;
   1203     UWORD32 pu1_val;
   1204     uint8x8_t pu1_val_two_nt_1_row;
   1205     if(nt == 32)
   1206     {
   1207         pu1_dst_tmp = pu1_dst;
   1208         for(row = 0; row < nt; row++)
   1209         {
   1210             pu1_val = pu1_ref[two_nt - 1 - row];
   1211             pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
   1212             for(col = nt; col > 0; col -= 8)
   1213             {
   1214                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
   1215                 pu1_dst_tmp += 8;
   1216             }
   1217             pu1_dst_tmp += dst_strd - nt;
   1218         }
   1219     }
   1220     else
   1221 
   1222 
   1223     /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
   1224     /* naming of variables made according to the operation(instructions) it performs*/
   1225     /* (eg. shift_val which contains the shifted value,                             */
   1226     /* add_sat which has add and saturated value)                                   */
   1227     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
   1228     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
   1229     {
   1230         if(0 != (nt & 7))      /* cond for multiple of 4 */
   1231         {
   1232             UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
   1233             UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
   1234             UWORD8 *pu1_dst_4 = pu1_dst;
   1235             UWORD8 *pu1_dst_4_tmp = pu1_dst;
   1236 
   1237             uint32x2_t pu1_ref_val1, pu1_ref_val2;
   1238             uint8x8_t dup_sub, round_val, dup_val;
   1239             uint16x8_t dup_add, sub_val;
   1240             int16x8_t shift_val, add_sat;
   1241 
   1242             pu1_ref_val1 = vdup_n_u32(0);
   1243             pu1_ref_val2 = vdup_n_u32(0);
   1244 
   1245             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
   1246 
   1247             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
   1248 
   1249             pu1_ref_4_two_nt_plus1 += (two_nt + 1);
   1250 
   1251             pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
   1252 
   1253             for(row = nt; row > 0; row -= 4)
   1254             {
   1255                 for(col = nt; col > 0; col -= 4)
   1256                 {
   1257                     pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
   1258                     sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
   1259                     shift_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
   1260 
   1261                     add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
   1262                     round_val = vqmovun_s16(add_sat);
   1263                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
   1264                     pu1_dst_4 += dst_strd;
   1265 
   1266                     pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
   1267                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
   1268                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
   1269                     pu1_dst_4 += dst_strd;
   1270 
   1271                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
   1272                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
   1273                     pu1_dst_4 += dst_strd;
   1274 
   1275                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
   1276                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
   1277                     pu1_dst_4 += dst_strd;
   1278 
   1279 
   1280                 }
   1281                 /* worst cases */
   1282                 pu1_ref_4_two_nt_minus_nt += 3;
   1283                 pu1_ref_4_two_nt_plus1 += 4;
   1284                 pu1_dst_4 = (pu1_dst_4_tmp + 4);
   1285             }
   1286 
   1287         }
   1288 
   1289         /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
   1290         /* naming of variables made according to the operation(instructions) it performs    */
   1291         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
   1292         /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
   1293 
   1294         else
   1295         {
   1296             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
   1297             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
   1298 
   1299             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
   1300             UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
   1301             UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
   1302 
   1303             uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
   1304             uint16x8_t sub_res, dup_add;
   1305             int16x8_t shift_res, add_res;
   1306 
   1307             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
   1308             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
   1309 
   1310             pu1_ref_tmp_1 += (two_nt + 1);
   1311             pu1_ref_tmp_2 += (two_nt - 1);
   1312 
   1313             for(col = nt; col > 0; col -= 8)
   1314             {
   1315                 src_tmp = vld1_u8(pu1_ref_tmp_1);
   1316                 pu1_ref_tmp_1 += 8;
   1317 
   1318                 sub_res = vsubl_u8(src_tmp, dup_sub);
   1319                 shift_res  = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
   1320                 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
   1321                 round_val = vqmovun_s16(add_res);
   1322                 vst1_u8(pu1_dst_tmp_1, round_val);
   1323                 pu1_dst_tmp_1 += 8;
   1324             }
   1325 
   1326             for(row = nt; row > 0; row -= 8)
   1327             {
   1328                 pu1_ref_tmp_2 -= 8;
   1329 
   1330                 src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
   1331                 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
   1332 
   1333                 dup_1 = vdup_lane_u8(rev_res, 0);
   1334                 dup_2 = vdup_lane_u8(rev_res, 1);
   1335                 dup_3 = vdup_lane_u8(rev_res, 2);
   1336                 dup_4 = vdup_lane_u8(rev_res, 3);
   1337                 dup_5 = vdup_lane_u8(rev_res, 4);
   1338                 dup_6 = vdup_lane_u8(rev_res, 5);
   1339                 dup_7 = vdup_lane_u8(rev_res, 6);
   1340                 dup_8 = vdup_lane_u8(rev_res, 7);
   1341 
   1342                 for(col = nt; col > 0; col -= 8)
   1343                 {
   1344                     pu1_dst_tmp_2 = pu1_dst_tmp_3;
   1345 
   1346                     vst1_u8(pu1_dst_tmp_2, dup_1);
   1347                     pu1_dst_tmp_2 += dst_strd;
   1348 
   1349                     vst1_u8(pu1_dst_tmp_2, dup_2);
   1350                     pu1_dst_tmp_2 += dst_strd;
   1351 
   1352                     vst1_u8(pu1_dst_tmp_2, dup_3);
   1353                     pu1_dst_tmp_2 += dst_strd;
   1354 
   1355                     vst1_u8(pu1_dst_tmp_2, dup_4);
   1356                     pu1_dst_tmp_2 += dst_strd;
   1357 
   1358                     vst1_u8(pu1_dst_tmp_2, dup_5);
   1359                     pu1_dst_tmp_2 += dst_strd;
   1360 
   1361                     vst1_u8(pu1_dst_tmp_2, dup_6);
   1362                     pu1_dst_tmp_2 += dst_strd;
   1363 
   1364                     vst1_u8(pu1_dst_tmp_2, dup_7);
   1365                     pu1_dst_tmp_2 += dst_strd;
   1366 
   1367                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
   1368                     if(row != 8)
   1369                         vst1_u8(pu1_dst_tmp_2, dup_8);
   1370                     pu1_dst_tmp_2 += dst_strd;
   1371 
   1372                     pu1_dst_tmp_3 += 8;
   1373                 }
   1374                 pu1_dst_tmp_2 -= (nt - 8);
   1375                 pu1_dst_tmp_3 = pu1_dst_tmp_2;
   1376             }
   1377         }
   1378     }
   1379 }
   1380 /* INTRA_PRED_LUMA_HORZ */
   1381 
   1382 /**
   1383 *******************************************************************************
   1384 *
   1385 * @brief
   1386 *    Intra prediction interpolation filter for vertical luma variable.
   1387 *
   1388 * @par Description:
   1389 *    Horizontal intraprediction with reference neighboring  samples location
   1390 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
   1391 *
   1392 * @param[in] pu1_src
   1393 *  UWORD8 pointer to the source
   1394 *
   1395 * @param[out] pu1_dst
   1396 *  UWORD8 pointer to the destination
   1397 *
   1398 * @param[in] src_strd
   1399 *  integer source stride
   1400 *
   1401 * @param[in] dst_strd
   1402 *  integer destination stride
   1403 *
   1404 * @param[in] nt
   1405 *  integer Transform Block size
   1406 *
   1407 * @param[in] wd
   1408 *  integer width of the array
   1409 *
   1410 * @returns
   1411 *
   1412 * @remarks
   1413 *  None
   1414 *
   1415 *******************************************************************************
   1416 */
   1417 
   1418 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
   1419                                         WORD32 src_strd,
   1420                                         UWORD8 *pu1_dst,
   1421                                         WORD32 dst_strd,
   1422                                         WORD32 nt,
   1423                                         WORD32 mode)
   1424 {
   1425     WORD32 row, col;
   1426     WORD32 two_nt;
   1427     UNUSED(src_strd);
   1428     UNUSED(mode);
   1429 
   1430     two_nt = 2 * nt;
   1431 
   1432     UWORD8 *pu1_dst_tmp = pu1_dst;
   1433     UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
   1434     uint8x8_t pu1_val_two_nt_1_col;
   1435     if(nt == 32)
   1436     {
   1437         pu1_dst_tmp = pu1_dst;
   1438         for(row = 0; row < nt; row++)
   1439         {
   1440             for(col = nt; col > 0; col -= 8)
   1441             {
   1442                 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
   1443                 pu1_ref_tmp_1 += 8;
   1444                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
   1445                 pu1_dst_tmp += 8;
   1446             }
   1447             pu1_ref_tmp_1 -= nt;
   1448             pu1_dst_tmp += dst_strd - nt;
   1449         }
   1450     }
   1451     else
   1452 
   1453     {
   1454         /* naming of variables made according to the operation(instructions) it performs                    */
   1455         /* (eg. shift_val which contains the shifted value,                                                 */
   1456         /* add_sat which has add and saturated value)                                                       */
   1457         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
   1458         /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
   1459 
   1460         if(0 != (nt & 7))
   1461         {
   1462             WORD32 cond_4 = 0;
   1463             UWORD8 *pu1_ref_val1 = pu1_ref;
   1464             UWORD8 *pu1_ref_val2 = pu1_ref;
   1465             UWORD8 *pu1_ref_val3 = pu1_ref;
   1466 
   1467             UWORD8 *pu1_dst_val1 = pu1_dst;
   1468             UWORD8 *pu1_dst_val2 = pu1_dst;
   1469             UWORD8 *pu1_dst_val3 = pu1_dst;
   1470 
   1471             uint8x8_t dup_2_sub, round_val, vext_val;
   1472             uint16x8_t dup_2_add;
   1473             uint32x2_t src_val1, src_val2, src_val3;
   1474             uint16x8_t sub_val;
   1475             int16x8_t shift_val1, add_sat;
   1476             uint64x1_t shift_val2;
   1477 
   1478             src_val1 = vdup_n_u32(0);
   1479             src_val2 = vdup_n_u32(0);
   1480             src_val3 = vdup_n_u32(0);
   1481             pu1_ref_val1 += (two_nt - nt);
   1482             pu1_ref_val3 += (two_nt + 2);
   1483             pu1_ref_val2 += (two_nt + 1);
   1484 
   1485             dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
   1486             dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
   1487 
   1488             /* loops to store the first nt sets of values in the destination */
   1489 
   1490             for(row = nt; row > 0; row -= 4)
   1491             {
   1492                 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
   1493                 {
   1494                     /*  unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
   1495                     src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
   1496                     sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
   1497                     shift_val1  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
   1498                     add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
   1499                     round_val = vqmovun_s16(add_sat);
   1500 
   1501                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
   1502                     src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
   1503                     vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
   1504                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
   1505                     pu1_dst_val1 += dst_strd;
   1506 
   1507                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
   1508 
   1509                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
   1510                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
   1511                     pu1_dst_val1 += dst_strd;
   1512 
   1513                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
   1514 
   1515                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
   1516                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
   1517                     pu1_dst_val1 += dst_strd;
   1518 
   1519                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
   1520 
   1521                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
   1522                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
   1523                     pu1_dst_val1 += dst_strd;
   1524 
   1525                     pu1_ref_val1  -= 4;
   1526                 }
   1527 
   1528                 /* loop to store next sets of eight values in the destination */
   1529 
   1530                 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
   1531                 {
   1532                     src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
   1533 
   1534                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
   1535                     pu1_dst_val2 += dst_strd;
   1536 
   1537                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
   1538                     pu1_dst_val2 += dst_strd;
   1539 
   1540                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
   1541                     pu1_dst_val2 += dst_strd;
   1542 
   1543                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
   1544                     pu1_dst_val2 += dst_strd;
   1545                 }
   1546                 pu1_ref_val2 += 4;
   1547                 pu1_dst_val3 += 4;
   1548                 pu1_dst_val2 = pu1_dst_val3;
   1549                 cond_4 = 1;
   1550             }
   1551         }
   1552 
   1553         /* rows and columns are unrolled by 8, when the width is multiple of 8          */
   1554         else
   1555         {
   1556             WORD32 cond = 0, col_1;
   1557             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
   1558             UWORD8 *pu1_dst_tmp_2 = pu1_dst;
   1559             UWORD8 *pu1_dst_tmp_3 = pu1_dst;
   1560 
   1561             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
   1562             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
   1563             UWORD8 *pu1_ref_tmp_3 = pu1_ref;
   1564 
   1565             uint8x8_t pu1_src_tmp1;
   1566             uint8x8_t pu1_src_tmp2;
   1567 
   1568             uint8x8_t dup_sub;
   1569             uint16x8_t dup_add;
   1570             int16x8_t subsh_val;
   1571             int16x8_t addsat_val;
   1572             uint16x8_t sub_val;
   1573             uint8x8_t round_val;
   1574             uint8x8_t vext_t;
   1575             uint64x1_t shift_64;
   1576 
   1577             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
   1578             dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
   1579 
   1580             pu1_ref_tmp_1 += (two_nt);
   1581             pu1_ref_tmp_1 -= 8;
   1582             pu1_ref_tmp_2 += (two_nt + 2);
   1583             pu1_ref_tmp_3 += (two_nt + 1);
   1584 
   1585             /* loops to store the first nt sets of values in the destination */
   1586 
   1587             for(row = nt; row > 0; row -= 8)
   1588             {
   1589                 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
   1590                 {
   1591                     pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
   1592 
   1593                     sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
   1594                     subsh_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
   1595                     addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
   1596                     round_val = vqmovun_s16(addsat_val);
   1597 
   1598                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
   1599 
   1600                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
   1601                     vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
   1602                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1603                     pu1_dst_tmp_1 += dst_strd;
   1604 
   1605                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
   1606 
   1607                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1608                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1609                     pu1_dst_tmp_1 += dst_strd;
   1610 
   1611                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
   1612                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1613                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1614                     pu1_dst_tmp_1 += dst_strd;
   1615 
   1616                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
   1617                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1618                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1619                     pu1_dst_tmp_1 += dst_strd;
   1620 
   1621                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
   1622                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1623                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1624                     pu1_dst_tmp_1 += dst_strd;
   1625 
   1626                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
   1627                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1628                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1629                     pu1_dst_tmp_1 += dst_strd;
   1630 
   1631                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
   1632                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1633                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1634                     pu1_dst_tmp_1 += dst_strd;
   1635 
   1636                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
   1637                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1638                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1639                     pu1_dst_tmp_1 += dst_strd;
   1640 
   1641                     pu1_ref_tmp_1 -= 8;
   1642                 }
   1643 
   1644                 /* loop to store next sets of eight values in the destination */
   1645 
   1646                 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
   1647                 {
   1648                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
   1649 
   1650                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1651                     pu1_dst_tmp_2 += dst_strd;
   1652 
   1653                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1654                     pu1_dst_tmp_2 += dst_strd;
   1655 
   1656                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1657                     pu1_dst_tmp_2 += dst_strd;
   1658 
   1659                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1660                     pu1_dst_tmp_2 += dst_strd;
   1661 
   1662                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1663                     pu1_dst_tmp_2 += dst_strd;
   1664 
   1665                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1666                     pu1_dst_tmp_2 += dst_strd;
   1667 
   1668                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1669                     pu1_dst_tmp_2 += dst_strd;
   1670 
   1671                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1672                     pu1_dst_tmp_2 += dst_strd;
   1673                 }
   1674                 pu1_ref_tmp_3 += 8;
   1675                 pu1_dst_tmp_3 += 8;
   1676                 pu1_dst_tmp_2 = pu1_dst_tmp_3;
   1677                 cond = 1;
   1678             }
   1679         }
   1680     }
   1681 }
   1682 /* INTRA_PRED_LUMA_VER */
   1683 
   1684 /**
   1685 *******************************************************************************
   1686 *
   1687 * @brief
   1688 *    Intra prediction interpolation filter for luma mode2.
   1689 *
   1690 * @par Description:
   1691 *    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
   1692 *    location pointed by 'pu1_ref' to the  TU block location pointed by
   1693 *    'pu1_dst'
   1694 *
   1695 * @param[in] pu1_src
   1696 *  UWORD8 pointer to the source
   1697 *
   1698 * @param[out] pu1_dst
   1699 *  UWORD8 pointer to the destination
   1700 *
   1701 * @param[in] src_strd
   1702 *  integer source stride
   1703 *
   1704 * @param[in] dst_strd
   1705 *  integer destination stride
   1706 *
   1707 * @param[in] nt
   1708 *  integer Transform Block size
   1709 *
   1710 * @param[in] wd
   1711 *  integer width of the array
   1712 *
   1713 * @returns
   1714 *
   1715 * @remarks
   1716 *  None
   1717 *
   1718 *******************************************************************************
   1719 */
   1720 
   1721 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
   1722                                           WORD32 src_strd,
   1723                                           UWORD8 *pu1_dst,
   1724                                           WORD32 dst_strd,
   1725                                           WORD32 nt,
   1726                                           WORD32 mode)
   1727 {
   1728 
   1729     WORD32 row, col;
   1730     WORD32 two_nt;
   1731     UNUSED(src_strd);
   1732     UNUSED(mode);
   1733 
   1734     /* rev_res naming has been made to have the reverse result value in it                              */
   1735     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
   1736     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
   1737 
   1738     if(0 != (nt & 7))
   1739     {
   1740         UWORD8 *pu1_ref_tmp = pu1_ref;
   1741         UWORD8 *pu1_dst_tmp = pu1_dst;
   1742         uint8x8_t pu1_src_val, rev_res;
   1743         uint64x1_t shift_res;
   1744 
   1745         for(col = nt; col > 0; col -= 4)
   1746         {
   1747             for(row = nt; row > 0; row -= 4)
   1748             {
   1749                 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
   1750 
   1751                 pu1_src_val = vld1_u8(pu1_ref_tmp);
   1752                 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
   1753                 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
   1754 
   1755                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
   1756                 pu1_dst_tmp += dst_strd;
   1757 
   1758                 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
   1759                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
   1760                 pu1_dst_tmp += dst_strd;
   1761 
   1762                 shift_res = vshr_n_u64(shift_res, 8);
   1763                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
   1764                 pu1_dst_tmp += dst_strd;
   1765 
   1766                 shift_res = vshr_n_u64(shift_res, 8);
   1767                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
   1768                 pu1_dst_tmp += dst_strd;
   1769             }
   1770         }
   1771     }
   1772 
   1773     /* rev_val_second, rev_val_first  to reverse the loaded values in order to get the values in right order */
   1774     /* shift_64 to shift the reversed 2nd values to get the value what we need                               */
   1775     /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
   1776 
   1777     else
   1778     {
   1779         UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
   1780         UWORD8 *pu1_dst_tmp = pu1_dst;
   1781         UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
   1782 
   1783         uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
   1784         uint64x1_t shift_val;
   1785 
   1786         two_nt = 2 * nt;
   1787         pu1_ref_two_nt_minus2 += (two_nt);
   1788         pu1_ref_two_nt_minus2 -= 8;
   1789 
   1790         for(col = nt; col > 0; col -= 8)
   1791         {
   1792             for(row = nt; row > 0; row -= 8)
   1793             {
   1794                 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
   1795                 rev_val_first = vrev64_u8(pu1_src_val2);
   1796 
   1797                 pu1_ref_two_nt_minus2 -= 8;
   1798                 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
   1799                 rev_val_second = vrev64_u8(pu1_src_val1);
   1800 
   1801                 vext_t = vext_u8(rev_val_first, rev_val_second, 1);
   1802                 vst1_u8(pu1_dst_tmp, vext_t);
   1803                 pu1_dst_tmp += dst_strd;
   1804 
   1805                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
   1806                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1807                 vst1_u8(pu1_dst_tmp, vext_t);
   1808                 pu1_dst_tmp += dst_strd;
   1809 
   1810                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
   1811                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1812                 vst1_u8(pu1_dst_tmp, vext_t);
   1813                 pu1_dst_tmp += dst_strd;
   1814 
   1815                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
   1816                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1817                 vst1_u8(pu1_dst_tmp, vext_t);
   1818                 pu1_dst_tmp += dst_strd;
   1819 
   1820                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
   1821                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1822                 vst1_u8(pu1_dst_tmp, vext_t);
   1823                 pu1_dst_tmp += dst_strd;
   1824 
   1825                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
   1826                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1827                 vst1_u8(pu1_dst_tmp, vext_t);
   1828                 pu1_dst_tmp += dst_strd;
   1829 
   1830                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
   1831                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1832                 vst1_u8(pu1_dst_tmp, vext_t);
   1833                 pu1_dst_tmp += dst_strd;
   1834 
   1835                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
   1836                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1837                 vst1_u8(pu1_dst_tmp, vext_t);
   1838                 pu1_dst_tmp += dst_strd;
   1839             }
   1840             pu1_dst_tmp_plus8 += 8;
   1841             pu1_dst_tmp = pu1_dst_tmp_plus8;
   1842             pu1_ref_two_nt_minus2 += (nt - 8);
   1843         }
   1844     }
   1845 }
   1846 /* INTRA_PRED_LUMA_MODE2 */
   1847 
   1848 /**
   1849 *******************************************************************************
   1850 *
   1851 * @brief
   1852 *   Intra prediction interpolation filter for luma mode 18 & mode 34.
   1853 *
   1854 * @par Description:
   1855 *    Intraprediction for mode 34 (ne angle) with reference  neighboring
   1856 *    samples location pointed by 'pu1_ref' to the  TU block location pointed by
   1857 *    'pu1_dst'
   1858 *
   1859 * @param[in] pu1_src
   1860 *  UWORD8 pointer to the source
   1861 *
   1862 * @param[out] pu1_dst
   1863 *  UWORD8 pointer to the destination
   1864 *
   1865 * @param[in] src_strd
   1866 *  integer source stride
   1867 *
   1868 * @param[in] dst_strd
   1869 *  integer destination stride
   1870 *
   1871 * @param[in] nt
   1872 *  integer Transform Block size
   1873 *
   1874 * @param[in] wd
   1875 *  integer width of the array
   1876 *
   1877 * @returns
   1878 *
   1879 * @remarks
   1880 *  None
   1881 *
   1882 *******************************************************************************
   1883 */
   1884 
   1885 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
   1886                                                WORD32 src_strd,
   1887                                                UWORD8 *pu1_dst,
   1888                                                WORD32 dst_strd,
   1889                                                WORD32 nt,
   1890                                                WORD32 mode)
   1891 {
   1892 
   1893     WORD32 row, col, idx;
   1894     WORD32 intraPredAngle = 32;
   1895     WORD32 two_nt;
   1896     UNUSED(src_strd);
   1897     two_nt = 2 * nt;
   1898 
   1899     UWORD8 *pu1_ref_tmp = pu1_ref;
   1900     UWORD8 *pu1_ref_tmp1 = pu1_ref;
   1901     UWORD8 *pu1_dst_tmp = pu1_dst;
   1902     UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
   1903 
   1904     uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
   1905 
   1906     /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref)   */
   1907     /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue        */
   1908     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8      */
   1909     /* rows and columns are unrolled by 8, when the width is multiple of 8                                  */
   1910     /* loops are maintained separately for mode18 and mode34                                                */
   1911 
   1912     /* cond to allow multiples of 8 */
   1913     if(0 == (nt & 7))
   1914     {
   1915         if(mode == 34)
   1916         {
   1917             pu1_ref_tmp += (two_nt + 2);
   1918 
   1919             for(row = nt; row > 0; row -= 8)
   1920             {
   1921                 for(col = nt; col > 0; col -= 8)
   1922                 {
   1923                     /* Loading 1st eight values */
   1924                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
   1925                     pu1_ref_tmp += 8;
   1926 
   1927                     /* Loading next eight values */
   1928                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
   1929 
   1930                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
   1931                     vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
   1932                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
   1933                     pu1_dst_tmp += dst_strd;
   1934 
   1935                     vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
   1936                     vst1_u8(pu1_dst_tmp, vext1);
   1937                     pu1_dst_tmp += dst_strd;
   1938 
   1939                     vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
   1940                     vst1_u8(pu1_dst_tmp, vext2);
   1941                     pu1_dst_tmp += dst_strd;
   1942 
   1943                     vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
   1944                     vst1_u8(pu1_dst_tmp, vext3);
   1945                     pu1_dst_tmp += dst_strd;
   1946 
   1947                     vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
   1948                     vst1_u8(pu1_dst_tmp, vext4);
   1949                     pu1_dst_tmp += dst_strd;
   1950 
   1951                     vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
   1952                     vst1_u8(pu1_dst_tmp, vext5);
   1953                     pu1_dst_tmp += dst_strd;
   1954 
   1955                     vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
   1956                     vst1_u8(pu1_dst_tmp, vext6);
   1957                     pu1_dst_tmp += dst_strd;
   1958 
   1959                     vst1_u8(pu1_dst_tmp, vext7);
   1960                     pu1_dst_tmp += dst_strd;
   1961                 }
   1962 
   1963                 pu1_dst_tmp_plus8 += 8;
   1964                 pu1_dst_tmp = pu1_dst_tmp_plus8;
   1965                 pu1_ref_tmp -= (nt - 8);
   1966             }
   1967         }
   1968         else /* Loop for mode 18 */
   1969         {
   1970             pu1_ref_tmp += (two_nt);
   1971 
   1972             for(row = nt; row > 0; row -= 8)
   1973             {
   1974                 for(col = nt; col > 0; col -= 8)
   1975                 {
   1976                     /* Loading 1st eight values */
   1977                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
   1978                     pu1_ref_tmp -= 8;
   1979 
   1980                     /* Loading next eight values */
   1981                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
   1982 
   1983                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
   1984                     vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
   1985                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
   1986                     pu1_dst_tmp += dst_strd;
   1987 
   1988                     vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
   1989                     vst1_u8(pu1_dst_tmp, vext1);
   1990                     pu1_dst_tmp += dst_strd;
   1991 
   1992                     vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
   1993                     vst1_u8(pu1_dst_tmp, vext2);
   1994                     pu1_dst_tmp += dst_strd;
   1995 
   1996                     vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
   1997                     vst1_u8(pu1_dst_tmp, vext3);
   1998                     pu1_dst_tmp += dst_strd;
   1999 
   2000                     vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
   2001                     vst1_u8(pu1_dst_tmp, vext4);
   2002                     pu1_dst_tmp += dst_strd;
   2003 
   2004                     vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
   2005                     vst1_u8(pu1_dst_tmp, vext5);
   2006                     pu1_dst_tmp += dst_strd;
   2007 
   2008                     vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
   2009                     vst1_u8(pu1_dst_tmp, vext6);
   2010                     pu1_dst_tmp += dst_strd;
   2011 
   2012                     vst1_u8(pu1_dst_tmp, vext7);
   2013                     pu1_dst_tmp += dst_strd;
   2014                 }
   2015                 pu1_dst_tmp_plus8 += 8;
   2016                 pu1_dst_tmp = pu1_dst_tmp_plus8;
   2017                 pu1_ref_tmp += (nt + 8);
   2018             }
   2019         }
   2020     }
   2021 
   2022     /* rows and columns are unrolled by 4, when the width is multiple of 4  */
   2023 
   2024     else /* loop for multiples of 4 */
   2025     {
   2026         uint8x8_t src_val1;
   2027         uint8x8_t src_val2;
   2028 
   2029         if(mode == 18)
   2030             intraPredAngle = -32;
   2031         else if(mode == 34)
   2032             intraPredAngle = 32;
   2033 
   2034         for(row = 0; row < nt; row += 2)
   2035         {
   2036             /* unrolling 2 rows */
   2037             idx = ((row + 1) * intraPredAngle) >> 5;
   2038             pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
   2039             src_val1 = vld1_u8(pu1_ref_tmp);
   2040 
   2041             idx = ((row + 2) * intraPredAngle) >> 5;
   2042             pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
   2043             src_val2 = vld1_u8(pu1_ref_tmp1);
   2044 
   2045             /* unrolling 4 col */
   2046             for(col = nt; col > 0; col -= 4)
   2047             {
   2048                 pu1_dst_tmp = pu1_dst;
   2049                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
   2050                 pu1_dst_tmp += dst_strd;
   2051                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
   2052                 pu1_dst += 4;
   2053             }
   2054             pu1_dst += 2 * dst_strd - nt;
   2055         }
   2056     }
   2057 }
   2058 /* INTRA_PRED_LUMA_MODE_18_34 */
   2059 
   2060 /**
   2061  *******************************************************************************
   2062  *
   2063  * @brief
   2064  *    Intra prediction interpolation filter for luma mode 3 to mode 9
   2065  *
   2066  * @par Description:
   2067  *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
   2068  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2069  *    block location pointed by 'pu1_dst'
   2070  *
   2071  * @param[in] pu1_src
   2072  *  UWORD8 pointer to the source
   2073  *
   2074  * @param[out] pu1_dst
   2075  *  UWORD8 pointer to the destination
   2076  *
   2077  * @param[in] src_strd
   2078  *  integer source stride
   2079  *
   2080  * @param[in] dst_strd
   2081  *  integer destination stride
   2082  *
   2083  * @param[in] nt
   2084  *  integer Transform Block size
   2085  *
   2086  * @param[in] mode
   2087  *  integer intraprediction mode
   2088  *
   2089  * @returns
   2090  *
   2091  * @remarks
   2092  *  None
   2093  *
   2094  *******************************************************************************
   2095  */
   2096 
   2097 
   2098 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
   2099                                                 WORD32 src_strd,
   2100                                                 UWORD8 *pu1_dst,
   2101                                                 WORD32 dst_strd,
   2102                                                 WORD32 nt,
   2103                                                 WORD32 mode)
   2104 {
   2105 
   2106     WORD32 row, col;
   2107     WORD32 intra_pred_ang;
   2108     WORD32 pos, fract = 100, fract_prev;
   2109     UNUSED(src_strd);
   2110     if(0 == (nt & 7))
   2111     {
   2112 
   2113         UWORD8 *pu1_ref_main_idx = pu1_ref;
   2114         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
   2115 
   2116         UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2117         UWORD8 *pu1_dst_tmp2 = pu1_dst;
   2118 
   2119         WORD32 two_nt = 2 * nt;
   2120 
   2121         pu1_ref_main_idx += two_nt;
   2122         pu1_ref_main_idx_1 += two_nt - 1;
   2123 
   2124         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
   2125         uint8x8_t shift_res;
   2126         uint16x8_t mul_res1, mul_res2, add_res;
   2127 
   2128         /* Intra Pred Angle according to the mode */
   2129         intra_pred_ang = gai4_ihevc_ang_table[mode];
   2130 
   2131         pu1_ref_main_idx -= 8;
   2132         pu1_ref_main_idx_1 -= 8;
   2133 
   2134         for(col = 0; col < nt; col++)
   2135         {
   2136             fract_prev = fract;
   2137 
   2138             pos = ((col + 1) * intra_pred_ang);
   2139             fract = pos & (31);
   2140 
   2141             if(fract_prev < fract)
   2142             {
   2143                 pu1_ref_main_idx += 1;
   2144                 pu1_ref_main_idx_1 += 1;
   2145             }
   2146 
   2147             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2148             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2149 
   2150             for(row = nt; row > 0; row -= 8)
   2151             {
   2152                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
   2153                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
   2154 
   2155                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
   2156                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
   2157 
   2158                 add_res = vaddq_u16(mul_res1, mul_res2);
   2159 
   2160                 shift_res = vrshrn_n_u16(add_res, 5);
   2161 
   2162                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
   2163                 pu1_dst_tmp1 += dst_strd;
   2164 
   2165                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
   2166                 pu1_dst_tmp1 += dst_strd;
   2167 
   2168                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
   2169                 pu1_dst_tmp1 += dst_strd;
   2170 
   2171                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
   2172                 pu1_dst_tmp1 += dst_strd;
   2173 
   2174                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
   2175                 pu1_dst_tmp1 += dst_strd;
   2176 
   2177                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
   2178                 pu1_dst_tmp1 += dst_strd;
   2179 
   2180                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
   2181                 pu1_dst_tmp1 += dst_strd;
   2182 
   2183                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
   2184                 pu1_dst_tmp1 += dst_strd;
   2185 
   2186                 pu1_ref_main_idx -= 8;
   2187                 pu1_ref_main_idx_1 -= 8;
   2188 
   2189             }
   2190             pu1_dst_tmp2 += 1;
   2191             pu1_dst_tmp1 = pu1_dst_tmp2;
   2192 
   2193             pu1_ref_main_idx += nt;
   2194             pu1_ref_main_idx_1 += nt;
   2195 
   2196             pu1_ref_main_idx -= 1;
   2197             pu1_ref_main_idx_1 -= 1;
   2198 
   2199         }
   2200     }
   2201     else
   2202     {
   2203         UWORD8 *pu1_ref_tmp1 = pu1_ref;
   2204         UWORD8 *pu1_ref_tmp2 = pu1_ref;
   2205         UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2206         UWORD8 *pu1_dst_tmp2 = pu1_dst;
   2207 
   2208         pu1_ref_tmp1 += nt;
   2209         pu1_ref_tmp2 += (nt - 1);
   2210 
   2211         uint8x8_t dup_fract, dup_32_fract, shift_res;
   2212         uint16x8_t mul_res1, mul_res2, add_res;
   2213         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
   2214 
   2215         pu1_ref_val1 = vdup_n_u32(0);
   2216         pu1_ref_val2 = vdup_n_u32(0);
   2217 
   2218         /* Intra Pred Angle according to the mode */
   2219         intra_pred_ang = gai4_ihevc_ang_table[mode];
   2220 
   2221 
   2222         for(col = 0; col < nt; col++)
   2223         {
   2224             fract_prev = fract;
   2225             pos = ((col + 1) * intra_pred_ang);
   2226             fract = pos & (31);
   2227             if(fract_prev < fract)
   2228             {
   2229                 pu1_ref_tmp1 += 1;
   2230                 pu1_ref_tmp2 += 1;
   2231             }
   2232             dup_fract = vdup_n_u8((uint8_t)fract);
   2233             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2234 
   2235             for(row = nt; row > 0; row -= 4)
   2236             {
   2237                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
   2238                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
   2239 
   2240                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
   2241                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
   2242 
   2243                 add_res = vaddq_u16(mul_res1, mul_res2);
   2244 
   2245                 shift_res = vrshrn_n_u16(add_res, 5);
   2246 
   2247                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
   2248                 pu1_dst_tmp1 += dst_strd;
   2249 
   2250                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
   2251                 pu1_dst_tmp1 += dst_strd;
   2252 
   2253                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
   2254                 pu1_dst_tmp1 += dst_strd;
   2255 
   2256                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
   2257 
   2258             }
   2259             pu1_ref_tmp1 -= 1;
   2260             pu1_ref_tmp2 -= 1;
   2261 
   2262             pu1_dst_tmp2 += 1;
   2263             pu1_dst_tmp1 = pu1_dst_tmp2;
   2264 
   2265         }
   2266 
   2267 
   2268     }
   2269 
   2270 }
   2271 
   2272 /**
   2273  *******************************************************************************
   2274  *
   2275  * @brief
   2276  *   Intra prediction interpolation filter for luma mode 11 to mode 17
   2277  *
   2278  * @par Description:
   2279  *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
   2280  *    with reference  neighboring samples location pointed by 'pu1_ref' to the
   2281  *    TU block location pointed by 'pu1_dst'
   2282  *
   2283  * @param[in] pu1_src
   2284  *  UWORD8 pointer to the source
   2285  *
   2286  * @param[out] pu1_dst
   2287  *  UWORD8 pointer to the destination
   2288  *
   2289  * @param[in] src_strd
   2290  *  integer source stride
   2291  *
   2292  * @param[in] dst_strd
   2293  *  integer destination stride
   2294  *
   2295  * @param[in] nt
   2296  *  integer Transform Block size
   2297  *
   2298  * @param[in] mode
   2299  *  integer intraprediction mode
   2300  *
   2301  * @returns
   2302  *
   2303  * @remarks
   2304  *  None
   2305  *
   2306  *******************************************************************************
   2307  */
   2308 
   2309 
   2310 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
   2311                                                   WORD32 src_strd,
   2312                                                   UWORD8 *pu1_dst,
   2313                                                   WORD32 dst_strd,
   2314                                                   WORD32 nt,
   2315                                                   WORD32 mode)
   2316 {
   2317 
   2318     WORD32 row, col, k;
   2319     WORD32 two_nt;
   2320     WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
   2321     WORD32 pos, fract = 1000, fract_prev;
   2322     WORD32  ref_idx;
   2323 
   2324     UWORD8 *ref_main;
   2325     UWORD8 *ref_main_tmp;
   2326 
   2327     UWORD8 *pu1_ref_tmp1 = pu1_ref;
   2328     UWORD8 *pu1_ref_tmp2 = pu1_ref;
   2329     UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2330     UWORD8 *pu1_dst_tmp2 = pu1_dst;
   2331 
   2332     UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
   2333 
   2334     uint16x8_t mul_res1, mul_res2, add_res;
   2335     uint8x8_t dup_const_fract, dup_const_32_fract;
   2336     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
   2337     uint8x8_t ref_left_t;
   2338     uint32x2_t  ref_left_tmp;
   2339     UNUSED(src_strd);
   2340     ref_left_tmp = vdup_n_u32(0);
   2341 
   2342     inv_ang_sum = 128;
   2343     two_nt = 2 * nt;
   2344 
   2345     intra_pred_ang = gai4_ihevc_ang_table[mode];
   2346 
   2347     inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
   2348 
   2349     pu1_ref_tmp1 += two_nt;
   2350 
   2351     ref_main = ref_temp + (nt - 1);
   2352     ref_main_tmp = ref_main;
   2353 
   2354     if(0 == (nt & 7))
   2355     {
   2356         pu1_ref_tmp2 += (two_nt - 7);
   2357 
   2358         for(k = nt - 1; k >= 0; k -= 8)
   2359         {
   2360 
   2361             ref_left_t = vld1_u8(pu1_ref_tmp2);
   2362 
   2363             ref_left_t = vrev64_u8(ref_left_t);
   2364             vst1_u8(ref_main_tmp, ref_left_t);
   2365             ref_main_tmp += 8;
   2366             pu1_ref_tmp2 -= 8;
   2367 
   2368         }
   2369 
   2370     }
   2371     else
   2372     {
   2373         uint8x8_t rev_val;
   2374         pu1_ref_tmp2 += (two_nt - (nt - 1));
   2375 
   2376         for(k = nt - 1; k >= 0; k -= 8)
   2377         {
   2378 
   2379             ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
   2380 
   2381             rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
   2382             vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
   2383 
   2384         }
   2385 
   2386     }
   2387 
   2388     ref_main[nt] = pu1_ref[two_nt - nt];
   2389 
   2390     /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
   2391 
   2392     ref_idx = (nt * intra_pred_ang) >> 5;
   2393 
   2394     /* SIMD Optimization can be done using look-up table for the loop */
   2395     /* For negative angled derive the main reference samples from side */
   2396     /*  reference samples refer to section 8.4.4.2.6 */
   2397     for(k = -1; k > ref_idx; k--)
   2398     {
   2399         inv_ang_sum += inv_ang;
   2400         ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
   2401     }
   2402 
   2403     UWORD8 *ref_main_tmp1 = ref_main;
   2404     UWORD8 *ref_main_tmp2 = ref_main;
   2405 
   2406     ref_main_tmp2 += 1;
   2407 
   2408     if(0 == (nt & 7))
   2409     {
   2410         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   2411         /* samples dependent on distance to obtain destination sample */
   2412         for(col = 0; col < nt; col++)
   2413         {
   2414 
   2415             fract_prev = fract;
   2416             pos = ((col + 1) * intra_pred_ang);
   2417             fract = pos & (31);
   2418 
   2419             if(fract_prev < fract)
   2420             {
   2421                 ref_main_tmp1 -= 1;
   2422                 ref_main_tmp2 -= 1;
   2423             }
   2424 
   2425             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2426             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2427 
   2428             // Do linear filtering
   2429             for(row = nt; row > 0; row -= 8)
   2430             {
   2431                 ref_main_idx = vld1_u8(ref_main_tmp1);
   2432 
   2433                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
   2434 
   2435                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
   2436                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
   2437 
   2438                 add_res = vaddq_u16(mul_res1, mul_res2);
   2439 
   2440                 shift_res = vrshrn_n_u16(add_res, 5);
   2441 
   2442                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
   2443                 pu1_dst_tmp1 += dst_strd;
   2444 
   2445                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
   2446                 pu1_dst_tmp1 += dst_strd;
   2447 
   2448                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
   2449                 pu1_dst_tmp1 += dst_strd;
   2450 
   2451                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
   2452                 pu1_dst_tmp1 += dst_strd;
   2453 
   2454                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
   2455                 pu1_dst_tmp1 += dst_strd;
   2456 
   2457                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
   2458                 pu1_dst_tmp1 += dst_strd;
   2459 
   2460                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
   2461                 pu1_dst_tmp1 += dst_strd;
   2462 
   2463                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
   2464                 pu1_dst_tmp1 += dst_strd;
   2465 
   2466                 ref_main_tmp1 += 8;
   2467                 ref_main_tmp2 += 8;
   2468             }
   2469 
   2470             ref_main_tmp1 -= nt;
   2471             ref_main_tmp2 -= nt;
   2472 
   2473             pu1_dst_tmp2 += 1;
   2474             pu1_dst_tmp1 = pu1_dst_tmp2;
   2475         }
   2476     }
   2477     else
   2478     {
   2479         uint32x2_t ref_main_idx1, ref_main_idx2;
   2480 
   2481         ref_main_idx1 = vdup_n_u32(0);
   2482         ref_main_idx2 = vdup_n_u32(0);
   2483 
   2484         for(col = 0; col < nt; col++)
   2485         {
   2486             fract_prev = fract;
   2487             pos = ((col + 1) * intra_pred_ang);
   2488             fract = pos & (31);
   2489 
   2490             if(fract_prev < fract)
   2491             {
   2492                 ref_main_tmp1 -= 1;
   2493                 ref_main_tmp2 -= 1;
   2494             }
   2495 
   2496             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2497             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2498 
   2499             for(row = nt; row > 0; row -= 4)
   2500             {
   2501 
   2502                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
   2503                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
   2504 
   2505                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
   2506                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
   2507 
   2508                 add_res = vaddq_u16(mul_res1, mul_res2);
   2509 
   2510                 shift_res = vrshrn_n_u16(add_res, 5);
   2511 
   2512                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
   2513                 pu1_dst_tmp1 += dst_strd;
   2514 
   2515                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
   2516                 pu1_dst_tmp1 += dst_strd;
   2517 
   2518                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
   2519                 pu1_dst_tmp1 += dst_strd;
   2520 
   2521                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
   2522                 pu1_dst_tmp1 += dst_strd;
   2523 
   2524             }
   2525 
   2526             pu1_dst_tmp2 += 1;
   2527             pu1_dst_tmp1 = pu1_dst_tmp2;
   2528 
   2529         }
   2530 
   2531     }
   2532 }
   2533 
   2534 /**
   2535  *******************************************************************************
   2536  *
   2537  * @brief
   2538  *   Intra prediction interpolation filter for luma mode 19 to mode 25
   2539  *
   2540  * @par Description:
   2541  *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
   2542  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2543  *    block location pointed by 'pu1_dst'
   2544  *
   2545  * @param[in] pu1_src
   2546  *  UWORD8 pointer to the source
   2547  *
   2548  * @param[out] pu1_dst
   2549  *  UWORD8 pointer to the destination
   2550  *
   2551  * @param[in] src_strd
   2552  *  integer source stride
   2553  *
   2554  * @param[in] dst_strd
   2555  *  integer destination stride
   2556  *
   2557  * @param[in] nt
   2558  *  integer Transform Block size
   2559  *
   2560  * @param[in] mode
   2561  *  integer intraprediction mode
   2562  *
   2563  * @returns
   2564  *
   2565  * @remarks
   2566  *  None
   2567  *
   2568  *******************************************************************************
   2569  */
   2570 
   2571 
   2572 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
   2573                                                   WORD32 src_strd,
   2574                                                   UWORD8 *pu1_dst,
   2575                                                   WORD32 dst_strd,
   2576                                                   WORD32 nt,
   2577                                                   WORD32 mode)
   2578 {
   2579 
   2580     WORD32 row, col, k;
   2581     WORD32 two_nt, intra_pred_ang;
   2582     WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
   2583     WORD32 ref_idx;
   2584     UWORD8 *ref_main;
   2585     UWORD8 *ref_main_tmp;
   2586     UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
   2587 
   2588     UWORD8 *pu1_ref_tmp1 = pu1_ref;
   2589     UWORD8 *pu1_ref_tmp2 = pu1_ref;
   2590     UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2591 
   2592     uint16x8_t mul_res1, mul_res2, add_res;
   2593     uint8x8_t dup_const_fract, dup_const_32_fract;
   2594     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
   2595     uint8x8_t ref_above_t;
   2596     uint32x2_t ref_above_tmp;
   2597     UNUSED(src_strd);
   2598     ref_above_tmp = vdup_n_u32(0);
   2599 
   2600     two_nt = 2 * nt;
   2601     intra_pred_ang = gai4_ihevc_ang_table[mode];
   2602     inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
   2603 
   2604     /* Intermediate reference samples for negative angle modes */
   2605     /* This have to be removed during optimization*/
   2606     pu1_ref_tmp1 += two_nt;
   2607 
   2608 
   2609     ref_main = ref_temp + (nt - 1);
   2610     ref_main_tmp = ref_main;
   2611 
   2612     if(0 == (nt & 7))
   2613     {
   2614         pu1_ref_tmp2 += (two_nt - 7);
   2615         for(k = nt - 1; k >= 0; k -= 8)
   2616         {
   2617 
   2618             ref_above_t = vld1_u8(pu1_ref_tmp1);
   2619             vst1_u8(ref_main_tmp, ref_above_t);
   2620             ref_main_tmp += 8;
   2621             pu1_ref_tmp1 += 8;
   2622 
   2623         }
   2624 
   2625     }
   2626     else
   2627     {
   2628         pu1_ref_tmp2 += (two_nt - (nt - 1));
   2629 
   2630         for(k = nt - 1; k >= 0; k -= 4)
   2631         {
   2632 
   2633             ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
   2634             vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
   2635 
   2636         }
   2637 
   2638     }
   2639 
   2640     ref_main[nt] = pu1_ref[two_nt + nt];
   2641 
   2642     /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   2643 
   2644     ref_idx = (nt * intra_pred_ang) >> 5;
   2645     inv_ang_sum = 128;
   2646 
   2647     /* SIMD Optimization can be done using look-up table for the loop */
   2648     /* For negative angled derive the main reference samples from side */
   2649     /*  reference samples refer to section 8.4.4.2.6 */
   2650     for(k = -1; k > ref_idx; k--)
   2651     {
   2652         inv_ang_sum += inv_ang;
   2653         ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
   2654     }
   2655 
   2656     UWORD8 *ref_main_tmp1 = ref_main;
   2657     UWORD8 *ref_main_tmp2 = ref_main;
   2658 
   2659     ref_main_tmp2 += 1;
   2660 
   2661     if(0 == (nt & 7))
   2662     {
   2663         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   2664         /* samples dependent on distance to obtain destination sample */
   2665         for(row = 0; row < nt; row++)
   2666         {
   2667 
   2668             fract_prev = fract;
   2669             pos = ((row + 1) * intra_pred_ang);
   2670             fract = pos & (31);
   2671 
   2672             if(fract_prev < fract)
   2673             {
   2674                 ref_main_tmp1 -= 1;
   2675                 ref_main_tmp2 -= 1;
   2676             }
   2677 
   2678             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2679             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2680 
   2681             // Do linear filtering
   2682             for(col = nt; col > 0; col -= 8)
   2683             {
   2684                 ref_main_idx = vld1_u8(ref_main_tmp1);
   2685 
   2686                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
   2687 
   2688                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
   2689                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
   2690 
   2691                 add_res = vaddq_u16(mul_res1, mul_res2);
   2692 
   2693                 shift_res = vrshrn_n_u16(add_res, 5);
   2694 
   2695                 vst1_u8(pu1_dst_tmp1, shift_res);
   2696                 pu1_dst_tmp1 += 8;
   2697 
   2698                 ref_main_tmp1 += 8;
   2699                 ref_main_tmp2 += 8;
   2700             }
   2701 
   2702             ref_main_tmp1 -= nt;
   2703             ref_main_tmp2 -= nt;
   2704 
   2705             pu1_dst_tmp1 += (dst_strd - nt);
   2706         }
   2707     }
   2708     else
   2709     {
   2710         uint32x2_t ref_main_idx1, ref_main_idx2;
   2711 
   2712         ref_main_idx1 = vdup_n_u32(0);
   2713         ref_main_idx2 = vdup_n_u32(0);
   2714 
   2715         for(row = 0; row < nt; row++)
   2716         {
   2717             fract_prev = fract;
   2718             pos = ((row + 1) * intra_pred_ang);
   2719             fract = pos & (31);
   2720 
   2721             if(fract_prev < fract)
   2722             {
   2723                 ref_main_tmp1 -= 1;
   2724                 ref_main_tmp2 -= 1;
   2725             }
   2726 
   2727             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2728             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2729 
   2730             for(col = nt; col > 0; col -= 4)
   2731             {
   2732 
   2733                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
   2734                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
   2735 
   2736                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
   2737                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
   2738 
   2739                 add_res = vaddq_u16(mul_res1, mul_res2);
   2740 
   2741                 shift_res = vrshrn_n_u16(add_res, 5);
   2742 
   2743                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
   2744                 pu1_dst_tmp1 += 4;
   2745 
   2746             }
   2747             pu1_dst_tmp1 += (dst_strd - nt);
   2748         }
   2749 
   2750     }
   2751 
   2752 }
   2753 
   2754 /**
   2755  *******************************************************************************
   2756  *
   2757  * @brief
   2758  *    Intra prediction interpolation filter for luma mode 27 to mode 33
   2759  *
   2760  * @par Description:
   2761  *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
   2762  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2763  *    block location pointed by 'pu1_dst'
   2764  *
   2765  * @param[in] pu1_src
   2766  *  UWORD8 pointer to the source
   2767  *
   2768  * @param[out] pu1_dst
   2769  *  UWORD8 pointer to the destination
   2770  *
   2771  * @param[in] src_strd
   2772  *  integer source stride
   2773  *
   2774  * @param[in] dst_strd
   2775  *  integer destination stride
   2776  *
   2777  * @param[in] nt
   2778  *  integer Transform Block size
   2779  *
   2780  * @param[in] mode
   2781  *  integer intraprediction mode
   2782  *
   2783  * @returns
   2784  *
   2785  * @remarks
   2786  *  None
   2787  *
   2788  *******************************************************************************
   2789  */
   2790 
   2791 
   2792 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
   2793                                                   WORD32 src_strd,
   2794                                                   UWORD8 *pu1_dst,
   2795                                                   WORD32 dst_strd,
   2796                                                   WORD32 nt,
   2797                                                   WORD32 mode)
   2798 {
   2799 
   2800     WORD32 row, col;
   2801     WORD32 intra_pred_ang;
   2802     WORD32 pos, fract = 0, fract_prev;
   2803 
   2804     WORD32 two_nt = 2 * nt;
   2805     UNUSED(src_strd);
   2806     if(0 == (nt & 7))
   2807     {
   2808 
   2809         UWORD8 *pu1_ref_main_idx = pu1_ref;
   2810         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
   2811 
   2812         UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2813         pu1_ref_main_idx += (two_nt + 1);
   2814         pu1_ref_main_idx_1 += (two_nt + 2);
   2815 
   2816         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
   2817         uint8x8_t shift_res;
   2818         uint16x8_t mul_res1, mul_res2, add_res;
   2819 
   2820         /* Intra Pred Angle according to the mode */
   2821         intra_pred_ang = gai4_ihevc_ang_table[mode];
   2822 
   2823         for(row = 0; row < nt; row++)
   2824         {
   2825             fract_prev = fract;
   2826 
   2827             pos = ((row + 1) * intra_pred_ang);
   2828             fract = pos & (31);
   2829 
   2830             if(fract_prev > fract)
   2831             {
   2832                 pu1_ref_main_idx += 1;
   2833                 pu1_ref_main_idx_1 += 1;
   2834             }
   2835 
   2836             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2837             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2838 
   2839             for(col = nt; col > 0; col -= 8)
   2840             {
   2841                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
   2842                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
   2843 
   2844                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
   2845                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
   2846 
   2847                 add_res = vaddq_u16(mul_res1, mul_res2);
   2848 
   2849                 shift_res = vrshrn_n_u16(add_res, 5);
   2850 
   2851                 vst1_u8(pu1_dst_tmp1, shift_res);
   2852                 pu1_dst_tmp1 += 8;
   2853 
   2854                 pu1_ref_main_idx += 8;
   2855                 pu1_ref_main_idx_1 += 8;
   2856             }
   2857 
   2858             pu1_ref_main_idx -= nt;
   2859             pu1_ref_main_idx_1 -= nt;
   2860 
   2861             pu1_dst_tmp1 += (dst_strd - nt);
   2862         }
   2863 
   2864     }
   2865     else
   2866     {
   2867         UWORD8 *pu1_ref_tmp1 = pu1_ref;
   2868         UWORD8 *pu1_ref_tmp2 = pu1_ref;
   2869         UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2870 
   2871         pu1_ref_tmp1 += (two_nt + 1);;
   2872         pu1_ref_tmp2 += (two_nt + 2);;
   2873 
   2874         uint8x8_t dup_fract, dup_32_fract, shift_res;
   2875         uint16x8_t mul_res1, mul_res2, add_res;
   2876         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
   2877 
   2878         pu1_ref_val1 = vdup_n_u32(0);
   2879         pu1_ref_val2 = vdup_n_u32(0);
   2880 
   2881         /* Intra Pred Angle according to the mode */
   2882         intra_pred_ang = gai4_ihevc_ang_table[mode];
   2883 
   2884         for(row = 0; row < nt; row++)
   2885         {
   2886             fract_prev = fract;
   2887             pos = ((row + 1) * intra_pred_ang);
   2888             fract = pos & (31);
   2889             if(fract_prev > fract)
   2890             {
   2891                 pu1_ref_tmp1 += 1;
   2892                 pu1_ref_tmp2 += 1;
   2893             }
   2894             dup_fract = vdup_n_u8((uint8_t)fract);
   2895             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2896 
   2897             for(col = nt; col > 0; col -= 4)
   2898             {
   2899                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
   2900                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
   2901 
   2902                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
   2903                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
   2904 
   2905                 add_res = vaddq_u16(mul_res1, mul_res2);
   2906 
   2907                 shift_res = vrshrn_n_u16(add_res, 5);
   2908 
   2909                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
   2910                 pu1_dst_tmp1 += 4;
   2911 
   2912             }
   2913 
   2914             pu1_dst_tmp1 += (dst_strd - nt);
   2915 
   2916         }
   2917 
   2918 
   2919     }
   2920 
   2921 }
   2922