Home | History | Annotate | Download | only in arm
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_intra_pred_filters_neon_intr.c
     22 *
     23 * @brief
     24 *  Contains function Definition for intra prediction  interpolation filters
     25 *
     26 *
     27 * @author
     28 *  Yogeswaran RS
     29 *
     30 * @par List of Functions:
     31 *  - ihevc_intra_pred_luma_planar()
     32 *  - ihevc_intra_pred_luma_dc()
     33 *  - ihevc_intra_pred_luma_horz()
     34 *  - ihevc_intra_pred_luma_ver()
     35 *  - ihevc_intra_pred_luma_mode2()
     36 *  - ihevc_intra_pred_luma_mode_18_34()
     37 *
     38 * @remarks
     39 *  None
     40 *
     41 *******************************************************************************
     42 */
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 #include <stdio.h>
     47 
     48 #include "ihevc_typedefs.h"
     49 #include "ihevc_intra_pred.h"
     50 #include "ihevc_macros.h"
     51 #include "ihevc_func_selector.h"
     52 #include "arm_neon.h"
     53 #include "ihevc_platform_macros.h"
     54 #include "ihevc_common_tables.h"
     55 
     56 /****************************************************************************/
     57 /* Constant Macros                                                          */
     58 /****************************************************************************/
     59 #define MAX_CU_SIZE 64
     60 #define BIT_DEPTH 8
     61 #define T32_4NT 128
     62 #define T16_4NT 64
     63 
     64 
     65 
     66 /*****************************************************************************/
     67 /* Table Look-up                                                             */
     68 /*****************************************************************************/
     69 
     70 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
     71 
     72 /*****************************************************************************/
     73 /* Function Definition                                                      */
     74 /*****************************************************************************/
     75 
     76 /**
     77 *******************************************************************************
     78 *
     79 * @brief
     80  *    Intra prediction interpolation filter for pu1_ref substitution
     81  *
     82  *
     83  * @par Description:
     84  *    Reference substitution process for samples unavailable  for prediction
     85  *    Refer to section 8.4.4.2.2
     86  *
     87  * @param[in] pu1_top_left
     88  *  UWORD8 pointer to the top-left
     89  *
     90  * @param[in] pu1_top
     91  *  UWORD8 pointer to the top
     92  *
     93  * @param[in] pu1_left
     94  *  UWORD8 pointer to the left
     95  *
     96  * @param[in] src_strd
     97  *  WORD32 Source stride
     98  *
     99  * @param[in] nbr_flags
    100  *  WORD32 neighbor availability flags
    101  *
    102  * @param[in] nt
    103  *  WORD32 transform Block size
    104  *
    105  * @param[in] dst_strd
    106  *  WORD32 Destination stride
    107  *
    108  * @returns
    109  *
    110  * @remarks
    111  *  None
    112  *
    113  *******************************************************************************
    114  */
    115 
    116 
    117 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
    118                                                      UWORD8 *pu1_top,
    119                                                      UWORD8 *pu1_left,
    120                                                      WORD32 src_strd,
    121                                                      WORD32 nt,
    122                                                      WORD32 nbr_flags,
    123                                                      UWORD8 *pu1_dst,
    124                                                      WORD32 dst_strd)
    125 {
    126     UWORD8 pu1_ref;
    127     WORD32 dc_val, i;
    128     WORD32 total_samples = (4 * nt) + 1;
    129     WORD32 two_nt = 2 * nt;
    130     WORD32 three_nt = 3 * nt;
    131     WORD32 get_bits;
    132     WORD32 next;
    133     WORD32 bot_left, left, top, tp_right, tp_left;
    134     WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
    135     UNUSED(dst_strd);
    136     dc_val = 1 << (BIT_DEPTH - 1);
    137 
    138     /* Neighbor Flag Structure*/
    139     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
    140               1         4         4     4         4
    141      */
    142 
    143     /* If no neighbor flags are present, fill the neighbor samples with DC value */
    144     if(nbr_flags == 0)
    145     {
    146         for(i = 0; i < total_samples; i++)
    147         {
    148             pu1_dst[i] = dc_val;
    149         }
    150     }
    151     else
    152     {
    153         /* Else fill the corresponding samples */
    154         pu1_dst[two_nt] = *pu1_top_left;
    155         UWORD8 *pu1_dst_tmp2 = pu1_dst;
    156         UWORD8 *pu1_top_tmp = pu1_top;
    157         pu1_dst_tmp2 += two_nt + 1;
    158 
    159         for(i = 0; i < two_nt; i++)
    160             pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
    161 
    162         uint8x8_t src;
    163         for(i = two_nt; i > 0; i -= 8)
    164         {
    165             src = vld1_u8(pu1_top_tmp);
    166             pu1_top_tmp += 8;
    167             vst1_u8(pu1_dst_tmp2, src);
    168             pu1_dst_tmp2 += 8;
    169         }
    170 
    171         if(nt <= 8)
    172         {
    173             /* 1 bit extraction for all the neighboring blocks */
    174             tp_left = (nbr_flags & 0x10000) >> 16;
    175             bot_left = nbr_flags & 0x1;
    176             left = (nbr_flags & 0x10) >> 4;
    177             top = (nbr_flags & 0x100) >> 8;
    178             tp_right = (nbr_flags & 0x1000) >> 12;
    179 
    180             next = 1;
    181 
    182             /* If bottom -left is not available, reverse substitution process*/
    183             if(bot_left == 0)
    184             {
    185                 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
    186 
    187                 /* Check for the 1st available sample from bottom-left*/
    188                 while(!a_nbr_flag[next])
    189                     next++;
    190 
    191                 /* If Left, top-left are available*/
    192                 if(next <= 2)
    193                 {
    194                     idx = nt * next;
    195                     pu1_ref = pu1_dst[idx];
    196                     for(i = 0; i < idx; i++)
    197                         pu1_dst[i] = pu1_ref;
    198                 }
    199                 else /* If top, top-right are available */
    200                 {
    201                     /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
    202                     idx = (nt * (next - 1)) + 1;
    203                     pu1_ref = pu1_dst[idx];
    204                     for(i = 0; i < idx; i++)
    205                         pu1_dst[i] = pu1_ref;
    206                 }
    207             }
    208 
    209             /* Forward Substitution Process */
    210             /* If left is Unavailable, copy the last bottom-left value */
    211 
    212             if(left == 0)
    213             {
    214                 uint8x8_t dup_pu1_dst1;
    215                 UWORD8 *pu1_dst_const_nt = pu1_dst;
    216                 pu1_dst_const_nt += nt;
    217 
    218                 if(0 == (nt & 7))
    219                 {
    220                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
    221                     for(i = nt; i > 0; i -= 8)
    222                     {
    223                         vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
    224                         pu1_dst_const_nt += 8;
    225 
    226                     }
    227                 }
    228                 else
    229                 {
    230                     //uint32x2_t dup_pu1_dst4;
    231                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
    232                     //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
    233                     for(i = nt; i > 0; i -= 4)
    234                     {
    235                         vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
    236                         pu1_dst_const_nt += 4;
    237 
    238                     }
    239 
    240                 }
    241 
    242             }
    243             if(tp_left == 0)
    244                 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
    245             if(top == 0)
    246             {
    247 
    248                 if(0 == (nt & 7))
    249                 {
    250                     uint8x8_t dup_pu1_dst2;
    251                     UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
    252                     pu1_dst_const_two_nt_1 += (two_nt + 1);
    253                     dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
    254                     for(i = nt; i > 0; i -= 8)
    255                     {
    256                         vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
    257                         pu1_dst_const_two_nt_1 += 8;
    258 
    259                     }
    260                 }
    261                 else
    262                 {
    263                     for(i = 0; i < nt; i++)
    264                         pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
    265                 }
    266             }
    267             if(tp_right == 0)
    268             {
    269                 uint8x8_t dup_pu1_dst3;
    270                 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
    271                 pu1_dst_const_three_nt_1 += (three_nt + 1);
    272                 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
    273                 if(0 == (nt & 7))
    274                 {
    275                     for(i = nt; i > 0; i -= 8)
    276                     {
    277                         vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
    278                         pu1_dst_const_three_nt_1 += 8;
    279 
    280                     }
    281                 }
    282                 else
    283                 {
    284                     for(i = nt; i > 0; i -= 4)
    285                     {
    286                         vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
    287                         pu1_dst_const_three_nt_1 += 4;
    288                     }
    289 
    290                 }
    291 
    292             }
    293         }
    294         if(nt == 16)
    295         {
    296             WORD32 nbr_flags_temp = 0;
    297             nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
    298                             + ((nbr_flags & 0x300) >> 4)
    299                             + ((nbr_flags & 0x3000) >> 6)
    300                             + ((nbr_flags & 0x10000) >> 8);
    301 
    302             /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
    303             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
    304             {
    305                 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
    306 
    307                 if(nbr_id_from_bl == 64)
    308                     nbr_id_from_bl = 32;
    309 
    310                 if(nbr_id_from_bl == 32)
    311                 {
    312                     /* for top left : 1 pel per nbr bit */
    313                     if(!((nbr_flags_temp >> 8) & 0x1))
    314                     {
    315                         nbr_id_from_bl++;
    316                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
    317                     }
    318                 }
    319                 /* Reverse Substitution Process*/
    320                 if(nbr_id_from_bl)
    321                 {
    322                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
    323                     pu1_ref = pu1_dst[nbr_id_from_bl];
    324                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
    325                     {
    326                         pu1_dst[i] = pu1_ref;
    327                     }
    328                 }
    329             }
    330 
    331             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
    332             while(nbr_id_from_bl < ((T16_4NT) + 1))
    333             {
    334                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
    335                 /* Devide by 8 to obtain the original index */
    336                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
    337 
    338                 /* The Top-left flag is at the last bit location of nbr_flags*/
    339                 if(nbr_id_from_bl == (T16_4NT / 2))
    340                 {
    341                     get_bits = GET_BITS(nbr_flags_temp, 8);
    342 
    343                     /* only pel substitution for TL */
    344                     if(!get_bits)
    345                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
    346                 }
    347                 else
    348                 {
    349                     get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
    350                     if(!get_bits)
    351                     {
    352                         /* 8 pel substitution (other than TL) */
    353                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
    354                         for(i = 0; i < 8; i++)
    355                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
    356                     }
    357 
    358                 }
    359                 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
    360             }
    361         }
    362 
    363         if(nt == 32)
    364         {
    365             /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
    366             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
    367             {
    368                 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
    369 
    370                 if(nbr_id_from_bl == 64)
    371                 {
    372                     /* for top left : 1 pel per nbr bit */
    373                     if(!((nbr_flags >> 16) & 0x1))
    374                     {
    375                         /* top left not available */
    376                         nbr_id_from_bl++;
    377                         /* top and top right;  8 pels per nbr bit */
    378                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
    379                     }
    380                 }
    381                 /* Reverse Substitution Process*/
    382                 if(nbr_id_from_bl)
    383                 {
    384                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
    385                     pu1_ref = pu1_dst[nbr_id_from_bl];
    386                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
    387                         pu1_dst[i] = pu1_ref;
    388                 }
    389             }
    390 
    391             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
    392             while(nbr_id_from_bl < ((T32_4NT)+1))
    393             {
    394                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
    395                 /* Devide by 8 to obtain the original index */
    396                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
    397 
    398                 /* The Top-left flag is at the last bit location of nbr_flags*/
    399                 if(nbr_id_from_bl == (T32_4NT / 2))
    400                 {
    401                     get_bits = GET_BITS(nbr_flags, 16);
    402                     /* only pel substitution for TL */
    403                     if(!get_bits)
    404                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
    405                 }
    406                 else
    407                 {
    408                     get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
    409                     if(!get_bits)
    410                     {
    411                         /* 8 pel substitution (other than TL) */
    412                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
    413                         for(i = 0; i < 8; i++)
    414                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
    415                     }
    416 
    417                 }
    418                 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
    419             }
    420         }
    421 
    422     }
    423 
    424 }
    425 
    426 /**
    427  *******************************************************************************
    428  *
    429  * @brief
    430  *    Intra prediction interpolation filter for ref_filtering
    431  *
    432  *
    433  * @par Description:
    434  *    Reference DC filtering for neighboring samples dependent  on TU size and
    435  *    mode  Refer to section 8.4.4.2.3 in the standard
    436  *
    437  * @param[in] pu1_src
    438  *  UWORD8 pointer to the source
    439  *
    440  * @param[out] pu1_dst
    441  *  UWORD8 pointer to the destination
    442  *
    443  * @param[in] nt
    444  *  integer Transform Block size
    445  *
    446  * @param[in] mode
    447  *  integer intraprediction mode
    448  *
    449  * @returns
    450  *
    451  * @remarks
    452  *  None
    453  *
    454  *******************************************************************************
    455  */
    456 
    457 
    458 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
    459                                              WORD32 nt,
    460                                              UWORD8 *pu1_dst,
    461                                              WORD32 mode,
    462                                              WORD32 strong_intra_smoothing_enable_flag)
    463 {
    464     WORD32 filter_flag;
    465     WORD32 i = 0;
    466     WORD32 four_nt = 4 * nt;
    467 
    468     WORD32 src_4nt;
    469 
    470     /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1   */
    471     /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values        */
    472     UWORD8 *pu1_src_tmp_0 = pu1_src;
    473     UWORD8 *pu1_src_tmp_1;
    474     UWORD8 *pu1_src_tmp_2;
    475     UWORD8 *pu1_dst_tmp_0 = pu1_dst;
    476     UWORD8 *pu1_dst_tmp_1;
    477 
    478     uint8x8_t src_val_0, src_val_2;
    479     uint8x8_t src_val_1, shift_res;
    480     uint8x8_t dup_const_2;
    481     uint16x8_t mul_res, add_res;
    482     WORD32 bi_linear_int_flag = 0;
    483     WORD32 abs_cond_left_flag = 0;
    484     WORD32 abs_cond_top_flag = 0;
    485     WORD32 dc_val = 1 << (BIT_DEPTH - 5);
    486     shift_res = vdup_n_u8(0);
    487 
    488     filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
    489 
    490     if(0 == filter_flag)
    491     {
    492         if(pu1_src == pu1_dst)
    493         {
    494             return;
    495         }
    496         else
    497         {
    498             for(i = four_nt; i > 0; i -= 8)
    499             {
    500                 src_val_0 = vld1_u8(pu1_src_tmp_0);
    501                 pu1_src_tmp_0 += 8;
    502                 vst1_u8(pu1_dst_tmp_0, src_val_0);
    503                 pu1_dst_tmp_0 += 8;
    504             }
    505             pu1_dst[four_nt] = pu1_src[four_nt];
    506         }
    507     }
    508 
    509     else
    510     {
    511         /* If strong intra smoothin is enabled and transform size is 32 */
    512         if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
    513         {
    514             /*Strong Intra Filtering*/
    515             abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
    516                             - (2 * pu1_src[3 * nt]))) < dc_val;
    517             abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
    518                             - (2 * pu1_src[nt]))) < dc_val;
    519 
    520             bi_linear_int_flag = ((1 == abs_cond_left_flag)
    521                             && (1 == abs_cond_top_flag));
    522         }
    523 
    524         src_4nt = pu1_src[4 * nt];
    525         /* Strong filtering of reference samples */
    526         if(1 == bi_linear_int_flag)
    527         {
    528             WORD32 two_nt = four_nt >> 1;
    529 
    530             WORD32 pu1_src_0_val = pu1_src[0];
    531             WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
    532             WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
    533 
    534             WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
    535             uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
    536 
    537             WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
    538             uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
    539 
    540             const UWORD8 *const_col_i;
    541             uint8x8_t const_col_i_val;
    542             uint16x8_t prod_val_1;
    543             uint16x8_t prod_val_2;
    544             uint16x8_t prod_val_3;
    545             uint16x8_t prod_val_4;
    546             uint8x8_t res_val_1;
    547             uint8x8_t res_val_2;
    548             uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
    549             uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
    550             uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
    551             pu1_dst_tmp_0 = pu1_dst + 1;
    552             pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
    553 
    554             const_col_i = gau1_ihevc_planar_factor + 1;
    555 
    556             for(i = two_nt; i > 0; i -= 8)
    557             {
    558                 const_col_i_val = vld1_u8(const_col_i);
    559                 const_col_i += 8;
    560 
    561                 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
    562                 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
    563 
    564                 res_val_1 = vrshrn_n_u16(prod_val_2, 6);
    565                 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
    566 
    567                 vst1_u8(pu1_dst_tmp_0, res_val_1);
    568                 pu1_dst_tmp_0 += 8;
    569                 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
    570 
    571                 res_val_2 = vrshrn_n_u16(prod_val_4, 6);
    572                 vst1_u8(pu1_dst_tmp_1, res_val_2);
    573                 pu1_dst_tmp_1 += 8;
    574             }
    575             pu1_dst[2 * nt] = pu1_src[2 * nt];
    576         }
    577         else
    578         {
    579             pu1_src_tmp_1 = pu1_src + 1;
    580             pu1_src_tmp_2 = pu1_src + 2;
    581             pu1_dst_tmp_0 += 1;
    582 
    583             dup_const_2 = vdup_n_u8(2);
    584 
    585             /* Extremities Untouched*/
    586             pu1_dst[0] = pu1_src[0];
    587 
    588             /* To avoid the issue when the dest and src has the same pointer this load has been done
    589              * outside and the 2nd consecutive load is done before the store of the 1st */
    590 
    591             /* Perform bilinear filtering of Reference Samples */
    592             for(i = (four_nt - 1); i > 0; i -= 8)
    593             {
    594                 src_val_0 = vld1_u8(pu1_src_tmp_0);
    595                 pu1_src_tmp_0 += 8;
    596 
    597                 src_val_2 = vld1_u8(pu1_src_tmp_2);
    598                 pu1_src_tmp_2 += 8;
    599 
    600                 src_val_1 = vld1_u8(pu1_src_tmp_1);
    601                 pu1_src_tmp_1 += 8;
    602 
    603                 if(i < four_nt - 1)
    604                 {
    605                     vst1_u8(pu1_dst_tmp_0, shift_res);
    606                     pu1_dst_tmp_0 += 8;
    607                 }
    608 
    609                 add_res = vaddl_u8(src_val_0, src_val_2);
    610 
    611                 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
    612                 shift_res = vrshrn_n_u16(mul_res, 2);
    613 
    614             }
    615             vst1_u8(pu1_dst_tmp_0, shift_res);
    616             pu1_dst_tmp_0 += 8;
    617         }
    618         pu1_dst[4 * nt] = src_4nt;
    619 
    620     }
    621 
    622 }
    623 
    624 
    625 
    626 /**
    627  *******************************************************************************
    628  *
    629  * @brief
    630 *   Intra prediction interpolation filter for luma planar
    631 *
    632 * @par Description:
    633 *      Planar Intraprediction with reference neighboring samples  location
    634 *      pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'
    635 *
    636 * @param[in] pu1_src
    637 *  UWORD8 pointer to the source
    638 *
    639 * @param[out] pu1_dst
    640 *  UWORD8 pointer to the destination
    641 *
    642 * @param[in] src_strd
    643 *  integer source stride
    644 *
    645 * @param[in] dst_strd
    646 *  integer destination stride
    647 *
    648 * @param[in] nt
    649 *  integer Transform Block size
    650 *
    651 * @param[in] wd
    652 *  integer width of the array
    653 *
    654 * @returns
    655 *
    656 * @remarks
    657 *  None
    658 *
    659 *******************************************************************************
    660 */
    661 
    662 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
    663                                            WORD32 src_strd,
    664                                            UWORD8 *pu1_dst,
    665                                            WORD32 dst_strd,
    666                                            WORD32 nt,
    667                                            WORD32 mode)
    668 {
    669     /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor)   */
    670     /* load const_nt_1_col values into a d register                                                 */
    671     /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1                                         */
    672     /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup                 */
    673     /* log2nt + 1 is taken care while assigning the values itself                                   */
    674     /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
    675 
    676     WORD32 row, col = 0;
    677     WORD32 log2nt_plus1 = 6;
    678     WORD32 two_nt, three_nt;
    679     UWORD8 *pu1_ref_two_nt_1;
    680     UWORD8 *pu1_dst_tmp;
    681     const UWORD8 *const_nt_1_col;
    682     uint8x8_t const_nt_1_col_t;
    683     const UWORD8 *const_col_1;
    684     uint8x8_t const_col_1_t;
    685     uint8_t const_nt_1_row;
    686     uint8x8_t const_nt_1_row_dup;
    687     uint8_t const_row_1;
    688     uint8x8_t const_row_1_dup;
    689     uint8_t const_nt = nt;
    690     uint16x8_t const_nt_dup;
    691     uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
    692     uint8x8_t pu1_ref_nt_1_dup;
    693     uint8_t pu1_ref_two_nt_1_row;
    694     uint8_t pu1_ref_three_nt_1;
    695     uint8x8_t pu1_ref_two_nt_1_row_dup;
    696     uint8x8_t pu1_ref_two_nt_1_t;
    697     uint8x8_t pu1_ref_three_nt_1_dup;
    698     uint16x8_t prod_t1;
    699     uint16x8_t prod_t2;
    700     uint16x8_t sto_res_tmp;
    701     uint8x8_t sto_res;
    702     int16x8_t log2nt_dup;
    703     UNUSED(src_strd);
    704     UNUSED(mode);
    705     log2nt_plus1 = 32 - CLZ(nt);
    706     two_nt = 2 * nt;
    707     three_nt = 3 * nt;
    708     /* loops have been unrolld considering the fact width is multiple of 8  */
    709     if(0 == (nt & 7))
    710     {
    711         pu1_dst_tmp = pu1_dst;
    712         const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
    713 
    714         const_col_1 = gau1_ihevc_planar_factor + 1;
    715         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
    716 
    717         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
    718         const_nt_dup = vdupq_n_u16(const_nt);
    719 
    720         log2nt_dup = vdupq_n_s16(log2nt_plus1);
    721         log2nt_dup = vnegq_s16(log2nt_dup);
    722 
    723         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
    724 
    725         for(row = 0; row < nt; row++)
    726         {
    727             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
    728             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
    729 
    730             const_nt_1_row = nt - 1 - row;
    731             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
    732 
    733             const_row_1 = row + 1;
    734             const_row_1_dup = vdup_n_u8(const_row_1);
    735 
    736             const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
    737 
    738             const_col_1 = gau1_ihevc_planar_factor + 1;
    739             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
    740 
    741             for(col = nt; col > 0; col -= 8)
    742             {
    743                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
    744                 const_nt_1_col -= 8;
    745                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
    746 
    747                 const_col_1_t = vld1_u8(const_col_1);
    748                 const_col_1 += 8;
    749                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
    750 
    751                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
    752                 pu1_ref_two_nt_1 += 8;
    753                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
    754 
    755                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
    756                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
    757                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
    758                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
    759 
    760                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
    761                 sto_res = vmovn_u16(sto_res_tmp);
    762                 vst1_u8(pu1_dst_tmp, sto_res);
    763                 pu1_dst_tmp += 8;
    764             }
    765             pu1_dst_tmp += dst_strd - nt;
    766         }
    767     }
    768     /* loops have been unrolld considering the fact width is multiple of 4  */
    769     /* If column is multiple of 4 then height should be multiple of 2       */
    770     else
    771     {
    772         uint8x8_t const_row_1_dup1;
    773         uint8x8_t pu1_ref_two_nt_1_t1;
    774         uint8x8_t const_nt_1_col_t1;
    775         uint8x8_t const_col_1_t1;
    776         uint8x8_t pu1_ref_two_nt_1_row_dup1;
    777         uint8x8_t const_nt_1_row_dup1;
    778 
    779         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
    780 
    781         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
    782         const_nt_dup = vdupq_n_u16(const_nt);
    783 
    784         log2nt_dup = vdupq_n_s16(log2nt_plus1);
    785         log2nt_dup = vnegq_s16(log2nt_dup);
    786 
    787         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
    788 
    789         for(row = 0; row < nt; row += 2)
    790         {
    791             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
    792             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
    793             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
    794             pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
    795             pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
    796 
    797             const_nt_1_row = nt - 1 - row;
    798             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
    799             const_nt_1_row = nt - 2 - row;
    800             const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
    801             const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
    802 
    803             const_row_1 = row + 1;
    804             const_row_1_dup = vdup_n_u8(const_row_1);
    805             const_row_1 = row + 2;
    806             const_row_1_dup1 = vdup_n_u8(const_row_1);
    807             const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
    808 
    809             const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
    810 
    811             const_col_1 = gau1_ihevc_planar_factor + 1;
    812 
    813             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
    814 
    815             for(col = nt; col > 0; col -= 4)
    816             {
    817                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
    818                 const_nt_1_col -= 4;
    819                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
    820 
    821                 const_col_1_t = vld1_u8(const_col_1);
    822                 const_col_1 += 4;
    823                 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
    824 
    825                 pu1_dst_tmp = pu1_dst;
    826                 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
    827 
    828                 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
    829                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
    830 
    831                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
    832                 pu1_ref_two_nt_1 += 4;
    833                 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
    834 
    835                 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
    836                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
    837 
    838                 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
    839                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
    840 
    841                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
    842                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
    843                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
    844 
    845                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
    846                 sto_res = vmovn_u16(sto_res_tmp);
    847 
    848                 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
    849                 pu1_dst_tmp += dst_strd;
    850 
    851                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
    852                 pu1_dst += 4;
    853             }
    854             pu1_dst += 2 * dst_strd - nt;
    855         }
    856     }
    857 
    858 }
    859 /* INTRA_PRED_LUMA_PLANAR */
    860 
    861 /**
    862 *******************************************************************************
    863 *
    864 * @brief
    865 *    Intra prediction interpolation filter for luma dc
    866 *
    867 * @par Description:
    868 *    Intraprediction for DC mode with reference neighboring  samples location
    869 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
    870 *
    871 * @param[in] pu1_src
    872 *  UWORD8 pointer to the source
    873 *
    874 * @param[out] pu1_dst
    875 *  UWORD8 pointer to the destination
    876 *
    877 * @param[in] src_strd
    878 *  integer source stride
    879 *
    880 * @param[in] dst_strd
    881 *  integer destination stride
    882 *
    883 * @param[in] nt
    884 *  integer Transform Block size
    885 *
    886 * @param[in] wd
    887 *  integer width of the array
    888 *
    889 * @returns
    890 *
    891 * @remarks
    892 *  None
    893 *
    894 *******************************************************************************
    895 */
    896 
    897 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
    898                                        WORD32 src_strd,
    899                                        UWORD8 *pu1_dst,
    900                                        WORD32 dst_strd,
    901                                        WORD32 nt,
    902                                        WORD32 mode)
    903 {
    904     WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
    905     WORD32 i = 0;
    906     WORD32 row = 0, col = 0, col_count;
    907     WORD32 log2nt_plus1 = 6;
    908     WORD32 two_nt = 0;
    909     uint16x8_t ref_load_q;
    910     uint16x8_t three_dc_val_t;
    911     uint8x8_t sto_res_tmp;
    912     uint8x8_t sto_res_tmp1;
    913     uint8x8_t sto_res_tmp2;
    914     uint8x8_t sto_res_tmp3;
    915     uint8x8_t sto_res_tmp4;
    916     uint8x8_t dc_val_t;
    917 
    918     UWORD8 *pu1_ref_tmp;
    919     UWORD8 *pu1_ref_tmp1;
    920     UWORD8 *pu1_dst_tmp;
    921     UWORD8 *pu1_dst_tmp1;
    922     UWORD8 *pu1_dst_tmp2;
    923     UNUSED(src_strd);
    924     UNUSED(mode);
    925 
    926     /* log2nt + 1 is taken care while assigning the values itself.          */
    927     log2nt_plus1 = 32 - CLZ(nt);
    928 
    929     /* loops have been unrolld considering the fact width is multiple of 8  */
    930     if(0 == (nt & 7))
    931     {
    932         uint8x8_t ref_load1;
    933         uint8x8_t ref_load2;
    934         uint16x4_t acc_dc_pair1;
    935         uint32x2_t acc_dc_pair2;
    936         uint64x1_t acc_dc = vdup_n_u64(col);
    937 
    938         two_nt = 2 * nt;
    939         pu1_ref_tmp = pu1_ref + nt;
    940         pu1_ref_tmp1 = pu1_ref + two_nt + 1;
    941 
    942         for(i = two_nt; i > nt; i -= 8)
    943         {
    944             ref_load1 = vld1_u8(pu1_ref_tmp);
    945             pu1_ref_tmp += 8;
    946             acc_dc_pair1 = vpaddl_u8(ref_load1);
    947 
    948             ref_load2 = vld1_u8(pu1_ref_tmp1);
    949             pu1_ref_tmp1 += 8;
    950 
    951             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
    952             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
    953 
    954             acc_dc_pair1 = vpaddl_u8(ref_load2);
    955             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
    956             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
    957         }
    958 
    959         dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
    960         dc_val_t = vdup_n_u8(dc_val);
    961         two_dc_val = 2 * dc_val;
    962         three_dc_val = 3 * dc_val;
    963         three_dc_val += 2;
    964 
    965         three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
    966         pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
    967         pu1_dst_tmp = pu1_dst;
    968 
    969 
    970         if(nt == 32)
    971         {
    972             for(row = 0; row < nt; row++)
    973             {
    974                 for(col = nt; col > 0; col -= 8)
    975                 {
    976                     vst1_u8(pu1_dst_tmp, dc_val_t);
    977                     pu1_dst_tmp += 8;
    978                 }
    979                 pu1_dst_tmp += dst_strd - nt;
    980             }
    981         }
    982         else
    983 
    984         {
    985             for(col = nt; col > 0; col -= 8)
    986             {
    987                 ref_load1 = vld1_u8(pu1_ref_tmp);
    988                 pu1_ref_tmp += 8;
    989                 ref_load_q = vmovl_u8(ref_load1);
    990                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
    991                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
    992                 sto_res_tmp = vmovn_u16(ref_load_q);
    993                 vst1_u8(pu1_dst_tmp, sto_res_tmp);
    994                 pu1_dst_tmp += 8;
    995             }
    996 
    997             pu1_ref_tmp = pu1_ref + two_nt - 9;
    998             pu1_dst_tmp = pu1_dst + dst_strd;
    999             col_count = nt - 8;
   1000 
   1001             /* Except the first row the remaining rows are done here                            */
   1002             /* Both column and row has been unrolled by 8                                       */
   1003             /* Store has been taken care for the unrolling                                      */
   1004             /* Except the 1st column of the remaining rows(other than 1st row), the values are  */
   1005             /* constant hence it is extracted with an constant value and stored                 */
   1006             /* If the column is greater than 8, then the remaining values are constant which is */
   1007             /* taken care in the inner for loop                                                 */
   1008 
   1009             for(row = nt; row > 0; row -= 8)
   1010             {
   1011                 pu1_dst_tmp1 = pu1_dst_tmp + 8;
   1012                 ref_load1 = vld1_u8(pu1_ref_tmp);
   1013                 pu1_ref_tmp -= 8;
   1014                 ref_load_q = vmovl_u8(ref_load1);
   1015                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
   1016                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
   1017                 sto_res_tmp = vmovn_u16(ref_load_q);
   1018 
   1019                 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
   1020 
   1021                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
   1022                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
   1023                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
   1024                 pu1_dst_tmp += dst_strd;
   1025 
   1026                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
   1027                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
   1028                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
   1029                 pu1_dst_tmp += dst_strd;
   1030 
   1031                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
   1032                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
   1033                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
   1034                 pu1_dst_tmp += dst_strd;
   1035 
   1036                 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
   1037                 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
   1038                 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
   1039                 pu1_dst_tmp += dst_strd;
   1040 
   1041                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
   1042                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
   1043                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
   1044                 pu1_dst_tmp += dst_strd;
   1045 
   1046                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
   1047                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
   1048                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
   1049                 pu1_dst_tmp += dst_strd;
   1050 
   1051                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
   1052                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
   1053                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
   1054                 pu1_dst_tmp += dst_strd;
   1055                 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
   1056                 if(row != 8)
   1057                     vst1_u8(pu1_dst_tmp, sto_res_tmp4);
   1058                 pu1_dst_tmp += dst_strd;
   1059 
   1060                 for(col = col_count; col > 0; col -= 8)
   1061                 {
   1062                     pu1_dst_tmp2 = pu1_dst_tmp1;
   1063                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1064                     pu1_dst_tmp1 += dst_strd;
   1065                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1066                     pu1_dst_tmp1 += dst_strd;
   1067                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1068                     pu1_dst_tmp1 += dst_strd;
   1069                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1070                     pu1_dst_tmp1 += dst_strd;
   1071                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1072                     pu1_dst_tmp1 += dst_strd;
   1073                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1074                     pu1_dst_tmp1 += dst_strd;
   1075                     vst1_u8(pu1_dst_tmp1, dc_val_t);
   1076                     pu1_dst_tmp1 += dst_strd;
   1077 
   1078                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
   1079                     if(row != 8)
   1080                         vst1_u8(pu1_dst_tmp1, dc_val_t);
   1081                     pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
   1082                 }
   1083             }
   1084             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
   1085         }
   1086     }
   1087     /* loops have been unrolld considering the fact width is multiple of 4  */
   1088     else
   1089     {
   1090         WORD32 acc_dc;
   1091         two_nt = 2 * nt;
   1092 
   1093         acc_dc = 0;
   1094         pu1_ref_tmp = pu1_ref + nt + 1;
   1095         for(i = nt; i < two_nt; i++)
   1096         {
   1097             acc_dc += pu1_ref[i];
   1098             acc_dc += pu1_ref_tmp[i];
   1099         }
   1100         dc_val = (acc_dc + nt) >> (log2nt_plus1);
   1101         two_dc_val = 2 * dc_val;
   1102         three_dc_val = 3 * dc_val;
   1103         three_dc_val = three_dc_val + 2;
   1104         dc_val_t = vdup_n_u8(dc_val);
   1105 
   1106         if(nt == 32)
   1107         {
   1108             pu1_dst_tmp = pu1_dst;
   1109             for(row = 0; row < nt; row++)
   1110             {
   1111                 for(col = nt; col > 0; col -= 4)
   1112                 {
   1113                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
   1114                     pu1_dst_tmp += 4;
   1115                 }
   1116                 pu1_dst_tmp += dst_strd - nt;
   1117             }
   1118         }
   1119         else
   1120 
   1121         {
   1122             for(col = 1; col < nt; col++)
   1123             {
   1124                 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
   1125             }
   1126 
   1127             pu1_dst_tmp = pu1_dst + dst_strd + 0;
   1128             /* Since first row is already updated before, loop count is nt-1 */
   1129             for(row = nt - 1; row > 0; row -= 1)
   1130             {
   1131                 for(col = nt; col > 0; col -= 4)
   1132                 {
   1133                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
   1134                     pu1_dst_tmp += 4;
   1135                 }
   1136                 pu1_dst_tmp += dst_strd - nt;
   1137             }
   1138 
   1139             for(row = 1; row < nt; row++)
   1140             {
   1141                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
   1142             }
   1143             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
   1144         }
   1145     }
   1146 }
   1147 /* INTRA_PRED_LUMA_DC */
   1148 
   1149 /**
   1150 *******************************************************************************
   1151 *
   1152 * @brief
   1153  *   Intra prediction interpolation filter for horizontal luma variable.
   1154  *
   1155  * @par Description:
   1156  *   Horizontal intraprediction with reference neighboring  samples location
   1157  *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
   1158  *
   1159  * @param[in] pu1_src
   1160  *  UWORD8 pointer to the source
   1161  *
   1162  * @param[out] pu1_dst
   1163  *  UWORD8 pointer to the destination
   1164  *
   1165  * @param[in] src_strd
   1166  *  integer source stride
   1167  *
   1168  * @param[in] dst_strd
   1169  *  integer destination stride
   1170  *
   1171  * @param[in] nt
   1172  *  integer Transform Block size
   1173  *
   1174  * @param[in] wd
   1175  *  integer width of the array
   1176  *
   1177  * @returns
   1178  *
   1179  * @remarks
   1180  *  None
   1181  *
   1182  *******************************************************************************
   1183  */
   1184 
   1185 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
   1186                                          WORD32 src_strd,
   1187                                          UWORD8 *pu1_dst,
   1188                                          WORD32 dst_strd,
   1189                                          WORD32 nt,
   1190                                          WORD32 mode)
   1191 {
   1192 
   1193     WORD32 row, col;
   1194     WORD32 two_nt;
   1195     UNUSED(src_strd);
   1196     UNUSED(mode);
   1197 
   1198     two_nt = 2 * nt;
   1199 
   1200 
   1201     UWORD8 *pu1_dst_tmp = pu1_dst;
   1202     UWORD32 pu1_val;
   1203     uint8x8_t pu1_val_two_nt_1_row;
   1204     if(nt == 32)
   1205     {
   1206         pu1_dst_tmp = pu1_dst;
   1207         for(row = 0; row < nt; row++)
   1208         {
   1209             pu1_val = pu1_ref[two_nt - 1 - row];
   1210             pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
   1211             for(col = nt; col > 0; col -= 8)
   1212             {
   1213                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
   1214                 pu1_dst_tmp += 8;
   1215             }
   1216             pu1_dst_tmp += dst_strd - nt;
   1217         }
   1218     }
   1219     else
   1220 
   1221 
   1222     /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
   1223     /* naming of variables made according to the operation(instructions) it performs*/
   1224     /* (eg. shift_val which contains the shifted value,                             */
   1225     /* add_sat which has add and saturated value)                                   */
   1226     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
   1227     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
   1228     {
   1229         if(0 != (nt & 7))      /* cond for multiple of 4 */
   1230         {
   1231             UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
   1232             UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
   1233             UWORD8 *pu1_dst_4 = pu1_dst;
   1234             UWORD8 *pu1_dst_4_tmp = pu1_dst;
   1235 
   1236             uint32x2_t pu1_ref_val1, pu1_ref_val2;
   1237             uint8x8_t dup_sub, round_val, dup_val;
   1238             uint16x8_t dup_add, sub_val;
   1239             int16x8_t shift_val, add_sat;
   1240 
   1241             pu1_ref_val1 = vdup_n_u32(0);
   1242             pu1_ref_val2 = vdup_n_u32(0);
   1243 
   1244             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
   1245 
   1246             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
   1247 
   1248             pu1_ref_4_two_nt_plus1 += (two_nt + 1);
   1249 
   1250             pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
   1251 
   1252             for(row = nt; row > 0; row -= 4)
   1253             {
   1254                 for(col = nt; col > 0; col -= 4)
   1255                 {
   1256                     pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
   1257                     sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
   1258                     shift_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
   1259 
   1260                     add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
   1261                     round_val = vqmovun_s16(add_sat);
   1262                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
   1263                     pu1_dst_4 += dst_strd;
   1264 
   1265                     pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
   1266                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
   1267                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
   1268                     pu1_dst_4 += dst_strd;
   1269 
   1270                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
   1271                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
   1272                     pu1_dst_4 += dst_strd;
   1273 
   1274                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
   1275                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
   1276                     pu1_dst_4 += dst_strd;
   1277 
   1278 
   1279                 }
   1280                 /* worst cases */
   1281                 pu1_ref_4_two_nt_minus_nt += 3;
   1282                 pu1_ref_4_two_nt_plus1 += 4;
   1283                 pu1_dst_4 = (pu1_dst_4_tmp + 4);
   1284             }
   1285 
   1286         }
   1287 
   1288         /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
   1289         /* naming of variables made according to the operation(instructions) it performs    */
   1290         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
   1291         /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
   1292 
   1293         else
   1294         {
   1295             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
   1296             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
   1297 
   1298             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
   1299             UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
   1300             UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
   1301 
   1302             uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
   1303             uint16x8_t sub_res, dup_add;
   1304             int16x8_t shift_res, add_res;
   1305 
   1306             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
   1307             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
   1308 
   1309             pu1_ref_tmp_1 += (two_nt + 1);
   1310             pu1_ref_tmp_2 += (two_nt - 1);
   1311 
   1312             for(col = nt; col > 0; col -= 8)
   1313             {
   1314                 src_tmp = vld1_u8(pu1_ref_tmp_1);
   1315                 pu1_ref_tmp_1 += 8;
   1316 
   1317                 sub_res = vsubl_u8(src_tmp, dup_sub);
   1318                 shift_res  = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
   1319                 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
   1320                 round_val = vqmovun_s16(add_res);
   1321                 vst1_u8(pu1_dst_tmp_1, round_val);
   1322                 pu1_dst_tmp_1 += 8;
   1323             }
   1324 
   1325             for(row = nt; row > 0; row -= 8)
   1326             {
   1327                 pu1_ref_tmp_2 -= 8;
   1328 
   1329                 src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
   1330                 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
   1331 
   1332                 dup_1 = vdup_lane_u8(rev_res, 0);
   1333                 dup_2 = vdup_lane_u8(rev_res, 1);
   1334                 dup_3 = vdup_lane_u8(rev_res, 2);
   1335                 dup_4 = vdup_lane_u8(rev_res, 3);
   1336                 dup_5 = vdup_lane_u8(rev_res, 4);
   1337                 dup_6 = vdup_lane_u8(rev_res, 5);
   1338                 dup_7 = vdup_lane_u8(rev_res, 6);
   1339                 dup_8 = vdup_lane_u8(rev_res, 7);
   1340 
   1341                 for(col = nt; col > 0; col -= 8)
   1342                 {
   1343                     pu1_dst_tmp_2 = pu1_dst_tmp_3;
   1344 
   1345                     vst1_u8(pu1_dst_tmp_2, dup_1);
   1346                     pu1_dst_tmp_2 += dst_strd;
   1347 
   1348                     vst1_u8(pu1_dst_tmp_2, dup_2);
   1349                     pu1_dst_tmp_2 += dst_strd;
   1350 
   1351                     vst1_u8(pu1_dst_tmp_2, dup_3);
   1352                     pu1_dst_tmp_2 += dst_strd;
   1353 
   1354                     vst1_u8(pu1_dst_tmp_2, dup_4);
   1355                     pu1_dst_tmp_2 += dst_strd;
   1356 
   1357                     vst1_u8(pu1_dst_tmp_2, dup_5);
   1358                     pu1_dst_tmp_2 += dst_strd;
   1359 
   1360                     vst1_u8(pu1_dst_tmp_2, dup_6);
   1361                     pu1_dst_tmp_2 += dst_strd;
   1362 
   1363                     vst1_u8(pu1_dst_tmp_2, dup_7);
   1364                     pu1_dst_tmp_2 += dst_strd;
   1365 
   1366                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
   1367                     if(row != 8)
   1368                         vst1_u8(pu1_dst_tmp_2, dup_8);
   1369                     pu1_dst_tmp_2 += dst_strd;
   1370 
   1371                     pu1_dst_tmp_3 += 8;
   1372                 }
   1373                 pu1_dst_tmp_2 -= (nt - 8);
   1374                 pu1_dst_tmp_3 = pu1_dst_tmp_2;
   1375             }
   1376         }
   1377     }
   1378 }
   1379 /* INTRA_PRED_LUMA_HORZ */
   1380 
   1381 /**
   1382 *******************************************************************************
   1383 *
   1384 * @brief
   1385 *    Intra prediction interpolation filter for vertical luma variable.
   1386 *
   1387 * @par Description:
   1388 *    Horizontal intraprediction with reference neighboring  samples location
   1389 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
   1390 *
   1391 * @param[in] pu1_src
   1392 *  UWORD8 pointer to the source
   1393 *
   1394 * @param[out] pu1_dst
   1395 *  UWORD8 pointer to the destination
   1396 *
   1397 * @param[in] src_strd
   1398 *  integer source stride
   1399 *
   1400 * @param[in] dst_strd
   1401 *  integer destination stride
   1402 *
   1403 * @param[in] nt
   1404 *  integer Transform Block size
   1405 *
   1406 * @param[in] wd
   1407 *  integer width of the array
   1408 *
   1409 * @returns
   1410 *
   1411 * @remarks
   1412 *  None
   1413 *
   1414 *******************************************************************************
   1415 */
   1416 
   1417 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
   1418                                         WORD32 src_strd,
   1419                                         UWORD8 *pu1_dst,
   1420                                         WORD32 dst_strd,
   1421                                         WORD32 nt,
   1422                                         WORD32 mode)
   1423 {
   1424     WORD32 row, col;
   1425     WORD32 two_nt;
   1426     UNUSED(src_strd);
   1427     UNUSED(mode);
   1428 
   1429     two_nt = 2 * nt;
   1430 
   1431     UWORD8 *pu1_dst_tmp = pu1_dst;
   1432     UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
   1433     uint8x8_t pu1_val_two_nt_1_col;
   1434     if(nt == 32)
   1435     {
   1436         pu1_dst_tmp = pu1_dst;
   1437         for(row = 0; row < nt; row++)
   1438         {
   1439             for(col = nt; col > 0; col -= 8)
   1440             {
   1441                 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
   1442                 pu1_ref_tmp_1 += 8;
   1443                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
   1444                 pu1_dst_tmp += 8;
   1445             }
   1446             pu1_ref_tmp_1 -= nt;
   1447             pu1_dst_tmp += dst_strd - nt;
   1448         }
   1449     }
   1450     else
   1451 
   1452     {
   1453         /* naming of variables made according to the operation(instructions) it performs                    */
   1454         /* (eg. shift_val which contains the shifted value,                                                 */
   1455         /* add_sat which has add and saturated value)                                                       */
   1456         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
   1457         /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
   1458 
   1459         if(0 != (nt & 7))
   1460         {
   1461             WORD32 cond_4 = 0;
   1462             UWORD8 *pu1_ref_val1 = pu1_ref;
   1463             UWORD8 *pu1_ref_val2 = pu1_ref;
   1464             UWORD8 *pu1_ref_val3 = pu1_ref;
   1465 
   1466             UWORD8 *pu1_dst_val1 = pu1_dst;
   1467             UWORD8 *pu1_dst_val2 = pu1_dst;
   1468             UWORD8 *pu1_dst_val3 = pu1_dst;
   1469 
   1470             uint8x8_t dup_2_sub, round_val, vext_val;
   1471             uint16x8_t dup_2_add;
   1472             uint32x2_t src_val1, src_val2, src_val3;
   1473             uint16x8_t sub_val;
   1474             int16x8_t shift_val1, add_sat;
   1475             uint64x1_t shift_val2;
   1476 
   1477             src_val1 = vdup_n_u32(0);
   1478             src_val2 = vdup_n_u32(0);
   1479             src_val3 = vdup_n_u32(0);
   1480             pu1_ref_val1 += (two_nt - nt);
   1481             pu1_ref_val3 += (two_nt + 2);
   1482             pu1_ref_val2 += (two_nt + 1);
   1483 
   1484             dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
   1485             dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
   1486 
   1487             /* loops to store the first nt sets of values in the destination */
   1488 
   1489             for(row = nt; row > 0; row -= 4)
   1490             {
   1491                 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
   1492                 {
   1493                     /*  unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
   1494                     src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
   1495                     sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
   1496                     shift_val1  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
   1497                     add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
   1498                     round_val = vqmovun_s16(add_sat);
   1499 
   1500                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
   1501                     src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
   1502                     vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
   1503                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
   1504                     pu1_dst_val1 += dst_strd;
   1505 
   1506                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
   1507 
   1508                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
   1509                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
   1510                     pu1_dst_val1 += dst_strd;
   1511 
   1512                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
   1513 
   1514                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
   1515                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
   1516                     pu1_dst_val1 += dst_strd;
   1517 
   1518                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
   1519 
   1520                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
   1521                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
   1522                     pu1_dst_val1 += dst_strd;
   1523 
   1524                     pu1_ref_val1  -= 4;
   1525                 }
   1526 
   1527                 /* loop to store next sets of eight values in the destination */
   1528 
   1529                 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
   1530                 {
   1531                     src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
   1532 
   1533                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
   1534                     pu1_dst_val2 += dst_strd;
   1535 
   1536                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
   1537                     pu1_dst_val2 += dst_strd;
   1538 
   1539                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
   1540                     pu1_dst_val2 += dst_strd;
   1541 
   1542                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
   1543                     pu1_dst_val2 += dst_strd;
   1544                 }
   1545                 pu1_ref_val2 += 4;
   1546                 pu1_dst_val3 += 4;
   1547                 pu1_dst_val2 = pu1_dst_val3;
   1548                 cond_4 = 1;
   1549             }
   1550         }
   1551 
   1552         /* rows and columns are unrolled by 8, when the width is multiple of 8          */
   1553         else
   1554         {
   1555             WORD32 cond = 0, col_1;
   1556             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
   1557             UWORD8 *pu1_dst_tmp_2 = pu1_dst;
   1558             UWORD8 *pu1_dst_tmp_3 = pu1_dst;
   1559 
   1560             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
   1561             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
   1562             UWORD8 *pu1_ref_tmp_3 = pu1_ref;
   1563 
   1564             uint8x8_t pu1_src_tmp1;
   1565             uint8x8_t pu1_src_tmp2;
   1566 
   1567             uint8x8_t dup_sub;
   1568             uint16x8_t dup_add;
   1569             int16x8_t subsh_val;
   1570             int16x8_t addsat_val;
   1571             uint16x8_t sub_val;
   1572             uint8x8_t round_val;
   1573             uint8x8_t vext_t;
   1574             uint64x1_t shift_64;
   1575 
   1576             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
   1577             dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
   1578 
   1579             pu1_ref_tmp_1 += (two_nt);
   1580             pu1_ref_tmp_1 -= 8;
   1581             pu1_ref_tmp_2 += (two_nt + 2);
   1582             pu1_ref_tmp_3 += (two_nt + 1);
   1583 
   1584             /* loops to store the first nt sets of values in the destination */
   1585 
   1586             for(row = nt; row > 0; row -= 8)
   1587             {
   1588                 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
   1589                 {
   1590                     pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
   1591 
   1592                     sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
   1593                     subsh_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
   1594                     addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
   1595                     round_val = vqmovun_s16(addsat_val);
   1596 
   1597                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
   1598 
   1599                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
   1600                     vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
   1601                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1602                     pu1_dst_tmp_1 += dst_strd;
   1603 
   1604                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
   1605 
   1606                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1607                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1608                     pu1_dst_tmp_1 += dst_strd;
   1609 
   1610                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
   1611                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1612                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1613                     pu1_dst_tmp_1 += dst_strd;
   1614 
   1615                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
   1616                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1617                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1618                     pu1_dst_tmp_1 += dst_strd;
   1619 
   1620                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
   1621                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1622                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1623                     pu1_dst_tmp_1 += dst_strd;
   1624 
   1625                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
   1626                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1627                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1628                     pu1_dst_tmp_1 += dst_strd;
   1629 
   1630                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
   1631                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1632                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1633                     pu1_dst_tmp_1 += dst_strd;
   1634 
   1635                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
   1636                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
   1637                     vst1_u8(pu1_dst_tmp_1, vext_t);
   1638                     pu1_dst_tmp_1 += dst_strd;
   1639 
   1640                     pu1_ref_tmp_1 -= 8;
   1641                 }
   1642 
   1643                 /* loop to store next sets of eight values in the destination */
   1644 
   1645                 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
   1646                 {
   1647                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
   1648 
   1649                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1650                     pu1_dst_tmp_2 += dst_strd;
   1651 
   1652                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1653                     pu1_dst_tmp_2 += dst_strd;
   1654 
   1655                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1656                     pu1_dst_tmp_2 += dst_strd;
   1657 
   1658                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1659                     pu1_dst_tmp_2 += dst_strd;
   1660 
   1661                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1662                     pu1_dst_tmp_2 += dst_strd;
   1663 
   1664                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1665                     pu1_dst_tmp_2 += dst_strd;
   1666 
   1667                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1668                     pu1_dst_tmp_2 += dst_strd;
   1669 
   1670                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
   1671                     pu1_dst_tmp_2 += dst_strd;
   1672                 }
   1673                 pu1_ref_tmp_3 += 8;
   1674                 pu1_dst_tmp_3 += 8;
   1675                 pu1_dst_tmp_2 = pu1_dst_tmp_3;
   1676                 cond = 1;
   1677             }
   1678         }
   1679     }
   1680 }
   1681 /* INTRA_PRED_LUMA_VER */
   1682 
   1683 /**
   1684 *******************************************************************************
   1685 *
   1686 * @brief
   1687 *    Intra prediction interpolation filter for luma mode2.
   1688 *
   1689 * @par Description:
   1690 *    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
   1691 *    location pointed by 'pu1_ref' to the  TU block location pointed by
   1692 *    'pu1_dst'
   1693 *
   1694 * @param[in] pu1_src
   1695 *  UWORD8 pointer to the source
   1696 *
   1697 * @param[out] pu1_dst
   1698 *  UWORD8 pointer to the destination
   1699 *
   1700 * @param[in] src_strd
   1701 *  integer source stride
   1702 *
   1703 * @param[in] dst_strd
   1704 *  integer destination stride
   1705 *
   1706 * @param[in] nt
   1707 *  integer Transform Block size
   1708 *
   1709 * @param[in] wd
   1710 *  integer width of the array
   1711 *
   1712 * @returns
   1713 *
   1714 * @remarks
   1715 *  None
   1716 *
   1717 *******************************************************************************
   1718 */
   1719 
   1720 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
   1721                                           WORD32 src_strd,
   1722                                           UWORD8 *pu1_dst,
   1723                                           WORD32 dst_strd,
   1724                                           WORD32 nt,
   1725                                           WORD32 mode)
   1726 {
   1727 
   1728     WORD32 row, col;
   1729     WORD32 two_nt;
   1730     UNUSED(src_strd);
   1731     UNUSED(mode);
   1732 
   1733     /* rev_res naming has been made to have the reverse result value in it                              */
   1734     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
   1735     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
   1736 
   1737     if(0 != (nt & 7))
   1738     {
   1739         UWORD8 *pu1_ref_tmp = pu1_ref;
   1740         UWORD8 *pu1_dst_tmp = pu1_dst;
   1741         uint8x8_t pu1_src_val, rev_res;
   1742         uint64x1_t shift_res;
   1743 
   1744         for(col = nt; col > 0; col -= 4)
   1745         {
   1746             for(row = nt; row > 0; row -= 4)
   1747             {
   1748                 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
   1749 
   1750                 pu1_src_val = vld1_u8(pu1_ref_tmp);
   1751                 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
   1752                 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
   1753 
   1754                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
   1755                 pu1_dst_tmp += dst_strd;
   1756 
   1757                 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
   1758                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
   1759                 pu1_dst_tmp += dst_strd;
   1760 
   1761                 shift_res = vshr_n_u64(shift_res, 8);
   1762                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
   1763                 pu1_dst_tmp += dst_strd;
   1764 
   1765                 shift_res = vshr_n_u64(shift_res, 8);
   1766                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
   1767                 pu1_dst_tmp += dst_strd;
   1768             }
   1769         }
   1770     }
   1771 
   1772     /* rev_val_second, rev_val_first  to reverse the loaded values in order to get the values in right order */
   1773     /* shift_64 to shift the reversed 2nd values to get the value what we need                               */
   1774     /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
   1775 
   1776     else
   1777     {
   1778         UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
   1779         UWORD8 *pu1_dst_tmp = pu1_dst;
   1780         UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
   1781 
   1782         uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
   1783         uint64x1_t shift_val;
   1784 
   1785         two_nt = 2 * nt;
   1786         pu1_ref_two_nt_minus2 += (two_nt);
   1787         pu1_ref_two_nt_minus2 -= 8;
   1788 
   1789         for(col = nt; col > 0; col -= 8)
   1790         {
   1791             for(row = nt; row > 0; row -= 8)
   1792             {
   1793                 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
   1794                 rev_val_first = vrev64_u8(pu1_src_val2);
   1795 
   1796                 pu1_ref_two_nt_minus2 -= 8;
   1797                 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
   1798                 rev_val_second = vrev64_u8(pu1_src_val1);
   1799 
   1800                 vext_t = vext_u8(rev_val_first, rev_val_second, 1);
   1801                 vst1_u8(pu1_dst_tmp, vext_t);
   1802                 pu1_dst_tmp += dst_strd;
   1803 
   1804                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
   1805                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1806                 vst1_u8(pu1_dst_tmp, vext_t);
   1807                 pu1_dst_tmp += dst_strd;
   1808 
   1809                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
   1810                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1811                 vst1_u8(pu1_dst_tmp, vext_t);
   1812                 pu1_dst_tmp += dst_strd;
   1813 
   1814                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
   1815                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1816                 vst1_u8(pu1_dst_tmp, vext_t);
   1817                 pu1_dst_tmp += dst_strd;
   1818 
   1819                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
   1820                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1821                 vst1_u8(pu1_dst_tmp, vext_t);
   1822                 pu1_dst_tmp += dst_strd;
   1823 
   1824                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
   1825                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1826                 vst1_u8(pu1_dst_tmp, vext_t);
   1827                 pu1_dst_tmp += dst_strd;
   1828 
   1829                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
   1830                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1831                 vst1_u8(pu1_dst_tmp, vext_t);
   1832                 pu1_dst_tmp += dst_strd;
   1833 
   1834                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
   1835                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
   1836                 vst1_u8(pu1_dst_tmp, vext_t);
   1837                 pu1_dst_tmp += dst_strd;
   1838             }
   1839             pu1_dst_tmp_plus8 += 8;
   1840             pu1_dst_tmp = pu1_dst_tmp_plus8;
   1841             pu1_ref_two_nt_minus2 += (nt - 8);
   1842         }
   1843     }
   1844 }
   1845 /* INTRA_PRED_LUMA_MODE2 */
   1846 
   1847 /**
   1848 *******************************************************************************
   1849 *
   1850 * @brief
   1851 *   Intra prediction interpolation filter for luma mode 18 & mode 34.
   1852 *
   1853 * @par Description:
   1854 *    Intraprediction for mode 34 (ne angle) with reference  neighboring
   1855 *    samples location pointed by 'pu1_ref' to the  TU block location pointed by
   1856 *    'pu1_dst'
   1857 *
   1858 * @param[in] pu1_src
   1859 *  UWORD8 pointer to the source
   1860 *
   1861 * @param[out] pu1_dst
   1862 *  UWORD8 pointer to the destination
   1863 *
   1864 * @param[in] src_strd
   1865 *  integer source stride
   1866 *
   1867 * @param[in] dst_strd
   1868 *  integer destination stride
   1869 *
   1870 * @param[in] nt
   1871 *  integer Transform Block size
   1872 *
   1873 * @param[in] wd
   1874 *  integer width of the array
   1875 *
   1876 * @returns
   1877 *
   1878 * @remarks
   1879 *  None
   1880 *
   1881 *******************************************************************************
   1882 */
   1883 
   1884 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
   1885                                                WORD32 src_strd,
   1886                                                UWORD8 *pu1_dst,
   1887                                                WORD32 dst_strd,
   1888                                                WORD32 nt,
   1889                                                WORD32 mode)
   1890 {
   1891 
   1892     WORD32 row, col, idx;
   1893     WORD32 intraPredAngle = 32;
   1894     WORD32 two_nt;
   1895     UNUSED(src_strd);
   1896     two_nt = 2 * nt;
   1897 
   1898     UWORD8 *pu1_ref_tmp = pu1_ref;
   1899     UWORD8 *pu1_ref_tmp1 = pu1_ref;
   1900     UWORD8 *pu1_dst_tmp = pu1_dst;
   1901     UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
   1902 
   1903     uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
   1904 
   1905     /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref)   */
   1906     /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue        */
   1907     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8      */
   1908     /* rows and columns are unrolled by 8, when the width is multiple of 8                                  */
   1909     /* loops are maintained separately for mode18 and mode34                                                */
   1910 
   1911     /* cond to allow multiples of 8 */
   1912     if(0 == (nt & 7))
   1913     {
   1914         if(mode == 34)
   1915         {
   1916             pu1_ref_tmp += (two_nt + 2);
   1917 
   1918             for(row = nt; row > 0; row -= 8)
   1919             {
   1920                 for(col = nt; col > 0; col -= 8)
   1921                 {
   1922                     /* Loading 1st eight values */
   1923                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
   1924                     pu1_ref_tmp += 8;
   1925 
   1926                     /* Loading next eight values */
   1927                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
   1928 
   1929                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
   1930                     vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
   1931                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
   1932                     pu1_dst_tmp += dst_strd;
   1933 
   1934                     vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
   1935                     vst1_u8(pu1_dst_tmp, vext1);
   1936                     pu1_dst_tmp += dst_strd;
   1937 
   1938                     vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
   1939                     vst1_u8(pu1_dst_tmp, vext2);
   1940                     pu1_dst_tmp += dst_strd;
   1941 
   1942                     vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
   1943                     vst1_u8(pu1_dst_tmp, vext3);
   1944                     pu1_dst_tmp += dst_strd;
   1945 
   1946                     vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
   1947                     vst1_u8(pu1_dst_tmp, vext4);
   1948                     pu1_dst_tmp += dst_strd;
   1949 
   1950                     vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
   1951                     vst1_u8(pu1_dst_tmp, vext5);
   1952                     pu1_dst_tmp += dst_strd;
   1953 
   1954                     vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
   1955                     vst1_u8(pu1_dst_tmp, vext6);
   1956                     pu1_dst_tmp += dst_strd;
   1957 
   1958                     vst1_u8(pu1_dst_tmp, vext7);
   1959                     pu1_dst_tmp += dst_strd;
   1960                 }
   1961 
   1962                 pu1_dst_tmp_plus8 += 8;
   1963                 pu1_dst_tmp = pu1_dst_tmp_plus8;
   1964                 pu1_ref_tmp -= (nt - 8);
   1965             }
   1966         }
   1967         else /* Loop for mode 18 */
   1968         {
   1969             pu1_ref_tmp += (two_nt);
   1970 
   1971             for(row = nt; row > 0; row -= 8)
   1972             {
   1973                 for(col = nt; col > 0; col -= 8)
   1974                 {
   1975                     /* Loading 1st eight values */
   1976                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
   1977                     pu1_ref_tmp -= 8;
   1978 
   1979                     /* Loading next eight values */
   1980                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
   1981 
   1982                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
   1983                     vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
   1984                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
   1985                     pu1_dst_tmp += dst_strd;
   1986 
   1987                     vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
   1988                     vst1_u8(pu1_dst_tmp, vext1);
   1989                     pu1_dst_tmp += dst_strd;
   1990 
   1991                     vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
   1992                     vst1_u8(pu1_dst_tmp, vext2);
   1993                     pu1_dst_tmp += dst_strd;
   1994 
   1995                     vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
   1996                     vst1_u8(pu1_dst_tmp, vext3);
   1997                     pu1_dst_tmp += dst_strd;
   1998 
   1999                     vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
   2000                     vst1_u8(pu1_dst_tmp, vext4);
   2001                     pu1_dst_tmp += dst_strd;
   2002 
   2003                     vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
   2004                     vst1_u8(pu1_dst_tmp, vext5);
   2005                     pu1_dst_tmp += dst_strd;
   2006 
   2007                     vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
   2008                     vst1_u8(pu1_dst_tmp, vext6);
   2009                     pu1_dst_tmp += dst_strd;
   2010 
   2011                     vst1_u8(pu1_dst_tmp, vext7);
   2012                     pu1_dst_tmp += dst_strd;
   2013                 }
   2014                 pu1_dst_tmp_plus8 += 8;
   2015                 pu1_dst_tmp = pu1_dst_tmp_plus8;
   2016                 pu1_ref_tmp += (nt + 8);
   2017             }
   2018         }
   2019     }
   2020 
   2021     /* rows and columns are unrolled by 4, when the width is multiple of 4  */
   2022 
   2023     else /* loop for multiples of 4 */
   2024     {
   2025         uint8x8_t src_val1;
   2026         uint8x8_t src_val2;
   2027 
   2028         if(mode == 18)
   2029             intraPredAngle = -32;
   2030         else if(mode == 34)
   2031             intraPredAngle = 32;
   2032 
   2033         for(row = 0; row < nt; row += 2)
   2034         {
   2035             /* unrolling 2 rows */
   2036             idx = ((row + 1) * intraPredAngle) >> 5;
   2037             pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
   2038             src_val1 = vld1_u8(pu1_ref_tmp);
   2039 
   2040             idx = ((row + 2) * intraPredAngle) >> 5;
   2041             pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
   2042             src_val2 = vld1_u8(pu1_ref_tmp1);
   2043 
   2044             /* unrolling 4 col */
   2045             for(col = nt; col > 0; col -= 4)
   2046             {
   2047                 pu1_dst_tmp = pu1_dst;
   2048                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
   2049                 pu1_dst_tmp += dst_strd;
   2050                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
   2051                 pu1_dst += 4;
   2052             }
   2053             pu1_dst += 2 * dst_strd - nt;
   2054         }
   2055     }
   2056 }
   2057 /* INTRA_PRED_LUMA_MODE_18_34 */
   2058 
   2059 /**
   2060  *******************************************************************************
   2061  *
   2062  * @brief
   2063  *    Intra prediction interpolation filter for luma mode 3 to mode 9
   2064  *
   2065  * @par Description:
   2066  *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
   2067  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2068  *    block location pointed by 'pu1_dst'
   2069  *
   2070  * @param[in] pu1_src
   2071  *  UWORD8 pointer to the source
   2072  *
   2073  * @param[out] pu1_dst
   2074  *  UWORD8 pointer to the destination
   2075  *
   2076  * @param[in] src_strd
   2077  *  integer source stride
   2078  *
   2079  * @param[in] dst_strd
   2080  *  integer destination stride
   2081  *
   2082  * @param[in] nt
   2083  *  integer Transform Block size
   2084  *
   2085  * @param[in] mode
   2086  *  integer intraprediction mode
   2087  *
   2088  * @returns
   2089  *
   2090  * @remarks
   2091  *  None
   2092  *
   2093  *******************************************************************************
   2094  */
   2095 
   2096 
   2097 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
   2098                                                 WORD32 src_strd,
   2099                                                 UWORD8 *pu1_dst,
   2100                                                 WORD32 dst_strd,
   2101                                                 WORD32 nt,
   2102                                                 WORD32 mode)
   2103 {
   2104 
   2105     WORD32 row, col;
   2106     WORD32 intra_pred_ang;
   2107     WORD32 pos, fract = 100, fract_prev;
   2108     UNUSED(src_strd);
   2109     if(0 == (nt & 7))
   2110     {
   2111 
   2112         UWORD8 *pu1_ref_main_idx = pu1_ref;
   2113         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
   2114 
   2115         UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2116         UWORD8 *pu1_dst_tmp2 = pu1_dst;
   2117 
   2118         WORD32 two_nt = 2 * nt;
   2119 
   2120         pu1_ref_main_idx += two_nt;
   2121         pu1_ref_main_idx_1 += two_nt - 1;
   2122 
   2123         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
   2124         uint8x8_t shift_res;
   2125         uint16x8_t mul_res1, mul_res2, add_res;
   2126 
   2127         /* Intra Pred Angle according to the mode */
   2128         intra_pred_ang = gai4_ihevc_ang_table[mode];
   2129 
   2130         pu1_ref_main_idx -= 8;
   2131         pu1_ref_main_idx_1 -= 8;
   2132 
   2133         for(col = 0; col < nt; col++)
   2134         {
   2135             fract_prev = fract;
   2136 
   2137             pos = ((col + 1) * intra_pred_ang);
   2138             fract = pos & (31);
   2139 
   2140             if(fract_prev < fract)
   2141             {
   2142                 pu1_ref_main_idx += 1;
   2143                 pu1_ref_main_idx_1 += 1;
   2144             }
   2145 
   2146             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2147             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2148 
   2149             for(row = nt; row > 0; row -= 8)
   2150             {
   2151                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
   2152                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
   2153 
   2154                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
   2155                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
   2156 
   2157                 add_res = vaddq_u16(mul_res1, mul_res2);
   2158 
   2159                 shift_res = vrshrn_n_u16(add_res, 5);
   2160 
   2161                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
   2162                 pu1_dst_tmp1 += dst_strd;
   2163 
   2164                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
   2165                 pu1_dst_tmp1 += dst_strd;
   2166 
   2167                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
   2168                 pu1_dst_tmp1 += dst_strd;
   2169 
   2170                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
   2171                 pu1_dst_tmp1 += dst_strd;
   2172 
   2173                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
   2174                 pu1_dst_tmp1 += dst_strd;
   2175 
   2176                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
   2177                 pu1_dst_tmp1 += dst_strd;
   2178 
   2179                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
   2180                 pu1_dst_tmp1 += dst_strd;
   2181 
   2182                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
   2183                 pu1_dst_tmp1 += dst_strd;
   2184 
   2185                 pu1_ref_main_idx -= 8;
   2186                 pu1_ref_main_idx_1 -= 8;
   2187 
   2188             }
   2189             pu1_dst_tmp2 += 1;
   2190             pu1_dst_tmp1 = pu1_dst_tmp2;
   2191 
   2192             pu1_ref_main_idx += nt;
   2193             pu1_ref_main_idx_1 += nt;
   2194 
   2195             pu1_ref_main_idx -= 1;
   2196             pu1_ref_main_idx_1 -= 1;
   2197 
   2198         }
   2199     }
   2200     else
   2201     {
   2202         UWORD8 *pu1_ref_tmp1 = pu1_ref;
   2203         UWORD8 *pu1_ref_tmp2 = pu1_ref;
   2204         UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2205         UWORD8 *pu1_dst_tmp2 = pu1_dst;
   2206 
   2207         pu1_ref_tmp1 += nt;
   2208         pu1_ref_tmp2 += (nt - 1);
   2209 
   2210         uint8x8_t dup_fract, dup_32_fract, shift_res;
   2211         uint16x8_t mul_res1, mul_res2, add_res;
   2212         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
   2213 
   2214         pu1_ref_val1 = vdup_n_u32(0);
   2215         pu1_ref_val2 = vdup_n_u32(0);
   2216 
   2217         /* Intra Pred Angle according to the mode */
   2218         intra_pred_ang = gai4_ihevc_ang_table[mode];
   2219 
   2220 
   2221         for(col = 0; col < nt; col++)
   2222         {
   2223             fract_prev = fract;
   2224             pos = ((col + 1) * intra_pred_ang);
   2225             fract = pos & (31);
   2226             if(fract_prev < fract)
   2227             {
   2228                 pu1_ref_tmp1 += 1;
   2229                 pu1_ref_tmp2 += 1;
   2230             }
   2231             dup_fract = vdup_n_u8((uint8_t)fract);
   2232             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2233 
   2234             for(row = nt; row > 0; row -= 4)
   2235             {
   2236                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
   2237                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
   2238 
   2239                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
   2240                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
   2241 
   2242                 add_res = vaddq_u16(mul_res1, mul_res2);
   2243 
   2244                 shift_res = vrshrn_n_u16(add_res, 5);
   2245 
   2246                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
   2247                 pu1_dst_tmp1 += dst_strd;
   2248 
   2249                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
   2250                 pu1_dst_tmp1 += dst_strd;
   2251 
   2252                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
   2253                 pu1_dst_tmp1 += dst_strd;
   2254 
   2255                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
   2256 
   2257             }
   2258             pu1_ref_tmp1 -= 1;
   2259             pu1_ref_tmp2 -= 1;
   2260 
   2261             pu1_dst_tmp2 += 1;
   2262             pu1_dst_tmp1 = pu1_dst_tmp2;
   2263 
   2264         }
   2265 
   2266 
   2267     }
   2268 
   2269 }
   2270 
   2271 /**
   2272  *******************************************************************************
   2273  *
   2274  * @brief
   2275  *   Intra prediction interpolation filter for luma mode 11 to mode 17
   2276  *
   2277  * @par Description:
   2278  *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
   2279  *    with reference  neighboring samples location pointed by 'pu1_ref' to the
   2280  *    TU block location pointed by 'pu1_dst'
   2281  *
   2282  * @param[in] pu1_src
   2283  *  UWORD8 pointer to the source
   2284  *
   2285  * @param[out] pu1_dst
   2286  *  UWORD8 pointer to the destination
   2287  *
   2288  * @param[in] src_strd
   2289  *  integer source stride
   2290  *
   2291  * @param[in] dst_strd
   2292  *  integer destination stride
   2293  *
   2294  * @param[in] nt
   2295  *  integer Transform Block size
   2296  *
   2297  * @param[in] mode
   2298  *  integer intraprediction mode
   2299  *
   2300  * @returns
   2301  *
   2302  * @remarks
   2303  *  None
   2304  *
   2305  *******************************************************************************
   2306  */
   2307 
   2308 
   2309 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
   2310                                                   WORD32 src_strd,
   2311                                                   UWORD8 *pu1_dst,
   2312                                                   WORD32 dst_strd,
   2313                                                   WORD32 nt,
   2314                                                   WORD32 mode)
   2315 {
   2316 
   2317     WORD32 row, col, k;
   2318     WORD32 two_nt;
   2319     WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
   2320     WORD32 pos, fract = 1000, fract_prev;
   2321     WORD32  ref_idx;
   2322 
   2323     UWORD8 *ref_main;
   2324     UWORD8 *ref_main_tmp;
   2325 
   2326     UWORD8 *pu1_ref_tmp1 = pu1_ref;
   2327     UWORD8 *pu1_ref_tmp2 = pu1_ref;
   2328     UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2329     UWORD8 *pu1_dst_tmp2 = pu1_dst;
   2330 
   2331     UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
   2332 
   2333     uint16x8_t mul_res1, mul_res2, add_res;
   2334     uint8x8_t dup_const_fract, dup_const_32_fract;
   2335     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
   2336     uint8x8_t ref_left_t;
   2337     uint32x2_t  ref_left_tmp;
   2338     UNUSED(src_strd);
   2339     ref_left_tmp = vdup_n_u32(0);
   2340 
   2341     inv_ang_sum = 128;
   2342     two_nt = 2 * nt;
   2343 
   2344     intra_pred_ang = gai4_ihevc_ang_table[mode];
   2345 
   2346     inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
   2347 
   2348     pu1_ref_tmp1 += two_nt;
   2349 
   2350     ref_main = ref_temp + (nt - 1);
   2351     ref_main_tmp = ref_main;
   2352 
   2353     if(0 == (nt & 7))
   2354     {
   2355         pu1_ref_tmp2 += (two_nt - 7);
   2356 
   2357         for(k = nt - 1; k >= 0; k -= 8)
   2358         {
   2359 
   2360             ref_left_t = vld1_u8(pu1_ref_tmp2);
   2361 
   2362             ref_left_t = vrev64_u8(ref_left_t);
   2363             vst1_u8(ref_main_tmp, ref_left_t);
   2364             ref_main_tmp += 8;
   2365             pu1_ref_tmp2 -= 8;
   2366 
   2367         }
   2368 
   2369     }
   2370     else
   2371     {
   2372         uint8x8_t rev_val;
   2373         pu1_ref_tmp2 += (two_nt - (nt - 1));
   2374 
   2375         for(k = nt - 1; k >= 0; k -= 8)
   2376         {
   2377 
   2378             ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
   2379 
   2380             rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
   2381             vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
   2382 
   2383         }
   2384 
   2385     }
   2386 
   2387     ref_main[nt] = pu1_ref[two_nt - nt];
   2388 
   2389     /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
   2390 
   2391     ref_idx = (nt * intra_pred_ang) >> 5;
   2392 
   2393     /* SIMD Optimization can be done using look-up table for the loop */
   2394     /* For negative angled derive the main reference samples from side */
   2395     /*  reference samples refer to section 8.4.4.2.6 */
   2396     for(k = -1; k > ref_idx; k--)
   2397     {
   2398         inv_ang_sum += inv_ang;
   2399         ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
   2400     }
   2401 
   2402     UWORD8 *ref_main_tmp1 = ref_main;
   2403     UWORD8 *ref_main_tmp2 = ref_main;
   2404 
   2405     ref_main_tmp2 += 1;
   2406 
   2407     if(0 == (nt & 7))
   2408     {
   2409         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   2410         /* samples dependent on distance to obtain destination sample */
   2411         for(col = 0; col < nt; col++)
   2412         {
   2413 
   2414             fract_prev = fract;
   2415             pos = ((col + 1) * intra_pred_ang);
   2416             fract = pos & (31);
   2417 
   2418             if(fract_prev < fract)
   2419             {
   2420                 ref_main_tmp1 -= 1;
   2421                 ref_main_tmp2 -= 1;
   2422             }
   2423 
   2424             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2425             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2426 
   2427             // Do linear filtering
   2428             for(row = nt; row > 0; row -= 8)
   2429             {
   2430                 ref_main_idx = vld1_u8(ref_main_tmp1);
   2431 
   2432                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
   2433 
   2434                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
   2435                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
   2436 
   2437                 add_res = vaddq_u16(mul_res1, mul_res2);
   2438 
   2439                 shift_res = vrshrn_n_u16(add_res, 5);
   2440 
   2441                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
   2442                 pu1_dst_tmp1 += dst_strd;
   2443 
   2444                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
   2445                 pu1_dst_tmp1 += dst_strd;
   2446 
   2447                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
   2448                 pu1_dst_tmp1 += dst_strd;
   2449 
   2450                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
   2451                 pu1_dst_tmp1 += dst_strd;
   2452 
   2453                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
   2454                 pu1_dst_tmp1 += dst_strd;
   2455 
   2456                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
   2457                 pu1_dst_tmp1 += dst_strd;
   2458 
   2459                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
   2460                 pu1_dst_tmp1 += dst_strd;
   2461 
   2462                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
   2463                 pu1_dst_tmp1 += dst_strd;
   2464 
   2465                 ref_main_tmp1 += 8;
   2466                 ref_main_tmp2 += 8;
   2467             }
   2468 
   2469             ref_main_tmp1 -= nt;
   2470             ref_main_tmp2 -= nt;
   2471 
   2472             pu1_dst_tmp2 += 1;
   2473             pu1_dst_tmp1 = pu1_dst_tmp2;
   2474         }
   2475     }
   2476     else
   2477     {
   2478         uint32x2_t ref_main_idx1, ref_main_idx2;
   2479 
   2480         ref_main_idx1 = vdup_n_u32(0);
   2481         ref_main_idx2 = vdup_n_u32(0);
   2482 
   2483         for(col = 0; col < nt; col++)
   2484         {
   2485             fract_prev = fract;
   2486             pos = ((col + 1) * intra_pred_ang);
   2487             fract = pos & (31);
   2488 
   2489             if(fract_prev < fract)
   2490             {
   2491                 ref_main_tmp1 -= 1;
   2492                 ref_main_tmp2 -= 1;
   2493             }
   2494 
   2495             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2496             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2497 
   2498             for(row = nt; row > 0; row -= 4)
   2499             {
   2500 
   2501                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
   2502                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
   2503 
   2504                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
   2505                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
   2506 
   2507                 add_res = vaddq_u16(mul_res1, mul_res2);
   2508 
   2509                 shift_res = vrshrn_n_u16(add_res, 5);
   2510 
   2511                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
   2512                 pu1_dst_tmp1 += dst_strd;
   2513 
   2514                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
   2515                 pu1_dst_tmp1 += dst_strd;
   2516 
   2517                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
   2518                 pu1_dst_tmp1 += dst_strd;
   2519 
   2520                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
   2521                 pu1_dst_tmp1 += dst_strd;
   2522 
   2523             }
   2524 
   2525             pu1_dst_tmp2 += 1;
   2526             pu1_dst_tmp1 = pu1_dst_tmp2;
   2527 
   2528         }
   2529 
   2530     }
   2531 }
   2532 
   2533 /**
   2534  *******************************************************************************
   2535  *
   2536  * @brief
   2537  *   Intra prediction interpolation filter for luma mode 19 to mode 25
   2538  *
   2539  * @par Description:
   2540  *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
   2541  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2542  *    block location pointed by 'pu1_dst'
   2543  *
   2544  * @param[in] pu1_src
   2545  *  UWORD8 pointer to the source
   2546  *
   2547  * @param[out] pu1_dst
   2548  *  UWORD8 pointer to the destination
   2549  *
   2550  * @param[in] src_strd
   2551  *  integer source stride
   2552  *
   2553  * @param[in] dst_strd
   2554  *  integer destination stride
   2555  *
   2556  * @param[in] nt
   2557  *  integer Transform Block size
   2558  *
   2559  * @param[in] mode
   2560  *  integer intraprediction mode
   2561  *
   2562  * @returns
   2563  *
   2564  * @remarks
   2565  *  None
   2566  *
   2567  *******************************************************************************
   2568  */
   2569 
   2570 
   2571 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
   2572                                                   WORD32 src_strd,
   2573                                                   UWORD8 *pu1_dst,
   2574                                                   WORD32 dst_strd,
   2575                                                   WORD32 nt,
   2576                                                   WORD32 mode)
   2577 {
   2578 
   2579     WORD32 row, col, k;
   2580     WORD32 two_nt, intra_pred_ang;
   2581     WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
   2582     WORD32 ref_idx;
   2583     UWORD8 *ref_main;
   2584     UWORD8 *ref_main_tmp;
   2585     UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
   2586 
   2587     UWORD8 *pu1_ref_tmp1 = pu1_ref;
   2588     UWORD8 *pu1_ref_tmp2 = pu1_ref;
   2589     UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2590 
   2591     uint16x8_t mul_res1, mul_res2, add_res;
   2592     uint8x8_t dup_const_fract, dup_const_32_fract;
   2593     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
   2594     uint8x8_t ref_above_t;
   2595     uint32x2_t ref_above_tmp;
   2596     UNUSED(src_strd);
   2597     ref_above_tmp = vdup_n_u32(0);
   2598 
   2599     two_nt = 2 * nt;
   2600     intra_pred_ang = gai4_ihevc_ang_table[mode];
   2601     inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
   2602 
   2603     /* Intermediate reference samples for negative angle modes */
   2604     /* This have to be removed during optimization*/
   2605     pu1_ref_tmp1 += two_nt;
   2606 
   2607 
   2608     ref_main = ref_temp + (nt - 1);
   2609     ref_main_tmp = ref_main;
   2610 
   2611     if(0 == (nt & 7))
   2612     {
   2613         pu1_ref_tmp2 += (two_nt - 7);
   2614         for(k = nt - 1; k >= 0; k -= 8)
   2615         {
   2616 
   2617             ref_above_t = vld1_u8(pu1_ref_tmp1);
   2618             vst1_u8(ref_main_tmp, ref_above_t);
   2619             ref_main_tmp += 8;
   2620             pu1_ref_tmp1 += 8;
   2621 
   2622         }
   2623 
   2624     }
   2625     else
   2626     {
   2627         pu1_ref_tmp2 += (two_nt - (nt - 1));
   2628 
   2629         for(k = nt - 1; k >= 0; k -= 4)
   2630         {
   2631 
   2632             ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
   2633             vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
   2634 
   2635         }
   2636 
   2637     }
   2638 
   2639     ref_main[nt] = pu1_ref[two_nt + nt];
   2640 
   2641     /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   2642 
   2643     ref_idx = (nt * intra_pred_ang) >> 5;
   2644     inv_ang_sum = 128;
   2645 
   2646     /* SIMD Optimization can be done using look-up table for the loop */
   2647     /* For negative angled derive the main reference samples from side */
   2648     /*  reference samples refer to section 8.4.4.2.6 */
   2649     for(k = -1; k > ref_idx; k--)
   2650     {
   2651         inv_ang_sum += inv_ang;
   2652         ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
   2653     }
   2654 
   2655     UWORD8 *ref_main_tmp1 = ref_main;
   2656     UWORD8 *ref_main_tmp2 = ref_main;
   2657 
   2658     ref_main_tmp2 += 1;
   2659 
   2660     if(0 == (nt & 7))
   2661     {
   2662         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   2663         /* samples dependent on distance to obtain destination sample */
   2664         for(row = 0; row < nt; row++)
   2665         {
   2666 
   2667             fract_prev = fract;
   2668             pos = ((row + 1) * intra_pred_ang);
   2669             fract = pos & (31);
   2670 
   2671             if(fract_prev < fract)
   2672             {
   2673                 ref_main_tmp1 -= 1;
   2674                 ref_main_tmp2 -= 1;
   2675             }
   2676 
   2677             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2678             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2679 
   2680             // Do linear filtering
   2681             for(col = nt; col > 0; col -= 8)
   2682             {
   2683                 ref_main_idx = vld1_u8(ref_main_tmp1);
   2684 
   2685                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
   2686 
   2687                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
   2688                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
   2689 
   2690                 add_res = vaddq_u16(mul_res1, mul_res2);
   2691 
   2692                 shift_res = vrshrn_n_u16(add_res, 5);
   2693 
   2694                 vst1_u8(pu1_dst_tmp1, shift_res);
   2695                 pu1_dst_tmp1 += 8;
   2696 
   2697                 ref_main_tmp1 += 8;
   2698                 ref_main_tmp2 += 8;
   2699             }
   2700 
   2701             ref_main_tmp1 -= nt;
   2702             ref_main_tmp2 -= nt;
   2703 
   2704             pu1_dst_tmp1 += (dst_strd - nt);
   2705         }
   2706     }
   2707     else
   2708     {
   2709         uint32x2_t ref_main_idx1, ref_main_idx2;
   2710 
   2711         ref_main_idx1 = vdup_n_u32(0);
   2712         ref_main_idx2 = vdup_n_u32(0);
   2713 
   2714         for(row = 0; row < nt; row++)
   2715         {
   2716             fract_prev = fract;
   2717             pos = ((row + 1) * intra_pred_ang);
   2718             fract = pos & (31);
   2719 
   2720             if(fract_prev < fract)
   2721             {
   2722                 ref_main_tmp1 -= 1;
   2723                 ref_main_tmp2 -= 1;
   2724             }
   2725 
   2726             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2727             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2728 
   2729             for(col = nt; col > 0; col -= 4)
   2730             {
   2731 
   2732                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
   2733                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
   2734 
   2735                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
   2736                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
   2737 
   2738                 add_res = vaddq_u16(mul_res1, mul_res2);
   2739 
   2740                 shift_res = vrshrn_n_u16(add_res, 5);
   2741 
   2742                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
   2743                 pu1_dst_tmp1 += 4;
   2744 
   2745             }
   2746             pu1_dst_tmp1 += (dst_strd - nt);
   2747         }
   2748 
   2749     }
   2750 
   2751 }
   2752 
   2753 /**
   2754  *******************************************************************************
   2755  *
   2756  * @brief
   2757  *    Intra prediction interpolation filter for luma mode 27 to mode 33
   2758  *
   2759  * @par Description:
   2760  *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
   2761  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2762  *    block location pointed by 'pu1_dst'
   2763  *
   2764  * @param[in] pu1_src
   2765  *  UWORD8 pointer to the source
   2766  *
   2767  * @param[out] pu1_dst
   2768  *  UWORD8 pointer to the destination
   2769  *
   2770  * @param[in] src_strd
   2771  *  integer source stride
   2772  *
   2773  * @param[in] dst_strd
   2774  *  integer destination stride
   2775  *
   2776  * @param[in] nt
   2777  *  integer Transform Block size
   2778  *
   2779  * @param[in] mode
   2780  *  integer intraprediction mode
   2781  *
   2782  * @returns
   2783  *
   2784  * @remarks
   2785  *  None
   2786  *
   2787  *******************************************************************************
   2788  */
   2789 
   2790 
   2791 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
   2792                                                   WORD32 src_strd,
   2793                                                   UWORD8 *pu1_dst,
   2794                                                   WORD32 dst_strd,
   2795                                                   WORD32 nt,
   2796                                                   WORD32 mode)
   2797 {
   2798 
   2799     WORD32 row, col;
   2800     WORD32 intra_pred_ang;
   2801     WORD32 pos, fract = 0, fract_prev;
   2802 
   2803     WORD32 two_nt = 2 * nt;
   2804     UNUSED(src_strd);
   2805     if(0 == (nt & 7))
   2806     {
   2807 
   2808         UWORD8 *pu1_ref_main_idx = pu1_ref;
   2809         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
   2810 
   2811         UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2812         pu1_ref_main_idx += (two_nt + 1);
   2813         pu1_ref_main_idx_1 += (two_nt + 2);
   2814 
   2815         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
   2816         uint8x8_t shift_res;
   2817         uint16x8_t mul_res1, mul_res2, add_res;
   2818 
   2819         /* Intra Pred Angle according to the mode */
   2820         intra_pred_ang = gai4_ihevc_ang_table[mode];
   2821 
   2822         for(row = 0; row < nt; row++)
   2823         {
   2824             fract_prev = fract;
   2825 
   2826             pos = ((row + 1) * intra_pred_ang);
   2827             fract = pos & (31);
   2828 
   2829             if(fract_prev > fract)
   2830             {
   2831                 pu1_ref_main_idx += 1;
   2832                 pu1_ref_main_idx_1 += 1;
   2833             }
   2834 
   2835             dup_const_fract = vdup_n_u8((uint8_t)fract);
   2836             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2837 
   2838             for(col = nt; col > 0; col -= 8)
   2839             {
   2840                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
   2841                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
   2842 
   2843                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
   2844                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
   2845 
   2846                 add_res = vaddq_u16(mul_res1, mul_res2);
   2847 
   2848                 shift_res = vrshrn_n_u16(add_res, 5);
   2849 
   2850                 vst1_u8(pu1_dst_tmp1, shift_res);
   2851                 pu1_dst_tmp1 += 8;
   2852 
   2853                 pu1_ref_main_idx += 8;
   2854                 pu1_ref_main_idx_1 += 8;
   2855             }
   2856 
   2857             pu1_ref_main_idx -= nt;
   2858             pu1_ref_main_idx_1 -= nt;
   2859 
   2860             pu1_dst_tmp1 += (dst_strd - nt);
   2861         }
   2862 
   2863     }
   2864     else
   2865     {
   2866         UWORD8 *pu1_ref_tmp1 = pu1_ref;
   2867         UWORD8 *pu1_ref_tmp2 = pu1_ref;
   2868         UWORD8 *pu1_dst_tmp1 = pu1_dst;
   2869 
   2870         pu1_ref_tmp1 += (two_nt + 1);;
   2871         pu1_ref_tmp2 += (two_nt + 2);;
   2872 
   2873         uint8x8_t dup_fract, dup_32_fract, shift_res;
   2874         uint16x8_t mul_res1, mul_res2, add_res;
   2875         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
   2876 
   2877         pu1_ref_val1 = vdup_n_u32(0);
   2878         pu1_ref_val2 = vdup_n_u32(0);
   2879 
   2880         /* Intra Pred Angle according to the mode */
   2881         intra_pred_ang = gai4_ihevc_ang_table[mode];
   2882 
   2883         for(row = 0; row < nt; row++)
   2884         {
   2885             fract_prev = fract;
   2886             pos = ((row + 1) * intra_pred_ang);
   2887             fract = pos & (31);
   2888             if(fract_prev > fract)
   2889             {
   2890                 pu1_ref_tmp1 += 1;
   2891                 pu1_ref_tmp2 += 1;
   2892             }
   2893             dup_fract = vdup_n_u8((uint8_t)fract);
   2894             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
   2895 
   2896             for(col = nt; col > 0; col -= 4)
   2897             {
   2898                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
   2899                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
   2900 
   2901                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
   2902                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
   2903 
   2904                 add_res = vaddq_u16(mul_res1, mul_res2);
   2905 
   2906                 shift_res = vrshrn_n_u16(add_res, 5);
   2907 
   2908                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
   2909                 pu1_dst_tmp1 += 4;
   2910 
   2911             }
   2912 
   2913             pu1_dst_tmp1 += (dst_strd - nt);
   2914 
   2915         }
   2916 
   2917 
   2918     }
   2919 
   2920 }
   2921