Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_luma_mode_27_to_33.s
     22 //*
     23 //* @brief
     24 //*  contains function definition for intra prediction  interpolation filters
     25 //*
     26 //*
     27 //* @author
     28 //*  parthiban v
     29 //*
     30 //* @par list of functions:
     31 //*  - ihevc_intra_pred_luma_mode_27_to_33()
     32 //*
     33 //* @remarks
     34 //*  none
     35 //*
     36 //*******************************************************************************
     37 //*/
     38 //
     39 ///**
     40 //*******************************************************************************
     41 //*
     42 //* @brief
     43 //*    intra prediction interpolation filter for luma mode 27 to mode 33
     44 //*
     45 //* @par description:
     46 //*    intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
     47 //*   .extern  neighboring samples location pointed by 'pu1_ref' to the  tu
     48 //*    block location pointed by 'pu1_dst'
     49 //*
     50 //* @param[in] pu1_src
     51 //*  uword8 pointer to the source
     52 //*
     53 //* @param[out] pu1_dst
     54 //*  uword8 pointer to the destination
     55 //*
     56 //* @param[in] src_strd
     57 //*  integer source stride
     58 //*
     59 //* @param[in] dst_strd
     60 //*  integer destination stride
     61 //*
     62 //* @param[in] nt
     63 //*  integer transform block size
     64 //*
     65 //* @param[in] mode
     66 //*  integer intraprediction mode
     67 //*
     68 //* @returns
     69 //*
     70 //* @remarks
     71 //*  none
     72 //*
     73 //*******************************************************************************
     74 //*/
     75 
     76 //void ihevc_intra_pred_luma_mode_27_to_33(uword8 *pu1_ref,
     77 //                                        word32 src_strd,
     78 //                                        uword8 *pu1_dst,
     79 //                                        word32 dst_strd,
     80 //                                        word32 nt,
     81 //                                        word32 mode)
     82 //**************variables vs registers*****************************************
     83 //x0 => *pu1_ref
     84 //x1 =>  src_strd
     85 //x2 => *pu1_dst
     86 //x3 =>  dst_strd
     87 
     88 .text
     89 .align 4
     90 .include "ihevc_neon_macros.s"
     91 
     92 
     93 
     94 .globl ihevc_intra_pred_luma_mode_27_to_33_av8
     95 .extern gai4_ihevc_ang_table
     96 .extern gau1_ihevc_planar_factor
     97 
     98 .type ihevc_intra_pred_luma_mode_27_to_33_av8, %function
     99 
    100 ihevc_intra_pred_luma_mode_27_to_33_av8:
    101 
    102     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
    103 
    104     stp         d9,d10,[sp,#-16]!
    105     stp         d12,d13,[sp,#-16]!
    106     stp         d14,d15,[sp,#-16]!
    107     stp         x19, x20,[sp,#-16]!
    108 
    109     adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
    110     ldr         x6,  [x6, #:got_lo12:gai4_ihevc_ang_table]
    111 
    112     lsl         x7,x4,#1                    //two_nt
    113 
    114     add         x8,x6,x5,lsl #2             //*gai4_ihevc_ang_table[mode]
    115     ldr         w9, [x8]                    //intra_pred_ang = gai4_ihevc_ang_table[mode]
    116     sxtw        x9,w9
    117     adrp        x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
    118     ldr         x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
    119     add         x6,x1,#1
    120 
    121     tst         x4,#7
    122     add         x8,x0,x7                    //pu1_ref + two_nt
    123     mov         x14,#0                      //row
    124     mov         x12,x4
    125     bne         core_loop_4
    126 
    127 core_loop_8:
    128     add         x8,x8,#1                    //pu1_ref_main_idx += (two_nt + 1)
    129     dup         v0.8b,w9                    //intra_pred_ang
    130     lsr         x12, x4, #3                 //divide by 8
    131 
    132     movi        v1.8b, #32
    133     mul         x7, x4, x12
    134 
    135     movi        v6.8h, #31
    136     //lsl            x12,x3,#3
    137 
    138     mov         x1,x8
    139     //sub            x12,x12,x4
    140     mov         x5,x4
    141     mov         x11,#1
    142 
    143 prologue:
    144     ld1         {v3.8b},[x6]                //loads the row value
    145     umull       v2.8h, v3.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
    146     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
    147     xtn         v4.8b,  v4.8h
    148     shrn        v5.8b, v2.8h,#5             //idx = pos >> 5
    149 
    150     dup         v31.8b, v4.b[0]
    151     add         x0,x2,x3
    152 
    153     umov        w14, v5.s[0]                //(i row)extract idx to the r register
    154     sxtw        x14,w14
    155 
    156     dup         v29.8b, v4.b[1]             //(ii)
    157     and         x9,x14,#0xff                //(i row) get the last byte
    158 
    159     add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
    160 
    161     asr         x14,x14,#8                  //(ii)shift by 8
    162     ld1         {v23.8b},[x10],x11          //(i row)ref_main_idx
    163     and         x9,x14,#0xff                //(ii)get the last byte
    164 
    165     asr         x14,x14,#8                  //(iii)
    166     ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
    167     add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
    168 
    169     and         x9,x14,#0xff                //(iii)
    170     sub         v30.8b,  v1.8b ,  v31.8b    //32-fract(dup_const_32_fract)
    171     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
    172 
    173     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
    174     umull       v10.8h, v23.8b, v30.8b      //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
    175 
    176     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
    177     umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
    178     asr         x14,x14,#8                  //(iv)
    179 
    180     dup         v27.8b, v4.b[2]             //(iii)
    181     sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
    182     and         x9,x14,#0xff                //(iv)
    183 
    184     dup         v25.8b, v4.b[3]             //(iv)
    185     umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    186     add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
    187 
    188     ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
    189     umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    190 
    191     ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
    192     rshrn       v10.8b, v10.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
    193 
    194     ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
    195     sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
    196 
    197     ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
    198 
    199     dup         v31.8b, v4.b[4]             //(v)
    200     umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    201 
    202     umov        w14, v5.s[1]                //extract idx to the r register
    203     sxtw        x14,w14
    204     umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    205 
    206     st1         {v10.8b},[x2],#8            //(i row)
    207     rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
    208 
    209     and         x9,x14,#0xff                //(v)
    210     dup         v29.8b, v4.b[5]             //(vi)
    211     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
    212 
    213     ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
    214     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
    215 
    216     asr         x14,x14,#8                  //(vi)
    217     umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    218     and         x9,x14,#0xff                //(vi)
    219 
    220     ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
    221     umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    222 
    223     st1         {v14.8b},[x0],x3            //(ii)
    224     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
    225 
    226     add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
    227     dup         v27.8b, v4.b[6]             //(vii)
    228     asr         x14,x14,#8                  //(vii)
    229 
    230     and         x9,x14,#0xff                //(vii)
    231     sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
    232     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
    233 
    234     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
    235     umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    236 
    237     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
    238     umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    239 
    240     st1         {v18.8b},[x0],x3            //(iii)
    241     rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
    242 
    243     asr         x14,x14,#8                  //(viii)
    244     dup         v25.8b, v4.b[7]             //(viii)
    245     and         x9,x14,#0xff                //(viii)
    246 
    247     ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
    248     sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
    249 
    250     ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
    251     umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    252 
    253     add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
    254     umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    255     subs        x4,x4,#8
    256 
    257     st1         {v22.8b},[x0],x3            //(iv)
    258     rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
    259 
    260     ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
    261     sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
    262 
    263     ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
    264     umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    265 
    266     add         x20,x8,#8
    267     csel        x8, x20, x8,gt
    268     umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    269     sub         x20,x7,#8
    270     csel        x7, x20, x7,gt
    271 
    272     st1         {v10.8b},[x0],x3            //(v)
    273     rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
    274 
    275     beq         epilogue
    276 
    277     ld1         {v5.8b},[x6]                //loads the row value
    278     umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
    279     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
    280     xtn         v4.8b,  v4.8h
    281     shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
    282     umov        w14, v3.s[0]                //(i)extract idx to the r register
    283     sxtw        x14,w14
    284     and         x9,x14,#0xff                //(i)
    285     add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
    286 
    287 kernel_8_rows:
    288     asr         x14,x14,#8                  //(ii)
    289     dup         v31.8b, v4.b[0]
    290     subs        x4,x4,#8
    291 
    292     ld1         {v23.8b},[x10],x11          //(i)ref_main_idx
    293     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
    294     and         x9,x14,#0xff                //(ii)
    295     add         x20,x6,#8                   //increment the row value
    296     csel        x6, x20, x6,le
    297 
    298     ld1         {v9.8b},[x10]               //(i)ref_main_idx_1
    299     umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    300     add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
    301 
    302     ld1         {v5.8b},[x6]                //loads the row value
    303     umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    304     asr         x14,x14,#8                  //(iii)
    305 
    306     dup         v29.8b, v4.b[1]             //(ii)
    307     rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
    308     and         x9,x14,#0xff                //(iii)
    309 
    310     st1         {v14.8b},[x0],x3            //(vi)
    311     sub         v30.8b,  v1.8b ,  v31.8b    //(i)32-fract(dup_const_32_fract)
    312     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
    313 
    314     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
    315     umull       v10.8h, v23.8b, v30.8b      //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
    316     asr         x14,x14,#8                  //(iv)
    317 
    318     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
    319     umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
    320     and         x9,x14,#0xff                //(iv)
    321 
    322     umov        w14, v3.s[1]                //extract idx to the r register
    323     sxtw        x14,w14
    324     rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
    325 
    326     dup         v27.8b, v4.b[2]             //(iii)
    327     sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
    328     csel        x4, x5, x4,le               //reload nt
    329 
    330     ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
    331     umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    332     add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
    333 
    334     st1         {v18.8b},[x0],x3            //(vii)
    335     umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    336 
    337     ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
    338     rshrn       v10.8b, v10.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
    339 
    340     dup         v25.8b, v4.b[3]             //(iv)
    341     umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
    342 
    343     st1         {v22.8b},[x0]               //(viii)
    344     sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
    345 
    346     ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
    347     umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    348     add         x0,x2,x3
    349 
    350     ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
    351     umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    352     and         x9,x14,#0xff                //(v)
    353 
    354     dup         v31.8b, v4.b[4]             //(v)
    355     rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
    356     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
    357 
    358     st1         {v10.8b},[x2],#8            //(i)
    359     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
    360     asr         x14,x14,#8                  //(vi)
    361 
    362     dup         v29.8b, v4.b[5]             //(vi)
    363     umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    364     and         x9,x14,#0xff                //(vi)
    365 
    366     dup         v27.8b, v4.b[6]             //(vii)
    367     umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    368     add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
    369 
    370     dup         v25.8b, v4.b[7]             //(viii)
    371     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
    372     asr         x14,x14,#8                  //(vii)
    373 
    374     ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
    375     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
    376     and         x9,x14,#0xff                //(vii)
    377 
    378     ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
    379     shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
    380     asr         x14,x14,#8                  //(viii)
    381 
    382     st1         {v14.8b},[x0],x3            //(ii)
    383     rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
    384     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
    385 
    386     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
    387     sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
    388     and         x9,x14,#0xff                //(viii)
    389 
    390     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
    391     umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    392 
    393     umov        w14, v3.s[0]                //(i)extract idx to the r register
    394     sxtw        x14,w14
    395     umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    396     add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
    397 
    398     ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
    399     sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
    400 
    401     st1         {v18.8b},[x0],x3            //(iii)
    402     umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    403     csel        x8, x1, x8,le               //reload the source to pu1_src+2nt
    404 
    405     ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
    406     umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    407     add         x20,x8,#8                   //increment the source next set 8 columns in same row
    408     csel        x8, x20, x8,gt
    409 
    410     ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
    411     rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
    412 
    413     ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
    414     sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
    415     lsl         x20, x3,#3
    416     csel        x12,x20,x12,le
    417 
    418     st1         {v22.8b},[x0],x3            //(iv)
    419     umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    420     sub         x20,x12,x5
    421     csel        x12, x20, x12,le
    422 
    423     st1         {v10.8b},[x0],x3            //(v)
    424     umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    425     add         x20,x2,x12                  //increment the dst pointer to 8*dst_strd - nt
    426     csel        x2, x20, x2,le
    427 
    428     xtn         v4.8b,  v4.8h
    429     rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
    430     and         x9,x14,#0xff                //(i)
    431 
    432     subs        x7,x7,#8
    433     add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
    434 
    435     bne         kernel_8_rows
    436 
    437 epilogue:
    438     st1         {v14.8b},[x0],x3            //(vi)
    439     rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
    440 
    441     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
    442     umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    443     umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    444 
    445     st1         {v18.8b},[x0],x3            //(vii)
    446     rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
    447 
    448     st1         {v22.8b},[x0],x3            //(viii)
    449     b           end_loops
    450 
    451 core_loop_4:
    452     add         x10,x8,#1                   //pu1_ref_main_idx += (two_nt + 1)
    453     add         x11,x8,#2                   //pu1_ref_main_idx_1 += (two_nt + 2)
    454     mov         x8,#0
    455 
    456     add         x5,x8,#1                    //row + 1
    457     mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
    458     and         x5,x5,#31                   //fract = pos & (31)
    459     cmp         x14,x5                      //if(fract_prev > fract)
    460     add         x20,x10,#1                  //pu1_ref_main_idx += 1
    461     csel        x10, x20, x10,gt
    462     add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
    463     dup         v0.8b,w5                    //dup_const_fract
    464     sub         x20,x5,#32
    465     neg         x4, x20
    466     dup         v1.8b,w4                    //dup_const_32_fract
    467 
    468 //inner_loop_4
    469     ld1         {v2.s}[0],[x10]             //ref_main_idx
    470     add         x8,x8,#1
    471     mov         x14,x5                      //fract_prev = fract
    472 
    473     ld1         {v3.s}[0],[x11]             //ref_main_idx_1
    474     add         x5,x8,#1                    //row + 1
    475     mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
    476     and         x5,x5,#31                   //fract = pos & (31)
    477     cmp         x14,x5                      //if(fract_prev > fract)
    478     add         x20,x10,#1                  //pu1_ref_main_idx += 1
    479     csel        x10, x20, x10,gt
    480     add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
    481 
    482     dup         v6.8b,w5                    //dup_const_fract
    483     umull       v4.8h, v2.8b, v1.8b         //vmull_u8(ref_main_idx, dup_const_32_fract)
    484 
    485     sub         x20,x5,#32
    486     neg         x4, x20
    487     dup         v7.8b,w4                    //dup_const_32_fract
    488     umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
    489 
    490     ld1         {v23.s}[0],[x10]            //ref_main_idx
    491     add         x8,x8,#1
    492 
    493     ld1         {v9.s}[0],[x11]             //ref_main_idx_1
    494     rshrn       v4.8b, v4.8h,#5             //shift_res = vrshrn_n_u16(add_res, 5)
    495 
    496     mov         x14,x5                      //fract_prev = fract
    497     add         x5,x8,#1                    //row + 1
    498     mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
    499     and         x5,x5,#31                   //fract = pos & (31)
    500     cmp         x14,x5                      //if(fract_prev > fract)
    501     add         x20,x10,#1                  //pu1_ref_main_idx += 1
    502     csel        x10, x20, x10,gt
    503     add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
    504 
    505     dup         v12.8b,w5                   //dup_const_fract
    506     umull       v10.8h, v23.8b, v7.8b       //vmull_u8(ref_main_idx, dup_const_32_fract)
    507 
    508     sub         x20,x5,#32
    509     neg         x4, x20
    510     dup         v13.8b,w4                   //dup_const_32_fract
    511     umlal       v10.8h, v9.8b, v6.8b        //vmull_u8(ref_main_idx_1, dup_const_fract)
    512 
    513     ld1         {v14.s}[0],[x10]            //ref_main_idx
    514     add         x8,x8,#1
    515 
    516     st1         {v4.s}[0],[x2],x3
    517     rshrn       v10.8b, v10.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
    518 
    519     ld1         {v15.s}[0],[x11]            //ref_main_idx_1
    520     mov         x14,x5                      //fract_prev = fract
    521     add         x5,x8,#1                    //row + 1
    522     mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
    523     and         x5,x5,#31                   //fract = pos & (31)
    524     cmp         x14,x5                      //if(fract_prev > fract)
    525     add         x20,x10,#1                  //pu1_ref_main_idx += 1
    526     csel        x10, x20, x10,gt
    527     add         x11,x10,#1                  //pu1_ref_main_idx_1 += 1
    528 
    529     dup         v18.8b,w5                   //dup_const_fract
    530     umull       v16.8h, v14.8b, v13.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
    531 
    532     sub         x20,x5,#32
    533     neg         x4, x20
    534     dup         v19.8b,w4                   //dup_const_32_fract
    535     umlal       v16.8h, v15.8b, v12.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
    536 
    537     ld1         {v20.s}[0],[x10]            //ref_main_idx
    538 
    539     st1         {v10.s}[0],[x2],x3
    540     rshrn       v16.8b, v16.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
    541     ld1         {v21.s}[0],[x11]            //ref_main_idx_1
    542 
    543     umull       v22.8h, v20.8b, v19.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
    544     umlal       v22.8h, v21.8b, v18.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
    545 
    546     st1         {v16.s}[0],[x2],x3
    547     rshrn       v22.8b, v22.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
    548 
    549     st1         {v22.s}[0],[x2],x3
    550 
    551 end_loops:
    552     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    553     ldp         x19, x20,[sp],#16
    554     ldp         d14,d15,[sp],#16
    555     ldp         d12,d13,[sp],#16
    556     ldp         d9,d10,[sp],#16
    557     ret
    558 
    559 
    560 
    561