Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_chroma_mode_27_to_33.s
     22 //*
     23 //* @brief
     24 //*  contains function definition for intra prediction  interpolation filters
     25 //*
     26 //*
     27 //* @author
     28 //*  parthiban v
     29 //*
     30 //* @par list of functions:
     31 //*  - ihevc_intra_pred_chroma_mode_27_to_33()
     32 //*
     33 //* @remarksll
     34 //*  none
     35 //*
     36 //*******************************************************************************
     37 //*/
     38 
     39 ///**
     40 //*******************************************************************************
     41 //*
     42 //* @brief
     43 //*  intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
     44 //*.extern  neighboring samples location pointed by 'pu1_ref' to the  tu
     45 //* block location pointed by 'pu1_dst'
     46 //*
     47 //* @par description:
     48 //*
     49 //*
     50 //* @param[in] pu1_src
     51 //*  uword8 pointer to the source
     52 //*
     53 //* @param[in] pu1_dst
     54 //*  uword8 pointer to the destination
     55 //*
     56 //* @param[in] src_strd
     57 //*  integer source stride
     58 //*
     59 //* @param[in] dst_strd
     60 //*  integer destination stride
     61 //*
     62 //* @param[in] nt
     63 //*  integer transform block size
     64 //*
     65 //* @param[in] mode
     66 //*  integer intraprediction mode
     67 //*
     68 //* @returns
     69 //*
     70 //* @remarks
     71 //*  none
     72 //*
     73 //*******************************************************************************
     74 //*/
     75 
     76 //.if intra_pred_chroma_27_t0_33 == c
     77 //void ihevc_intra_pred_chroma_mode_27_to_33(uword8 *pu1_ref,
     78 //                                        word32 src_strd,
     79 //                                         uword8 *pu1_dst,
     80 //                                         word32 dst_strd,
     81 //                                         word32 nt,
     82 //                                         word32 mode)
     83 
     84 .text
     85 .align 4
     86 .include "ihevc_neon_macros.s"
     87 
     88 
     89 .globl ihevc_intra_pred_chroma_mode_27_to_33_av8
     90 .extern gai4_ihevc_ang_table
     91 .extern gau1_ihevc_planar_factor
     92 
     93 .type ihevc_intra_pred_chroma_mode_27_to_33_av8, %function
     94 
     95 ihevc_intra_pred_chroma_mode_27_to_33_av8:
     96 
     97     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
     98 
     99     stp         d9,d10,[sp,#-16]!
    100     stp         d12,d13,[sp,#-16]!
    101     stp         d14,d15,[sp,#-16]!
    102     stp         x19, x20,[sp,#-16]!
    103 
    104     adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
    105     ldr         x6,  [x6, #:got_lo12:gai4_ihevc_ang_table]
    106 
    107     lsl         x7,x4,#2                    //four_nt
    108 
    109     add         x8,x6,x5,lsl #2             //*gai4_ihevc_ang_table[mode]
    110     ldr         w9, [x8]                    //intra_pred_ang = gai4_ihevc_ang_table[mode]
    111     sxtw        x9,w9
    112     adrp        x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
    113     ldr         x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
    114     add         x6,x1,#1
    115 
    116     tst         x4,#7
    117     add         x8,x0,x7                    //pu1_ref + four_nt
    118     mov         x14,#0                      //row
    119     mov         x12,x4
    120     bne         core_loop_4
    121     lsl         x4,x4,#1
    122     b           core_loop_8
    123 
    124 core_loop_8:
    125     add         x8,x8,#2                    //pu1_ref_main_idx += (four_nt + 1)
    126     dup         v0.8b,w9                    //intra_pred_ang
    127     lsr         x12, x4, #4                 //divide by 8
    128 
    129     movi        v1.8b, #32
    130     mul         x7, x4, x12
    131 
    132     movi        v6.8h, #31
    133 
    134     mov         x1,x8
    135     mov         x5,x4
    136     mov         x11,#2
    137 
    138 prologue:
    139     ld1         {v3.8b},[x6]                //loads the row value
    140     umull       v2.8h, v3.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
    141     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
    142     xtn         v4.8b,  v4.8h
    143     shrn        v5.8b, v2.8h,#5             //idx = pos >> 5
    144 
    145     dup         v31.8b, v4.8b[0]
    146     add         x0,x2,x3
    147 
    148     smov        x14, v5.2s[0]               //(i row)extract idx to the r register
    149     lsl         x14,x14,#1
    150 
    151     dup         v29.8b, v4.8b[1]            //(ii)
    152     and         x9,x14,#0xff                //(i row) get the last byte
    153 
    154     add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
    155 
    156     asr         x14,x14,#8                  //(ii)shift by 8
    157     ld1         {v23.8b},[x10],x11          //(i row)ref_main_idx
    158     and         x9,x14,#0xff                //(ii)get the last byte
    159 
    160     asr         x14,x14,#8                  //(iii)
    161     ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
    162     add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
    163 
    164     and         x9,x14,#0xff                //(iii)
    165     sub         v30.8b,  v1.8b ,  v31.8b    //32-fract(dup_const_32_fract)
    166     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
    167 
    168     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
    169     umull       v10.8h, v23.8b, v30.8b      //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
    170 
    171     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
    172     umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
    173     asr         x14,x14,#8                  //(iv)
    174 
    175     dup         v27.8b, v4.8b[2]            //(iii)
    176     sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
    177     and         x9,x14,#0xff                //(iv)
    178 
    179     dup         v25.8b, v4.8b[3]            //(iv)
    180     umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    181     add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
    182 
    183     ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
    184     umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    185 
    186     ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
    187     rshrn       v10.8b, v10.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
    188 
    189     ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
    190     sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
    191 
    192     ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
    193 
    194     dup         v31.8b, v4.8b[4]            //(v)
    195     umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    196 
    197     smov        x14, v5.2s[1]               //extract idx to the r register
    198     umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    199     lsl         x14,x14,#1
    200 
    201     st1         {v10.8b},[x2],#8            //(i row)
    202     rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
    203 
    204     and         x9,x14,#0xff                //(v)
    205     dup         v29.8b, v4.8b[5]            //(vi)
    206     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
    207 
    208     ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
    209     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
    210 
    211     asr         x14,x14,#8                  //(vi)
    212     umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    213     and         x9,x14,#0xff                //(vi)
    214 
    215     ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
    216     umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    217 
    218     st1         {v14.8b},[x0],x3            //(ii)
    219     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
    220 
    221     add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
    222     dup         v27.8b, v4.8b[6]            //(vii)
    223     asr         x14,x14,#8                  //(vii)
    224 
    225     and         x9,x14,#0xff                //(vii)
    226     sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
    227     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
    228 
    229     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
    230     umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    231 
    232     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
    233     umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    234 
    235     st1         {v18.8b},[x0],x3            //(iii)
    236     rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
    237 
    238     asr         x14,x14,#8                  //(viii)
    239     dup         v25.8b, v4.8b[7]            //(viii)
    240     and         x9,x14,#0xff                //(viii)
    241 
    242     ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
    243     sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
    244 
    245     ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
    246     umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    247 
    248     add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
    249     umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    250     subs        x7,x7,#8
    251 
    252     st1         {v22.8b},[x0],x3            //(iv)
    253     rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
    254 
    255     ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
    256     sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
    257 
    258     ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
    259     umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    260 
    261     add         x20,x8,#8
    262     csel        x8, x20, x8,gt
    263     umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    264     sub         x20,x4,#8
    265     csel        x4, x20, x4,gt
    266 
    267     st1         {v10.8b},[x0],x3            //(v)
    268     rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
    269 
    270     beq         epilogue
    271 
    272     ld1         {v5.8b},[x6]                //loads the row value
    273     umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
    274     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
    275     xtn         v4.8b,  v4.8h
    276     shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
    277     smov        x14, v3.2s[0]               //(i)extract idx to the r register
    278     lsl         x14,x14,#1
    279     and         x9,x14,#0xff                //(i)
    280     add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
    281 
    282 kernel_8_rows:
    283     asr         x14,x14,#8                  //(ii)
    284     dup         v31.8b, v4.8b[0]
    285     subs        x4,x4,#8
    286 
    287     ld1         {v23.8b},[x10],x11          //(i)ref_main_idx
    288     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
    289     and         x9,x14,#0xff                //(ii)
    290     add         x20,x6,#8                   //increment the row value
    291     csel        x6, x20, x6,le
    292 
    293     ld1         {v9.8b},[x10]               //(i)ref_main_idx_1
    294     umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    295     add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
    296 
    297     ld1         {v5.8b},[x6]                //loads the row value
    298     umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    299     asr         x14,x14,#8                  //(iii)
    300 
    301     dup         v29.8b, v4.8b[1]            //(ii)
    302     rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
    303     and         x9,x14,#0xff                //(iii)
    304 
    305     st1         {v14.8b},[x0],x3            //(vi)
    306     sub         v30.8b,  v1.8b ,  v31.8b    //(i)32-fract(dup_const_32_fract)
    307     add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
    308 
    309     ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
    310     umull       v10.8h, v23.8b, v30.8b      //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
    311     asr         x14,x14,#8                  //(iv)
    312 
    313     ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
    314     umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
    315     and         x9,x14,#0xff                //(iv)
    316 
    317     smov        x14, v3.2s[1]               //extract idx to the r register
    318     rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
    319 
    320     dup         v27.8b, v4.8b[2]            //(iii)
    321     sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
    322     csel        x4, x5, x4,le               //reload nt
    323 
    324     ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
    325     umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    326     add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
    327 
    328     st1         {v18.8b},[x0],x3            //(vii)
    329     umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    330 
    331     ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
    332     rshrn       v10.8b, v10.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
    333 
    334     dup         v25.8b, v4.8b[3]            //(iv)
    335     umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
    336 
    337     st1         {v22.8b},[x0]               //(viii)
    338     sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
    339 
    340     ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
    341     umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    342     lsl         x14,x14,#1
    343 
    344     ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
    345     umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    346     add         x0,x2,x3
    347 
    348     dup         v31.8b, v4.8b[4]            //(v)
    349     rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
    350     and         x9,x14,#0xff                //(v)
    351 
    352     st1         {v10.8b},[x2],#8            //(i)
    353     sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
    354     add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
    355 
    356     dup         v29.8b, v4.8b[5]            //(vi)
    357     umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    358     asr         x14,x14,#8                  //(vi)
    359 
    360     dup         v27.8b, v4.8b[6]            //(vii)
    361     umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    362     and         x9,x14,#0xff                //(vi)
    363 
    364     dup         v25.8b, v4.8b[7]            //(viii)
    365     rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
    366     add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
    367 
    368     ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
    369     and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
    370     asr         x14,x14,#8                  //(vii)
    371 
    372     ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
    373     shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
    374     and         x9,x14,#0xff                //(vii)
    375 
    376     st1         {v14.8b},[x0],x3            //(ii)
    377     rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
    378     asr         x14,x14,#8                  //(viii)
    379 
    380     ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
    381     sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
    382     add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
    383 
    384     ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
    385     umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    386     and         x9,x14,#0xff                //(viii)
    387 
    388     smov        x14, v3.2s[0]               //(i)extract idx to the r register
    389     umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    390     add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
    391 
    392     ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
    393     sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
    394 
    395     st1         {v18.8b},[x0],x3            //(iii)
    396     umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    397     csel        x8, x1, x8,le               //reload the source to pu1_src+2nt
    398 
    399     ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
    400     umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    401     add         x20,x8,#8                   //increment the source next set 8 columns in same row
    402     csel        x8, x20, x8,gt
    403 
    404     ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
    405     rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
    406 
    407     ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
    408     sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
    409     lsl         x20, x3,#3
    410     csel        x12,x20,x12,le
    411 
    412     st1         {v22.8b},[x0],x3            //(iv)
    413     umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    414     sub         x20,x12,x5
    415     csel        x12, x20, x12,le
    416 
    417     st1         {v10.8b},[x0],x3            //(v)
    418     umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    419     add         x20,x2,x12                  //increment the dst pointer to 8*dst_strd - nt
    420     csel        x2, x20, x2,le
    421 
    422     xtn         v4.8b,  v4.8h
    423     rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
    424     lsl         x14,x14,#1
    425 
    426     and         x9,x14,#0xff                //(i)
    427     subs        x7,x7,#8
    428     add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
    429 
    430     bne         kernel_8_rows
    431 
    432 epilogue:
    433     st1         {v14.8b},[x0],x3            //(vi)
    434     rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
    435 
    436     sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
    437     umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    438     umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    439 
    440     st1         {v18.8b},[x0],x3            //(vii)
    441     rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
    442 
    443     st1         {v22.8b},[x0],x3            //(viii)
    444     b           end_loops
    445 
    446 core_loop_4:
    447     add         x10,x8,#2                   //pu1_ref_main_idx += (four_nt + 1)
    448     add         x11,x8,#4                   //pu1_ref_main_idx_1 += (four_nt + 2)
    449     mov         x8,#0
    450 
    451     add         x5,x8,#1                    //row + 1
    452     mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
    453     and         x5,x5,#31                   //fract = pos & (31)
    454     cmp         x14,x5                      //if(fract_prev > fract)
    455     add         x20,x10,#2                  //pu1_ref_main_idx += 2
    456     csel        x10, x20, x10,gt
    457     add         x11,x10,#2                  //pu1_ref_main_idx_1 += 2
    458     dup         v0.8b,w5                    //dup_const_fract
    459     sub         x20,x5,#32
    460     neg         x4, x20
    461     dup         v1.8b,w4                    //dup_const_32_fract
    462 
    463 //inner_loop_4
    464     ld1         {v2.8b},[x10]               //ref_main_idx
    465     add         x8,x8,#1
    466     mov         x14,x5                      //fract_prev = fract
    467 
    468     ld1         {v3.8b},[x11]               //ref_main_idx_1
    469     add         x5,x8,#1                    //row + 1
    470     mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
    471     and         x5,x5,#31                   //fract = pos & (31)
    472     cmp         x14,x5                      //if(fract_prev > fract)
    473     add         x20,x10,#2                  //pu1_ref_main_idx += 1
    474     csel        x10, x20, x10,gt
    475     add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
    476 
    477     dup         v6.8b,w5                    //dup_const_fract
    478     umull       v4.8h, v2.8b, v1.8b         //vmull_u8(ref_main_idx, dup_const_32_fract)
    479 
    480     sub         x20,x5,#32
    481     neg         x4, x20
    482     dup         v7.8b,w4                    //dup_const_32_fract
    483     umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
    484 
    485     ld1         {v23.8b},[x10]              //ref_main_idx
    486     add         x8,x8,#1
    487 
    488     ld1         {v9.8b},[x11]               //ref_main_idx_1
    489     rshrn       v4.8b, v4.8h,#5             //shift_res = vrshrn_n_u16(add_res, 5)
    490 
    491     mov         x14,x5                      //fract_prev = fract
    492     add         x5,x8,#1                    //row + 1
    493     mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
    494     and         x5,x5,#31                   //fract = pos & (31)
    495     cmp         x14,x5                      //if(fract_prev > fract)
    496     add         x20,x10,#2                  //pu1_ref_main_idx += 1
    497     csel        x10, x20, x10,gt
    498     add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
    499 
    500     dup         v12.8b,w5                   //dup_const_fract
    501     umull       v10.8h, v23.8b, v7.8b       //vmull_u8(ref_main_idx, dup_const_32_fract)
    502 
    503     sub         x20,x5,#32
    504     neg         x4, x20
    505     dup         v13.8b,w4                   //dup_const_32_fract
    506     umlal       v10.8h, v9.8b, v6.8b        //vmull_u8(ref_main_idx_1, dup_const_fract)
    507 
    508     ld1         {v14.8b},[x10]              //ref_main_idx
    509     add         x8,x8,#1
    510 
    511     st1         {v4.8b},[x2],x3
    512     rshrn       v10.8b, v10.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
    513 
    514     ld1         {v15.8b},[x11]              //ref_main_idx_1
    515     mov         x14,x5                      //fract_prev = fract
    516     add         x5,x8,#1                    //row + 1
    517     mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
    518     and         x5,x5,#31                   //fract = pos & (31)
    519     cmp         x14,x5                      //if(fract_prev > fract)
    520     add         x20,x10,#2                  //pu1_ref_main_idx += 1
    521     csel        x10, x20, x10,gt
    522     add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
    523 
    524     dup         v18.8b,w5                   //dup_const_fract
    525     umull       v16.8h, v14.8b, v13.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
    526 
    527     sub         x20,x5,#32
    528     neg         x4, x20
    529     dup         v19.8b,w4                   //dup_const_32_fract
    530     umlal       v16.8h, v15.8b, v12.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
    531 
    532     ld1         {v20.8b},[x10]              //ref_main_idx
    533 
    534     st1         {v10.8b},[x2],x3
    535     rshrn       v16.8b, v16.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
    536     ld1         {v21.8b},[x11]              //ref_main_idx_1
    537 
    538     umull       v22.8h, v20.8b, v19.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
    539     umlal       v22.8h, v21.8b, v18.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
    540 
    541     st1         {v16.8b},[x2],x3
    542     rshrn       v22.8b, v22.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
    543 
    544     st1         {v22.8b},[x2],x3
    545 
    546 end_loops:
    547     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    548     ldp         x19, x20,[sp],#16
    549     ldp         d14,d15,[sp],#16
    550     ldp         d12,d13,[sp],#16
    551     ldp         d9,d10,[sp],#16
    552     ret
    553 
    554 
    555 
    556 
    557