Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_chroma_mode_11_to_17.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction chroma mode 11 to 17
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  akshaya mukund
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] nt
     61 //*  size of tranform block
     62 //*
     63 //* @param[in] mode
     64 //*  type of filtering
     65 //*
     66 //* @returns
     67 //*
     68 //* @remarks
     69 //*  none
     70 //*
     71 //*******************************************************************************
     72 //*/
     73 
     74 //void ihevc_intra_pred_chroma_mode_11_to_17(uword8* pu1_ref,
     75 //                               word32 src_strd,
     76 //                               uword8* pu1_dst,
     77 //                               word32 dst_strd,
     78 //                               word32 nt,
     79 //                               word32 mode)
     80 //
     81 //**************variables vs registers*****************************************
     82 //x0 => *pu1_ref
     83 //x1 => src_strd
     84 //x2 => *pu1_dst
     85 //x3 => dst_strd
     86 
     87 //stack contents from #40
     88 //    nt
     89 //    mode
     90 
     91 .text
     92 .align 4
     93 .include "ihevc_neon_macros.s"
     94 
     95 
     96 
     97 .globl ihevc_intra_pred_chroma_mode_11_to_17_av8
     98 .extern gai4_ihevc_ang_table
     99 .extern gai4_ihevc_inv_ang_table
    100 .extern col_for_intra_chroma
    101 .extern idx_neg_idx_chroma_11_17
    102 
    103 .type ihevc_intra_pred_chroma_mode_11_to_17_av8, %function
    104 
    105 ihevc_intra_pred_chroma_mode_11_to_17_av8:
    106 
    107     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    108 
    109     stp         d12,d13,[sp,#-16]!
    110     stp         d14,d15,[sp,#-16]!
    111     stp         x19, x20,[sp,#-16]!
    112 
    113     adrp        x7,  :got:gai4_ihevc_ang_table
    114     ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
    115 
    116     adrp        x8,  :got:gai4_ihevc_inv_ang_table
    117     ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
    118 
    119     add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
    120     add         x8, x8, x5, lsl #2          //gai4_ihevc_inv_ang_table[mode - 11]
    121     sub         x8, x8, #44
    122 
    123     ldr         w7,  [x7]                   //intra_pred_ang
    124     sxtw        x7,w7
    125     sub         sp, sp, #132                //ref_temp[2 * max_cu_size + 2]
    126 
    127     ldr         w8,  [x8]                   //inv_ang
    128     sxtw        x8,w8
    129     add         x6, sp, x4, lsl #1          //ref_temp + 2 * nt
    130 
    131     mul         x9, x4, x7                  //nt*intra_pred_ang
    132 
    133     sub         x6, x6, #2                  //ref_temp + 2*nt - 2
    134 
    135     add         x1, x0, x4, lsl #2          //x1 = &src[4nt]
    136     dup         v30.8b,w7                   //intra_pred_ang
    137 
    138     mov         x7, x4
    139 
    140     sub         x1,x1,#6                    //address calculation for copying 4 halfwords
    141 
    142     asr         x9, x9, #5
    143 
    144     ld1         {v0.8b},[x1]
    145     rev64       v0.4h,  v0.4h
    146     st1         {v0.8b},[x6],#8
    147 
    148     sub         x1, x1,#8
    149 
    150     subs        x7, x7, #4
    151     add         x20, x1,#8
    152     csel        x1, x20, x1,eq
    153     beq         end_loop_copy
    154     subs        x7,x7,#4
    155     beq         loop_copy_8
    156     subs        x7,x7,#8
    157     beq         loop_copy_16
    158 
    159 loop_copy_32:
    160     sub         x1, x1,#24
    161     ld1         {v0.16b, v1.16b},[x1]
    162 
    163     sub         x1, x1,#24
    164     ld1         {v0.16b, v1.16b},[x1],#32
    165 
    166     rev64       v6.4h,  v6.4h
    167     rev64       v5.4h,  v5.4h
    168     rev64       v4.4h,  v4.4h
    169     rev64       v3.4h,  v3.4h
    170     rev64       v2.4h,  v2.4h
    171     rev64       v1.4h,  v1.4h
    172     rev64       v0.4h,  v0.4h
    173 
    174     st1         {v6.8b},[x6],#8
    175     st1         {v5.8b},[x6],#8
    176     st1         {v4.8b},[x6],#8
    177     st1         {v3.8b},[x6],#8
    178     st1         {v2.8b},[x6],#8
    179     st1         {v1.8b},[x6],#8
    180     st1         {v0.8b},[x6],#8
    181 
    182     ld1         {v4.8b, v5.8b, v6.8b},[x1],#24
    183     b           end_loop_copy
    184 
    185 loop_copy_16:
    186     sub         x1, x1,#16
    187     ld1         {v0.8b, v1.8b, v2.8b},[x1]
    188 
    189     rev64       v2.4h,  v2.4h
    190     rev64       v1.4h,  v1.4h
    191     rev64       v0.4h,  v0.4h
    192 
    193     st1         {v2.8b},[x6],#8
    194     st1         {v1.8b},[x6],#8
    195     st1         {v0.8b},[x6],#8
    196 
    197     b           end_loop_copy
    198 loop_copy_8:
    199     ld1         {v0.8b},[x1]
    200     rev64       v0.4h,  v0.4h
    201     st1         {v0.8b},[x6],#8
    202 end_loop_copy:
    203     sub         x1, x1,#2
    204 
    205     ldrh        w11, [x1], #-2
    206     sxtw        x11,w11
    207     strh        w11, [x6], #2
    208     sxtw        x11,w11
    209 
    210     cmn         x9, #1
    211     bge         prologue_8_16_32
    212 
    213     add         x6, sp, x4, lsl #1          //ref_temp + 2 * nt
    214     sub         x6, x6, #4                  //ref_temp + 2 * nt - 2 - 2
    215 
    216     mov         x12, #-1
    217 
    218     sub         x20, x9, x12                //count to take care off ref_idx
    219     neg         x9, x20
    220 
    221     add         x1, x0, x4, lsl #2          //x1 = &src[4nt]
    222 
    223     mov         x7, #128                    //inv_ang_sum
    224 
    225 loop_copy_ref_idx:
    226 
    227     add         x7, x7, x8                  //inv_ang_sum += inv_ang
    228 
    229     lsr         x0, x7, #8
    230     lsl         x0, x0, #1
    231 
    232     ldrh        w11, [x1, x0]
    233     sxtw        x11,w11
    234     strh        w11, [x6], #-2
    235     sxtw        x11,w11
    236 
    237     subs        x9, x9, #1
    238 
    239     bne         loop_copy_ref_idx
    240 
    241 prologue_8_16_32:
    242 
    243     adrp        x14,  :got:col_for_intra_chroma
    244     ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
    245 
    246     lsr         x10, x4, #3
    247     ld1         {v31.8b},[x14],#8
    248     mul         x10, x4, x10                //block counter (dec by #8)
    249 
    250     lsl         x11, x4, #1                 //col counter to be inc/dec by #8
    251     smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    252 
    253     sub         x7, x5, #11
    254 
    255     adrp        x12, :got:idx_neg_idx_chroma_11_17 //load least idx table
    256     ldr         x12, [x12, #:got_lo12:idx_neg_idx_chroma_11_17]
    257 
    258     add         x12, x12, x7, lsl #4
    259     mov         x8, x12
    260 
    261     mov         x7, #8
    262     sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
    263 
    264     ldr         w9,  [x8]
    265     sxtw        x9,w9
    266     lsl         x9, x9, #1
    267     add         x1, sp, x4, lsl #1          //ref_temp + 2nt
    268 
    269     xtn         v6.8b,  v22.8h
    270     dup         v26.8b,w9                   //least idx added to final idx values
    271     sub         x1, x1, #2                  //ref_temp + 2nt - 2
    272 
    273     add         x6, x1, x9
    274 
    275     ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from least idx)
    276     sshr        v22.8h, v22.8h,#5
    277 
    278 //    mov        x0, #31
    279     movi        v29.8b, #31                 //contains #31 for vand operation
    280 
    281 //    mov        x0, #32
    282     movi        v28.8b, #32
    283 
    284     sqxtn       v19.8b,  v22.8h
    285     shl         v19.8b, v19.8b,#1           // 2 * idx
    286 
    287     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
    288 
    289 //    mov        x0, #2
    290     movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
    291 
    292     mov         x0,#0x100                   // idx value for v is +1 of u
    293     dup         v27.4h,w0
    294     add         v27.8b,  v27.8b ,  v29.8b
    295     mov         x0,#0
    296 
    297     add         v19.8b,  v19.8b ,  v27.8b   //ref_main_idx (add row)
    298     sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx (row 0)
    299     add         v21.8b,  v19.8b ,  v29.8b   //ref_main_idx + 1 (row 0)
    300     tbl         v12.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0)
    301     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    302 
    303     tbl         v13.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0)
    304     add         v4.8b,  v19.8b ,  v29.8b    //ref_main_idx (row 1)
    305     add         v5.8b,  v21.8b ,  v29.8b    //ref_main_idx + 1 (row 1)
    306 
    307 //    mov        x0, #4                @ 2 *(row * 2 )
    308     movi        v29.8b, #4
    309 
    310     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
    311     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
    312     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    313 
    314     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
    315     add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 2)
    316     add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 2)
    317 
    318     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
    319 
    320     tbl         v14.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2)
    321     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
    322     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    323 
    324     tbl         v15.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2)
    325     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
    326     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
    327 
    328     st1         {v24.8b},[x2], x3           //st (row 0)
    329     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
    330 
    331     tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
    332     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
    333     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
    334 
    335     tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
    336     add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 4)
    337     add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 4)
    338 
    339     st1         {v22.8b},[x2], x3           //st (row 1)
    340     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
    341 
    342     tbl         v12.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4)
    343     umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
    344     umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
    345 
    346     tbl         v13.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4)
    347     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
    348     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
    349 
    350     st1         {v20.8b},[x2], x3           //st (row 2)
    351     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
    352 
    353     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
    354     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
    355     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
    356 
    357     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
    358     add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 6)
    359     add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 6)
    360 
    361     st1         {v18.8b},[x2], x3           //st (row 3)
    362     cmp         x4,#4
    363     beq         end_func
    364     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
    365 
    366     tbl         v14.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6)
    367     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
    368     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
    369 
    370     tbl         v15.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6)
    371     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
    372     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
    373 
    374     st1         {v24.8b},[x2], x3           //st (row 4)
    375     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
    376 
    377     tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
    378     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    379     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    380 
    381     tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
    382     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    383     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    384 
    385     st1         {v22.8b},[x2], x3           //st (row 5)
    386     rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
    387     rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
    388 
    389     st1         {v20.8b},[x2], x3           //st (row 6)
    390 
    391     subs        x10, x10, #4                //subtract 8 and go to end if 8x8
    392 
    393     st1         {v18.8b},[x2], x3           //st (row 7)
    394 
    395     beq         end_func
    396 
    397     subs        x11, x11, #8
    398     add         x20, x8, #4
    399     csel        x8, x20, x8,gt
    400     add         x20, x2, x7
    401     csel        x2, x20, x2,gt
    402     csel        x8, x12, x8,le
    403     sub         x20, x2, x4
    404     csel        x2, x20, x2,le
    405     add         x20, x2, #8
    406     csel        x2, x20, x2,le
    407     lsl         x20, x4,  #1
    408     csel        x11,x20,x11,le
    409     bgt         lbl400
    410     adrp        x14,  :got:col_for_intra_chroma
    411     ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
    412 lbl400:
    413     add         x20, x0, #8
    414     csel        x0, x20, x0,le
    415 
    416     ld1         {v31.8b},[x14],#8
    417     smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    418     xtn         v23.8b,  v12.8h
    419     sshr        v12.8h, v12.8h,#5
    420     sqxtn       v25.8b,  v12.8h
    421     shl         v25.8b, v25.8b,#1
    422     orr         x5,x0,x0, lsl#8
    423     add         x5, x5,#0x002
    424     add         x5, x5,#0x300
    425     dup         v27.4h,w5                   //row value inc or reset accordingly
    426     ldr         w9,  [x8]
    427     sxtw        x9,w9
    428     lsl         x9, x9, #1
    429     add         x9, x9, x0, lsl #1
    430 //    sub        x9, x9, #1
    431     dup         v26.8b,w9
    432     add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
    433     mov         x5,x2
    434 
    435 //    sub        x4,x4,#8
    436 
    437 kernel_8_16_32:
    438     movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
    439 
    440     sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx
    441     mov         v26.8b, v23.8b
    442 
    443     subs        x11, x11, #8
    444     add         x6, x1, x9
    445     tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
    446     add         v21.8b,  v29.8b ,  v19.8b   //ref_main_idx + 1
    447 
    448     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    449     tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
    450     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    451 
    452     add         x20, x0, #8
    453     csel        x0, x20, x0,le
    454     add         x20, x8, #4
    455     csel        x8, x20, x8,gt
    456     ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from least idx)
    457 
    458     st1         {v24.8b},[x5], x3           //st (row 4)
    459     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
    460 
    461     csel        x8, x12, x8,le
    462     orr         x9,x0,x0, lsl#8
    463     lsl         x9, x9, #1
    464     add         x9, x9,#0x002
    465     add         x9, x9,#0x300
    466     dup         v27.4h,w9                   //row value inc or reset accordingly
    467 
    468     bgt         lbl452
    469     adrp        x14,  :got:col_for_intra_chroma
    470     ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
    471 lbl452:
    472 
    473     add         v4.8b,  v29.8b ,  v19.8b    //ref_main_idx (row 1)
    474     tbl         v12.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0)
    475     add         v5.8b,  v29.8b ,  v21.8b    //ref_main_idx + 1 (row 1)
    476 
    477     movi        v29.8b, #31                 //contains #2 for adding to get ref_main_idx + 1
    478 
    479     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    480     tbl         v13.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0)
    481     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    482 
    483     ld1         {v31.8b},[x14],#8
    484     and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
    485 
    486     movi        v29.8b, #4                  //contains #2 for adding to get ref_main_idx + 1
    487 
    488     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
    489     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
    490 
    491     add         v19.8b,  v29.8b ,  v19.8b   //ref_main_idx (row 2)
    492     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
    493     add         v21.8b,  v29.8b ,  v21.8b   //ref_main_idx + 1 (row 2)
    494 
    495     lsl         x20, x4,  #1
    496     csel        x11,x20,x11,le
    497     ldr         w9,  [x8]
    498     sxtw        x9,w9
    499     lsl         x9, x9, #1
    500     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    501 
    502     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
    503     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
    504     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    505 
    506     st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
    507     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
    508 
    509     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
    510     tbl         v14.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2)
    511     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
    512 
    513     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
    514     tbl         v15.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2)
    515     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    516 
    517     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
    518     st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
    519 
    520     add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 4)
    521     tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
    522     add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 4)
    523 
    524     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
    525     tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
    526     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
    527 
    528     smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    529     add         x5,x2,x3,lsl#2
    530     add         x9, x9, x0, lsl #1
    531 
    532 
    533     st1         {v24.8b},[x2], x3           //st (row 0)
    534     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
    535 
    536     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
    537     tbl         v12.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4)
    538     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
    539 
    540     umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
    541     tbl         v13.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4)
    542     umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
    543 
    544     st1         {v22.8b},[x2], x3           //st (row 1)
    545     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
    546 
    547     xtn         v23.8b,  v14.8h
    548     sshr        v14.8h, v14.8h,#5
    549 
    550     add         v19.8b,  v19.8b ,  v29.8b   //ref_main_idx (row 6)
    551     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
    552     add         v21.8b,  v21.8b ,  v29.8b   //ref_main_idx + 1 (row 6)
    553 
    554     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
    555     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
    556     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
    557 
    558     st1         {v20.8b},[x2], x3           //st (row 2)
    559     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
    560 
    561 //    sub        x9, x9, #1
    562     sqxtn       v25.8b,  v14.8h
    563 
    564     add         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
    565     tbl         v14.8b, {  v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6)
    566     add         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
    567 
    568     shl         v25.8b, v25.8b,#1
    569 
    570     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
    571     tbl         v15.8b, {  v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6)
    572     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
    573 
    574     add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
    575     dup         v26.8b,w9
    576 
    577     st1         {v18.8b},[x2], x3           //st (row 3)
    578     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
    579 
    580 
    581     add         x2, x2, x3, lsl #2
    582     add         x20, x7, x2
    583     csel        x2, x20, x2,gt
    584     sub         x20, x2, x4, lsl #1
    585     csel        x2, x20, x2,le
    586     add         x20,x2,#8
    587     csel        x2, x20, x2,le
    588 
    589     subs        x10, x10, #4                //subtract 8 and go to end if 8x8
    590 
    591     bne         kernel_8_16_32
    592 epil_8_16_32:
    593 
    594     tbl         v23.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
    595 
    596     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    597     tbl         v25.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
    598     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    599 
    600     st1         {v24.8b},[x5], x3           //st (row 4)
    601     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
    602 
    603     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    604     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    605 
    606     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
    607     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
    608 
    609     st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
    610     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
    611 
    612     st1         {v18.8b},[x5], x3           //st (row 7)
    613 
    614 end_func:
    615     add         sp, sp, #132
    616     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    617     ldp         x19, x20,[sp],#16
    618     ldp         d14,d15,[sp],#16
    619     ldp         d12,d13,[sp],#16
    620     ret
    621 
    622 
    623 
    624 
    625 
    626 
    627