Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_luma_mode_11_to_17.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  akshaya mukund
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] nt
     61 //*  size of tranform block
     62 //*
     63 //* @param[in] mode
     64 //*  type of filtering
     65 //*
     66 //* @returns
     67 //*
     68 //* @remarks
     69 //*  none
     70 //*
     71 //*******************************************************************************
     72 //*/
     73 
     74 //void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref,
     75 //                               word32 src_strd,
     76 //                               uword8* pu1_dst,
     77 //                               word32 dst_strd,
     78 //                               word32 nt,
     79 //                               word32 mode)
     80 //
     81 //**************variables vs registers*****************************************
     82 //x0 => *pu1_ref
     83 //x1 => src_strd
     84 //x2 => *pu1_dst
     85 //x3 => dst_strd
     86 
     87 //stack contents from #40
     88 //    nt
     89 //    mode
     90 
     91 .text
     92 .align 4
     93 .include "ihevc_neon_macros.s"
     94 
     95 
     96 
     97 .globl ihevc_intra_pred_luma_mode_11_to_17_av8
     98 .extern gai4_ihevc_ang_table
     99 .extern gai4_ihevc_inv_ang_table
    100 .extern col_for_intra_luma
    101 .extern idx_neg_idx_11_17
    102 
    103 .type ihevc_intra_pred_luma_mode_11_to_17_av8, %function
    104 
    105 ihevc_intra_pred_luma_mode_11_to_17_av8:
    106 
    107     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    108 
    109     stp         d12,d13,[sp,#-16]!
    110     stp         d14,d15,[sp,#-16]!
    111     stp         x19, x20,[sp,#-16]!
    112 
    113     adrp        x7,  :got:gai4_ihevc_ang_table
    114     ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
    115 
    116     adrp        x8,  :got:gai4_ihevc_inv_ang_table
    117     ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
    118 
    119     add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
    120     add         x8, x8, x5, lsl #2          //gai4_ihevc_inv_ang_table[mode - 11]
    121     sub         x8, x8, #44
    122 
    123     ldr         w7,  [x7]                   //intra_pred_ang
    124     sxtw        x7,w7
    125     sub         sp, sp, #132                //ref_temp[2 * max_cu_size + 1]
    126 
    127     ldr         w8,  [x8]                   //inv_ang
    128     sxtw        x8,w8
    129     add         x6, sp, x4                  //ref_temp + nt
    130 
    131     mul         x9, x4, x7                  //nt*intra_pred_ang
    132 
    133     sub         x6, x6, #1                  //ref_temp + nt - 1
    134 
    135     add         x1, x0, x4, lsl #1          //x1 = &src[2nt]
    136     dup         v30.8b,w7                   //intra_pred_ang
    137 
    138     mov         x7, x4
    139 
    140     ldrb        w11, [x1], #-1
    141     sxtw        x11,w11
    142 
    143     asr         x9, x9, #5
    144 
    145     ldrb        w12, [x1], #-1
    146     sxtw        x12,w12
    147     ldrb        w10, [x1], #-1
    148     sxtw        x10,w10
    149     ldrb        w14, [x1], #-1
    150     sxtw        x14,w14
    151 
    152     strb        w11, [x6], #1
    153     sxtw        x11,w11
    154     strb        w12, [x6], #1
    155     sxtw        x12,w12
    156     strb        w10, [x6], #1
    157     sxtw        x10,w10
    158     strb        w14, [x6], #1
    159     sxtw        x14,w14
    160 
    161     subs        x7, x7, #4
    162     beq         end_loop_copy
    163 
    164     sub         x6, x6,#4
    165     sub         x1, x1,#3
    166 
    167     subs        x7,x7,#4
    168     beq         loop_copy_8
    169     subs        x7,x7,#8
    170     beq         loop_copy_16
    171 
    172 loop_copy_32:
    173     ld1         {v0.8b},[x1]
    174     sub         x1, x1,#8
    175     ld1         {v1.8b},[x1]
    176     sub         x1, x1,#8
    177     ld1         {v2.8b},[x1]
    178     sub         x1, x1,#8
    179     ld1         {v3.8b},[x1]
    180 
    181     rev64       v0.8b,  v0.8b
    182     rev64       v1.8b,  v1.8b
    183     st1         {v0.8b},[x6],#8
    184     rev64       v2.8b,  v2.8b
    185     st1         {v1.8b},[x6],#8
    186     rev64       v3.8b,  v3.8b
    187     st1         {v2.8b},[x6],#8
    188     st1         {v3.8b},[x6],#8
    189     sub         x1, x1,#1
    190     b           end_loop_copy
    191 
    192 loop_copy_16:
    193     ld1         {v0.8b},[x1]
    194     sub         x1, x1,#8
    195     ld1         {v1.8b},[x1]
    196 
    197     rev64       v0.8b,  v0.8b
    198     rev64       v1.8b,  v1.8b
    199 
    200     st1         {v0.8b},[x6],#8
    201     st1         {v1.8b},[x6],#8
    202     sub         x1, x1,#1
    203     b           end_loop_copy
    204 
    205 loop_copy_8:
    206     ld1         {v0.8b},[x1]
    207     rev64       v0.8b,  v0.8b
    208     st1         {v0.8b},[x6],#8
    209     sub         x1, x1,#1
    210 end_loop_copy:
    211 
    212     ldrb        w11, [x1], #-1
    213     sxtw        x11,w11
    214     strb        w11, [x6], #1
    215     sxtw        x11,w11
    216 
    217     cmn         x9, #1
    218     bge         prologue_8_16_32
    219 
    220     add         x6, sp, x4                  //ref_temp + nt
    221     sub         x6, x6, #2                  //ref_temp + nt - 2
    222 
    223     mov         x12, #-1
    224 
    225     sub         x20, x9, x12                //count to take care off ref_idx
    226     neg         x9, x20
    227 
    228     add         x1, x0, x4, lsl #1          //x1 = &src[2nt]
    229 
    230     mov         x7, #128                    //inv_ang_sum
    231 
    232 loop_copy_ref_idx:
    233 
    234     add         x7, x7, x8                  //inv_ang_sum += inv_ang
    235 
    236     lsr         x20, x7, #8
    237     ldrb        w11, [x1, x20]
    238     strb        w11, [x6], #-1
    239 
    240     subs        x9, x9, #1
    241 
    242     bne         loop_copy_ref_idx
    243 
    244 prologue_8_16_32:
    245     cmp         x4, #4
    246     beq         sz_4_proc
    247     adrp        x14,  :got:col_for_intra_luma
    248     ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
    249 
    250     lsr         x10, x4, #3
    251     ld1         {v31.8b},[x14],#8
    252     mul         x10, x4, x10                //block counter (dec by #8)
    253 
    254     mov         x11, x4                     //col counter to be inc/dec by #8
    255     smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    256     mov         x0, #1
    257 
    258     sub         x7, x5, #11
    259     dup         v2.8b,w0                    //contains #1 for adding to get ref_main_idx + 1
    260 
    261     adrp        x12, :got:idx_neg_idx_11_17 //load least idx table
    262     ldr         x12, [x12, #:got_lo12:idx_neg_idx_11_17]
    263 
    264     mov         x0, #2
    265     dup         v3.8b,w0
    266 
    267     add         x12, x12, x7, lsl #4
    268     mov         x8, x12
    269 
    270     mov         x7, #8
    271     sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
    272 
    273     ldr         w9,  [x8]
    274     sxtw        x9,w9
    275     add         x1, sp, x4                  //ref_temp + nt
    276 
    277     xtn         v6.8b,  v22.8h
    278     dup         v26.8b,w9                   //least idx added to final idx values
    279     sub         x1, x1, #1                  //ref_temp + nt - 1
    280 
    281     add         x6, x1, x9
    282 
    283     ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
    284     sshr        v22.8h, v22.8h,#5
    285 
    286     mov         x0, #31
    287     dup         v29.8b,w0                   //contains #31 for vand operation
    288 
    289     mov         x0, #32
    290     dup         v28.8b,w0
    291 
    292     sqxtn       v19.8b,  v22.8h
    293 
    294     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
    295 
    296     mov         x0, #1
    297     dup         v27.8b,w0                   //row value inc or reset accordingly
    298 
    299     add         v19.8b,  v19.8b ,  v27.8b   //ref_main_idx (add row)
    300     sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx (row 0)
    301     add         v21.8b,  v19.8b ,  v2.8b    //ref_main_idx + 1 (row 0)
    302     tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
    303     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    304 
    305     tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
    306     add         v4.8b,  v19.8b ,  v2.8b     //ref_main_idx (row 1)
    307     add         v5.8b,  v21.8b ,  v2.8b     //ref_main_idx + 1 (row 1)
    308 
    309     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
    310     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
    311     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    312 
    313     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
    314     add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 2)
    315     add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 2)
    316 
    317     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
    318 
    319     tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 2)
    320     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
    321     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    322 
    323     tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
    324     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
    325     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
    326 
    327     st1         {v24.8b},[x2], x3           //st (row 0)
    328     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
    329 
    330     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
    331     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
    332     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
    333 
    334     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
    335     add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 4)
    336     add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
    337 
    338     st1         {v22.8b},[x2], x3           //st (row 1)
    339     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
    340 
    341     tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 4)
    342     umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
    343     umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
    344 
    345     tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 4)
    346     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
    347     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
    348 
    349     st1         {v20.8b},[x2], x3           //st (row 2)
    350     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
    351 
    352     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
    353     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
    354     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
    355 
    356     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
    357     add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 6)
    358     add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
    359 
    360     st1         {v18.8b},[x2], x3           //st (row 3)
    361     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
    362 
    363     tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 6)
    364     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
    365     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
    366 
    367     tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 6)
    368     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
    369     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
    370 
    371     st1         {v24.8b},[x2], x3           //st (row 4)
    372     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
    373 
    374     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
    375     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    376     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    377 
    378     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
    379     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    380     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    381 
    382     st1         {v22.8b},[x2], x3           //st (row 5)
    383     rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
    384     rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
    385 
    386     st1         {v20.8b},[x2], x3           //st (row 6)
    387 
    388     subs        x10, x10, #8                //subtract 8 and go to end if 8x8
    389 
    390     st1         {v18.8b},[x2], x3           //st (row 7)
    391 
    392     beq         end_func
    393 
    394     subs        x11, x11, #8
    395     add         x20, x8, #4
    396     csel        x8, x20, x8,gt
    397     add         x20, x2, x7
    398     csel        x2, x20, x2,gt
    399     csel        x8, x12, x8,le
    400     sub         x20, x2, x4
    401     csel        x2, x20, x2,le
    402     add         x20, x2, #8
    403     csel        x2, x20, x2,le
    404     csel        x11, x4, x11,le
    405     bgt         lbl390
    406     adrp        x14,  :got:col_for_intra_luma
    407     ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
    408 lbl390:
    409     add         x20, x0, #8
    410     csel        x0, x20, x0,le
    411 
    412     mov         x5,x2
    413     ld1         {v31.8b},[x14],#8
    414     smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    415     xtn         v23.8b,  v12.8h
    416     sshr        v12.8h, v12.8h,#5
    417     sqxtn       v25.8b,  v12.8h
    418     dup         v27.8b,w0                   //row value inc or reset accordingly
    419     ldr         w9,  [x8]
    420     sxtw        x9,w9
    421     add         x9, x0, x9
    422     sub         x9, x9, #1
    423     dup         v26.8b,w9
    424     add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
    425 
    426     sub         x4,x4,#8
    427 
    428 kernel_8_16_32:
    429 
    430     sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx
    431     mov         v26.8b, v23.8b
    432 
    433     subs        x11, x11, #8
    434     add         x6, x1, x9
    435     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
    436     add         v21.8b,  v2.8b ,  v19.8b    //ref_main_idx + 1
    437 
    438     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    439     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
    440     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    441 
    442     add         x20, x0, #8
    443     csel        x0, x20, x0,le
    444     add         x20, x8, #4
    445     csel        x8, x20, x8,gt
    446     ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
    447 
    448     st1         {v24.8b},[x5], x3           //st (row 4)
    449     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
    450 
    451     bgt         lbl429
    452     adrp        x14,  :got:col_for_intra_luma
    453     ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
    454 lbl429:
    455     csel        x8, x12, x8,le
    456     dup         v27.8b,w0                   //row value inc or reset accordingly
    457 
    458     add         v4.8b,  v2.8b ,  v19.8b     //ref_main_idx (row 1)
    459     tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
    460     add         v5.8b,  v2.8b ,  v21.8b     //ref_main_idx + 1 (row 1)
    461 
    462 
    463     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    464     tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
    465     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    466 
    467     ld1         {v31.8b},[x14],#8
    468     and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
    469 
    470     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
    471     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
    472 
    473     add         v19.8b,  v3.8b ,  v19.8b    //ref_main_idx (row 2)
    474     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
    475     add         v21.8b,  v3.8b ,  v21.8b    //ref_main_idx + 1 (row 2)
    476 
    477     add         x20, x4, #8
    478     csel        x11, x20, x11,le
    479     ldr         w9,  [x8]
    480     sxtw        x9,w9
    481     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    482 
    483     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
    484     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
    485     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    486 
    487     st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
    488     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
    489 
    490     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
    491     tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 2)
    492     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
    493 
    494     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
    495     tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
    496     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    497 
    498     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
    499     st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
    500 
    501     add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 4)
    502     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
    503     add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
    504 
    505     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
    506     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
    507     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
    508 
    509     smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    510     add         x5,x2,x3,lsl#2
    511     add         x9, x0, x9
    512 
    513 
    514     st1         {v24.8b},[x2], x3           //st (row 0)
    515     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
    516 
    517     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
    518     tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 4)
    519     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
    520 
    521     umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
    522     tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 4)
    523     umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
    524 
    525     st1         {v22.8b},[x2], x3           //st (row 1)
    526     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
    527 
    528     xtn         v23.8b,  v14.8h
    529     sshr        v14.8h, v14.8h,#5
    530 
    531     add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 6)
    532     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
    533     add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
    534 
    535     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
    536     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
    537     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
    538 
    539     st1         {v20.8b},[x2], x3           //st (row 2)
    540     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
    541 
    542     sub         x9, x9, #1
    543     sqxtn       v25.8b,  v14.8h
    544 
    545     add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
    546     tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 6)
    547     add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
    548 
    549     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
    550     tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 6)
    551     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
    552 
    553     add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
    554     dup         v26.8b,w9
    555 
    556     st1         {v18.8b},[x2], x3           //st (row 3)
    557     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
    558 
    559 
    560     add         x2, x2, x3, lsl #2
    561     add         x20, x7, x2
    562     csel        x2, x20, x2,gt
    563     sub         x20, x2, x4
    564     csel        x2, x20, x2,le
    565 
    566     subs        x10, x10, #8                //subtract 8 and go to end if 8x8
    567 
    568     bne         kernel_8_16_32
    569 epil_8_16_32:
    570 
    571     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
    572 
    573     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    574     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
    575     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    576 
    577     st1         {v24.8b},[x5], x3           //st (row 4)
    578     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
    579 
    580     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    581     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    582 
    583     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
    584     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
    585 
    586     st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
    587     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
    588 
    589     st1         {v18.8b},[x5], x3           //st (row 7)
    590 
    591 
    592     b           end_func
    593 
    594 sz_4_proc:
    595     adrp        x14,  :got:col_for_intra_luma
    596     ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
    597 
    598     ld1         {v31.8b},[x14]
    599     mov         x12, #1
    600 
    601     dup         v2.8b,w12                   //contains #1 for adding to get ref_main_idx + 1
    602     mov         x0, #2
    603 
    604     dup         v3.8b,w0
    605     adrp        x12, :got:idx_neg_idx_11_17 //load least idx table
    606     ldr         x12, [x12, #:got_lo12:idx_neg_idx_11_17]
    607 
    608     smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    609     sub         x7, x5, #11
    610 
    611     add         x12, x12, x7, lsl #4
    612     mov         x8, x12
    613 
    614     ldr         w9,  [x8]
    615     sxtw        x9,w9
    616 
    617     dup         v26.8b,w9                   //least idx added to final idx values
    618     add         x6, sp, x4                  //ref_temp + nt
    619 
    620     sub         x6, x6, #1                  //ref_temp + nt - 1
    621     xtn         v6.8b,  v22.8h
    622     add         x6, x6, x9
    623 
    624     ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
    625     mov         x0, #31
    626 
    627     dup         v29.8b,w0                   //contains #31 for vand operation
    628     mov         x1, #32
    629 
    630     dup         v28.8b,w1
    631 
    632     sshr        v22.8h, v22.8h,#5
    633     sqxtn       v19.8b,  v22.8h
    634 
    635     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
    636     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    637 
    638     add         v19.8b,  v19.8b ,  v2.8b    //ref_main_idx (add 1)
    639     sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx
    640     add         v21.8b,  v19.8b ,  v2.8b    //ref_main_idx + 1
    641 
    642     add         v4.8b,  v19.8b ,  v2.8b     //row 1 ref_main_idx
    643     add         v5.8b,  v21.8b ,  v2.8b
    644 
    645     tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
    646     tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
    647 
    648 
    649     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
    650     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx    (row 1)
    651     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    652 
    653     add         v19.8b,  v19.8b ,  v3.8b    //idx (row 2)
    654     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
    655     add         v21.8b,  v21.8b ,  v3.8b    //idx+1 (row 2)
    656 
    657     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
    658     tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx    (row 2)
    659     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    660 
    661     rshrn       v24.8b, v24.8h,#5           //round shift (row 0)
    662 
    663     add         v4.8b,  v4.8b ,  v3.8b      //idx (row 3)
    664     tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
    665     add         v5.8b,  v5.8b ,  v3.8b      //idx+1 (row 3)
    666 
    667     umull       v20.8h, v12.8b, v7.8b       //mul (row 2)
    668     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
    669     umlal       v20.8h, v13.8b, v6.8b       //mul (row 2)
    670 
    671     st1         {v24.s}[0],[x2], x3         //st row 0
    672     rshrn       v22.8b, v22.8h,#5           //round shift (row 1)
    673 
    674     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
    675 
    676     umull       v18.8h, v16.8b, v7.8b       //mul (row 3)
    677     umlal       v18.8h, v17.8b, v6.8b       //mul (row 3)
    678 
    679     st1         {v22.s}[0],[x2], x3         //st row 1
    680     rshrn       v20.8b, v20.8h,#5           //round shift (row 2)
    681 
    682     st1         {v20.s}[0],[x2], x3         //st row 2
    683 
    684     rshrn       v18.8b, v18.8h,#5           //round shift (row 3)
    685 
    686     st1         {v18.s}[0],[x2], x3         //st (row 3)
    687 
    688 end_func:
    689     add         sp, sp, #132
    690     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    691     ldp         x19, x20,[sp],#16
    692     ldp         d14,d15,[sp],#16
    693     ldp         d12,d13,[sp],#16
    694     ret
    695 
    696 
    697 
    698 
    699 
    700 
    701