Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_chroma_mode_3_to_9.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  parthiban v
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] nt
     61 //*  size of tranform block
     62 //*
     63 //* @param[in] mode
     64 //*  type of filtering
     65 //*
     66 //* @returns
     67 //*
     68 //* @remarks
     69 //*  none
     70 //*
     71 //*******************************************************************************
     72 //*/
     73 //void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
     74 //                                       word32 src_strd,
     75 //                                       uword8 *pu1_dst,
     76 //                                       word32 dst_strd,
     77 //                                       word32 nt,
     78 //                                       word32 mode)
     79 //**************variables vs registers*****************************************
     80 //x0 => *pu1_ref
     81 //x1 => src_strd
     82 //x2 => *pu1_dst
     83 //x3 => dst_strd
     84 
     85 //stack contents from #40
     86 //    nt
     87 //    mode
     88 
     89 .text
     90 .align 4
     91 
     92 .include "ihevc_neon_macros.s"
     93 
     94 
     95 
     96 .globl ihevc_intra_pred_chroma_mode_3_to_9_av8
     97 .extern gai4_ihevc_ang_table
     98 .extern gai4_ihevc_inv_ang_table
     99 .extern col_for_intra_chroma
    100 .extern idx_neg_idx_chroma_3_9
    101 
    102 .type ihevc_intra_pred_chroma_mode_3_to_9_av8, %function
    103 
    104 ihevc_intra_pred_chroma_mode_3_to_9_av8:
    105 
    106     // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
    107 
    108     stp         d13,d14,[sp,#-16]!
    109     stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
    110                                             // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
    111     stp         x19, x20,[sp,#-16]!
    112 
    113     adrp        x7,  :got:gai4_ihevc_ang_table
    114     ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
    115 
    116     adrp        x8,  :got:gai4_ihevc_inv_ang_table
    117     ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
    118 
    119     add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
    120     ldr         w7,  [x7]                   //intra_pred_ang
    121     sxtw        x7,w7
    122     dup         v30.8b,w7                   //intra_pred_ang
    123 
    124     adrp        x14,  :got:col_for_intra_chroma
    125     ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
    126 
    127 prologue_8_16_32:
    128     lsr         x10, x4, #3
    129     ld1         {v31.8b},[x14],#8
    130     mul         x10, x4, x10                //block counter (dec by #8)
    131 
    132     lsl         x11, x4, #1                 //col counter to be inc/dec by #8
    133     smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    134 
    135     sub         x7, x5, #3
    136     adrp        x12,  :got:idx_neg_idx_chroma_3_9 //load most idx table
    137     ldr         x12, [x12,  #:got_lo12:idx_neg_idx_chroma_3_9]
    138 
    139     add         x12, x12, x7, lsl #4
    140     mov         x8, x12
    141 
    142     mov         x7, #8
    143     sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
    144 
    145     ldr         w9,  [x8]
    146     sxtw        x9,w9
    147     lsl         x9, x9, #1
    148     add         x1, x0, x4, lsl #2          //pu1_ref + 4*nt
    149 
    150     xtn         v6.8b,  v22.8h
    151     dup         v26.8b,w9                   //most idx added to final idx values
    152     sub         x1, x1, #26                 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
    153 
    154     sub         x6, x1, x9
    155 
    156     ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from most idx)
    157     sshr        v22.8h, v22.8h,#5
    158 
    159     movi        v29.8b, #31                 //contains #31 for vand operation
    160 
    161     movi        v28.8b, #32
    162 
    163     sqxtn       v2.8b,  v22.8h
    164     shl         v2.8b, v2.8b,#1             // 2 * idx
    165 
    166     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
    167     movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
    168 
    169     mov         x0,#0x302                   // idx value for v is +1 of u
    170     dup         v27.4h,w0
    171     mov         x0,#0
    172 
    173     movi        v3.8b, #22                  //row 0 to 7
    174 
    175     sub         v2.8b,  v2.8b ,  v27.8b     //ref_main_idx (sub row)
    176     sub         v2.8b,  v26.8b ,  v2.8b     //ref_main_idx (row 0)
    177     add         v2.8b,  v2.8b ,  v3.8b      //to compensate the pu1_src idx incremented by 8
    178     sub         v3.8b,  v2.8b ,  v29.8b     //ref_main_idx + 1 (row 0)
    179     tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
    180     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    181 
    182     tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
    183     sub         v4.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 1)
    184     sub         v5.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 1)
    185 
    186     movi        v29.8b, #4
    187 
    188     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
    189     umull       v24.8h, v25.8b, v7.8b       //mul (row 0)
    190     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    191 
    192     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
    193     sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 2)
    194     sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 2)
    195 
    196     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
    197 
    198     tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
    199     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
    200     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    201 
    202     tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
    203     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
    204     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
    205 
    206     st1         {v24.8b},[x2], x3           //st (row 0)
    207     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
    208 
    209     tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
    210     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
    211     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
    212 
    213     tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
    214     sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 4)
    215     sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
    216 
    217     st1         {v22.8b},[x2], x3           //st (row 1)
    218     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
    219 
    220     tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
    221     umull       v18.8h, v19.8b, v7.8b       //mul (row 3)
    222     umlal       v18.8h, v23.8b, v6.8b       //mul (row 3)
    223 
    224     tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
    225     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
    226     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
    227 
    228     st1         {v20.8b},[x2], x3           //st (row 2)
    229     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
    230 
    231     tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
    232     umull       v24.8h, v25.8b, v7.8b       //mul (row 4)
    233     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
    234 
    235     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
    236     sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 6)
    237     sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
    238 
    239     st1         {v18.8b},[x2], x3           //st (row 3)
    240     cmp         x4,#4
    241     beq         end_func
    242     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
    243 
    244     tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
    245     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
    246     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
    247 
    248     tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
    249     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
    250     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
    251 
    252     st1         {v24.8b},[x2], x3           //st (row 4)
    253     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
    254 
    255     tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
    256     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    257     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    258 
    259     tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
    260     umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
    261     umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
    262 
    263     st1         {v22.8b},[x2], x3           //st (row 5)
    264     rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
    265     rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
    266 
    267     st1         {v20.8b},[x2], x3           //st (row 6)
    268 
    269     subs        x10, x10, #4                //subtract 8 and go to end if 8x8
    270 
    271     st1         {v18.8b},[x2], x3           //st (row 7)
    272 
    273     beq         end_func
    274 
    275     subs        x11, x11, #8                //decrement the processed col
    276     add         x20, x8, #4
    277     csel        x8, x20, x8,gt
    278     add         x20, x2, x7
    279     csel        x2, x20, x2,gt
    280     csel        x8, x12, x8,le
    281     sub         x20, x2, x4
    282     csel        x2, x20, x2,le
    283     add         x20, x2, #8
    284     csel        x2, x20, x2,le
    285     lsl         x20, x4,  #1
    286     csel        x11,x20,x11,le
    287     bgt         lbl284
    288     adrp        x14,  :got:col_for_intra_chroma
    289     ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
    290 lbl284:
    291     add         x20, x0, #8
    292     csel        x0, x20, x0,le
    293 
    294     ld1         {v31.8b},[x14],#8
    295     smull       v25.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    296     xtn         v19.8b,  v25.8h
    297     sshr        v25.8h, v25.8h,#5
    298     sqxtn       v23.8b,  v25.8h
    299     shl         v23.8b, v23.8b,#1
    300     mov         x5, #0x302                  //idx value for v is +1 of u
    301     dup         v27.4h,w5                   //row value inc or reset accordingly
    302     ldr         w9,  [x8]                   //loads index value
    303     sxtw        x9,w9
    304     lsl         x9, x9, #1
    305     mov         x5, #22
    306     sub         x5, x5, x0, lsl #1
    307     dup         v16.8b,w5
    308     dup         v26.8b,w9
    309 
    310     mov         x5,x2
    311     sub         v23.8b,  v23.8b ,  v27.8b   //ref_main_idx (sub row)
    312 
    313 kernel_8_16_32:
    314     movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
    315     sub         v2.8b,  v26.8b ,  v23.8b    //ref_main_idx
    316     mov         v26.8b, v19.8b
    317 
    318     subs        x11, x11, #8
    319     sub         x6, x1, x9
    320     tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
    321     add         v2.8b,  v2.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
    322 
    323     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    324     tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
    325     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    326 
    327     add         x20, x0, #8
    328     csel        x0, x20, x0,le
    329     sub         v3.8b,  v2.8b ,  v29.8b     //ref_main_idx - 2
    330     add         x20, x8, #4
    331     csel        x8, x20, x8,gt
    332 
    333     ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from most idx)
    334     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
    335 
    336     bgt         lbl326
    337     adrp        x14,  :got:col_for_intra_chroma
    338     ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
    339 lbl326:
    340     st1         {v24.8b},[x5], x3           //st (row 4)
    341     csel        x8, x12, x8,le
    342 
    343     mov         x9,#0x302
    344     dup         v27.4h,w9                   //row value inc or reset accordingly
    345     sub         v4.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 1)
    346 
    347     sub         v5.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 1)
    348     tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
    349     movi        v29.8b, #31                 //contains #2 for adding to get ref_main_idx + 1
    350 
    351     umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
    352     tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
    353     umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
    354 
    355     ld1         {v31.8b},[x14],#8
    356     and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
    357 
    358     lsl         x20, x4,  #1
    359     csel        x11,x20,x11,le
    360     movi        v29.8b, #4                  //contains #2 for adding to get ref_main_idx + 1
    361     ldr         w9,  [x8]
    362     sxtw        x9,w9
    363 
    364     st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
    365     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
    366 
    367     sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 2)
    368     tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
    369     sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 2)
    370 
    371     lsl         x9, x9, #1
    372     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    373 
    374     umull       v24.8h, v25.8b, v7.8b       //mul (row 0)
    375     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
    376     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    377 
    378     st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
    379     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
    380 
    381     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
    382     tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
    383     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 3)
    384 
    385     umull       v22.8h, v19.8b, v7.8b       //mul (row 1)
    386     tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
    387     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    388 
    389     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
    390     st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
    391 
    392     sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 4)
    393     tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
    394     sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 4)
    395 
    396     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
    397     tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
    398     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
    399 
    400     add         x5,x2,x3,lsl#2
    401     smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    402     add         x9, x9, x0, lsl #1
    403 
    404     st1         {v24.8b},[x2], x3           //st (row 0)
    405     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
    406 
    407     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
    408     tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
    409     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 5)
    410 
    411     umull       v18.8h, v19.8b, v7.8b       //mul (row 3)
    412     tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
    413     umlal       v18.8h, v23.8b, v6.8b       //mul (row 3)
    414 
    415     st1         {v22.8b},[x2], x3           //st (row 1)
    416     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
    417 
    418     xtn         v19.8b,  v14.8h
    419     sshr        v14.8h, v14.8h,#5
    420 
    421     sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 6)
    422     tbl         v21.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
    423     sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 6)
    424 
    425     umull       v24.8h, v25.8b, v7.8b       //mul (row 4)
    426     tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
    427     sqxtn       v23.8b,  v14.8h
    428 
    429     st1         {v20.8b},[x2], x3           //st (row 2)
    430     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
    431 
    432     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
    433     dup         v26.8b,w9
    434 
    435     sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
    436     tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
    437     sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 7)
    438 
    439     mov         x6, #22                     //to compensate the 2*row value
    440     shl         v23.8b, v23.8b,#1
    441     sub         x6, x6, x0, lsl #1
    442 
    443     umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
    444     tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
    445     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
    446 
    447     st1         {v18.8b},[x2], x3           //st (row 3)
    448     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
    449 
    450     add         x2,x2,x3, lsl #2
    451     dup         v16.8b,w6
    452     add         x20, x7, x2
    453     csel        x2, x20, x2,gt
    454 
    455     sub         x20, x2, x4
    456     csel        x2, x20, x2,le
    457     sub         v23.8b,  v23.8b ,  v27.8b   //ref_main_idx (add row)
    458     sub         x20,x2,#8
    459     csel        x2, x20, x2,le
    460 
    461     subs        x10, x10, #4                //subtract 8 and go to end if 8x8
    462 
    463     bne         kernel_8_16_32
    464 
    465 epil_8_16_32:
    466     tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
    467 
    468     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    469     tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
    470     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    471 
    472     st1         {v24.8b},[x5], x3           //st (row 4)
    473     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
    474 
    475     umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
    476     umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
    477 
    478     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
    479     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
    480 
    481     st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
    482     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
    483 
    484     st1         {v18.8b},[x5], x3           //st (row 7)
    485 
    486 end_func:
    487     // ldmfd sp!,{x4-x12,x15}               //reload the registers from sp
    488     ldp         x19, x20,[sp],#16
    489     ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
    490                                             // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
    491     ldp         d13,d14,[sp],#16
    492     ret
    493 
    494 
    495 
    496 
    497 
    498 
    499 
    500 
    501