Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_luma_mode_3_to_9.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  parthiban v
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] nt
     61 //*  size of tranform block
     62 //*
     63 //* @param[in] mode
     64 //*  type of filtering
     65 //*
     66 //* @returns
     67 //*
     68 //* @remarks
     69 //*  none
     70 //*
     71 //*******************************************************************************
     72 //*/
     73 
     74 //void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref,
     75 //                               word32 src_strd,
     76 //                               uword8* pu1_dst,
     77 //                               word32 dst_strd,
     78 //                               word32 nt,
     79 //                               word32 mode)
     80 //
     81 //**************variables vs registers*****************************************
     82 //x0 => *pu1_ref
     83 //x1 => src_strd
     84 //x2 => *pu1_dst
     85 //x3 => dst_strd
     86 
     87 //stack contents from #40
     88 //    nt
     89 //    mode
     90 
     91 .text
     92 .align 4
     93 .include "ihevc_neon_macros.s"
     94 
     95 
     96 
     97 .globl ihevc_intra_pred_luma_mode_3_to_9_av8
     98 .extern gai4_ihevc_ang_table
     99 .extern gai4_ihevc_inv_ang_table
    100 .extern col_for_intra_luma
    101 .extern idx_neg_idx_3_9
    102 
    103 
    104 .type ihevc_intra_pred_luma_mode_3_to_9_av8, %function
    105 
    106 ihevc_intra_pred_luma_mode_3_to_9_av8:
    107 
    108     // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
    109 
    110     stp         d12,d13,[sp,#-16]!
    111     stp         d14,d15,[sp,#-16]!
    112     stp         x19, x20,[sp,#-16]!
    113 
    114     adrp        x7,  :got:gai4_ihevc_ang_table
    115     ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
    116 
    117     adrp        x8,  :got:gai4_ihevc_inv_ang_table
    118     ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
    119 
    120     add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
    121     ldr         w7,  [x7]                   //intra_pred_ang
    122     sxtw        x7,w7
    123     dup         v30.8b,w7                   //intra_pred_ang
    124 
    125     adrp        x14,  :got:col_for_intra_luma
    126     ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
    127 
    128     cmp         x4, #4
    129 
    130     beq         sz_4_proc
    131     b           prologue_8_16_32
    132 
    133 prologue_8_16_32:
    134     lsr         x10, x4, #3
    135     ld1         {v31.8b},[x14],#8
    136     mul         x10, x4, x10                //block counter (dec by #8)
    137 
    138     mov         x11, x4                     //col counter to be inc/dec by #8
    139     smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    140 
    141     sub         x7, x5, #3
    142     movi        v2.8b, #1                   //contains #1 for adding to get ref_main_idx + 1
    143     adrp        x12, :got:idx_neg_idx_3_9   //load least idx table
    144     ldr         x12, [x12, #:got_lo12:idx_neg_idx_3_9]
    145     movi        v3.8b, #2
    146 
    147     add         x12, x12, x7, lsl #4
    148     mov         x8, x12
    149 
    150     mov         x7, #8
    151     sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
    152 
    153     ldr         w9,  [x8]
    154     sxtw        x9,w9
    155     add         x1, x0, x4, lsl #1          //pu1_ref + nt
    156 
    157     xtn         v6.8b,  v22.8h
    158     dup         v26.8b,w9                   //least idx added to final idx values
    159     sub         x1, x1, #9                  //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
    160 
    161     sub         x6, x1, x9
    162 
    163     ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
    164     sshr        v22.8h, v22.8h,#5
    165 
    166     movi        v29.8b, #31                 //contains #31 for vand operation
    167 
    168     movi        v28.8b, #32
    169 
    170     sqxtn       v1.8b,  v22.8h
    171 
    172     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
    173 
    174     mov         x0, #1
    175 
    176     movi        v27.8b, #7                  //row 0 to 7
    177 
    178     sub         v1.8b,  v1.8b ,  v2.8b      //ref_main_idx (sub row)
    179     sub         v1.8b,  v26.8b ,  v1.8b     //ref_main_idx (row 0)
    180     add         v1.8b,  v1.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
    181     sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx + 1 (row 0)
    182     tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
    183     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    184 
    185     tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
    186     sub         v4.8b,  v1.8b ,  v2.8b      //ref_main_idx (row 1)
    187     sub         v5.8b,  v19.8b ,  v2.8b     //ref_main_idx + 1 (row 1)
    188 
    189     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
    190     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
    191     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    192 
    193     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
    194     sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 2)
    195     sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 2)
    196 
    197     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
    198 
    199     tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 2)
    200     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
    201     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    202 
    203     tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
    204     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
    205     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
    206 
    207     st1         {v24.8b},[x2], x3           //st (row 0)
    208     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
    209 
    210     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
    211     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
    212     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
    213 
    214     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
    215     sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 4)
    216     sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
    217 
    218     st1         {v22.8b},[x2], x3           //st (row 1)
    219     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
    220 
    221     tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 4)
    222     umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
    223     umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
    224 
    225     tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 4)
    226     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
    227     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
    228 
    229     st1         {v20.8b},[x2], x3           //st (row 2)
    230     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
    231 
    232     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
    233     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
    234     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
    235 
    236     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
    237     sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 6)
    238     sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
    239 
    240     st1         {v18.8b},[x2], x3           //st (row 3)
    241     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
    242 
    243     tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 6)
    244     umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
    245     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
    246 
    247     tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 6)
    248     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
    249     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
    250 
    251     st1         {v24.8b},[x2], x3           //st (row 4)
    252     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
    253 
    254     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
    255     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    256     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    257 
    258     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
    259     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    260     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    261 
    262     st1         {v22.8b},[x2], x3           //st (row 5)
    263     rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
    264     rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
    265 
    266     st1         {v20.8b},[x2], x3           //st (row 6)
    267 
    268     subs        x10, x10, #8                //subtract 8 and go to end if 8x8
    269 
    270     st1         {v18.8b},[x2], x3           //st (row 7)
    271 
    272     beq         end_func
    273 
    274     subs        x11, x11, #8
    275     add         x20, x8, #4
    276     csel        x8, x20, x8,gt
    277     add         x20, x2, x7
    278     csel        x2, x20, x2,gt
    279     csel        x8, x12, x8,le
    280     sub         x20, x2, x4
    281     csel        x2, x20, x2,le
    282     add         x20, x2, #8
    283     csel        x2, x20, x2,le
    284     csel        x11, x4, x11,le
    285     bgt         lbl284
    286     adrp        x14,  :got:col_for_intra_luma
    287     ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
    288 lbl284:
    289     add         x20, x0, #8
    290     csel        x0, x20, x0,le
    291 
    292     mov         x5,x2
    293     ld1         {v31.8b},[x14],#8
    294     smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    295     xtn         v23.8b,  v12.8h
    296     sshr        v12.8h, v12.8h,#5
    297     sqxtn       v25.8b,  v12.8h
    298     ldr         w9,  [x8]
    299     sxtw        x9,w9
    300     add         x9, x0, x9
    301     sub         x9, x9, #1
    302     dup         v26.8b,w9
    303     movi        v16.8b, #8
    304 
    305     sub         x4,x4,#8
    306 
    307 kernel_8_16_32:
    308 
    309     sub         v1.8b,  v26.8b ,  v25.8b    //ref_main_idx
    310     mov         v26.8b, v23.8b
    311 
    312     subs        x11, x11, #8
    313     sub         x6, x1, x9
    314     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
    315     add         v1.8b,  v1.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
    316 
    317     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    318     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx - 1 (row 7)
    319     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    320 
    321     sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx - 1
    322     add         x20, x0, #8
    323     csel        x0, x20, x0,le
    324     add         x20, x8, #4
    325     csel        x8, x20, x8,gt
    326     ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
    327 
    328     st1         {v24.8b},[x5], x3           //st (row 4)
    329     rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
    330 
    331     bgt         lbl323
    332     adrp        x14,  :got:col_for_intra_luma
    333     ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
    334 lbl323:
    335     csel        x8, x12, x8,le
    336     dup         v27.8b,w0                   //row value inc or reset accordingly
    337 
    338     sub         v4.8b,  v1.8b ,  v2.8b      //ref_main_idx (row 1)
    339     tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
    340     sub         v5.8b,  v19.8b ,  v2.8b     //ref_main_idx - 1 (row 1)
    341 
    342 
    343     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    344     tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
    345     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    346 
    347     ld1         {v31.8b},[x14],#8
    348     and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
    349 
    350     st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
    351     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
    352 
    353     sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 2)
    354     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
    355     sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 2)
    356 
    357     add         x20, x4, #8
    358     csel        x11, x20, x11,le
    359     ldr         w9,  [x8]
    360     sxtw        x9,w9
    361     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    362 
    363     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
    364     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
    365     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    366 
    367     st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
    368     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
    369 
    370     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
    371     tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 2)
    372     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 3)
    373 
    374     umull       v22.8h, v23.8b, v7.8b       //mul (row 1)
    375     tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
    376     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    377 
    378     rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
    379     st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
    380 
    381     sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 4)
    382     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
    383     sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 4)
    384 
    385     umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
    386     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
    387     umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
    388 
    389     smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    390     add         x5,x2,x3,lsl#2
    391     add         x9, x0, x9
    392 
    393     st1         {v24.8b},[x2], x3           //st (row 0)
    394     rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
    395 
    396     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
    397     tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 4)
    398     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 5)
    399 
    400     umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
    401     tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 4)
    402     umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
    403 
    404     st1         {v22.8b},[x2], x3           //st (row 1)
    405     rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
    406 
    407     xtn         v23.8b,  v14.8h
    408     sshr        v14.8h, v14.8h,#5
    409 
    410     sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 6)
    411     tbl         v21.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
    412     sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 6)
    413 
    414     umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
    415     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
    416     umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
    417 
    418     st1         {v20.8b},[x2], x3           //st (row 2)
    419     rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
    420 
    421     sub         x9, x9, #1
    422     sqxtn       v25.8b,  v14.8h
    423 
    424     sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
    425     tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 6)
    426     sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 7)
    427 
    428     umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
    429     tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 6)
    430     umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
    431 
    432     add         v25.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
    433     dup         v26.8b,w9
    434 
    435     st1         {v18.8b},[x2], x3           //st (row 3)
    436     rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
    437 
    438     add         x2, x2, x3, lsl #2
    439     sub         v25.8b,  v25.8b ,  v2.8b    //ref_main_idx -1 (sub 1)
    440     add         x20, x7, x2
    441     csel        x2, x20, x2,gt
    442 
    443     sub         x20, x2, x4
    444     csel        x2, x20, x2,le
    445 
    446     subs        x10, x10, #8                //subtract 8 and go to end if 8x8
    447 
    448     bne         kernel_8_16_32
    449 
    450 epil_8_16_32:
    451     tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
    452 
    453     umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
    454     tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
    455     umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
    456 
    457     st1         {v24.8b},[x5], x3           //st (row 4)
    458     rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
    459 
    460     umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
    461     umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
    462 
    463     st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
    464     rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
    465 
    466     st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
    467     rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
    468 
    469     st1         {v18.8b},[x5], x3           //st (row 7)
    470 
    471     b           end_func
    472 
    473 sz_4_proc:
    474     ld1         {v31.8b},[x14]
    475     movi        v2.8b, #1                   //contains #1 for adding to get ref_main_idx - 1
    476 
    477     movi        v3.8b, #2
    478     adrp        x12, :got:idx_neg_idx_3_9   //load least idx table
    479     ldr         x12, [x12, #:got_lo12:idx_neg_idx_3_9]
    480 
    481     smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
    482     sub         x7, x5, #3
    483 
    484     add         x12, x12, x7, lsl #4
    485     mov         x8, x12
    486 
    487     ldr         w9,  [x8]
    488     sxtw        x9,w9
    489 
    490     dup         v26.8b,w9                   //least idx added to final idx values
    491     add         x6, x0, x4, lsl #1          //pu1_ref + 2nt
    492 
    493     xtn         v6.8b,  v22.8h
    494     sub         x6, x6, #9                  //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
    495     sub         x6, x6, x9
    496 
    497     ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
    498 
    499     movi        v29.8b, #31                 //contains #31 for vand operation
    500 
    501     movi        v28.8b, #32
    502 
    503     sshr        v22.8h, v22.8h,#5
    504     sqxtn       v1.8b,  v22.8h
    505 
    506     and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
    507     sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
    508 
    509     movi        v27.8b, #7                  //row 0 to 7(row-1)
    510     sub         v1.8b,  v1.8b ,  v2.8b      //ref_main_idx (add 1)
    511     sub         v1.8b,  v26.8b ,  v1.8b     //ref_main_idx
    512     add         v1.8b,  v1.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
    513     sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx - 1
    514 
    515     sub         v4.8b,  v1.8b ,  v2.8b      //row 1 ref_main_idx
    516     sub         v5.8b,  v19.8b ,  v2.8b
    517 
    518     tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
    519     tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
    520 
    521 
    522     umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
    523     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx    (row 1)
    524     umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
    525 
    526     sub         v1.8b,  v1.8b ,  v3.8b      //idx (row 2)
    527     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
    528     sub         v19.8b,  v19.8b ,  v3.8b    //idx+1 (row 2)
    529 
    530     umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
    531     tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx    (row 2)
    532     umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
    533 
    534     rshrn       v24.8b, v24.8h,#5           //round shift (row 0)
    535 
    536     sub         v4.8b,  v4.8b ,  v3.8b      //idx (row 3)
    537     tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
    538     sub         v5.8b,  v5.8b ,  v3.8b      //idx+1 (row 3)
    539 
    540     umull       v20.8h, v12.8b, v7.8b       //mul (row 2)
    541     tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
    542     umlal       v20.8h, v13.8b, v6.8b       //mul (row 2)
    543 
    544     st1         {v24.s}[0],[x2], x3         //st row 0
    545     rshrn       v22.8b, v22.8h,#5           //round shift (row 1)
    546 
    547     tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
    548 
    549     umull       v18.8h, v16.8b, v7.8b       //mul (row 3)
    550     umlal       v18.8h, v17.8b, v6.8b       //mul (row 3)
    551 
    552     st1         {v22.s}[0],[x2], x3         //st row 1
    553     rshrn       v20.8b, v20.8h,#5           //round shift (row 2)
    554 
    555     st1         {v20.s}[0],[x2], x3         //st row 2
    556 
    557     rshrn       v18.8b, v18.8h,#5           //round shift (row 3)
    558 
    559     st1         {v18.s}[0],[x2], x3         //st (row 3)
    560 
    561 end_func:
    562     // ldmfd sp!,{x4-x12,x15}          //reload the registers from sp
    563     ldp         x19, x20,[sp],#16
    564     ldp         d14,d15,[sp],#16
    565     ldp         d12,d13,[sp],#16
    566     ret
    567 
    568 
    569 
    570 
    571