Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* ,:file
     21 //*  ihevc_sao_band_offset_chroma.s
     22 //*
     23 //* ,:brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 //* RVCT
     27 //*
     28 //* ,:author
     29 //*  Parthiban V
     30 //*
     31 //* ,:par List of Functions:
     32 //*
     33 //*
     34 //* ,:remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 //void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
     40 //                           WORD32 src_strd,
     41 //                           UWORD8 *pu1_src_left,
     42 //                           UWORD8 *pu1_src_top,
     43 //                           UWORD8 *pu1_src_top_left,
     44 //                           WORD32 sao_band_pos_u,
     45 //                           WORD32 sao_band_pos_v,
     46 //                           WORD8 *pi1_sao_offset_u,
     47 //                           WORD8 *pi1_sao_offset_v,
     48 //                           WORD32 wd,
     49 //                           WORD32 ht)
     50 //
     51 //**************Variables Vs Registers*****************************************
     52 //x0 =>    *pu1_src
     53 //x1 =>    src_strd
     54 //x2 =>    *pu1_src_left
     55 //x3 =>    *pu1_src_top
     56 //x4    =>    *pu1_src_top_left 40
     57 //x5    =>    sao_band_pos_u 44
     58 //x6    =>    sao_band_pos_v 48
     59 //x7    =>    *pi1_sao_offset_u 52
     60 //x8    =>    *pi1_sao_offset_v 56
     61 //x9    =>    wd 60
     62 //x10=>    ht 64
     63 
     64 .text
     65 .p2align 2
     66 .include "ihevc_neon_macros.s"
     67 
     68 .globl gu1_table_band_idx
     69 .globl ihevc_sao_band_offset_chroma_av8
     70 
     71 ihevc_sao_band_offset_chroma_av8:
     72     mov         x8,#0
     73     mov         x9,#0
     74     mov         x10,#0
     75 
     76     ldr         x8,[sp,#0]
     77     ldr         w9,[sp,#8]
     78     ldr         w10,[sp,#16]
     79 
     80     push_v_regs
     81     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
     82     stp         x19, x20,[sp,#-16]!
     83     stp         x21, x22,[sp,#-16]!
     84     stp         x23, x24,[sp,#-16]!
     85 
     86     mov         x15,x4 // pu1_src_top_left 40
     87     mov         x16,x5 // sao_band_pos_u 44
     88     mov         x17,x6 // sao_band_pos_v 48
     89     mov         x19,x7 // pi1_sao_offset_u 52
     90     mov         x20,x8 // pi1_sao_offset_v 56
     91     mov         x21,x9 // wd 60
     92     mov         x22,x10 // ht 64
     93 
     94     MOV         x4, x15                     //Loads pu1_src_top_left
     95     MOV         x10, x22                    //Loads ht
     96 
     97     MOV         x9, x21                     //Loads wd
     98     MOV         x11,x10                     //Move the ht to x9 for loop counter
     99 
    100     ADD         x12,x0,x9                   //pu1_src[row * src_strd + (wd)]
    101     ADRP        x14, :got:gu1_table_band_idx
    102     LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
    103 
    104     SUB         x12,x12,#2                  //wd-2
    105 
    106 SRC_LEFT_LOOP:
    107     LDRH        w5,[x12]                    //Load the value
    108     ADD         x12,x12,x1
    109     SUBS        x11,x11,#1                  //Decrement the loop counter
    110     STRH        w5,[x2],#2                  //Store the value in pu1_src_left pointer
    111     BNE         SRC_LEFT_LOOP
    112 
    113     MOV         x5, x16                     //Loads sao_band_pos_u
    114     LD1         {v1.8b},[x14],#8            //band_table_u.val[0]
    115     ADD         x12,x3,x9                   //pu1_src_top[wd]
    116 
    117     sub         x23,x12,#2
    118     LDRH        w11,[x23]
    119     LD1         {v2.8b},[x14],#8            //band_table_u.val[1]
    120     LSL         x6,x5,#3                    //sao_band_pos_u
    121 
    122     STRH        w11,[x4]                    //store to pu1_src_top_left[0]
    123     LD1         {v3.8b},[x14],#8            //band_table_u.val[2]
    124     MOV         x7, x19                     //Loads pi1_sao_offset_u
    125 
    126     SUB         x4,x10,#1                   //ht-1
    127     dup         v31.8b,w6                   //band_pos_u
    128     mul         x4, x4, x1                  //ht-1 * src_strd
    129 
    130     ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
    131     LD1         {v4.8b},[x14],#8            //band_table_u.val[3]
    132     MOV         x11,x9                      //Move the wd to x9 for loop counter
    133 
    134 SRC_TOP_LOOP:                               //wd is always multiple of 8
    135     LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
    136     SUBS        x11,x11,#8                  //Decrement the loop counter by 8
    137     ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
    138     BNE         SRC_TOP_LOOP
    139 
    140     LD1         {v30.8b},[x7]               //pi1_sao_offset_u load
    141     ADD         v5.8b,  v1.8b ,  v31.8b     //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
    142 
    143     dup         v29.8b, v30.b[1]            //vdup_n_u8(pi1_sao_offset_u[1])
    144     ADD         v6.8b,  v2.8b ,  v31.8b     //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
    145 
    146     dup         v28.8b, v30.b[2]            //vdup_n_u8(pi1_sao_offset_u[2])
    147     ADD         v7.8b,  v3.8b ,  v31.8b     //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
    148 
    149     dup         v27.8b, v30.b[3]            //vdup_n_u8(pi1_sao_offset_u[3])
    150     ADD         v8.8b,  v4.8b ,  v31.8b     //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
    151 
    152     CMP         x5,#28
    153     dup         v26.8b, v30.b[4]            //vdup_n_u8(pi1_sao_offset_u[4])
    154     ADRP        x14, :got:gu1_table_band_idx
    155     LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
    156 
    157     movi        v30.8b, #16                 //vdup_n_u8(16)
    158     ADD         v1.8b,  v5.8b ,  v29.8b     //band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
    159 
    160     LD1         {v9.8b},[x14],#8            //band_table_v.val[0]
    161     ADD         v2.8b,  v6.8b ,  v28.8b     //band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
    162 
    163     LD1         {v10.8b},[x14],#8           //band_table_v.val[1]
    164     ADD         v3.8b,  v7.8b ,  v27.8b     //band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
    165 
    166     MOV         x6, x17                     //Loads sao_band_pos_v
    167     ADD         v4.8b,  v8.8b ,  v26.8b     //band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
    168     LSL         x11,x6,#3                   //sao_band_pos_v
    169 
    170     BLT         SAO_BAND_POS_U_0
    171 
    172 SAO_BAND_POS_U_28:                          //case 28
    173     cmhs        v13.8b,  v30.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
    174     BNE         SAO_BAND_POS_U_29
    175 
    176     ORR         v4.8b,  v4.8b ,  v13.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
    177     B           SWITCH_BREAK_U
    178 
    179 SAO_BAND_POS_U_29:                          //case 29
    180     CMP         x5,#29
    181 
    182     cmhs        v14.8b,  v30.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
    183     BNE         SAO_BAND_POS_U_30
    184     ORR         v3.8b,  v3.8b ,  v14.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
    185 
    186     AND         v4.8b,  v4.8b ,  v13.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
    187     B           SWITCH_BREAK_U
    188 
    189 SAO_BAND_POS_U_30:                          //case 30
    190     CMP         x5,#30
    191 
    192     cmhs        v15.8b,  v30.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
    193     BNE         SAO_BAND_POS_U_31
    194     ORR         v2.8b,  v2.8b ,  v15.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
    195 
    196     AND         v3.8b,  v3.8b ,  v14.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
    197 
    198 SAO_BAND_POS_U_31:                          //case 31
    199     CMP         x5,#31
    200     BNE         SWITCH_BREAK_U
    201 
    202     cmhs        v16.8b,  v30.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
    203     ORR         v1.8b,  v1.8b ,  v16.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
    204 
    205     AND         v2.8b,  v2.8b ,  v15.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
    206     B           SWITCH_BREAK_U
    207 
    208 SAO_BAND_POS_U_0:
    209     CMP         x5,#0                       //case 0
    210     BNE         SWITCH_BREAK_U
    211 
    212     cmhs        v16.8b,  v30.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
    213     AND         v1.8b,  v1.8b ,  v16.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
    214 
    215 SWITCH_BREAK_U:
    216     dup         v30.8b,w11                  //band_pos_v
    217     MOV         x8, x20                     //Loads pi1_sao_offset_v
    218 
    219     LD1         {v11.8b},[x14],#8           //band_table_v.val[2]
    220     ADD         v13.8b,  v9.8b ,  v30.8b    //band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
    221 
    222     LD1         {v12.8b},[x14],#8           //band_table_v.val[3]
    223     ADD         v14.8b,  v10.8b ,  v30.8b   //band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
    224 
    225     LD1         {v25.8b},[x8]               //pi1_sao_offset_v load
    226     ADD         v15.8b,  v11.8b ,  v30.8b   //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
    227 
    228     dup         v29.8b, v25.b[1]            //vdup_n_u8(pi1_sao_offset_v[1])
    229     ADD         v16.8b,  v12.8b ,  v30.8b   //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
    230 
    231     dup         v28.8b, v25.b[2]            //vdup_n_u8(pi1_sao_offset_v[2])
    232     ADD         v9.8b,  v13.8b ,  v29.8b    //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
    233 
    234     dup         v27.8b, v25.b[3]            //vdup_n_u8(pi1_sao_offset_v[3])
    235     ADD         v10.8b,  v14.8b ,  v28.8b   //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
    236 
    237     dup         v26.8b, v25.b[4]            //vdup_n_u8(pi1_sao_offset_v[4])
    238     ADD         v11.8b,  v15.8b ,  v27.8b   //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
    239 
    240     movi        v29.8b, #16                 //vdup_n_u8(16)
    241     ADD         v12.8b,  v16.8b ,  v26.8b   //band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
    242     AND         x12,x9,#0xf
    243 
    244     CMP         x6,#28
    245     BLT         SAO_BAND_POS_V_0
    246 
    247 SAO_BAND_POS_V_28:                          //case 28
    248     cmhs        v17.8b,  v29.8b ,  v12.8b   //vcle_u8(band_table.val[3], vdup_n_u8(16))
    249     BNE         SAO_BAND_POS_V_29
    250     ORR         v12.8b,  v12.8b ,  v17.8b   //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
    251     B           SWITCH_BREAK_V
    252 
    253 SAO_BAND_POS_V_29:                          //case 29
    254     CMP         x6,#29
    255 
    256     cmhs        v18.8b,  v29.8b ,  v11.8b   //vcle_u8(band_table.val[2], vdup_n_u8(16))
    257     BNE         SAO_BAND_POS_V_30
    258     ORR         v11.8b,  v11.8b ,  v18.8b   //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
    259 
    260     AND         v12.8b,  v12.8b ,  v17.8b   //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
    261     B           SWITCH_BREAK_V
    262 
    263 SAO_BAND_POS_V_30:                          //case 30
    264     CMP         x6,#30
    265 
    266     cmhs        v19.8b,  v29.8b ,  v10.8b   //vcle_u8(band_table.val[1], vdup_n_u8(16))
    267     BNE         SAO_BAND_POS_V_31
    268     ORR         v10.8b,  v10.8b ,  v19.8b   //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
    269 
    270     AND         v11.8b,  v11.8b ,  v18.8b   //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
    271     B           SWITCH_BREAK_V
    272 
    273 SAO_BAND_POS_V_31:                          //case 31
    274     CMP         x6,#31
    275     BNE         SWITCH_BREAK_V
    276 
    277     cmhs        v20.8b,  v29.8b ,  v9.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
    278     ORR         v9.8b,  v9.8b ,  v20.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
    279 
    280     AND         v10.8b,  v10.8b ,  v19.8b   //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
    281     B           SWITCH_BREAK_V
    282 
    283 SAO_BAND_POS_V_0:
    284     CMP         x6,#0                       //case 0
    285     BNE         SWITCH_BREAK_V
    286 
    287     cmhs        v20.8b,  v29.8b ,  v9.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
    288     AND         v9.8b,  v9.8b ,  v20.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
    289 
    290 SWITCH_BREAK_V:
    291     CMP         x9,#16
    292     MOV         x4,x0                       //pu1_src_cpy
    293     mov         v1.d[1],v2.d[0]
    294     mov         v2.d[0],v3.d[0]
    295     mov         v2.d[1],v4.d[0]
    296     mov         v9.d[1],v10.d[0]
    297     mov         v10.d[0],v11.d[0]
    298     mov         v10.d[1],v12.d[0]
    299     BLT         WIDTH_RESIDUE
    300 
    301 WIDTH_LOOP:                                 //Width is assigned to be multiple of 16
    302     MOV         x4,x0                       //pu1_src_cpy
    303     MOV         x11,x10                     //move ht
    304     ADD         x5,x4,x1
    305 
    306 HEIGHT_LOOP:                                //unrolled for 4 rows
    307 
    308     ADD         x6,x5,x1
    309     LD2         {v5.8b, v6.8b},[x4]         //vld1q_u8(pu1_src_cpy)
    310     ADD         x7,x6,x1
    311 
    312     LD2         {v13.8b, v14.8b},[x5]       //vld1q_u8(pu1_src_cpy)
    313     SUB         v7.8b,  v5.8b ,  v31.8b     //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    314 
    315     LD2         {v17.8b, v18.8b},[x6]       //vld1q_u8(pu1_src_cpy)
    316     SUB         v8.8b,  v6.8b ,  v30.8b     //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    317 
    318     LD2         {v21.8b, v22.8b},[x7]       //vld1q_u8(pu1_src_cpy)
    319     SUB         v15.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    320 
    321     TBX         v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    322     SUB         v16.8b,  v14.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    323 
    324     TBX         v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    325     SUB         v19.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    326 
    327     TBX         v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    328     SUB         v20.8b,  v18.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    329 
    330     TBX         v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    331     SUB         v23.8b,  v21.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    332 
    333     ST2         {v5.8b, v6.8b},[x4]         //vst1q_u8(pu1_src_cpy, au1_cur_row)
    334     SUB         v24.8b,  v22.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    335 
    336     SUBS        x11,x11,#4                  //Decrement the ht loop count by 4
    337     TBX         v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    338 
    339     ST2         {v13.8b, v14.8b},[x5]       //vst1q_u8(pu1_src_cpy, au1_cur_row)
    340 
    341     TBX         v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    342     TBX         v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    343     TBX         v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    344 
    345     ST2         {v17.8b, v18.8b},[x6],x1    //vst1q_u8(pu1_src_cpy, au1_cur_row)
    346 
    347     ADD         x4,x6,x1
    348     ST2         {v21.8b, v22.8b},[x7]       //vst1q_u8(pu1_src_cpy, au1_cur_row)
    349     ADD         x5,x4,x1
    350 
    351     BNE         HEIGHT_LOOP
    352 
    353     SUB         x9,x9,#16                   //Decrement the width loop by 16
    354     ADD         x0,x0,#16
    355     CMP         x9,#8
    356     BGT         WIDTH_LOOP
    357     BLT         END_LOOP
    358     MOV         x4,x0                       //pu1_src_cpy
    359 
    360 WIDTH_RESIDUE:                              //If width is not multiple of 16
    361 
    362     ADD         x5,x4,x1
    363     LD2         {v5.8b, v6.8b},[x4]         //vld1q_u8(pu1_src_cpy)
    364     ADD         x6,x5,x1
    365 
    366     ADD         x7,x6,x1
    367     LD2         {v13.8b, v14.8b},[x5]       //vld1q_u8(pu1_src_cpy)
    368     SUB         v7.8b,  v5.8b ,  v31.8b     //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    369 
    370     LD2         {v17.8b, v18.8b},[x6]       //vld1q_u8(pu1_src_cpy)
    371     SUB         v8.8b,  v6.8b ,  v30.8b     //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    372 
    373     TBX         v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    374     SUB         v15.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    375 
    376     TBX         v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    377     SUB         v16.8b,  v14.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    378 
    379     LD2         {v21.8b, v22.8b},[x7]       //vld1q_u8(pu1_src_cpy)
    380     SUB         v19.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    381 
    382     TBX         v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    383     SUB         v20.8b,  v18.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    384 
    385     TBX         v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    386     ZIP1        v28.8b, v5.8b, v6.8b
    387     ZIP2        v6.8b, v5.8b, v6.8b
    388     mov         v5.8b, v28.8b
    389 
    390     TBX         v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    391     SUB         v23.8b,  v21.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    392 
    393     ST1         {v5.8b},[x4]                //vst1q_u8(pu1_src_cpy, au1_cur_row)
    394     ZIP1        v28.8b, v13.8b, v14.8b
    395     ZIP2        v14.8b, v13.8b, v14.8b
    396     mov         v13.8b, v28.8b
    397 
    398     TBX         v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    399     SUB         v24.8b,  v22.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    400 
    401     ST1         {v13.8b},[x5]               //vst1q_u8(pu1_src_cpy, au1_cur_row)
    402     SUBS        x10,x10,#4                  //Decrement the ht loop count by 4
    403 
    404     TBX         v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    405     ZIP1        v28.8b, v17.8b, v18.8b
    406     ZIP2        v18.8b, v17.8b, v18.8b
    407     mov         v17.8b, v28.8b
    408 
    409     TBX         v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    410     ST1         {v17.8b},[x6],x1            //vst1q_u8(pu1_src_cpy, au1_cur_row)
    411     ZIP1        v28.8b, v21.8b, v22.8b
    412     ZIP2        v22.8b, v21.8b, v22.8b
    413     mov         v21.8b, v28.8b
    414 
    415     ADD         x4,x6,x1
    416     ST1         {v21.8b},[x7]               //vst1q_u8(pu1_src_cpy, au1_cur_row)
    417     ADD         x5,x4,x1
    418 
    419     BNE         WIDTH_RESIDUE
    420 
    421 END_LOOP:
    422     // LDMFD sp!,{x4-x12,x15}            //Reload the registers from SP
    423     ldp         x23, x24,[sp],#16
    424     ldp         x21, x22,[sp],#16
    425     ldp         x19, x20,[sp],#16
    426     pop_v_regs
    427     ret
    428 
    429 
    430 
    431