Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* ,:file
     21 //*  ihevc_sao_edge_offset_class1.s
     22 //*
     23 //* ,:brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 //* RVCT
     27 //*
     28 //* ,:author
     29 //*  Parthiban V
     30 //*
     31 //* ,:par List of Functions:
     32 //*
     33 //*
     34 //* ,:remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 //void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
     40 //                              WORD32 src_strd,
     41 //                              UWORD8 *pu1_src_left,
     42 //                              UWORD8 *pu1_src_top,
     43 //                              UWORD8 *pu1_src_top_left,
     44 //                              UWORD8 *pu1_src_top_right,
     45 //                              UWORD8 *pu1_src_bot_left,
     46 //                              UWORD8 *pu1_avail,
     47 //                              WORD8 *pi1_sao_offset,
     48 //                              WORD32 wd,
     49 //                              WORD32 ht)
     50 //**************Variables Vs Registers*****************************************
     51 //x0 =>    *pu1_src
     52 //x1 =>    src_strd
     53 //x2 =>    *pu1_src_left
     54 //x3 =>    *pu1_src_top
     55 //x4    =>    *pu1_src_top_left
     56 //x5    =>    *pu1_avail
     57 //x6    =>    *pi1_sao_offset
     58 //x7    =>    wd
     59 //x8 =>    ht
     60 
     61 .text
     62 .p2align 2
     63 
     64 .include "ihevc_neon_macros.s"
     65 
     66 .globl gi1_table_edge_idx
     67 .globl ihevc_sao_edge_offset_class1_av8
     68 
     69 ihevc_sao_edge_offset_class1_av8:
     70 
     71 
     72     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     73     MOV         x5,x7                       //Loads pu1_avail
     74 
     75     LDR         x6,[sp]                     //Loads pi1_sao_offset
     76     LDR         w7,[sp,#8]                  //Loads wd
     77     LDR         w8,[sp,#16]                 //Loads ht
     78 
     79 
     80     stp         x19, x20,[sp,#-16]!
     81 
     82     SUB         x9,x7,#1                    //wd - 1
     83     LDRB        w10,[x3,x9]                 //pu1_src_top[wd - 1]
     84     STRB        w10,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
     85     ADD         x10,x0,x9                   //pu1_src[row * src_strd + wd - 1]
     86     MOV         x11,x2                      //Move pu1_src_left pointer to x11
     87     MOV         x12,x8                      //Move ht to x12 for loop count
     88 SRC_LEFT_LOOP:
     89     LDRB        w14,[x10]                   //Load pu1_src[row * src_strd + wd - 1]
     90     ADD         x10,x10,x1
     91     STRB        w14,[x11],#1                //pu1_src_left[row]
     92     SUBS        x12, x12,#1                 //Decrement the loop count
     93     BNE         SRC_LEFT_LOOP               //If not equal to 0 jump to the src_left_loop
     94 
     95     SUB         x12,x8,#1                   //ht - 1
     96     mul         x12, x12, x1                //(ht - 1) * src_strd
     97     ADD         x12,x12,x0                  //pu1_src[(ht - 1) * src_strd]
     98 
     99     LDRB        w4,[x5,#2]                  //pu1_avail[2]
    100     CMP         x4,#0                       //0 == pu1_avail[2]
    101     ADD         x20,x0,x1                   //pu1_src += src_strd
    102     csel        x0, x20, x0,EQ
    103     SUB         x20,x8,#1                   //ht--
    104     csel        x8, x20, x8,EQ
    105 
    106     LDRB        w4,[x5,#3]                  //pu1_avail[3]
    107     CMP         x4,#0                       //0 == pu1_avail[3]
    108     SUB         x20,x8,#1                   //ht--
    109     csel        x8, x20, x8,EQ
    110 
    111     movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
    112     movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
    113     movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    114     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    115     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    116     LD1         {v6.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    117     LD1         {v7.8b},[x6]                //offset_tbl = vld1_s8(pi1_sao_offset)
    118 
    119     CMP         x7,#16                      //Compare wd with 16
    120     BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    121 
    122 WIDTH_LOOP_16:
    123     LDRB        w4,[x5,#2]                  //pu1_avail[2]
    124     CMP         x4,#0                       //0 == pu1_avail[2]
    125     SUB         x20,x0,x1                   //pu1_src -= src_strd
    126     csel        x9, x20, x9,EQ
    127     csel        x9, x3, x9,NE               //*pu1_src_top
    128 
    129     MOV         x10,x0                      //*pu1_src
    130 
    131     LD1         {v1.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
    132     LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
    133 
    134     LD1         {v30.16b},[x12],#16         //vld1q_u8(pu1_src[(ht - 1) * src_strd])
    135     cmhi        v5.16b,  v3.16b ,  v1.16b   //vcgtq_u8(pu1_cur_row, pu1_top_row)
    136 
    137     ST1         { v30.16b},[x3],#16         //vst1q_u8(pu1_src_top[col])
    138     cmhi        v17.16b,  v1.16b ,  v3.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    139 
    140     SUB         v16.16b,  v17.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    141     MOV         x11,x8                      //move ht to x11 for loop count
    142 
    143 PU1_SRC_LOOP:
    144     ADD         x10,x10,x1                  //*pu1_src + src_strd
    145     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    146     ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
    147 
    148     cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    149     LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    150 
    151     cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
    152     SUB         x10,x10,x1
    153 
    154     SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    155     Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    156 
    157     ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
    158     Uxtl2       v28.8h, v18.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    159 
    160     ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
    161     cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
    162 
    163     NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
    164     TBL         v5.16b, {v6.16b},v5.16b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    165     cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
    166 
    167     SUB         v1.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    168 //  TBL v13.8b, {v6.16b},v13.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    169     ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
    170 
    171 
    172     NEG         v16.16b, v1.16b             //II sign_up = vnegq_s8(sign_down)
    173     TBL         v5.16b, {v7.16b},v5.16b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    174     ADD         v22.16b,  v22.16b ,  v1.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
    175 
    176 
    177     Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    178     TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    179     SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    180 
    181     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    182 //  TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    183     UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    184 
    185 
    186     Uxtl2       v1.8h, v3.16b               //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    187 //  TBL v13.8b, {v7.16b},v13.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    188     mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
    189 
    190     SADDW2      v1.8h,  v1.8h ,  v5.16b     //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    191     TBL         v24.16b, {v7.16b},v22.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    192     SMAX        v1.8h,  v1.8h ,  v2.8h      //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    193 
    194     UMIN        v1.8h,  v1.8h ,  v4.8h      //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    195 //  TBL v25.8b, {v7.16b},v23.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    196 
    197     xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    198     SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    199 
    200     xtn2        v20.16b,  v1.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
    201     SADDW2      v28.8h,  v28.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    202 
    203 
    204     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    205     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    206 
    207     SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    208     UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    209     ST1         { v20.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    210 
    211     xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
    212     SUBS        x11,x11,#2                  //II Decrement the ht loop count by 1
    213     xtn2        v30.16b,  v28.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
    214 
    215     ST1         { v30.16b},[x10],x1         //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    216 
    217     BEQ         PU1_SRC_LOOP_END            //if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
    218     CMP         x11,#1                      //checking any residue remains
    219     BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
    220 
    221     ADD         x10,x10,x1                  //*pu1_src + src_strd
    222     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    223     cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    224     cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
    225     SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    226     SUB         x10,x10,x1
    227 
    228     ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
    229     ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
    230     TBL         v22.16b, {v6.16b},v22.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    231 //  TBL v23.8b, {v6.16b},v23.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    232 
    233     TBL         v24.16b, {v7.16b},v22.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    234     Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    235     SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    236     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    237     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    238 
    239 //  TBL v25.8b, {v7.16b},v23.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    240     Uxtl2       v28.8h, v3.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    241     SADDW2      v28.8h,  v28.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    242     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    243     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    244 
    245     xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    246     xtn2        v30.16b,  v28.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
    247 
    248     ST1         { v30.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    249 
    250 PU1_SRC_LOOP_END:
    251     mov         v3.16b, v18.16b             //pu1_cur_row = pu1_next_row
    252     SUBS        x7,x7,#16                   //Decrement the wd loop count by 16
    253     CMP         x7,#8                       //Check whether residue remains
    254     BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
    255     BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
    256     BLT         END_LOOPS                   //Jump to end function
    257 
    258 
    259 WIDTH_RESIDUE:
    260     LDRB        w4,[x5,#2]                  //pu1_avail[2]
    261     CMP         x4,#0                       //0 == pu1_avail[2]
    262     SUB         x20,x0,x1                   //pu1_src -= src_strd
    263     csel        x9, x20, x9,EQ
    264     csel        x9, x3, x9,NE               //*pu1_src_top
    265     MOV         x10,x0
    266 
    267     LD1         {v1.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
    268     LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
    269 
    270     LD1         {v30.8b},[x12]              //vld1_u8(pu1_src[(ht - 1) * src_strd])
    271     ST1         {v30.8b},[x3]               //vst1_u8(pu1_src_top[col])
    272 
    273     cmhi        v5.16b,  v3.16b ,  v1.16b   //vcgtq_u8(pu1_cur_row, pu1_top_row)
    274     cmhi        v17.16b,  v1.16b ,  v3.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    275     SUB         v16.16b,  v17.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    276     MOV         x11,x8                      //move ht to x11 for loop count
    277 
    278 PU1_SRC_LOOP_RESIDUE:
    279     ADD         x10,x10,x1                  //*pu1_src + src_strd
    280     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    281     ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
    282 
    283     cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
    284     LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    285 
    286     cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
    287     SUB         x10,x10,x1
    288 
    289     SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    290     Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    291 
    292     ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
    293     cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
    294 
    295     ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
    296     cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
    297 
    298     NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
    299     TBL         v5.8b, {v6.16b},v5.8b       //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    300     SUB         v20.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    301 
    302     ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
    303     TBL         v5.8b, {v7.16b},v5.8b       //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    304     NEG         v16.16b, v20.16b            //II sign_up = vnegq_s8(sign_down)
    305 
    306     ADD         v22.16b,  v22.16b ,  v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
    307     Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    308 
    309     SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    310     TBL         v22.8b, {v6.16b},v22.8b     //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    311     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    312 
    313     UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    314     TBL         v24.8b, {v7.16b},v22.8b     //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    315     xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    316 
    317     SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    318     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    319     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    320 
    321     mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
    322     ST1         {v20.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    323     xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
    324 
    325     SUBS        x11,x11,#2                  //Decrement the ht loop count by 1
    326     ST1         {v30.8b},[x10],x1           //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    327 
    328     BEQ         END_LOOPS
    329     CMP         x11,#1
    330     BGT         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
    331 
    332 
    333     ADD         x10,x10,x1                  //*pu1_src + src_strd
    334     LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    335     cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
    336     cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
    337     SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    338     SUB         x10,x10,x1
    339 
    340     ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
    341     ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
    342     TBL         v22.8b, {v6.16b},v22.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    343 
    344     TBL         v24.8b, {v7.16b},v22.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    345     Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    346     SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    347     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    348     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    349 
    350     xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    351 
    352     ST1         {v30.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    353 
    354 END_LOOPS:
    355     // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
    356     ldp         x19, x20,[sp], #16
    357 
    358     ret
    359 
    360 
    361 
    362 
    363 
    364 
    365