Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* ,:file
     21 //*  ihevc_sao_edge_offset_class2_chroma.s
     22 //*
     23 //* ,:brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 //* RVCT
     27 //*
     28 //* ,:author
     29 //*  Parthiban V
     30 //*
     31 //* ,:par List of Functions:
     32 //*
     33 //*
     34 //* ,:remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 //void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
     40 //                              WORD32 src_strd,
     41 //                              UWORD8 *pu1_src_left,
     42 //                              UWORD8 *pu1_src_top,
     43 //                              UWORD8 *pu1_src_top_left,
     44 //                              UWORD8 *pu1_src_top_right,
     45 //                              UWORD8 *pu1_src_bot_left,
     46 //                              UWORD8 *pu1_avail,
     47 //                              WORD8 *pi1_sao_offset_u,
     48 //                              WORD8 *pi1_sao_offset_v,
     49 //                              WORD32 wd,
     50 //                              WORD32 ht)
     51 //**************Variables Vs Registers*****************************************
     52 //x0 =>    *pu1_src
     53 //x1 =>    src_strd
     54 //x2 =>    *pu1_src_left
     55 //x3 =>    *pu1_src_top
     56 //x4    =>    *pu1_src_top_left
     57 //x5    =>    *pu1_avail
     58 //x6    =>    *pi1_sao_offset_u
     59 //x9 =>  *pi1_sao_offset_v
     60 //x7    =>    wd
     61 //x8=>    ht
     62 
     63 .text
     64 .p2align 2
     65 .include "ihevc_neon_macros.s"
     66 
     67 .globl gi1_table_edge_idx
     68 .globl ihevc_sao_edge_offset_class2_chroma_av8
     69 
     70 ihevc_sao_edge_offset_class2_chroma_av8:
     71 
     72 
     73     // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
     74 
     75     ldr         x8,[sp,#0]
     76     ldr         x9,[sp,#8]
     77     ldr         w10,[sp,#16]
     78     ldr         w11,[sp,#24]
     79 
     80 
     81 
     82     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     83     stp         x19, x20,[sp,#-16]!
     84     stp         x21, x22,[sp,#-16]!
     85     stp         x23, x24,[sp,#-16]!
     86     stp         x25, x26,[sp,#-16]!
     87     stp         x27, x28,[sp,#-16]!
     88 
     89     mov         x15,x4 // *pu1_src_top_left 0x28
     90     //mov x16,x5    // *pu1_src_top_right 0x2c
     91     mov         x17,x6 // *pu1_src_bot_left 0x30
     92     mov         x21,x7 // *pu1_avail 0x34
     93     mov         x22,x8 // *pi1_sao_offset_u 0x38
     94     mov         x23,x9 // *pi1_sao_offset_v 0x3c
     95     mov         x24,x10 // wd 0x40
     96     mov         x25,x11 // ht 0x44
     97 
     98 
     99     mov         w7, w24                     //Loads wd
    100     mov         w8, w25                     //Loads ht
    101     SUB         x9,x7,#2                    //wd - 2
    102 
    103     mov         x4, x15                     //Loads pu1_src_top_left
    104     LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
    105 
    106     mov         x26, x0                     //Store pu1_src in sp
    107     MOV         x9,x7                       //Move width to x9 for loop count
    108 
    109     mov         x17, x2                     //Store pu1_src_bot_left in sp
    110     mov         x5, x21                     //Loads pu1_avail
    111     mov         x6, x22                     //Loads pi1_sao_offset_u
    112 
    113     mov         x22, x3                     //Store pu1_src_top in sp
    114     SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
    115 
    116     STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
    117     SUB         x10,x8,#1                   //ht-1
    118     madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
    119     ADD         x12,sp,#10                  //temp array
    120 
    121 AU1_SRC_TOP_LOOP:
    122     LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
    123     SUBS        x9,x9,#8                    //Decrement the loop count by 8
    124     ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    125     BNE         AU1_SRC_TOP_LOOP
    126 
    127 PU1_AVAIL_4_LOOP_U:
    128     LDRB        w9,[x5,#4]                  //pu1_avail[4]
    129     CMP         x9,#0
    130     LDRB        w9,[x0]                     //u1_pos_0_0_tmp_u = pu1_src[0]
    131     LDRB        w10,[x0,#1]                 //u1_pos_0_0_tmp_v = pu1_src[1]
    132     BEQ         PU1_AVAIL_7_LOOP_U
    133 
    134     LDRB        w11,[x4]                    //pu1_src_top_left[0]
    135     ADD         x14,x0,x1                   //pu1_src + src_strd
    136 
    137     SUB         x12,x9,x11                  //pu1_src[0] - pu1_src_top_left[0]
    138 
    139     LDRB        w14,[x14,#2]                //pu1_src[2 + src_strd]
    140     CMP         x12,#0
    141 
    142     movn        x20,#0
    143     csel        x12, x20, x12,LT
    144     SUB         x11,x9,x14                  //pu1_src[0] - pu1_src[2 + src_strd]
    145 
    146     MOV         x20,#1
    147     csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
    148 
    149     CMP         x11,#0
    150     movn        x20,#0
    151     csel        x11, x20, x11,LT
    152     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    153     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    154     MOV         x20,#1
    155     csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[2 + src_strd])
    156 
    157     ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
    158     ADD         x11,x11,#2                  //edge_idx
    159 
    160     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    161     CMP         x12,#0                      //0 != edge_idx
    162     BEQ         PU1_AVAIL_4_LOOP_V
    163     LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
    164     ADD         x9,x9,x11                   //pu1_src[0] + pi1_sao_offset_u[edge_idx]
    165     mov         x20,#255
    166     cmp         x9,x20
    167     csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    168 
    169 PU1_AVAIL_4_LOOP_V:
    170 
    171     LDRB        w11,[x4,#1]                 //pu1_src_top_left[1]
    172     ADD         x14,x0,x1                   //pu1_src + src_strd
    173 
    174     SUB         x12,x10,x11                 //pu1_src[1] - pu1_src_top_left[1]
    175     LDRB        w14,[x14,#3]                //pu1_src[3 + src_strd]
    176 
    177     CMP         x12,#0
    178     movn        x20,#0
    179     csel        x12, x20, x12,LT
    180     SUB         x11,x10,x14                 //pu1_src[1] - pu1_src[3 + src_strd]
    181     MOV         x20,#1
    182     csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
    183 
    184     CMP         x11,#0
    185     movn        x20,#0
    186     csel        x11, x20, x11,LT
    187     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    188     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    189     MOV         x20,#1
    190     csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[3 + src_strd])
    191 
    192     ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
    193     ADD         x11,x11,#2                  //edge_idx
    194 
    195     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    196     CMP         x12,#0                      //0 != edge_idx
    197     BEQ         PU1_AVAIL_7_LOOP_U
    198     mov         x11, x23                    //Loads pi1_sao_offset_v
    199     LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
    200     ADD         x10,x10,x11                 //pu1_src[0] + pi1_sao_offset_v[edge_idx]
    201     mov         x20,#255
    202     cmp         x10,x20
    203     csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
    204 
    205 PU1_AVAIL_7_LOOP_U:
    206     STRB        w10,[sp,#7]
    207     STRB        w9,[sp,#6]
    208 
    209     LDRB        w10,[x5,#7]                 //pu1_avail[7]
    210     CMP         x10,#0
    211     SUB         x10,x7,#2                   //wd - 2
    212     SUB         x11,x8,#1                   //ht - 1
    213     madd        x12, x11, x1, x10           //wd - 2 + (ht - 1) * src_strd
    214     ADD         x12,x12,x0                  //pu1_src[wd - 2 + (ht - 1) * src_strd]
    215     LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
    216     LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
    217     BEQ         PU1_AVAIL_3_LOOP
    218 
    219     SUB         x11,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
    220     SUB         x11,x11,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
    221     LDRB        w11,[x11]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
    222     SUB         x11,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
    223     CMP         x11,#0
    224     movn        x20,#0
    225     csel        x11, x20, x11,LT
    226     MOV         x20,#1
    227     csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
    228 
    229     ADD         x14,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
    230     ADD         x14,x14,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
    231     LDRB        w14,[x14]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
    232     SUB         x14,x10,x14                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    233     CMP         x14,#0
    234     movn        x20,#0
    235     csel        x14, x20, x14,LT
    236     MOV         x20,#1
    237     csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
    238 
    239     ADD         x11,x11,x14                 //Add 2 sign value
    240     ADD         x11,x11,#2                  //edge_idx
    241     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    242     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    243 
    244     LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    245     CMP         x14,#0
    246     BEQ         PU1_AVAIL_7_LOOP_V
    247     LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
    248     ADD         x10,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    249     mov         x20,#255
    250     cmp         x10,x20
    251     csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    252 
    253 PU1_AVAIL_7_LOOP_V:
    254     ADD         x12,x12,#1
    255     SUB         x11,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
    256     SUB         x11,x11,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
    257     LDRB        w11,[x11]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
    258     SUB         x11,x9,x11                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
    259     CMP         x11,#0
    260     movn        x20,#0
    261     csel        x11, x20, x11,LT
    262     MOV         x20,#1
    263     csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
    264 
    265     ADD         x14,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
    266     ADD         x14,x14,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    267     LDRB        w14,[x14]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    268     SUB         x14,x9,x14                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    269     CMP         x14,#0
    270     movn        x20,#0
    271     csel        x14, x20, x14,LT
    272     MOV         x20,#1
    273     csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
    274 
    275     ADD         x11,x11,x14                 //Add 2 sign value
    276     ADD         x11,x11,#2                  //edge_idx
    277     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    278     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    279 
    280     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    281     CMP         x12,#0
    282     BEQ         PU1_AVAIL_3_LOOP
    283     mov         x14, x23                    //Loads pi1_sao_offset_v
    284     LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
    285     ADD         x9,x9,x11                   //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    286     mov         x20,#255
    287     cmp         x9,x20
    288     csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    289 
    290 PU1_AVAIL_3_LOOP:
    291     STRB        w10,[sp,#8]
    292     movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
    293     STRB        w9,[sp,#9]
    294 
    295     MOV         x12,x8                      //Move ht
    296     movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
    297     MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
    298 
    299     LDRB        w11,[x5,#3]                 //pu1_avail[3]
    300     movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    301     CMP         x11,#0
    302 
    303     SUB         x20,x12,#1                  //ht_tmp--
    304     csel        x12, x20, x12,EQ
    305     LDRB        w5,[x5,#2]                  //pu1_avail[2]
    306 
    307     CMP         x5,#0
    308 
    309     ADD         x20,x0,x1                   //pu1_src += src_strd
    310     csel        x0, x20, x0,EQ
    311     LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
    312     SUB         x20,x12,#1                  //ht_tmp--
    313     csel        x12, x20, x12,EQ
    314 
    315     mov         x6, x23                     //Loads pi1_sao_offset_v
    316     ADD         x20,x14,#2                  //pu1_src_left_cpy += 2
    317     csel        x14, x20, x14,EQ
    318 
    319     mov         x27, x0                     //Store pu1_src in sp
    320     LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
    321     ADRP        x2, :got:gi1_table_edge_idx //table pointer
    322     LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
    323 
    324     MOV         x6,x7                       //move wd to x6 loop_count
    325     movi        v1.16b, #0XFF               //au1_mask = vdupq_n_s8(-1)
    326     CMP         x7,#16                      //Compare wd with 16
    327 
    328     BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    329     CMP         x8,#4                       //Compare ht with 4
    330     BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
    331 
    332 WIDTH_LOOP_16:
    333     mov         x5, x21                     //Loads pu1_avail
    334     mov         w7, w24                     //Loads wd
    335     CMP         x6,x7                       //col == wd
    336     LDRb        w20, [x5]                   //pu1_avail[0]
    337     csel        w8,w20,w8,EQ
    338 
    339     MOV         x20,#-1
    340     csel        x8, x20, x8,NE
    341     mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    342 
    343     CMP         x6,#16                      //if(col == 16)
    344     mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    345 
    346     BNE         SKIP_AU1_MASK_VAL
    347     LDRB        w8,[x5,#1]                  //pu1_avail[1]
    348     mov         v1.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    349     mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    350 
    351 SKIP_AU1_MASK_VAL:
    352     LDRB        w9,[x5,#2]                  //pu1_avail[2]
    353     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    354     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    355     //SUB x0, x0,#8
    356     CMP         x9,#0
    357 
    358     mov         w4, w25                     //Loads ht
    359     SUB         x20,x0,x1                   //pu1_src - src_strd
    360     csel        x8, x20, x8,EQ
    361 
    362     mov         w7, w24                     //Loads wd
    363     csel        x8, x3, x8,NE               //pu1_src_top_cpy
    364 
    365     SUB         x8,x8,#2                    //pu1_src - src_strd - 2
    366     ADD         x3,x3,#16
    367 
    368     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    369     LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    370     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    371     //SUB x8, x8,#8
    372     SUB         x7,x7,x6                    //(wd - col)
    373 
    374     ADD         x7,x7,#14                   //15 + (wd - col)
    375     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    376     mov         x8, x26                     //Loads *pu1_src
    377 
    378     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
    379     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    380 
    381 AU1_SRC_LEFT_LOOP:
    382     LDRH        w8,[x7]                     //load the value and increment by src_strd
    383     SUBS        x4,x4,#1                    //decrement the loop count
    384 
    385     STRH        w8,[x5],#2                  //store it in the stack pointer
    386     ADD         x7,x7,x1
    387 
    388     BNE         AU1_SRC_LEFT_LOOP
    389 
    390     ADD         x8,x0,x1                    //I *pu1_src + src_strd
    391     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    392     MOV         x7,x12                      //row count, move ht_tmp to x7
    393 
    394     LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    395     //LD1 {v17.8b},[x8]                        //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    396     //SUB x8, x8,#8
    397 
    398     ADD         x8,x8,#16                   //I
    399     movi        v18.16b, #0
    400     LDRH        w5,[x8]                     //I pu1_src_cpy[src_strd + 16]
    401 
    402     mov         x10, x21                    //I Loads pu1_avail
    403     mov         v18.4h[0], w5               //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    404     LDRB        w10,[x10,#2]                //I pu1_avail[2]
    405 
    406     CMP         x10,#0                      //I
    407     EXT         v18.16b,  v16.16b ,  v18.16b,#2 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    408     BNE         SIGN_UP_CHANGE_DONE         //I
    409 
    410     LDRB        w11,[x0]                    //I pu1_src_cpy[0]
    411     SUB         x4,x12,x7                   //I ht_tmp - row
    412 
    413     LDRB        w10,[x0,#1]                 //I pu1_src_cpy[0]
    414     LSL         x4,x4,#1                    //I (ht_tmp - row) * 2
    415 
    416     ADD         x9,x14,x4                   //I pu1_src_left_cpy[(ht_tmp - row) * 2]
    417     sub         x13,x9,#2
    418     LDRB        w5,[x13]                    //I load the value
    419 
    420     SUB         x8,x11,x5                   //I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    421     sub         x13,x9,#1
    422     LDRB        w5,[x13]                    //I load the value
    423 
    424     CMP         x8,#0                       //I
    425     SUB         x4,x10,x5                   //I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    426 
    427     movn        x20,#0
    428     csel        x8, x20, x8,LT              //I
    429     MOV         x20,#1
    430     csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    431 
    432     CMP         x4,#0                       //I
    433     mov         v17.8b[0], w8               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    434     movn        x20,#0
    435     csel        x4, x20, x4,LT              //I
    436 
    437     MOV         x20,#1
    438     csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    439     mov         v17.8b[1], w4               //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    440 
    441 SIGN_UP_CHANGE_DONE:
    442     LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    443     cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    444 
    445     cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    446     SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    447 
    448     ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
    449     ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
    450 
    451     TBL         v18.16b, {v30.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    452     NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
    453 
    454     //TBL v19.8b, {v30.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    455     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
    456 
    457     Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    458     AND         v22.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
    459     mov         v23.d[0],v22.d[1]
    460 
    461     Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    462     UZP1        v31.8b, v22.8b, v23.8b
    463     UZP2        v23.8b, v22.8b, v23.8b      //I
    464     mov         v22.8b,v31.8b
    465 
    466     TBL         v22.8b, {v6.16b},v22.8b     //I
    467     TBL         v23.8b, {v7.16b},v23.8b     //I
    468     ZIP1        v31.8b, v22.8b, v23.8b
    469     ZIP2        v23.8b, v22.8b, v23.8b      //I
    470     mov         v22.8b,v31.8b
    471 
    472     mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
    473     SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    474 
    475     SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    476     UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    477 
    478     SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    479     SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    480 
    481     UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    482     SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
    483 
    484 
    485 PU1_SRC_LOOP:
    486     ADD         x8,x0,x1,LSL #1             //II *pu1_src + src_strd
    487     xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
    488     ADD         x11,x8,x1                   //III *pu1_src + src_strd
    489 
    490     LD1         {v16.16b},[x8]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    491     //LD1 {v17.8b},[x8]                        //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    492     //SUB x8, x8,#8
    493     LD1         {v30.16b},[x11]             //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    494     //LD1 {v31.8b},[x11]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    495     //SUB x11, x11,#8
    496 
    497     ADD         x8,x8,#16                   //II
    498     xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
    499     LDRH        w5,[x8]                     //II pu1_src_cpy[src_strd + 16]
    500 
    501     ADD         x11,x11,#16                 //III
    502     mov         v28.4h[0], w5               //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    503     LDRH        w4,[x11]                    //III pu1_src_cpy[src_strd + 16]
    504 
    505     LDRB        w8,[x0,x1]                  //II pu1_src_cpy[0]
    506     EXT         v28.16b,  v16.16b ,  v28.16b,#2 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    507     SUB         x5,x12,x7                   //II ht_tmp - row
    508 
    509     LSL         x5,x5,#1                    //II (ht_tmp - row) * 2
    510     mov         v18.4h[0], w4               //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    511     ADD         x9,x14,x5                   //II pu1_src_left_cpy[(ht_tmp - row) * 2]
    512 
    513     sub         x13,x9,#2
    514     LDRB        w11,[x13]                   //II load the value
    515     ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    516     SUB         x8,x8,x11                   //II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    517 
    518     CMP         x8,#0                       //II
    519     EXT         v18.16b,  v30.16b ,  v18.16b,#2 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    520     LDRB        w11,[x0,#1]                 //II pu1_src_cpy[0]
    521 
    522     movn        x20,#0
    523     csel        x8, x20, x8,LT              //II
    524     cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    525     MOV         x20,#1
    526     csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    527 
    528     sub         x13,x9,#1
    529     LDRB        w5,[x13]                    //II load the value
    530     mov         v17.8b[0], w8               //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    531     SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
    532 
    533     SUB         x11,x11,x5                  //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    534     cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    535     CMP         x11,#0                      //II
    536 
    537     movn        x20,#0
    538     csel        x11, x20, x11,LT            //II
    539     SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    540     MOV         x20,#1
    541     csel        x11, x20, x11,GT            //II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    542 
    543     LDRB        w4,[x0,x1]                  //III pu1_src_cpy[0]
    544     LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    545     SUB         x5,x12,x7                   //III ht_tmp - row
    546 
    547     ADD         x10,x0,x1
    548     mov         v17.8b[1], w11              //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    549     LSL         x5,x5,#1                    //III (ht_tmp - row) * 2
    550 
    551     ADD         x9,x14,x5                   //III pu1_src_left_cpy[(ht_tmp - row) * 2]
    552     ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
    553     LDRB        w10,[x10,#1]                //III pu1_src_cpy[0]
    554 
    555     sub         x13,x9,#2
    556     LDRB        w5,[x13]                    //III load the value
    557     ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
    558     SUB         x4,x4,x5                    //III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    559 
    560     mov         v22.d[1],v22.d[0]
    561     CMP         x4,#0                       //III
    562     sub         x13,x9,#1
    563     LDRB        w9,[x13]                    //III load the value
    564     TBL         v26.16b, {v22.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    565     NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
    566 
    567     movn        x20,#0
    568     csel        x4, x20, x4,LT              //III
    569     SUB         x10,x10,x9                  //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    570     //TBL v27.8b, {v22.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    571     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
    572 
    573     MOV         x20,#1
    574     csel        x4, x20, x4,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    575     AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
    576     CMP         x10,#0                      //III
    577 
    578     mov         v27.d[0],v26.d[1]
    579     UZP1        v31.8b, v26.8b, v27.8b
    580     UZP2        v27.8b, v26.8b, v27.8b      //II
    581     mov         v26.8b,v31.8b
    582     mov         v17.8b[0], w4               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    583 
    584     movn        x20,#0
    585     csel        x10, x20, x10,LT            //III
    586     MOV         x20,#1
    587     csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    588     TBL         v24.8b, {v6.16b},v26.8b     //II
    589     cmhi        v20.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    590 
    591     cmhi        v22.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    592     TBL         v25.8b, {v7.16b},v27.8b     //II
    593     SUB         v22.16b,  v22.16b ,  v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    594 
    595     mov         v17.8b[1], w10              //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    596     ZIP1        v31.8b, v24.8b, v25.8b
    597     ZIP2        v25.8b, v24.8b, v25.8b      //II
    598     mov         v24.8b,v31.8b
    599 
    600     Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    601     ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
    602 
    603     LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    604     SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    605 
    606     ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
    607     SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    608 
    609     UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    610     TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    611     NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
    612 
    613     //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    614     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
    615 
    616     Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    617     AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
    618 
    619     mov         v19.d[0],v18.d[1]
    620     UZP1        v31.8b, v18.8b, v19.8b
    621     UZP2        v19.8b, v18.8b, v19.8b      //III
    622     mov         v18.8b,v31.8b
    623     TBL         v22.8b, {v6.16b},v18.8b     //III
    624     SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    625 
    626     mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
    627     TBL         v23.8b, {v7.16b},v19.8b     //III
    628     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    629 
    630     Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    631     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    632 
    633     ZIP1        v31.8b, v22.8b, v23.8b
    634     ZIP2        v23.8b, v22.8b, v23.8b      //III
    635     mov         v22.8b,v31.8b
    636     xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
    637 
    638     xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
    639     SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    640 
    641     Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    642     SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    643 
    644     UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    645     SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    646 
    647     SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
    648     SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    649     CMP         x7,#1
    650 
    651     ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    652     UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    653 
    654     BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
    655     BLT         INNER_LOOP_DONE
    656 
    657     ADD         x8,x0,x1,LSL #1             //*pu1_src + src_strd
    658     xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
    659 
    660     LDRB        w11,[x0,x1]                 //pu1_src_cpy[0]
    661     LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    662     //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    663     //SUB x8, x8,#8
    664     SUB         x4,x12,x7                   //ht_tmp - row
    665 
    666     ADD         x8,x8,#16
    667     xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
    668     LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
    669 
    670     LSL         x4,x4,#1                    //(ht_tmp - row) * 2
    671     mov         v18.4h[0], w5               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    672     ADD         x9,x14,x4                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
    673 
    674     sub         x13,x9,#2
    675     LDRB        w5,[x13]                    //load the value
    676     EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    677     SUB         x8,x11,x5                   //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    678 
    679     CMP         x8,#0
    680     ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    681     movn        x20,#0
    682     csel        x8, x20, x8,LT
    683 
    684     MOV         x20,#1
    685     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    686     LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    687 
    688     LDRB        w11,[x0,#1]                 //pu1_src_cpy[0]
    689     mov         v17.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    690     sub         x13,x9,#1
    691     LDRB        w5,[x13]                    //load the value
    692 
    693     SUB         x4,x11,x5                   //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    694     cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    695     CMP         x4,#0
    696 
    697     movn        x20,#0
    698     csel        x4, x20, x4,LT
    699     cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    700     MOV         x20,#1
    701     csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    702 
    703     mov         v17.8b[1], w4               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    704     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    705 
    706     ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
    707     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
    708 
    709     mov         v30.d[1],v30.d[0]
    710     TBL         v26.16b, {v30.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    711     //TBL v27.8b, {v30.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    712 
    713     Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    714     AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    715     mov         v27.d[0],v26.d[1]
    716 
    717     Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    718     UZP1        v31.8b, v26.8b, v27.8b
    719     UZP2        v27.8b, v26.8b, v27.8b
    720     mov         v26.8b,v31.8b
    721 
    722     TBL         v24.8b, {v6.16b},v26.8b
    723     TBL         v25.8b, {v7.16b},v27.8b
    724     ZIP1        v31.8b, v24.8b, v25.8b
    725     ZIP2        v25.8b, v24.8b, v25.8b
    726     mov         v24.8b,v31.8b
    727 
    728     SADDW       v20.8h,  v20.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    729     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    730     UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    731 
    732     SADDW       v18.8h,  v18.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    733     SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    734     UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    735 
    736 
    737 INNER_LOOP_DONE:
    738     mov         w8, w25                     //Loads ht
    739     xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    740     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    741 
    742     mov         x11, x17                    //Loads *pu1_src_left
    743     xtn2        v20.16b,  v18.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
    744 
    745 
    746 SRC_LEFT_LOOP:
    747     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
    748     SUBS        x8,x8,#2
    749     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
    750     BNE         SRC_LEFT_LOOP
    751 
    752     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    753     ST1         { v20.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    754     CMP         x6,#8                       //Check whether residue remains
    755 
    756     BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
    757     mov         w7, w24                     //Loads wd
    758     mov         x0, x27                     //Loads *pu1_src
    759     SUB         x7,x7,x6
    760     ADD         x0,x0,x7
    761     BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
    762     BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
    763 
    764 
    765 WD_16_HT_4_LOOP:
    766     mov         x5, x21                     //Loads pu1_avail
    767     mov         w7, w24                     //Loads wd
    768     CMP         x6,x7                       //col == wd
    769     LDRb        w20, [x5]                   //pu1_avail[0]
    770     csel        w8,w20,w8,EQ
    771 
    772     MOV         x20,#-1
    773     csel        x8, x20, x8,NE
    774     mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    775     mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    776 
    777     CMP         x6,#16                      //if(col == 16)
    778     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    779     LDRB        w8,[x5,#1]                  //pu1_avail[1]
    780     mov         v1.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    781     mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    782 
    783 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    784     LDRB        w8,[x5,#2]                  //pu1_avail[2]
    785     CMP         x8,#0
    786 
    787     SUB         x20,x0,x1                   //pu1_src - src_strd
    788     csel        x8, x20, x8,EQ
    789     csel        x8, x3, x8,NE               //pu1_src_top_cpy
    790     SUB         x8,x8,#2                    //pu1_src - src_strd - 2
    791     LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    792     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    793     //SUB x8, x8,#8
    794 
    795     ADD         x3,x3,#16
    796     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    797     mov         w4, w25                     //Loads ht
    798     mov         x7, x24                     //Loads wd
    799     SUB         x7,x7,x6                    //(wd - col)
    800     ADD         x7,x7,#14                   //15 + (wd - col)
    801     mov         x8, x26                     //Loads *pu1_src
    802     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
    803 
    804 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    805     LDRH        w8,[x7]                     //load the value and increment by src_strd
    806     STRH        w8,[x5],#2                  //store it in the stack pointer
    807     ADD         x7,x7,x1
    808 
    809     SUBS        x4,x4,#1                    //decrement the loop count
    810     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    811 
    812     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    813     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    814     //SUB x0, x0,#8
    815 
    816     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    817     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    818     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    819     movi        v18.16b, #0
    820     MOV         x7,x12                      //row count, move ht_tmp to x7
    821 
    822 PU1_SRC_LOOP_WD_16_HT_4:
    823     movi        v18.16b, #0
    824     ADD         x8,x0,x1                    //*pu1_src + src_strd
    825     LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    826     //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    827     //SUB x8, x8,#8
    828 
    829     ADD         x8,x8,#16
    830     LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
    831     mov         v18.4h[0], w5               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    832     EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    833 
    834     CMP         x7,x12
    835     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    836     mov         x5, x21                     //Loads pu1_avail
    837     LDRB        w5,[x5,#2]                  //pu1_avail[2]
    838     CMP         x5,#0
    839     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    840 
    841 SIGN_UP_CHANGE_WD_16_HT_4:
    842     LDRB        w8,[x0]                     //pu1_src_cpy[0]
    843     SUB         x5,x12,x7                   //ht_tmp - row
    844     LSL         x5,x5,#1                    //(ht_tmp - row) * 2
    845     ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
    846     sub         x13,x9,#2
    847     LDRB        w5,[x13]                    //load the value
    848     SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    849     CMP         x8,#0
    850     movn        x20,#0
    851     csel        x8, x20, x8,LT
    852     MOV         x20,#1
    853     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    854     mov         v17.8b[0], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    855 
    856     LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
    857     sub         x13,x9,#1
    858     LDRB        w5,[x13]                    //load the value
    859     SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    860     CMP         x8,#0
    861     movn        x20,#0
    862     csel        x8, x20, x8,LT
    863     MOV         x20,#1
    864     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    865     mov         v17.8b[1], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    866 
    867 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    868     cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    869     cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    870     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    871 
    872     ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
    873     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
    874 
    875     LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    876     TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    877     //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    878 
    879     AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    880     mov         v27.d[0],v26.d[1]
    881 
    882     NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
    883     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
    884 
    885     UZP1        v31.8b, v26.8b, v27.8b
    886     UZP2        v27.8b, v26.8b, v27.8b
    887     mov         v26.8b,v31.8b
    888     TBL         v24.8b, {v6.16b},v26.8b
    889     TBL         v25.8b, {v7.16b},v27.8b
    890     ZIP1        v31.8b, v24.8b, v25.8b
    891     ZIP2        v25.8b, v24.8b, v25.8b
    892     mov         v24.8b,v31.8b
    893 
    894     Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    895     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    896     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    897     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    898 
    899     Uxtl2       v26.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    900     SADDW       v26.8h,  v26.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    901     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    902     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    903 
    904     xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    905     xtn2        v28.16b,  v26.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
    906 
    907     ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    908 
    909     mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
    910     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
    911     BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    912 
    913     mov         w8, w25                     //Loads ht
    914     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    915     mov         x11, x17                    //Loads *pu1_src_left
    916 
    917 SRC_LEFT_LOOP_WD_16_HT_4:
    918     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
    919     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
    920 
    921     SUBS        x8,x8,#2
    922     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    923 
    924 
    925     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    926     BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
    927     BGT         WD_16_HT_4_LOOP
    928 
    929 
    930 WIDTH_RESIDUE:
    931     mov         w7, w24                     //Loads wd
    932     mov         x5, x21                     //Loads pu1_avail
    933     CMP         x6,x7                       //wd_residue == wd
    934     LDRb        w20, [x5]                   //pu1_avail[0]
    935     csel        w8,w20,w8,EQ
    936 
    937     MOV         x20,#-1
    938     csel        x8, x20, x8,NE
    939     mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    940     mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    941 
    942     LDRB        w8,[x5,#1]                  //pu1_avail[1]
    943     mov         v1.8b[6], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    944     mov         v1.8b[7], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    945 
    946     LDRB        w8,[x5,#2]                  //pu1_avail[2]
    947     CMP         x8,#0
    948 
    949     SUB         x20,x0,x1                   //pu1_src - src_strd
    950     csel        x8, x20, x8,EQ
    951     csel        x8, x3, x8,NE
    952     SUB         x8,x8,#2                    //pu1_src - src_strd - 2
    953     LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
    954     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
    955     //SUB x8, x8,#8
    956 
    957     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    958     mov         w4, w25                     //Loads ht
    959     mov         w7, w24                     //Loads wd
    960     mov         x8, x26                     //Loads *pu1_src
    961     SUB         x7,x7,#2                    //(wd - 2)
    962     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
    963 
    964 AU1_SRC_LEFT_LOOP_RESIDUE:
    965     LDRH        w8,[x7]                     //load the value and increment by src_strd
    966     STRH        w8,[x5],#2                  //store it in the stack pointer
    967     ADD         x7,x7,x1
    968     SUBS        x4,x4,#1                    //decrement the loop count
    969     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    970 
    971     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    972     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    973     //SUB x0, x0,#8
    974 
    975     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    976     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    977     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    978     MOV         x7,x12                      //row count, move ht_tmp to x7
    979 
    980 PU1_SRC_LOOP_RESIDUE:
    981     movi        v18.16b, #0
    982     ADD         x8,x0,x1                    //*pu1_src + src_strd
    983     LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    984     //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    985     //SUB x8, x8,#8
    986 
    987     ADD         x8,x8,#16
    988     LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
    989     mov         v18.4h[0], w5               //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    990     EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    991 
    992     CMP         x7,x12
    993     BLT         SIGN_UP_CHANGE_RESIDUE
    994     mov         x5, x21                     //Loads pu1_avail
    995     LDRB        w5,[x5,#2]                  //pu1_avail[2]
    996     CMP         x5,#0
    997     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    998 
    999 SIGN_UP_CHANGE_RESIDUE:
   1000     LDRB        w8,[x0]                     //pu1_src_cpy[0]
   1001     SUB         x5,x12,x7                   //ht_tmp - row
   1002     LSL         x5,x5,#1                    //(ht_tmp - row) * 2
   1003     ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
   1004     sub         x13,x9,#2
   1005     LDRB        w5,[x13]                    //load the value
   1006     SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
   1007     CMP         x8,#0
   1008     movn        x20,#0
   1009     csel        x8, x20, x8,LT
   1010     MOV         x20,#1
   1011     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
   1012     mov         v17.8b[0], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
   1013 
   1014     LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
   1015     sub         x13,x9,#1
   1016     LDRB        w5,[x13]                    //load the value
   1017     SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
   1018     CMP         x8,#0
   1019     movn        x20,#0
   1020     csel        x8, x20, x8,LT
   1021     MOV         x20,#1
   1022     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
   1023     mov         v17.8b[1], w8               //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
   1024 
   1025 SIGN_UP_CHANGE_DONE_RESIDUE:
   1026     cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
   1027     cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
   1028     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
   1029 
   1030     ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
   1031     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
   1032 
   1033     LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
   1034     mov         v22.d[1],v22.d[0]
   1035     TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
   1036     //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
   1037 
   1038     AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
   1039     mov         v27.d[0],v26.d[1]
   1040 
   1041     NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
   1042     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
   1043 
   1044     UZP1        v31.8b, v26.8b, v27.8b
   1045     UZP2        v27.8b, v26.8b, v27.8b
   1046     mov         v26.8b,v31.8b
   1047     TBL         v24.8b, {v6.16b},v26.8b
   1048     TBL         v25.8b, {v7.16b},v27.8b
   1049     ZIP1        v31.8b, v24.8b, v25.8b
   1050     ZIP2        v25.8b, v24.8b, v25.8b
   1051     mov         v24.8b,v31.8b
   1052 
   1053     Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
   1054     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
   1055     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
   1056     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
   1057 
   1058     xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
   1059 
   1060     ST1         {v28.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
   1061 
   1062     mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
   1063     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
   1064     BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
   1065 
   1066     mov         w8, w25                     //Loads ht
   1067     mov         x11, x17                    //Loads *pu1_src_left
   1068     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
   1069 
   1070 SRC_LEFT_LOOP_RESIDUE:
   1071     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
   1072     SUBS        x8,x8,#2
   1073     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
   1074 
   1075     BNE         SRC_LEFT_LOOP_RESIDUE
   1076 
   1077 
   1078 RE_ASSINING_LOOP:
   1079     mov         w8, w25                     //Loads ht
   1080 
   1081     mov         x0, x26                     //Loads *pu1_src
   1082     SUB         x8,x8,#1                    //ht - 1
   1083 
   1084     mov         w7, w24                     //Loads wd
   1085 
   1086     LDRH        w9,[sp,#6]
   1087     madd        x6, x8, x1, x7              //wd - 2 + (ht - 1) * src_strd
   1088 
   1089     STRH        w9,[x0]                     //pu1_src_org[0] = u1_pos_0_0_tmp
   1090     ADD         x6,x0,x6                    //pu1_src[wd - 2 + (ht - 1) * src_strd]
   1091 
   1092     LDRH        w9,[sp,#8]
   1093     ADD         x12,sp,#10
   1094     sub         x13,x6,#2
   1095     STRH        w9,[x13]                    //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
   1096 
   1097     mov         x4, x15                     //Loads pu1_src_top_left
   1098     LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
   1099     STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
   1100     mov         x3, x22                     //Loads pu1_src_top
   1101 
   1102 SRC_TOP_LOOP:
   1103     LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
   1104     SUBS        x7,x7,#8                    //Decrement the width
   1105     ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
   1106     BNE         SRC_TOP_LOOP
   1107 
   1108 END_LOOPS:
   1109     ADD         sp,sp,#0xE0
   1110     // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
   1111     ldp         x27, x28,[sp],#16
   1112     ldp         x25, x26,[sp],#16
   1113     ldp         x23, x24,[sp],#16
   1114     ldp         x21, x22,[sp],#16
   1115     ldp         x19, x20,[sp],#16
   1116 
   1117     ret
   1118 
   1119 
   1120 
   1121