Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* ,:file
     21 //*  ihevc_sao_edge_offset_class2_chroma.s
     22 //*
     23 //* ,:brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 //* RVCT
     27 //*
     28 //* ,:author
     29 //*  Parthiban V
     30 //*
     31 //* ,:par List of Functions:
     32 //*
     33 //*
     34 //* ,:remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 //void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
     40 //                              WORD32 src_strd,
     41 //                              UWORD8 *pu1_src_left,
     42 //                              UWORD8 *pu1_src_top,
     43 //                              UWORD8 *pu1_src_top_left,
     44 //                              UWORD8 *pu1_src_top_right,
     45 //                              UWORD8 *pu1_src_bot_left,
     46 //                              UWORD8 *pu1_avail,
     47 //                              WORD8 *pi1_sao_offset_u,
     48 //                              WORD8 *pi1_sao_offset_v,
     49 //                              WORD32 wd,
     50 //                              WORD32 ht)
     51 //**************Variables Vs Registers*****************************************
     52 //x0 =>    *pu1_src
     53 //x1 =>    src_strd
     54 //x2 =>    *pu1_src_left
     55 //x3 =>    *pu1_src_top
     56 //x4    =>    *pu1_src_top_left
     57 //x5    =>    *pu1_avail
     58 //x6    =>    *pi1_sao_offset_u
     59 //x9 =>  *pi1_sao_offset_v
     60 //x7    =>    wd
     61 //x8=>    ht
     62 
     63 .text
     64 .p2align 2
     65 .include "ihevc_neon_macros.s"
     66 
     67 .globl gi1_table_edge_idx
     68 .globl ihevc_sao_edge_offset_class2_chroma_av8
     69 
     70 ihevc_sao_edge_offset_class2_chroma_av8:
     71 
     72 
     73     // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
     74 
     75     ldr         x8,[sp,#0]
     76     ldr         x9,[sp,#8]
     77     ldr         w10,[sp,#16]
     78     ldr         w11,[sp,#24]
     79 
     80 
     81 
     82     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     83     stp         x19, x20,[sp,#-16]!
     84     stp         x21, x22,[sp,#-16]!
     85     stp         x23, x24,[sp,#-16]!
     86     stp         x25, x26,[sp,#-16]!
     87     stp         x27, x28,[sp,#-16]!
     88 
     89     mov         x15,x4 // *pu1_src_top_left 0x28
     90     //mov x16,x5    // *pu1_src_top_right 0x2c
     91     mov         x17,x6 // *pu1_src_bot_left 0x30
     92     mov         x21,x7 // *pu1_avail 0x34
     93     mov         x22,x8 // *pi1_sao_offset_u 0x38
     94     mov         x23,x9 // *pi1_sao_offset_v 0x3c
     95     mov         x24,x10 // wd 0x40
     96     mov         x25,x11 // ht 0x44
     97 
     98 
     99     mov         w7, w24                     //Loads wd
    100     mov         w8, w25                     //Loads ht
    101     SUB         x9,x7,#2                    //wd - 2
    102 
    103     mov         x4, x15                     //Loads pu1_src_top_left
    104     LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
    105 
    106     mov         x26, x0                     //Store pu1_src in sp
    107     MOV         x9,x7                       //Move width to x9 for loop count
    108 
    109     mov         x17, x2                     //Store pu1_src_bot_left in sp
    110     mov         x5, x21                     //Loads pu1_avail
    111     mov         x6, x22                     //Loads pi1_sao_offset_u
    112 
    113     mov         x22, x3                     //Store pu1_src_top in sp
    114     SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
    115 
    116     STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
    117     SUB         x10,x8,#1                   //ht-1
    118     madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
    119     ADD         x12,sp,#10                  //temp array
    120 
    121 AU1_SRC_TOP_LOOP:
    122     LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
    123     SUBS        x9,x9,#8                    //Decrement the loop count by 8
    124     ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    125     BNE         AU1_SRC_TOP_LOOP
    126 
    127 PU1_AVAIL_4_LOOP_U:
    128     LDRB        w9,[x5,#4]                  //pu1_avail[4]
    129     CMP         x9,#0
    130     LDRB        w9,[x0]                     //u1_pos_0_0_tmp_u = pu1_src[0]
    131     LDRB        w10,[x0,#1]                 //u1_pos_0_0_tmp_v = pu1_src[1]
    132     BEQ         PU1_AVAIL_7_LOOP_U
    133 
    134     LDRB        w11,[x4]                    //pu1_src_top_left[0]
    135     ADD         x14,x0,x1                   //pu1_src + src_strd
    136 
    137     SUB         x12,x9,x11                  //pu1_src[0] - pu1_src_top_left[0]
    138 
    139     LDRB        w14,[x14,#2]                //pu1_src[2 + src_strd]
    140     CMP         x12,#0
    141 
    142     movn        x20,#0
    143     csel        x12, x20, x12,LT
    144     SUB         x11,x9,x14                  //pu1_src[0] - pu1_src[2 + src_strd]
    145 
    146     MOV         x20,#1
    147     csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
    148 
    149     CMP         x11,#0
    150     movn        x20,#0
    151     csel        x11, x20, x11,LT
    152     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    153     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    154     MOV         x20,#1
    155     csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[2 + src_strd])
    156 
    157     ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
    158     ADD         x11,x11,#2                  //edge_idx
    159 
    160     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    161     CMP         x12,#0                      //0 != edge_idx
    162     BEQ         PU1_AVAIL_4_LOOP_V
    163     LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
    164     ADD         x9,x9,x11                   //pu1_src[0] + pi1_sao_offset_u[edge_idx]
    165     mov         x20,#255
    166     cmp         x9,x20
    167     csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    168     mov         x20,#0
    169     cmp         x9,x20
    170     csel        x9, x20, x9, LT             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    171 
    172 PU1_AVAIL_4_LOOP_V:
    173 
    174     LDRB        w11,[x4,#1]                 //pu1_src_top_left[1]
    175     ADD         x14,x0,x1                   //pu1_src + src_strd
    176 
    177     SUB         x12,x10,x11                 //pu1_src[1] - pu1_src_top_left[1]
    178     LDRB        w14,[x14,#3]                //pu1_src[3 + src_strd]
    179 
    180     CMP         x12,#0
    181     movn        x20,#0
    182     csel        x12, x20, x12,LT
    183     SUB         x11,x10,x14                 //pu1_src[1] - pu1_src[3 + src_strd]
    184     MOV         x20,#1
    185     csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
    186 
    187     CMP         x11,#0
    188     movn        x20,#0
    189     csel        x11, x20, x11,LT
    190     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    191     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    192     MOV         x20,#1
    193     csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[3 + src_strd])
    194 
    195     ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
    196     ADD         x11,x11,#2                  //edge_idx
    197 
    198     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    199     CMP         x12,#0                      //0 != edge_idx
    200     BEQ         PU1_AVAIL_7_LOOP_U
    201     mov         x11, x23                    //Loads pi1_sao_offset_v
    202     LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
    203     ADD         x10,x10,x11                 //pu1_src[0] + pi1_sao_offset_v[edge_idx]
    204     mov         x20,#255
    205     cmp         x10,x20
    206     csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
    207     mov         x20,#0
    208     cmp         x10,x20
    209     csel        x10, x20, x10, LT           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
    210 
    211 PU1_AVAIL_7_LOOP_U:
    212     STRB        w10,[sp,#7]
    213     STRB        w9,[sp,#6]
    214 
    215     LDRB        w10,[x5,#7]                 //pu1_avail[7]
    216     CMP         x10,#0
    217     SUB         x10,x7,#2                   //wd - 2
    218     SUB         x11,x8,#1                   //ht - 1
    219     madd        x12, x11, x1, x10           //wd - 2 + (ht - 1) * src_strd
    220     ADD         x12,x12,x0                  //pu1_src[wd - 2 + (ht - 1) * src_strd]
    221     LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
    222     LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
    223     BEQ         PU1_AVAIL_3_LOOP
    224 
    225     SUB         x11,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
    226     SUB         x11,x11,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
    227     LDRB        w11,[x11]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
    228     SUB         x11,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
    229     CMP         x11,#0
    230     movn        x20,#0
    231     csel        x11, x20, x11,LT
    232     MOV         x20,#1
    233     csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
    234 
    235     ADD         x14,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
    236     ADD         x14,x14,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
    237     LDRB        w14,[x14]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
    238     SUB         x14,x10,x14                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    239     CMP         x14,#0
    240     movn        x20,#0
    241     csel        x14, x20, x14,LT
    242     MOV         x20,#1
    243     csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
    244 
    245     ADD         x11,x11,x14                 //Add 2 sign value
    246     ADD         x11,x11,#2                  //edge_idx
    247     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    248     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    249 
    250     LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    251     CMP         x14,#0
    252     BEQ         PU1_AVAIL_7_LOOP_V
    253     LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
    254     ADD         x10,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    255     mov         x20,#255
    256     cmp         x10,x20
    257     csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    258     mov         x20,#0
    259     cmp         x10,x20
    260     csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    261 
    262 PU1_AVAIL_7_LOOP_V:
    263     ADD         x12,x12,#1
    264     SUB         x11,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
    265     SUB         x11,x11,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
    266     LDRB        w11,[x11]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
    267     SUB         x11,x9,x11                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
    268     CMP         x11,#0
    269     movn        x20,#0
    270     csel        x11, x20, x11,LT
    271     MOV         x20,#1
    272     csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
    273 
    274     ADD         x14,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
    275     ADD         x14,x14,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    276     LDRB        w14,[x14]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    277     SUB         x14,x9,x14                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    278     CMP         x14,#0
    279     movn        x20,#0
    280     csel        x14, x20, x14,LT
    281     MOV         x20,#1
    282     csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
    283 
    284     ADD         x11,x11,x14                 //Add 2 sign value
    285     ADD         x11,x11,#2                  //edge_idx
    286     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    287     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    288 
    289     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    290     CMP         x12,#0
    291     BEQ         PU1_AVAIL_3_LOOP
    292     mov         x14, x23                    //Loads pi1_sao_offset_v
    293     LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
    294     ADD         x9,x9,x11                   //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    295     mov         x20,#255
    296     cmp         x9,x20
    297     csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    298     mov         x20,#0
    299     cmp         x9,x20
    300     csel        x9, x20, x9, LT             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    301 
    302 PU1_AVAIL_3_LOOP:
    303     STRB        w10,[sp,#8]
    304     movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
    305     STRB        w9,[sp,#9]
    306 
    307     MOV         x12,x8                      //Move ht
    308     movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
    309     MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
    310 
    311     LDRB        w11,[x5,#3]                 //pu1_avail[3]
    312     movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    313     CMP         x11,#0
    314 
    315     SUB         x20,x12,#1                  //ht_tmp--
    316     csel        x12, x20, x12,EQ
    317     LDRB        w5,[x5,#2]                  //pu1_avail[2]
    318 
    319     CMP         x5,#0
    320 
    321     ADD         x20,x0,x1                   //pu1_src += src_strd
    322     csel        x0, x20, x0,EQ
    323     LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
    324     SUB         x20,x12,#1                  //ht_tmp--
    325     csel        x12, x20, x12,EQ
    326 
    327     mov         x6, x23                     //Loads pi1_sao_offset_v
    328     ADD         x20,x14,#2                  //pu1_src_left_cpy += 2
    329     csel        x14, x20, x14,EQ
    330 
    331     mov         x27, x0                     //Store pu1_src in sp
    332     LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
    333     ADRP        x2, :got:gi1_table_edge_idx //table pointer
    334     LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
    335 
    336     MOV         x6,x7                       //move wd to x6 loop_count
    337     movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
    338     CMP         x7,#16                      //Compare wd with 16
    339 
    340     BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    341     CMP         x8,#4                       //Compare ht with 4
    342     BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
    343 
    344 WIDTH_LOOP_16:
    345     mov         x5, x21                     //Loads pu1_avail
    346     mov         w7, w24                     //Loads wd
    347     CMP         x6,x7                       //col == wd
    348     LDRb        w20, [x5]                   //pu1_avail[0]
    349     csel        w8,w20,w8,EQ
    350 
    351     MOV         x20,#-1
    352     csel        x8, x20, x8,NE
    353     mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    354 
    355     CMP         x6,#16                      //if(col == 16)
    356     mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    357 
    358     BNE         SKIP_AU1_MASK_VAL
    359     LDRB        w8,[x5,#1]                  //pu1_avail[1]
    360     mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    361     mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    362 
    363 SKIP_AU1_MASK_VAL:
    364     LDRB        w9,[x5,#2]                  //pu1_avail[2]
    365     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    366     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    367     //SUB x0, x0,#8
    368     CMP         x9,#0
    369 
    370     mov         w4, w25                     //Loads ht
    371     SUB         x20,x0,x1                   //pu1_src - src_strd
    372     csel        x8, x20, x8,EQ
    373 
    374     mov         w7, w24                     //Loads wd
    375     csel        x8, x3, x8,NE               //pu1_src_top_cpy
    376 
    377     SUB         x8,x8,#2                    //pu1_src - src_strd - 2
    378     ADD         x3,x3,#16
    379 
    380     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    381     LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    382     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    383     //SUB x8, x8,#8
    384     SUB         x7,x7,x6                    //(wd - col)
    385 
    386     ADD         x7,x7,#14                   //15 + (wd - col)
    387     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    388     mov         x8, x26                     //Loads *pu1_src
    389 
    390     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
    391     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    392 
    393 AU1_SRC_LEFT_LOOP:
    394     LDRH        w8,[x7]                     //load the value and increment by src_strd
    395     SUBS        x4,x4,#1                    //decrement the loop count
    396 
    397     STRH        w8,[x5],#2                  //store it in the stack pointer
    398     ADD         x7,x7,x1
    399 
    400     BNE         AU1_SRC_LEFT_LOOP
    401 
    402     ADD         x8,x0,x1                    //I *pu1_src + src_strd
    403     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    404     MOV         x7,x12                      //row count, move ht_tmp to x7
    405 
    406     LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    407     //LD1 {v17.8b},[x8]                        //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    408     //SUB x8, x8,#8
    409 
    410     ADD         x8,x8,#16                   //I
    411     movi        v18.16b, #0
    412     LDRH        w5,[x8]                     //I pu1_src_cpy[src_strd + 16]
    413 
    414     mov         x10, x21                    //I Loads pu1_avail
    415     mov         v18.h[0], w5                //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    416     LDRB        w10,[x10,#2]                //I pu1_avail[2]
    417 
    418     CMP         x10,#0                      //I
    419     EXT         v18.16b,  v16.16b ,  v18.16b,#2 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    420     BNE         SIGN_UP_CHANGE_DONE         //I
    421 
    422     LDRB        w11,[x0]                    //I pu1_src_cpy[0]
    423     SUB         x4,x12,x7                   //I ht_tmp - row
    424 
    425     LDRB        w10,[x0,#1]                 //I pu1_src_cpy[0]
    426     LSL         x4,x4,#1                    //I (ht_tmp - row) * 2
    427 
    428     ADD         x9,x14,x4                   //I pu1_src_left_cpy[(ht_tmp - row) * 2]
    429     sub         x13,x9,#2
    430     LDRB        w5,[x13]                    //I load the value
    431 
    432     SUB         x8,x11,x5                   //I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    433     sub         x13,x9,#1
    434     LDRB        w5,[x13]                    //I load the value
    435 
    436     CMP         x8,#0                       //I
    437     SUB         x4,x10,x5                   //I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    438 
    439     movn        x20,#0
    440     csel        x8, x20, x8,LT              //I
    441     MOV         x20,#1
    442     csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    443 
    444     CMP         x4,#0                       //I
    445     mov         v17.b[0], w8                //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    446     movn        x20,#0
    447     csel        x4, x20, x4,LT              //I
    448 
    449     MOV         x20,#1
    450     csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    451     mov         v17.b[1], w4                //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    452 
    453 SIGN_UP_CHANGE_DONE:
    454     LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    455     cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    456 
    457     cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    458     SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    459 
    460     ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
    461     ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
    462 
    463     TBL         v18.16b, {v30.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    464     NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
    465 
    466     //TBL v19.8b, {v30.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    467     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
    468 
    469     Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    470     AND         v22.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
    471     mov         v23.d[0],v22.d[1]
    472 
    473     Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    474     UZP1        v31.8b, v22.8b, v23.8b
    475     UZP2        v23.8b, v22.8b, v23.8b      //I
    476     mov         v22.8b,v31.8b
    477 
    478     TBL         v22.8b, {v6.16b},v22.8b     //I
    479     TBL         v23.8b, {v7.16b},v23.8b     //I
    480     ZIP1        v31.8b, v22.8b, v23.8b
    481     ZIP2        v23.8b, v22.8b, v23.8b      //I
    482     mov         v22.8b,v31.8b
    483 
    484     mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
    485     SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    486 
    487     SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    488     UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    489 
    490     SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    491     SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    492 
    493     UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    494     SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
    495 
    496 
    497 PU1_SRC_LOOP:
    498     ADD         x8,x0,x1,LSL #1             //II *pu1_src + src_strd
    499     xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
    500     ADD         x11,x8,x1                   //III *pu1_src + src_strd
    501 
    502     LD1         {v16.16b},[x8]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    503     //LD1 {v17.8b},[x8]                        //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    504     //SUB x8, x8,#8
    505     LD1         {v30.16b},[x11]             //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    506     //LD1 {v31.8b},[x11]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    507     //SUB x11, x11,#8
    508 
    509     ADD         x8,x8,#16                   //II
    510     xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
    511     LDRH        w5,[x8]                     //II pu1_src_cpy[src_strd + 16]
    512 
    513     ADD         x11,x11,#16                 //III
    514     mov         v28.h[0], w5                //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    515     LDRH        w4,[x11]                    //III pu1_src_cpy[src_strd + 16]
    516 
    517     LDRB        w8,[x0,x1]                  //II pu1_src_cpy[0]
    518     EXT         v28.16b,  v16.16b ,  v28.16b,#2 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    519     SUB         x5,x12,x7                   //II ht_tmp - row
    520 
    521     LSL         x5,x5,#1                    //II (ht_tmp - row) * 2
    522     mov         v18.h[0], w4                //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    523     ADD         x9,x14,x5                   //II pu1_src_left_cpy[(ht_tmp - row) * 2]
    524 
    525     sub         x13,x9,#2
    526     LDRB        w11,[x13]                   //II load the value
    527     ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    528     SUB         x8,x8,x11                   //II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    529 
    530     CMP         x8,#0                       //II
    531     EXT         v18.16b,  v30.16b ,  v18.16b,#2 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    532     LDRB        w11,[x0,#1]                 //II pu1_src_cpy[0]
    533 
    534     movn        x20,#0
    535     csel        x8, x20, x8,LT              //II
    536     cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    537     MOV         x20,#1
    538     csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    539 
    540     sub         x13,x9,#1
    541     LDRB        w5,[x13]                    //II load the value
    542     mov         v17.b[0], w8                //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    543     SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
    544 
    545     SUB         x11,x11,x5                  //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    546     cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    547     CMP         x11,#0                      //II
    548 
    549     movn        x20,#0
    550     csel        x11, x20, x11,LT            //II
    551     SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    552     MOV         x20,#1
    553     csel        x11, x20, x11,GT            //II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    554 
    555     LDRB        w4,[x0,x1]                  //III pu1_src_cpy[0]
    556     LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    557     SUB         x5,x12,x7                   //III ht_tmp - row
    558 
    559     ADD         x10,x0,x1
    560     mov         v17.b[1], w11               //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    561     LSL         x5,x5,#1                    //III (ht_tmp - row) * 2
    562 
    563     ADD         x9,x14,x5                   //III pu1_src_left_cpy[(ht_tmp - row) * 2]
    564     ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
    565     LDRB        w10,[x10,#1]                //III pu1_src_cpy[0]
    566 
    567     sub         x13,x9,#2
    568     LDRB        w5,[x13]                    //III load the value
    569     ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
    570     SUB         x4,x4,x5                    //III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    571 
    572     mov         v22.d[1],v22.d[0]
    573     CMP         x4,#0                       //III
    574     sub         x13,x9,#1
    575     LDRB        w9,[x13]                    //III load the value
    576     TBL         v26.16b, {v22.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    577     NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
    578 
    579     movn        x20,#0
    580     csel        x4, x20, x4,LT              //III
    581     SUB         x10,x10,x9                  //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    582     //TBL v27.8b, {v22.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    583     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
    584 
    585     MOV         x20,#1
    586     csel        x4, x20, x4,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    587     AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
    588     CMP         x10,#0                      //III
    589 
    590     mov         v27.d[0],v26.d[1]
    591     UZP1        v31.8b, v26.8b, v27.8b
    592     UZP2        v27.8b, v26.8b, v27.8b      //II
    593     mov         v26.8b,v31.8b
    594     mov         v17.b[0], w4                //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    595 
    596     movn        x20,#0
    597     csel        x10, x20, x10,LT            //III
    598     MOV         x20,#1
    599     csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    600     TBL         v24.8b, {v6.16b},v26.8b     //II
    601     cmhi        v20.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    602 
    603     cmhi        v22.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    604     TBL         v25.8b, {v7.16b},v27.8b     //II
    605     SUB         v22.16b,  v22.16b ,  v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    606 
    607     mov         v17.b[1], w10               //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    608     ZIP1        v31.8b, v24.8b, v25.8b
    609     ZIP2        v25.8b, v24.8b, v25.8b      //II
    610     mov         v24.8b,v31.8b
    611 
    612     Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    613     ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
    614 
    615     LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    616     SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    617 
    618     ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
    619     SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    620 
    621     UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    622     TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    623     NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
    624 
    625     //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    626     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
    627 
    628     Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    629     AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
    630 
    631     mov         v19.d[0],v18.d[1]
    632     UZP1        v31.8b, v18.8b, v19.8b
    633     UZP2        v19.8b, v18.8b, v19.8b      //III
    634     mov         v18.8b,v31.8b
    635     TBL         v22.8b, {v6.16b},v18.8b     //III
    636     SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    637 
    638     mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
    639     TBL         v23.8b, {v7.16b},v19.8b     //III
    640     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    641 
    642     Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    643     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    644 
    645     ZIP1        v31.8b, v22.8b, v23.8b
    646     ZIP2        v23.8b, v22.8b, v23.8b      //III
    647     mov         v22.8b,v31.8b
    648     xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
    649 
    650     xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
    651     SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    652 
    653     Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    654     SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    655 
    656     UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    657     SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    658 
    659     SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
    660     SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    661     CMP         x7,#1
    662 
    663     ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    664     UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    665 
    666     BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
    667     BLT         INNER_LOOP_DONE
    668 
    669     ADD         x8,x0,x1,LSL #1             //*pu1_src + src_strd
    670     xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
    671 
    672     LDRB        w11,[x0,x1]                 //pu1_src_cpy[0]
    673     LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    674     //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    675     //SUB x8, x8,#8
    676     SUB         x4,x12,x7                   //ht_tmp - row
    677 
    678     ADD         x8,x8,#16
    679     xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
    680     LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
    681 
    682     LSL         x4,x4,#1                    //(ht_tmp - row) * 2
    683     mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    684     ADD         x9,x14,x4                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
    685 
    686     sub         x13,x9,#2
    687     LDRB        w5,[x13]                    //load the value
    688     EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    689     SUB         x8,x11,x5                   //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    690 
    691     CMP         x8,#0
    692     ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    693     movn        x20,#0
    694     csel        x8, x20, x8,LT
    695 
    696     MOV         x20,#1
    697     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    698     LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    699 
    700     LDRB        w11,[x0,#1]                 //pu1_src_cpy[0]
    701     mov         v17.b[0], w8                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    702     sub         x13,x9,#1
    703     LDRB        w5,[x13]                    //load the value
    704 
    705     SUB         x4,x11,x5                   //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    706     cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    707     CMP         x4,#0
    708 
    709     movn        x20,#0
    710     csel        x4, x20, x4,LT
    711     cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    712     MOV         x20,#1
    713     csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    714 
    715     mov         v17.b[1], w4                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    716     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    717 
    718     ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
    719     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
    720 
    721     mov         v30.d[1],v30.d[0]
    722     TBL         v26.16b, {v30.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    723     //TBL v27.8b, {v30.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    724 
    725     Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    726     AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    727     mov         v27.d[0],v26.d[1]
    728 
    729     Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    730     UZP1        v31.8b, v26.8b, v27.8b
    731     UZP2        v27.8b, v26.8b, v27.8b
    732     mov         v26.8b,v31.8b
    733 
    734     TBL         v24.8b, {v6.16b},v26.8b
    735     TBL         v25.8b, {v7.16b},v27.8b
    736     ZIP1        v31.8b, v24.8b, v25.8b
    737     ZIP2        v25.8b, v24.8b, v25.8b
    738     mov         v24.8b,v31.8b
    739 
    740     SADDW       v20.8h,  v20.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    741     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    742     UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    743 
    744     SADDW       v18.8h,  v18.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    745     SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    746     UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    747 
    748 
    749 INNER_LOOP_DONE:
    750     mov         w8, w25                     //Loads ht
    751     xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    752     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    753 
    754     mov         x11, x17                    //Loads *pu1_src_left
    755     xtn2        v20.16b,  v18.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
    756 
    757 
    758 SRC_LEFT_LOOP:
    759     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
    760     SUBS        x8,x8,#2
    761     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
    762     BNE         SRC_LEFT_LOOP
    763 
    764     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    765     ST1         { v20.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    766     CMP         x6,#8                       //Check whether residue remains
    767 
    768     BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
    769     mov         w7, w24                     //Loads wd
    770     mov         x0, x27                     //Loads *pu1_src
    771     SUB         x7,x7,x6
    772     ADD         x0,x0,x7
    773     BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
    774     BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
    775 
    776 
    777 WD_16_HT_4_LOOP:
    778     mov         x5, x21                     //Loads pu1_avail
    779     mov         w7, w24                     //Loads wd
    780     CMP         x6,x7                       //col == wd
    781     LDRb        w20, [x5]                   //pu1_avail[0]
    782     csel        w8,w20,w8,EQ
    783 
    784     MOV         x20,#-1
    785     csel        x8, x20, x8,NE
    786     mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    787     mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    788 
    789     CMP         x6,#16                      //if(col == 16)
    790     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    791     LDRB        w8,[x5,#1]                  //pu1_avail[1]
    792     mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    793     mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    794 
    795 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    796     LDRB        w8,[x5,#2]                  //pu1_avail[2]
    797     CMP         x8,#0
    798 
    799     SUB         x20,x0,x1                   //pu1_src - src_strd
    800     csel        x8, x20, x8,EQ
    801     csel        x8, x3, x8,NE               //pu1_src_top_cpy
    802     SUB         x8,x8,#2                    //pu1_src - src_strd - 2
    803     LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    804     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    805     //SUB x8, x8,#8
    806 
    807     ADD         x3,x3,#16
    808     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    809     mov         w4, w25                     //Loads ht
    810     mov         x7, x24                     //Loads wd
    811     SUB         x7,x7,x6                    //(wd - col)
    812     ADD         x7,x7,#14                   //15 + (wd - col)
    813     mov         x8, x26                     //Loads *pu1_src
    814     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
    815 
    816 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    817     LDRH        w8,[x7]                     //load the value and increment by src_strd
    818     STRH        w8,[x5],#2                  //store it in the stack pointer
    819     ADD         x7,x7,x1
    820 
    821     SUBS        x4,x4,#1                    //decrement the loop count
    822     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    823 
    824     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    825     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    826     //SUB x0, x0,#8
    827 
    828     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    829     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    830     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    831     movi        v18.16b, #0
    832     MOV         x7,x12                      //row count, move ht_tmp to x7
    833 
    834 PU1_SRC_LOOP_WD_16_HT_4:
    835     movi        v18.16b, #0
    836     ADD         x8,x0,x1                    //*pu1_src + src_strd
    837     LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    838     //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    839     //SUB x8, x8,#8
    840 
    841     ADD         x8,x8,#16
    842     LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
    843     mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    844     EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    845 
    846     CMP         x7,x12
    847     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    848     mov         x5, x21                     //Loads pu1_avail
    849     LDRB        w5,[x5,#2]                  //pu1_avail[2]
    850     CMP         x5,#0
    851     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    852 
    853 SIGN_UP_CHANGE_WD_16_HT_4:
    854     LDRB        w8,[x0]                     //pu1_src_cpy[0]
    855     SUB         x5,x12,x7                   //ht_tmp - row
    856     LSL         x5,x5,#1                    //(ht_tmp - row) * 2
    857     ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
    858     sub         x13,x9,#2
    859     LDRB        w5,[x13]                    //load the value
    860     SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    861     CMP         x8,#0
    862     movn        x20,#0
    863     csel        x8, x20, x8,LT
    864     MOV         x20,#1
    865     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    866     mov         v17.b[0], w8                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    867 
    868     LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
    869     sub         x13,x9,#1
    870     LDRB        w5,[x13]                    //load the value
    871     SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    872     CMP         x8,#0
    873     movn        x20,#0
    874     csel        x8, x20, x8,LT
    875     MOV         x20,#1
    876     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    877     mov         v17.b[1], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    878 
    879 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    880     cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    881     cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    882     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    883 
    884     ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
    885     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
    886 
    887     LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    888     TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    889     //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    890 
    891     AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    892     mov         v27.d[0],v26.d[1]
    893 
    894     NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
    895     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
    896 
    897     UZP1        v31.8b, v26.8b, v27.8b
    898     UZP2        v27.8b, v26.8b, v27.8b
    899     mov         v26.8b,v31.8b
    900     TBL         v24.8b, {v6.16b},v26.8b
    901     TBL         v25.8b, {v7.16b},v27.8b
    902     ZIP1        v31.8b, v24.8b, v25.8b
    903     ZIP2        v25.8b, v24.8b, v25.8b
    904     mov         v24.8b,v31.8b
    905 
    906     Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    907     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    908     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    909     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    910 
    911     Uxtl2       v26.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    912     SADDW       v26.8h,  v26.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    913     SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    914     UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    915 
    916     xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    917     xtn2        v28.16b,  v26.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
    918 
    919     ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    920 
    921     mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
    922     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
    923     BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    924 
    925     mov         w8, w25                     //Loads ht
    926     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    927     mov         x11, x17                    //Loads *pu1_src_left
    928 
    929 SRC_LEFT_LOOP_WD_16_HT_4:
    930     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
    931     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
    932 
    933     SUBS        x8,x8,#2
    934     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    935 
    936 
    937     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    938     BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
    939     mov         w7, w24                     //Loads wd
    940     mov         x0, x27                     //Loads *pu1_src
    941     SUB         x7,x7,x6
    942     ADD         x0,x0,x7
    943     BGT         WD_16_HT_4_LOOP
    944 
    945 
    946 WIDTH_RESIDUE:
    947     mov         w7, w24                     //Loads wd
    948     mov         x5, x21                     //Loads pu1_avail
    949     CMP         x6,x7                       //wd_residue == wd
    950     LDRb        w20, [x5]                   //pu1_avail[0]
    951     csel        w8,w20,w8,EQ
    952 
    953     MOV         x20,#-1
    954     csel        x8, x20, x8,NE
    955     mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    956     mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    957 
    958     LDRB        w8,[x5,#1]                  //pu1_avail[1]
    959     mov         v1.b[6], w8                 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    960     mov         v1.b[7], w8                 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    961 
    962     LDRB        w8,[x5,#2]                  //pu1_avail[2]
    963     CMP         x8,#0
    964 
    965     SUB         x20,x0,x1                   //pu1_src - src_strd
    966     csel        x8, x20, x8,EQ
    967     csel        x8, x3, x8,NE
    968     SUB         x8,x8,#2                    //pu1_src - src_strd - 2
    969     LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
    970     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
    971     //SUB x8, x8,#8
    972 
    973     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    974     mov         w4, w25                     //Loads ht
    975     mov         w7, w24                     //Loads wd
    976     mov         x8, x26                     //Loads *pu1_src
    977     SUB         x7,x7,#2                    //(wd - 2)
    978     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
    979 
    980 AU1_SRC_LEFT_LOOP_RESIDUE:
    981     LDRH        w8,[x7]                     //load the value and increment by src_strd
    982     STRH        w8,[x5],#2                  //store it in the stack pointer
    983     ADD         x7,x7,x1
    984     SUBS        x4,x4,#1                    //decrement the loop count
    985     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    986 
    987     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    988     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    989     //SUB x0, x0,#8
    990 
    991     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    992     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    993     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    994     MOV         x7,x12                      //row count, move ht_tmp to x7
    995 
    996 PU1_SRC_LOOP_RESIDUE:
    997     movi        v18.16b, #0
    998     ADD         x8,x0,x1                    //*pu1_src + src_strd
    999     LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
   1000     //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
   1001     //SUB x8, x8,#8
   1002 
   1003     ADD         x8,x8,#16
   1004     LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
   1005     mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
   1006     EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
   1007 
   1008     CMP         x7,x12
   1009     BLT         SIGN_UP_CHANGE_RESIDUE
   1010     mov         x5, x21                     //Loads pu1_avail
   1011     LDRB        w5,[x5,#2]                  //pu1_avail[2]
   1012     CMP         x5,#0
   1013     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
   1014 
   1015 SIGN_UP_CHANGE_RESIDUE:
   1016     LDRB        w8,[x0]                     //pu1_src_cpy[0]
   1017     SUB         x5,x12,x7                   //ht_tmp - row
   1018     LSL         x5,x5,#1                    //(ht_tmp - row) * 2
   1019     ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
   1020     sub         x13,x9,#2
   1021     LDRB        w5,[x13]                    //load the value
   1022     SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
   1023     CMP         x8,#0
   1024     movn        x20,#0
   1025     csel        x8, x20, x8,LT
   1026     MOV         x20,#1
   1027     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
   1028     mov         v17.b[0], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
   1029 
   1030     LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
   1031     sub         x13,x9,#1
   1032     LDRB        w5,[x13]                    //load the value
   1033     SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
   1034     CMP         x8,#0
   1035     movn        x20,#0
   1036     csel        x8, x20, x8,LT
   1037     MOV         x20,#1
   1038     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
   1039     mov         v17.b[1], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
   1040 
   1041 SIGN_UP_CHANGE_DONE_RESIDUE:
   1042     cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
   1043     cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
   1044     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
   1045 
   1046     ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
   1047     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
   1048 
   1049     LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
   1050     mov         v22.d[1],v22.d[0]
   1051     TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
   1052     //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
   1053 
   1054     AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
   1055     mov         v27.d[0],v26.d[1]
   1056 
   1057     NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
   1058     EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
   1059 
   1060     UZP1        v31.8b, v26.8b, v27.8b
   1061     UZP2        v27.8b, v26.8b, v27.8b
   1062     mov         v26.8b,v31.8b
   1063     TBL         v24.8b, {v6.16b},v26.8b
   1064     TBL         v25.8b, {v7.16b},v27.8b
   1065     ZIP1        v31.8b, v24.8b, v25.8b
   1066     ZIP2        v25.8b, v24.8b, v25.8b
   1067     mov         v24.8b,v31.8b
   1068 
   1069     Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
   1070     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
   1071     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
   1072     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
   1073 
   1074     xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
   1075 
   1076     ST1         {v28.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
   1077 
   1078     mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
   1079     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
   1080     BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
   1081 
   1082     mov         w8, w25                     //Loads ht
   1083     mov         x11, x17                    //Loads *pu1_src_left
   1084     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
   1085 
   1086 SRC_LEFT_LOOP_RESIDUE:
   1087     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
   1088     SUBS        x8,x8,#2
   1089     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
   1090 
   1091     BNE         SRC_LEFT_LOOP_RESIDUE
   1092 
   1093 
   1094 RE_ASSINING_LOOP:
   1095     mov         w8, w25                     //Loads ht
   1096 
   1097     mov         x0, x26                     //Loads *pu1_src
   1098     SUB         x8,x8,#1                    //ht - 1
   1099 
   1100     mov         w7, w24                     //Loads wd
   1101 
   1102     LDRH        w9,[sp,#6]
   1103     madd        x6, x8, x1, x7              //wd - 2 + (ht - 1) * src_strd
   1104 
   1105     STRH        w9,[x0]                     //pu1_src_org[0] = u1_pos_0_0_tmp
   1106     ADD         x6,x0,x6                    //pu1_src[wd - 2 + (ht - 1) * src_strd]
   1107 
   1108     LDRH        w9,[sp,#8]
   1109     ADD         x12,sp,#10
   1110     sub         x13,x6,#2
   1111     STRH        w9,[x13]                    //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
   1112 
   1113     mov         x4, x15                     //Loads pu1_src_top_left
   1114     LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
   1115     STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
   1116     mov         x3, x22                     //Loads pu1_src_top
   1117 
   1118 SRC_TOP_LOOP:
   1119     LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
   1120     SUBS        x7,x7,#8                    //Decrement the width
   1121     ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
   1122     BNE         SRC_TOP_LOOP
   1123 
   1124 END_LOOPS:
   1125     ADD         sp,sp,#0xE0
   1126     // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
   1127     ldp         x27, x28,[sp],#16
   1128     ldp         x25, x26,[sp],#16
   1129     ldp         x23, x24,[sp],#16
   1130     ldp         x21, x22,[sp],#16
   1131     ldp         x19, x20,[sp],#16
   1132 
   1133     ret
   1134 
   1135 
   1136 
   1137