Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* ,:file
     21 //*  ihevc_sao_edge_offset_class3_chroma.s
     22 //*
     23 //* ,:brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 //* RVCT
     27 //*
     28 //* ,:author
     29 //*  Parthiban V
     30 //*
     31 //* ,:par List of Functions:
     32 //*
     33 //*
     34 //* ,:remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 //void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
     40 //                              WORD32 src_strd,
     41 //                              UWORD8 *pu1_src_left,
     42 //                              UWORD8 *pu1_src_top,
     43 //                              UWORD8 *pu1_src_top_left,
     44 //                              UWORD8 *pu1_src_top_right,
     45 //                              UWORD8 *pu1_src_bot_left,
     46 //                              UWORD8 *pu1_avail,
     47 //                              WORD8 *pi1_sao_offset_u,
     48 //                              WORD8 *pi1_sao_offset_v,
     49 //                              WORD32 wd,
     50 //                              WORD32 ht)
     51 //**************Variables Vs Registers*****************************************
     52 //x0 =>    *pu1_src
     53 //x1 =>    src_strd
     54 //x2 =>    *pu1_src_left
     55 //x3 =>    *pu1_src_top
     56 //x4    =>    *pu1_src_top_left
     57 //x5    =>    *pu1_avail
     58 //x6    =>    *pi1_sao_offset_u
     59 //x9 =>  *pi1_sao_offset_v
     60 //x7    =>    wd
     61 //x8=>    ht
     62 
     63 .text
     64 .p2align 2
     65 .include "ihevc_neon_macros.s"
     66 .globl gi1_table_edge_idx
     67 .globl ihevc_sao_edge_offset_class3_chroma_av8
     68 
     69 ihevc_sao_edge_offset_class3_chroma_av8:
     70 
     71 
     72     // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
     73 
     74 
     75     ldr         x8,[sp,#0]
     76     ldr         x9,[sp,#8]
     77     ldr         w10,[sp,#16]
     78     ldr         w11,[sp,#24]
     79 
     80 
     81     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     82     stp         x19, x20,[sp,#-16]!
     83     stp         x21, x22,[sp,#-16]!
     84     stp         x23, x24,[sp,#-16]!
     85     stp         x25, x26,[sp,#-16]!
     86     stp         x27, x28,[sp,#-16]!
     87 
     88     mov         x15,x4 // *pu1_src_top_left 0x28
     89     mov         x16,x5 // *pu1_src_top_right 0x2c
     90     mov         x17,x6 // *pu1_src_bot_left 0x30
     91     mov         x21,x7 // *pu1_avail 0x34
     92     mov         x22,x8 // *pi1_sao_offset_u 0x38
     93     mov         x23,x9 // *pi1_sao_offset_v 0x3c
     94     mov         x24,x10 // wd 0x40
     95     mov         x25,x11 // ht 0x44
     96 
     97 
     98     mov         w7, w24                     //Loads wd
     99     mov         w8, w25                     //Loads ht
    100     SUB         x9,x7,#2                    //wd - 2
    101 
    102     mov         x4, x15                     //Loads pu1_src_top_left
    103     LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
    104 
    105     MOV         x9,x7                       //Move width to x9 for loop count
    106 
    107     mov         x5, x21                     //Loads pu1_avail
    108     mov         x6, x22                     //Loads pi1_sao_offset_u
    109 
    110     mov         x22, x3                     //Store pu1_src_top in sp
    111     SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
    112 
    113     STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
    114     SUB         x10,x8,#1                   //ht-1
    115     madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
    116     ADD         x12,sp,#10                  //temp array
    117 
    118 AU1_SRC_TOP_LOOP:
    119     LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
    120     SUBS        x9,x9,#8                    //Decrement the loop count by 8
    121     ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    122     BNE         AU1_SRC_TOP_LOOP
    123 
    124 PU1_AVAIL_5_LOOP_U:
    125     LDRB        w9,[x5,#5]                  //pu1_avail[5]
    126     CMP         x9,#0
    127     SUB         x14,x7,#2                   //[wd - 2]
    128     LDRB        w9,[x0,x14]                 //u1_pos_0_0_tmp_u = pu1_src[wd - 2]
    129     SUB         x11,x7,#1                   //[wd - 1]
    130     LDRB        w10,[x0,x11]                //u1_pos_0_0_tmp_v = pu1_src[wd - 1]
    131     BEQ         PU1_AVAIL_6_LOOP_U
    132 
    133     mov         x11, x16                    //Load pu1_src_top_right from sp
    134     LDRB        w11,[x11]                   //pu1_src_top_right[0]
    135     SUB         x12,x9,x11                  //pu1_src[wd - 2] - pu1_src_top_right[0]
    136     CMP         x12,#0
    137     movn        x20,#0
    138     csel        x12, x20, x12,LT
    139     MOV         x20,#1
    140     csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
    141     ADD         x11,x0,x1                   //pu1_src + src_strd
    142     SUB         x14,x14,#2                  //[wd - 2 - 2]
    143     LDRB        w14,[x11,x14]               //pu1_src[wd - 2 - 2 + src_strd]
    144     SUB         x11,x9,x14                  //pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
    145     CMP         x11,#0
    146     movn        x20,#0
    147     csel        x11, x20, x11,LT
    148     MOV         x20,#1
    149     csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
    150     ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
    151     ADD         x11,x11,#2                  //edge_idx
    152     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    153     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    154 
    155     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    156     CMP         x12,#0                      //0 != edge_idx
    157     BEQ         PU1_AVAIL_5_LOOP_V
    158     LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
    159     ADD         x9,x9,x11                   //pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
    160     mov         x20,#255
    161     cmp         x9,x20
    162     csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    163 
    164 PU1_AVAIL_5_LOOP_V:
    165 
    166     mov         x11, x16                    //Load pu1_src_top_right from sp
    167     LDRB        w11,[x11,#1]                //pu1_src_top_right[1]
    168     SUB         x12,x10,x11                 //pu1_src[wd - 1] - pu1_src_top_right[1]
    169     CMP         x12,#0
    170     movn        x20,#0
    171     csel        x12, x20, x12,LT
    172     MOV         x20,#1
    173     csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
    174     ADD         x11,x0,x1                   //pu1_src + src_strd
    175     SUB         x14,x7,#3                   //[wd - 1 - 2]
    176     LDRB        w14,[x11,x14]               //pu1_src[wd - 1 - 2 + src_strd]
    177     SUB         x11,x10,x14                 //pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
    178     CMP         x11,#0
    179     movn        x20,#0
    180     csel        x11, x20, x11,LT
    181     MOV         x20,#1
    182     csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
    183     ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
    184     ADD         x11,x11,#2                  //edge_idx
    185     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    186     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    187 
    188     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    189     CMP         x12,#0                      //0 != edge_idx
    190     BEQ         PU1_AVAIL_6_LOOP_U
    191     mov         x11, x23                    //Loads pi1_sao_offset_v
    192     LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
    193     ADD         x10,x10,x11                 //pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
    194     mov         x20,#255
    195     cmp         x10,x20
    196     csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
    197 
    198 PU1_AVAIL_6_LOOP_U:
    199     STRB        w9,[sp,#6]
    200     STRB        w10,[sp,#7]
    201     mov         x26, x0                     //Store pu1_src in sp
    202 
    203     LDRB        w10,[x5,#6]                 //pu1_avail[6]
    204     CMP         x10,#0
    205     SUB         x11,x8,#1                   //ht - 1
    206     madd        x12, x11, x1, x0            //pu1_src[(ht - 1) * src_strd]
    207     LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
    208     LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
    209     BEQ         PU1_AVAIL_3_LOOP
    210 
    211     SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd - src_strd]
    212     ADD         x11,x11,#2                  //pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    213     LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    214     SUB         x11,x10,x11                 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    215     CMP         x11,#0
    216     movn        x20,#0
    217     csel        x11, x20, x11,LT
    218     MOV         x20,#1
    219     csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
    220 
    221     mov         x14, x17                    //Load pu1_src_bot_left from sp
    222     LDRB        w14,[x14]                   //Load pu1_src_bot_left[0]
    223     SUB         x14,x10,x14                 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
    224     CMP         x14,#0
    225     movn        x20,#0
    226     csel        x14, x20, x14,LT
    227     MOV         x20,#1
    228     csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
    229 
    230     ADD         x11,x11,x14                 //Add 2 sign value
    231     ADD         x11,x11,#2                  //edge_idx
    232     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    233     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    234 
    235     LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    236     CMP         x14,#0
    237     BEQ         PU1_AVAIL_6_LOOP_V
    238     LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
    239     ADD         x10,x10,x11                 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    240     mov         x20,#255
    241     cmp         x10,x20
    242     csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    243 
    244 PU1_AVAIL_6_LOOP_V:
    245     ADD         x12,x12,#1                  //pu1_src[(ht - 1) * src_strd + 1]
    246     SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd + 1) - src_strd]
    247     ADD         x11,x11,#2                  //pu1_src[(ht - 1) * src_strd + 2 - src_strd]
    248     LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
    249     SUB         x11,x9,x11                  //pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
    250     CMP         x11,#0
    251     movn        x20,#0
    252     csel        x11, x20, x11,LT
    253     MOV         x20,#1
    254     csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
    255 
    256     mov         x14, x17                    //Load pu1_src_bot_left from sp
    257     LDRB        w14,[x14,#1]                //Load pu1_src_bot_left[1]
    258     SUB         x14,x9,x14                  //pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
    259     CMP         x14,#0
    260     movn        x20,#0
    261     csel        x14, x20, x14,LT
    262     MOV         x20,#1
    263     csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
    264 
    265     ADD         x11,x11,x14                 //Add 2 sign value
    266     ADD         x11,x11,#2                  //edge_idx
    267     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    268     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    269 
    270     LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
    271     CMP         x12,#0
    272     BEQ         PU1_AVAIL_3_LOOP
    273     mov         x14, x23                    //Loads pi1_sao_offset_v
    274     LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
    275     ADD         x9,x9,x11                   //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    276     mov         x20,#255
    277     cmp         x9,x20
    278     csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    279 
    280 PU1_AVAIL_3_LOOP:
    281     STRB        w10,[sp,#8]
    282     STRB        w9,[sp,#9]
    283     mov         x27, x2                     //Store pu1_src_left in sp
    284 
    285     MOV         x12,x8                      //Move ht
    286     MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
    287     LDRB        w11,[x5,#3]                 //pu1_avail[3]
    288     CMP         x11,#0
    289     BNE         PU1_AVAIL_2_LOOP
    290     SUB         x12,x12,#1                  //ht_tmp--
    291 
    292 PU1_AVAIL_2_LOOP:
    293     LDRB        w5,[x5,#2]                  //pu1_avail[2]
    294     CMP         x5,#0
    295     BNE         PU1_AVAIL_2_LOOP_END
    296 
    297     ADD         x0,x0,x1                    //pu1_src += src_strd
    298     SUB         x12,x12,#1                  //ht_tmp--
    299     ADD         x14,x14,#2                  //pu1_src_left_cpy += 2
    300 
    301 PU1_AVAIL_2_LOOP_END:
    302     mov         x28, x0                     //Store pu1_src in sp
    303     movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
    304     movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
    305     movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    306     LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
    307     mov         x6, x23                     //Loads pi1_sao_offset_v
    308     LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
    309     ADRP        x2, :got:gi1_table_edge_idx //table pointer
    310     LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
    311 
    312     //VLD1.8        D6,[x6]                        @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    313     movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
    314     MOV         x6,x7                       //move wd to x6 loop_count
    315 
    316     CMP         x7,#16                      //Compare wd with 16
    317     BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    318     CMP         x8,#4                       //Compare ht with 4
    319     BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
    320 
    321 WIDTH_LOOP_16:
    322     mov         w7, w24                     //Loads wd
    323     CMP         x6,x7                       //col == wd
    324     mov         x5, x21                     //Loads pu1_avail
    325 
    326     LDRb        w20, [x5]                   //pu1_avail[0]
    327     csel        w8,w20,w8,EQ
    328     MOV         x20,#-1
    329     csel        x8, x20, x8,NE
    330 
    331     mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    332     LDRB        w11,[x5,#2]                 //pu1_avail[2]
    333 
    334     CMP         x6,#16                      //if(col == 16)
    335     mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    336 
    337     BNE         SKIP_AU1_MASK_VAL
    338     LDRB        w8,[x5,#1]                  //pu1_avail[1]
    339     mov         v1.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    340     mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    341 
    342 SKIP_AU1_MASK_VAL:
    343     CMP         x11,#0
    344     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    345     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    346     //SUB x0, x0,#8
    347     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    348 
    349     SUB         x20,x0,x1                   //pu1_src - src_strd
    350     csel        x8, x20, x8,EQ
    351     movi        v18.16b, #0
    352     csel        x8, x3, x8,NE
    353 
    354     ADD         x8,x8,#2                    //pu1_src - src_strd + 2
    355     LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    356     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    357     //SUB x8, x8,#8
    358     ADD         x3,x3,#16
    359 
    360     mov         w4, w25                     //Loads ht
    361     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    362     mov         w7, w24                     //Loads wd
    363 
    364     SUB         x7,x7,x6                    //(wd - col)
    365     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    366     ADD         x7,x7,#14                   //15 + (wd - col)
    367 
    368     mov         x8, x26                     //Loads *pu1_src
    369     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    370     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
    371 
    372 AU1_SRC_LEFT_LOOP:
    373     LDRH        w8,[x7]                     //load the value and increment by src_strd
    374     SUBS        x4,x4,#1                    //decrement the loop count
    375 
    376     STRH        w8,[x5],#2                  //store it in the stack pointer
    377     ADD         x7,x7,x1
    378     BNE         AU1_SRC_LEFT_LOOP
    379 
    380 
    381     MOV         x7,x12                      //row count, move ht_tmp to x7
    382     movi        v18.16b, #0                 //I
    383     ADD         x11,x0,x1                   //I *pu1_src + src_strd
    384 
    385     SUB         x5,x12,x7                   //I ht_tmp - row
    386     LD1         {v16.16b},[x11]             //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    387     //LD1 {v17.8b},[x11]                    //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    388     //SUB x11, x11,#8
    389     ADD         x8,x14,x5,LSL #1            //I pu1_src_left_cpy[(ht_tmp - row) * 2]
    390 
    391     LDRH        w5,[x8,#2]                  //I
    392     mov         v18.4h[7], w5               //I vsetq_lane_u8
    393     mov         x11, x21                    //I Loads pu1_avail
    394 
    395     LDRB        w11,[x11,#2]                //I pu1_avail[2]
    396     EXT         v18.16b,  v18.16b ,  v16.16b,#14 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    397     CMP         x11,#0                      //I
    398     BNE         SIGN_UP_CHANGE_DONE         //I
    399 
    400     LDRB        w8,[x0,#14]                 //I pu1_src_cpy[14]
    401     SUB         x5,x0,x1                    //I
    402 
    403     LDRB        w11,[x5,#16]                //I load the value pu1_src_cpy[16 - src_strd]
    404 
    405     LDRB        w9,[x0,#15]                 //I pu1_src_cpy[15]
    406     SUB         x8,x8,x11                   //I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    407 
    408     LDRB        w10,[x5,#17]                //I load the value pu1_src_cpy[17 - src_strd]
    409     CMP         x8,#0                       //I
    410 
    411     movn        x20,#0
    412     csel        x8, x20, x8,LT              //I
    413     SUB         x9,x9,x10                   //I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    414 
    415     MOV         x20,#1
    416     csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    417     CMP         x9,#0                       //I
    418 
    419     movn        x20,#0
    420     csel        x9, x20, x9,LT              //I
    421     mov         v17.16b[14], w8             //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    422     MOV         x20,#1
    423     csel        x9, x20, x9,GT              //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    424 
    425     mov         v17.16b[15], w9             //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    426 
    427 SIGN_UP_CHANGE_DONE:
    428     LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    429     cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    430 
    431     cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    432     SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    433 
    434     ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
    435     ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
    436     TBL         v18.16b, {v28.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    437     NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
    438 
    439     //TBL v19.8b, {v28.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    440     EXT         v17.16b,  v17.16b ,  v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
    441 
    442     Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    443     AND         v18.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
    444     mov         v19.d[0],v18.d[1]
    445 
    446     UZP1        v31.8b, v18.8b, v19.8b
    447     UZP2        v19.8b, v18.8b, v19.8b      //I
    448     mov         v18.8b,v31.8b
    449     TBL         v22.8b, {v6.16b},v18.8b     //I
    450     TBL         v23.8b, {v7.16b},v19.8b     //I
    451     ZIP1        v31.8b, v22.8b, v23.8b
    452     ZIP2        v23.8b, v22.8b, v23.8b      //I
    453     mov         v22.8b,v31.8b
    454 
    455     Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    456     SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    457 
    458     SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    459     UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    460 
    461     mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
    462     SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    463 
    464     SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
    465     SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    466 
    467     UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    468 
    469 
    470 PU1_SRC_LOOP:
    471     ADD         x11,x0,x1,LSL #1            //II *pu1_src + src_strd
    472     xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
    473     SUB         x5,x12,x7                   //II ht_tmp - row
    474 
    475     ADD         x4,x0,x1                    //III *pu1_src + src_strd
    476     xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
    477     ADD         x8,x14,x5,LSL #1            //II pu1_src_left_cpy[(ht_tmp - row) * 2]
    478 
    479     LDRH        w9,[x8,#2]
    480     LD1         {v16.16b},[x11]             //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    481     //LD1 {v17.8b},[x11]                    //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    482     //SUB x11, x11,#8
    483     LDRB        w10,[x4,#14]                //II pu1_src_cpy[14]
    484 
    485     LDRB        w8,[x4,#15]                 //II pu1_src_cpy[15]
    486     mov         v28.4h[7], w9               //II vsetq_lane_u8
    487     ADD         x4,x11,x1                   //III *pu1_src + src_strd
    488 
    489     LDRB        w5,[x0,#17]                 //II load the value pu1_src_cpy[17 - src_strd]
    490     LD1         {v30.16b},[x4]              //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    491     //LD1 {v31.8b},[x4]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    492     //SUB x4, x4,#8
    493     LDRB        w11,[x0,#16]                //II load the value pu1_src_cpy[16 - src_strd]
    494 
    495     SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
    496     ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    497     SUB         x10,x10,x11                 //II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    498 
    499     CMP         x10,#0                      //II
    500     EXT         v28.16b,  v28.16b ,  v16.16b,#14 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    501     SUB         x8,x8,x5                    //II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    502 
    503     movn        x20,#0
    504     csel        x10, x20, x10,LT            //II
    505     LD1         {v21.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    506     MOV         x20,#1
    507     csel        x10, x20, x10,GT            //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    508 
    509     CMP         x8,#0                       //II
    510     mov         v17.8b[14], w10             //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    511     movn        x20,#0
    512     csel        x8, x20, x8,LT              //II
    513 
    514     MOV         x20,#1
    515     csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    516     SUB         x10,x12,x7                  //III ht_tmp - row
    517     mov         v17.8b[15], w8              //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    518     ADD         x11,x14,x10,LSL #1          //III pu1_src_left_cpy[(ht_tmp - row) * 2]
    519 
    520     CMP         x7,#1                       //III
    521     cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    522     BNE         NEXT_ROW_POINTER_ASSIGNED_2 //III
    523 
    524     mov         x5, x21                     //III Loads pu1_avail
    525     LDRB        w5,[x5,#3]                  //III pu1_avail[3]
    526     CMP         x5,#0                       //III
    527     SUB         x20,x4,#4                   //III pu1_src[src_strd - 2]
    528     csel        x11, x20, x11,NE
    529 
    530 NEXT_ROW_POINTER_ASSIGNED_2:
    531     LDRH        w5,[x11,#2]                 //III
    532     cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    533     ADD         x11,x0,x1                   //III
    534 
    535     LDRB        w9,[x11,#14]                //III pu1_src_cpy[14]
    536     mov         v18.4h[7], w5               //III vsetq_lane_u8
    537     LDRB        w8,[x11,#15]                //III pu1_src_cpy[15]
    538 
    539     LDRB        w11,[x0,#16]                //III load the value pu1_src_cpy[16 - src_strd]
    540     SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    541     LDRB        w10,[x0,#17]                //III load the value pu1_src_cpy[17 - src_strd]
    542 
    543     SUB         x9,x9,x11                   //III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    544     EXT         v18.16b,  v18.16b ,  v30.16b,#14 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    545     SUB         x10,x8,x10                  //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    546 
    547     CMP         x9,#0                       //III
    548     ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
    549     movn        x20,#0
    550     csel        x9, x20, x9,LT              //III
    551 
    552     MOV         x20,#1
    553     csel        x9, x20, x9,GT              //III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    554     ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
    555     CMP         x10,#0                      //III
    556 
    557     NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
    558     TBL         v26.16b, {v21.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    559     movn        x20,#0
    560     csel        x10, x20, x10,LT            //III
    561     MOV         x20,#1
    562     csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    563 
    564     EXT         v17.16b,  v17.16b ,  v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
    565     //TBL v27.8b, {v21.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    566     cmhi        v22.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    567 
    568     mov         v17.16b[14], w9             //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    569     AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
    570     mov         v27.d[0],v26.d[1]
    571 
    572     mov         v17.16b[15], w10            //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    573     UZP1        v31.8b, v26.8b, v27.8b
    574     UZP2        v27.8b, v26.8b, v27.8b      //II
    575     mov         v26.8b,v31.8b
    576 
    577     cmhi        v20.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    578     TBL         v24.8b, {v6.16b},v26.8b     //II
    579     SUB         v22.16b,  v20.16b ,  v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    580 
    581     ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
    582     TBL         v25.8b, {v7.16b},v27.8b     //II
    583     ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
    584 
    585     LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    586     ZIP1        v31.8b, v24.8b, v25.8b
    587     ZIP2        v25.8b, v24.8b, v25.8b      //II
    588     mov         v24.8b,v31.8b
    589 
    590     Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    591     TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    592     NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
    593 
    594     SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    595     //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    596     EXT         v17.16b,  v17.16b ,  v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
    597 
    598     Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    599     AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
    600     mov         v19.d[0],v18.d[1]
    601 
    602     Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    603     UZP1        v31.8b, v18.8b, v19.8b
    604     UZP2        v19.8b, v18.8b, v19.8b      //III
    605     mov         v18.8b,v31.8b
    606 
    607     SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    608     TBL         v22.8b, {v6.16b},v18.8b     //III
    609     UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    610 
    611     SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    612     TBL         v23.8b, {v7.16b},v19.8b     //III
    613     SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    614 
    615     Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    616     ZIP1        v31.8b, v22.8b, v23.8b
    617     ZIP2        v23.8b, v22.8b, v23.8b      //III
    618     mov         v22.8b,v31.8b
    619 
    620     xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
    621     SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    622 
    623     mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
    624     UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    625 
    626     SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
    627     SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    628     CMP         x7,#1                       //III
    629 
    630     xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
    631     UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    632 
    633     SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    634 
    635     SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    636 
    637     ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    638     UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    639 
    640     BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
    641     BLT         INNER_LOOP_DONE
    642 
    643 
    644     ADD         x11,x0,x1,LSL #1            //*pu1_src + src_strd
    645     xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
    646     SUB         x5,x12,x7                   //ht_tmp - row
    647 
    648     ADD         x8,x14,x5,LSL #1            //pu1_src_left_cpy[(ht_tmp - row) * 2]
    649     xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
    650     CMP         x7,#1
    651 
    652     LDRB        w4,[x0,#16]                 //load the value pu1_src_cpy[16 - src_strd]
    653     LD1         {v16.16b},[x11]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    654     //LD1 {v17.8b},[x11]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    655     //SUB x11, x11,#8
    656     LDRB        w9,[x0,#17]                 //load the value pu1_src_cpy[17 - src_strd]
    657 
    658     BNE         NEXT_ROW_POINTER_ASSIGNED_3
    659     mov         x5, x21                     //Loads pu1_avail
    660     LDRB        w5,[x5,#3]                  //pu1_avail[3]
    661     CMP         x5,#0
    662     SUB         x20,x11,#4                  //pu1_src[src_strd - 2]
    663     csel        x8, x20, x8,NE
    664 
    665 NEXT_ROW_POINTER_ASSIGNED_3:
    666     LDRH        w5,[x8,#2]
    667     ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    668     LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
    669 
    670     SUB         x8,x8,x4                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    671     mov         v18.4h[7], w5               //vsetq_lane_u8
    672     LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
    673 
    674     CMP         x8,#0
    675     EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    676     SUB         x10,x10,x9                  //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    677 
    678     movn        x20,#0
    679     csel        x8, x20, x8,LT
    680     LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    681     MOV         x20,#1
    682     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    683 
    684     CMP         x10,#0
    685     mov         v17.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    686     movn        x20,#0
    687     csel        x10, x20, x10,LT
    688 
    689     MOV         x20,#1
    690     csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    691     mov         v17.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    692     cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    693 
    694     cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    695     SUB         v22.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    696 
    697     ADD         v18.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
    698     ADD         v18.16b,  v18.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
    699     TBL         v18.16b, {v28.16b},v18.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    700     //TBL v19.8b, {v28.16b},v19.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    701 
    702     AND         v18.16b,  v18.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    703     mov         v19.d[0],v18.d[1]
    704 
    705     Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    706     UZP1        v31.8b, v18.8b, v19.8b
    707     UZP2        v19.8b, v18.8b, v19.8b
    708     mov         v18.8b,v31.8b
    709 
    710     TBL         v22.8b, {v6.16b},v18.8b
    711     TBL         v23.8b, {v7.16b},v19.8b
    712 
    713     Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    714     ZIP1        v31.8b, v22.8b, v23.8b
    715     ZIP2        v23.8b, v22.8b, v23.8b
    716     mov         v22.8b,v31.8b
    717 
    718     SADDW       v20.8h,  v20.8h ,  v22.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    719     SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    720     UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    721 
    722     SADDW       v18.8h,  v18.8h ,  v23.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    723     SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    724     UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    725 
    726 
    727 INNER_LOOP_DONE:
    728 
    729     mov         w8, w25                     //Loads ht
    730     xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
    731     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    732 
    733     LSL         x8,x8,#1
    734     xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
    735     mov         x11, x27                    //Loads *pu1_src_left
    736 
    737 SRC_LEFT_LOOP:
    738     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
    739     SUBS        x8,x8,#4
    740     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
    741     BNE         SRC_LEFT_LOOP
    742 
    743     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    744     ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    745     CMP         x6,#8                       //Check whether residue remains
    746 
    747     BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
    748     mov         w7, w24                     //Loads wd
    749     mov         x0, x28                     //Loads *pu1_src
    750     SUB         x7,x7,x6
    751     ADD         x0,x0,x7
    752     BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
    753     BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
    754 
    755 WD_16_HT_4_LOOP:
    756     mov         w7, w24                     //Loads wd
    757 
    758     mov         x5, x21                     //Loads pu1_avail
    759     CMP         x6,x7                       //col == wd
    760 
    761     LDRb        w20, [x5]                   //pu1_avail[0]
    762     csel        w8,w20,w8,EQ
    763     MOV         x20,#-1
    764     csel        x8, x20, x8,NE
    765     mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    766 
    767     CMP         x6,#16                      //if(col == 16)
    768     mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    769 
    770     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    771     LDRB        w8,[x5,#1]                  //pu1_avail[1]
    772     mov         v1.16b[14], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    773     mov         v1.16b[15], w8              //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    774 
    775 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    776     LDRB        w11,[x5,#2]                 //pu1_avail[2]
    777     SUB         x20,x0,x1                   //pu1_src - src_strd
    778     csel        x8, x20, x8,EQ
    779 
    780     CMP         x11,#0
    781     csel        x8, x3, x8,NE
    782     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    783     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    784     //SUB x0, x0,#8
    785     ADD         x8,x8,#2                    //pu1_src - src_strd + 2
    786 
    787     ADD         x3,x3,#16
    788     LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    789     //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    790     //SUB x8, x8,#8
    791     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    792 
    793     mov         w4, w25                     //Loads ht
    794     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    795     mov         w7, w24                     //Loads wd
    796 
    797     SUB         x7,x7,x6                    //(wd - col)
    798     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    799     ADD         x7,x7,#14                   //15 + (wd - col)
    800 
    801     mov         x8, x26                     //Loads *pu1_src
    802     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    803     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
    804 
    805 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    806     LDRH        w8,[x7]                     //load the value and increment by src_strd
    807     SUBS        x4,x4,#1                    //decrement the loop count
    808 
    809     STRH        w8,[x5],#2                  //store it in the stack pointer
    810     ADD         x7,x7,x1
    811     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    812 
    813     movi        v18.16b, #0
    814     MOV         x7,x12                      //row count, move ht_tmp to x7
    815 
    816 PU1_SRC_LOOP_WD_16_HT_4:
    817     ADD         x9,x0,x1                    //*pu1_src + src_strd
    818 
    819     mov         x5, x21                     //Loads pu1_avail
    820     LD1         {v16.16b},[x9]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    821     //LD1 {v17.8b},[x9]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    822     //SUB x9, x9,#8
    823     LDRB        w5,[x5,#3]                  //pu1_avail[3]
    824 
    825     SUB         x11,x12,x7                  //ht_tmp - row
    826     ADD         x8,x14,x11,LSL #1           //pu1_src_left_cpy[(ht_tmp - row) * 2]
    827     ADD         x8,x8,#2                    //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
    828 
    829     CMP         x5,#0
    830     BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
    831     CMP         x7,#1
    832     SUB         x20,x9,#2                   //pu1_src[src_strd - 2]
    833     csel        x8, x20, x8,EQ
    834 
    835 NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
    836     LDRH        w5,[x8]
    837     mov         v18.8h[7], w5               //vsetq_lane_u8
    838     EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    839 
    840     CMP         x7,x12
    841     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    842     mov         x5, x21                     //Loads pu1_avail
    843     LDRB        w5,[x5,#2]                  //pu1_avail[2]
    844     CMP         x5,#0
    845     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    846 
    847 SIGN_UP_CHANGE_WD_16_HT_4:
    848     LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
    849     SUB         x9,x0,x1
    850 
    851     LDRB        w5,[x9,#16]                 //load the value pu1_src_cpy[16 - src_strd]
    852 
    853     LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
    854     SUB         x8,x8,x5                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    855 
    856     LDRB        w11,[x9,#17]                //load the value pu1_src_cpy[17 - src_strd]
    857     CMP         x8,#0
    858 
    859     movn        x20,#0
    860     csel        x8, x20, x8,LT
    861     SUB         x10,x10,x11                 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    862 
    863     MOV         x20,#1
    864     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    865 
    866     CMP         x10,#0
    867     mov         v17.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    868     movn        x20,#0
    869     csel        x10, x20, x10,LT
    870 
    871     MOV         x20,#1
    872     csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    873     mov         v17.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    874 
    875 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    876     LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    877     cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    878 
    879     cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    880     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    881 
    882     ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
    883     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
    884 
    885     mov         v20.d[1],v20.d[0]
    886     NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
    887     TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    888 
    889     //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    890     EXT         v17.16b,  v17.16b ,  v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
    891 
    892     Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    893     AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    894     mov         v27.d[0],v26.d[1]
    895 
    896     UZP1        v31.8b, v26.8b, v27.8b
    897     UZP2        v27.8b, v26.8b, v27.8b
    898     mov         v26.8b,v31.8b
    899     TBL         v24.8b, {v6.16b},v26.8b
    900     TBL         v25.8b, {v7.16b},v27.8b
    901     ZIP1        v31.8b, v24.8b, v25.8b
    902     ZIP2        v25.8b, v24.8b, v25.8b
    903     mov         v24.8b,v31.8b
    904 
    905     Uxtl2       v30.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    906     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    907 
    908     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    909     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    910 
    911     mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
    912     SADDW       v30.8h,  v30.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    913 
    914     SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    915     UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    916 
    917     xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    918     xtn2        v28.16b,  v30.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
    919 
    920     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
    921     ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    922     BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    923 
    924     mov         w8, w25                     //Loads ht
    925     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    926     mov         x11, x27                    //Loads *pu1_src_left
    927 
    928 SRC_LEFT_LOOP_WD_16_HT_4:
    929     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
    930     SUBS        x8,x8,#2
    931     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
    932     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    933 
    934     SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
    935     BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
    936     BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop
    937 
    938 WIDTH_RESIDUE:
    939     mov         w7, w24                     //Loads wd
    940 
    941     mov         x5, x21                     //Loads pu1_avail
    942     CMP         x6,x7                       //wd_residue == wd
    943 
    944     LDRb        w20, [x5]                   //pu1_avail[0]
    945     csel        w8,w20,w8,EQ
    946 
    947     MOV         x20,#-1
    948     csel        x8, x20, x8,NE
    949     LDRB        w11,[x5,#1]                 //pu1_avail[1]
    950 
    951     LDRB        w9,[x5,#2]                  //pu1_avail[2]
    952     mov         v1.8b[0], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    953     CMP         x9,#0
    954 
    955     SUB         x20,x0,x1                   //pu1_src - src_strd
    956     csel        x10, x20, x10,EQ
    957     mov         v1.8b[1], w8                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    958     csel        x10, x3, x10,NE
    959 
    960     ADD         x10,x10,#2                  //pu1_src - src_strd + 2
    961     mov         v1.8b[6], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    962     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
    963 
    964     mov         w4, w25                     //Loads ht
    965     mov         v1.8b[7], w11               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    966     mov         w7, w24                     //Loads wd
    967 
    968     mov         x8, x26                     //Loads *pu1_src
    969     LD1         {v3.16b},[x10]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    970     //LD1 {v11.8b},[x10]                    //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    971     //SUB x10, x10,#8
    972     SUB         x7,x7,#2                    //(wd - 2)
    973 
    974     ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
    975 
    976 AU1_SRC_LEFT_LOOP_RESIDUE:
    977     LDRH        w8,[x7]                     //load the value and increment by src_strd
    978     ADD         x7,x7,x1
    979     STRH        w8,[x5],#2                  //store it in the stack pointer
    980     SUBS        x4,x4,#1                    //decrement the loop count
    981     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    982 
    983     LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
    984     //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
    985     //SUB x0, x0,#8
    986 
    987     movi        v18.16b, #0
    988     cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
    989 
    990     cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
    991     SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    992     MOV         x7,x12                      //row count, move ht_tmp to x7
    993 
    994 PU1_SRC_LOOP_RESIDUE:
    995     ADD         x9,x0,x1                    //*pu1_src + src_strd
    996 
    997     SUB         x11,x12,x7                  //ht_tmp - row
    998     LD1         {v16.16b},[x9]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    999     //LD1 {v17.8b},[x9]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
   1000     //SUB x9, x9,#8
   1001     mov         x5, x21                     //Loads pu1_avail
   1002 
   1003     LDRB        w5,[x5,#3]                  //pu1_avail[3]
   1004     ADD         x8,x14,x11,LSL #1           //pu1_src_left_cpy[(ht_tmp - row) * 2]
   1005 
   1006     CMP         x5,#0
   1007     ADD         x8,x8,#2                    //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
   1008 
   1009     BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
   1010     CMP         x7,#1
   1011     SUB         x20,x9,#2                   //pu1_src[src_strd - 2]
   1012     csel        x8, x20, x8,EQ
   1013 
   1014 NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
   1015     LDRB        w5,[x8]
   1016 
   1017     LDRB        w8,[x8,#1]
   1018     mov         v18.16b[14], w5             //vsetq_lane_u8
   1019     CMP         x7,x12
   1020 
   1021     mov         v18.16b[15], w8             //vsetq_lane_u8
   1022     EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
   1023 
   1024     BLT         SIGN_UP_CHANGE_RESIDUE
   1025     mov         x5, x21                     //Loads pu1_avail
   1026     LDRB        w5,[x5,#2]                  //pu1_avail[2]
   1027     CMP         x5,#0
   1028     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
   1029 
   1030 SIGN_UP_CHANGE_RESIDUE:
   1031     LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
   1032     SUB         x9,x0,x1
   1033 
   1034     LDRB        w5,[x9,#16]                 //load the value pu1_src_cpy[16 - src_strd]
   1035 
   1036     LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
   1037     SUB         x8,x8,x5                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
   1038 
   1039     LDRB        w11,[x9,#17]                //load the value pu1_src_cpy[17 - src_strd]
   1040     CMP         x8,#0
   1041 
   1042     movn        x20,#0
   1043     csel        x8, x20, x8,LT
   1044     SUB         x10,x10,x11                 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
   1045 
   1046     MOV         x20,#1
   1047     csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
   1048 
   1049     CMP         x10,#0
   1050     mov         v17.16b[14], w8             //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
   1051     movn        x20,#0
   1052     csel        x10, x20, x10,LT
   1053 
   1054     MOV         x20,#1
   1055     csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
   1056     mov         v17.16b[15], w10            //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
   1057 
   1058 SIGN_UP_CHANGE_DONE_RESIDUE:
   1059     LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
   1060     cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
   1061 
   1062     cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
   1063     SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
   1064 
   1065     ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
   1066     ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
   1067 
   1068     mov         v20.d[1],v20.d[0]
   1069     NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
   1070     TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
   1071 
   1072     //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
   1073     EXT         v17.16b,  v17.16b ,  v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
   1074 
   1075     Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
   1076     AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
   1077     mov         v27.d[0],v26.d[1]
   1078 
   1079     UZP1        v31.8b, v26.8b, v27.8b
   1080     UZP2        v27.8b, v26.8b, v27.8b
   1081     mov         v26.8b,v31.8b
   1082     TBL         v24.8b, {v6.16b},v26.8b
   1083     TBL         v25.8b, {v7.16b},v27.8b
   1084     ZIP1        v31.8b, v24.8b, v25.8b
   1085     ZIP2        v25.8b, v24.8b, v25.8b
   1086     mov         v24.8b,v31.8b
   1087 
   1088     mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
   1089     SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
   1090 
   1091     SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
   1092     UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
   1093 
   1094     SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
   1095     xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
   1096 
   1097     ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
   1098 
   1099     BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
   1100 
   1101     mov         w8, w25                     //Loads ht
   1102     ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
   1103 
   1104     mov         x11, x27                    //Loads *pu1_src_left
   1105 
   1106 SRC_LEFT_LOOP_RESIDUE:
   1107     LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
   1108     SUBS        x8,x8,#2
   1109     STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
   1110     BNE         SRC_LEFT_LOOP_RESIDUE
   1111 
   1112 
   1113 RE_ASSINING_LOOP:
   1114     mov         w7, w24                     //Loads wd
   1115     mov         w8, w25                     //Loads ht
   1116 
   1117     mov         x0, x26                     //Loads *pu1_src
   1118     SUB         x10,x7,#2                   //wd - 2
   1119 
   1120     LDRH        w9,[sp,#6]
   1121     SUB         x8,x8,#1                    //ht - 1
   1122 
   1123     STRH        w9,[x0,x10]                 //pu1_src_org[0] = u1_pos_0_0_tmp
   1124     madd        x6, x8, x1, x0              //pu1_src[(ht - 1) * src_strd]
   1125 
   1126     mov         x4, x15                     //Loads pu1_src_top_left
   1127 
   1128     LDRH        w9,[sp,#8]
   1129     ADD         x12,sp,#10
   1130 
   1131     STRH        w9,[x6]                     //pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
   1132 
   1133     LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
   1134     STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
   1135     mov         x3, x22                     //Loads pu1_src_top
   1136 
   1137 SRC_TOP_LOOP:
   1138     LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
   1139     SUBS        x7,x7,#8                    //Decrement the width
   1140     ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
   1141     BNE         SRC_TOP_LOOP
   1142 
   1143 END_LOOPS:
   1144     ADD         sp,sp,#0xE0
   1145     // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
   1146     ldp         x27, x28,[sp],#16
   1147     ldp         x25, x26,[sp],#16
   1148     ldp         x23, x24,[sp],#16
   1149     ldp         x21, x22,[sp],#16
   1150     ldp         x19, x20,[sp],#16
   1151 
   1152     ret
   1153 
   1154 
   1155 
   1156