Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* ,:file
     21 //*  ihevc_sao_edge_offset_class0_chroma.s
     22 //*
     23 //* ,:brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 //* RVCT
     27 //*
     28 //* ,:author
     29 //*  Parthiban V
     30 //*
     31 //* ,:par List of Functions:
     32 //*
     33 //*
     34 //* ,:remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 //void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
     40 //                              WORD32 src_strd,
     41 //                              UWORD8 *pu1_src_left,
     42 //                              UWORD8 *pu1_src_top,
     43 //                              UWORD8 *pu1_src_top_left,
     44 //                              UWORD8 *pu1_src_top_right,
     45 //                              UWORD8 *pu1_src_bot_left,
     46 //                              UWORD8 *pu1_avail,
     47 //                              WORD8 *pi1_sao_offset_u,
     48 //                              WORD8 *pi1_sao_offset_v,
     49 //                              WORD32 wd,
     50 //
     51 //**************Variables Vs Registers*****************************************
     52 //x0 =>    *pu1_src
     53 //x1 =>    src_strd
     54 //x2 =>    *pu1_src_left
     55 //x3 =>    *pu1_src_top
     56 //x4    =>    *pu1_src_top_left
     57 //x7    =>    *pu1_avail
     58 //x8    =>    *pi1_sao_offset_u
     59 //x5    =>    *pi1_sao_offset_v
     60 //x9    =>    wd
     61 //x10=>    ht
     62 
     63 .text
     64 .p2align 2
     65 .include "ihevc_neon_macros.s"
     66 
     67 .globl gi1_table_edge_idx
     68 .globl ihevc_sao_edge_offset_class0_chroma_av8
     69 
     70 ihevc_sao_edge_offset_class0_chroma_av8:
     71 
     72     ldr         x8,[sp,#0]
     73     ldr         x9,[sp,#8]
     74     ldr         w10,[sp,#16]
     75     ldr         w11,[sp,#24]
     76 
     77 
     78 
     79     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     80     stp         x19, x20,[sp,#-16]!
     81     stp         x21, x22,[sp,#-16]!
     82     stp         x23, x24,[sp,#-16]!
     83     stp         x25, x26,[sp,#-16]!
     84 
     85     mov         x15,x4 // *pu1_src_top_left 40
     86     mov         x16,x5 // *pu1_src_top_right 44
     87     mov         x17,x6 // *pu1_src_bot_left 48
     88     mov         x21,x7 // *pu1_avail 52
     89     mov         x22,x8 // *pi1_sao_offset_u 56
     90     mov         x23,x9 // *pi1_sao_offset_v 60
     91     mov         x24,x10 // wd 64
     92     mov         x25,x11 // ht 68
     93 
     94     MOV         x9, x24                     //Loads wd
     95 
     96     MOV         x4, x15                     //Loads pu1_src_top_left
     97     ADD         x11,x3,x9                   //pu1_src_top[wd]
     98 
     99     MOV         x10, x25                    //Loads ht
    100     movi        v2.16b, #2                  //const_2 = vdupq_n_s8(2)
    101     SUB         x20,x11,#2
    102     LDRH        w12,[x20]                   //pu1_src_top[wd - 1]
    103 
    104     MOV         x7, x21                     //Loads pu1_avail
    105     movi        v4.8h, #0                   //const_min_clip = vdupq_n_s16(0)
    106     STRH        w12,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
    107 
    108     MOV         x8, x22                     //Loads pi1_sao_offset_u
    109     movi        v6.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    110     SUB         x4,x10,#1                   //(ht - 1)
    111 
    112     ADRP        x14, :got:gi1_table_edge_idx //table pointer
    113     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
    114     movi        v3.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
    115     mul         x4, x4, x1                  //(ht - 1) * src_strd
    116 
    117     MOV         x5, x23                     //Loads pi1_sao_offset_v
    118     LD1         {v7.8b},[x8]                //offset_tbl = vld1_s8(pi1_sao_offset_u)
    119     ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
    120 
    121     MOV         x6,x0                       //pu1_src_org
    122     LD1         {v5.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    123     MOV         x12,x9                      //Move wd to x12 for loop count
    124 
    125 SRC_TOP_LOOP:                               //wd is always multiple of 8
    126     LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
    127     SUBS        x12,x12,#8                  //Decrement the loop counter by 8
    128     ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
    129     BNE         SRC_TOP_LOOP
    130     ADD         x6,x6,#14                   //pu1_src_org[14]
    131 
    132     MOV         x3,x2                       //pu1_src_left backup to reload later
    133     LD1         {v0.8b},[x5]                //offset_tbl = vld1_s8(pi1_sao_offset_v)
    134     CMP         x9,#16                      //Compare wd with 16
    135 
    136     BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    137 
    138     MOV         x8,x9                       //move wd to x8 for loop count
    139 
    140 WIDTH_LOOP_16:
    141     CMP         x8,x9                       //if(col == wd)
    142     BNE         AU1_MASK_FF                 //jump to else part
    143     LDRB        w12,[x7]                    //pu1_avail[0]
    144     mov         v3.b[0], w12                //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    145     mov         v3.b[1], w12                //vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
    146     B           SKIP_AU1_MASK_FF            //Skip the else part
    147 
    148 AU1_MASK_FF:
    149     MOV         x12,#-1                     //move -1 to x12
    150     mov         v3.h[0], w12                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    151 
    152 SKIP_AU1_MASK_FF:
    153     CMP         x8,#16                      //If col == 16
    154     BNE         SKIP_MASKING_IF_NOT16       //If not skip masking
    155     LDRB        w12,[x7,#1]                 //pu1_avail[1]
    156     mov         v3.b[14], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
    157     mov         v3.b[15], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    158 
    159 SKIP_MASKING_IF_NOT16:
    160     MOV         x12,x0                      //pu1_src_cpy = pu1_src
    161     MOV         x4,x10                      //move ht to x4 for loop count
    162 
    163 PU1_SRC_LOOP:
    164     LDRH        w11,[x2]                    //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
    165     LD1         {v19.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
    166     //LD1 {v13.8b},[x12],x1                    //pu1_cur_row = vld1q_u8(pu1_src_cpy)
    167     //SUB x12, x12,#8
    168     SUB         x5,x9,x8                    //wd - col
    169 
    170     SUB         x14,x10,x4                  //ht - row
    171     mov         v21.h[7], w11               //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
    172     mul         x14, x14, x1                //(ht - row) * src_strd
    173 
    174     LD1         {v30.16b},[x12]             //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
    175     //LD1 {v31.8b},[x12]                    //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
    176     //SUB x12, x12,#8
    177     EXT         v21.16b,  v21.16b ,  v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
    178     SUB         x12,x12,x1
    179 
    180     LDRH        w11,[x2,#2]                 //II load pu1_src_left since ht - row =0
    181     cmhi        v16.16b,  v19.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    182     ADD         x5,x14,x5                   //(ht - row) * src_strd + (wd - col)
    183 
    184     mov         v28.h[7], w11               //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
    185     cmhi        v18.16b,  v21.16b ,  v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    186 
    187     LDRH        w14,[x6,x5]                 //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
    188     SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    189     SUB         x4,x4,#1
    190 
    191     LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
    192     EXT         v28.16b,  v28.16b ,  v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
    193 
    194     mov         v21.b[0], w11               //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    195     cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    196 
    197     LDRB        w11,[x12,#17]               //pu1_src_cpy[17]
    198     cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    199     STRH        w14,[x2],#2                 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    200 
    201     ADD         x12,x12,x1
    202     mov         v21.b[1], w11               //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
    203     LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
    204 
    205     EXT         v21.16b,  v19.16b ,  v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
    206     mov         v28.b[0], w11               //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    207 
    208     LDRB        w11,[x12,#17]               //II pu1_src_cpy[17]
    209     cmhi        v16.16b,  v19.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    210     SUB         x12,x12,x1
    211 
    212     cmhi        v18.16b,  v21.16b ,  v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    213     mov         v28.b[1], w11               //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
    214 
    215     SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    216     EXT         v28.16b,  v30.16b ,  v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
    217 
    218     ADD         v21.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
    219 
    220     mov         v5.d[1],v5.d[0]
    221     ADD         v21.16b,  v21.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
    222     TBL         v21.16b, {v5.16b},v21.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    223     SUB         v20.16b,  v24.16b ,  v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    224 
    225     cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    226 //    TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    227     cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    228 
    229     AND         v21.16b,  v21.16b ,  v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    230     mov         v23.d[0],v21.d[1]
    231     UZP1        v1.8b, v21.8b, v23.8b
    232     UZP2        v23.8b, v21.8b, v23.8b
    233     mov         v21.8b, v1.8b
    234 
    235     //mov v11.d[1],v0.d[0]
    236     //mov v14.d[1],v15.d[0]
    237     SUB         v22.16b,  v24.16b ,  v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    238     TBL         v16.8b, {v7.16b},v21.8b     //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
    239     ADD         v24.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
    240 
    241     Uxtl        v18.8h, v19.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    242     TBL         v17.8b, {v0.16b},v23.8b
    243     ADD         v24.16b,  v24.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
    244 
    245     //mov v17.d[0],v16.d[1]
    246     ZIP1        v1.8b, v16.8b, v17.8b
    247     ZIP2        v17.8b, v16.8b, v17.8b
    248     mov         v16.8b, v1.8b
    249     TBL         v24.16b, {v5.16b},v24.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    250     Uxtl2       v19.8h, v19.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    251 
    252     //mov v16.d[1],v17.d[0]
    253     SADDW       v18.8h,  v18.8h ,  v16.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    254     //TBL v25.8b, {v10.16b},v25.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    255     SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    256 
    257     AND         v24.16b,  v24.16b ,  v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
    258     mov         v25.d[0],v24.d[1]
    259     UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    260     UZP1        v1.8b, v24.8b, v25.8b
    261     UZP2        v25.8b, v24.8b, v25.8b      //II
    262     mov         v24.8b, v1.8b
    263 
    264     //mov v24.d[1],v25.d[0]
    265     SADDW       v19.8h,  v19.8h ,  v17.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    266     TBL         v26.8b, {v7.16b},v24.8b     //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
    267     SMAX        v19.8h,  v19.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    268 
    269     UMIN        v19.8h,  v19.8h ,  v6.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    270     TBL         v27.8b, {v0.16b},v25.8b     //II
    271     xtn         v21.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    272 
    273     //mov v27.d[0],v26.d[1]
    274     xtn         v23.8b,  v19.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
    275     ZIP1        v1.8b, v26.8b, v27.8b
    276     ZIP2        v27.8b, v26.8b, v27.8b      //II
    277     mov         v26.8b, v1.8b
    278 
    279     //mov v26.d[1],v27.d[0]
    280     SUB         x5,x9,x8                    //II wd - col
    281     Uxtl        v28.8h, v30.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    282     SUB         x14,x10,x4                  //II ht - row
    283 
    284     mul         x14, x14, x1                //II (ht - row) * src_strd
    285     SADDW       v28.8h,  v28.8h ,  v26.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    286     ADD         x5,x14,x5                   //II (ht - row) * src_strd + (wd - col)
    287 
    288     LDRH        w14,[x6,x5]                 //II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
    289     SMAX        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    290 
    291     STRH        w14,[x2],#2                 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    292     UMIN        v28.8h,  v28.8h ,  v6.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    293 
    294     //mov       v31.2d[0],v30.2d[1]
    295     Uxtl2       v30.8h, v30.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    296 
    297     SADDW       v30.8h,  v30.8h ,  v27.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    298     ST1         {v21.8b},[x12],#8           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    299     ST1         {v23.8b},[x12],x1
    300     SUB         x12,x12,#8
    301 
    302     SMAX        v30.8h,  v30.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    303     SUBS        x4,x4,#1                    //Decrement row by 1
    304     UMIN        v30.8h,  v30.8h ,  v6.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    305 
    306     xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
    307     xtn         v29.8b,  v30.8h             //II vmovn_s16(pi2_tmp_cur_row.val[1])
    308 
    309     ST1         {v28.8b, v29.8b},[x12],x1   //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    310 
    311     BNE         PU1_SRC_LOOP                //If not equal jump to the inner loop
    312 
    313     ADD         x0,x0,#16                   //pu1_src += 16
    314 
    315     SUBS        x8,x8,#16                   //Decrement column by 16
    316     CMP         x8,#8                       //Check whether residue remains
    317     MOV         x2,x3                       //Reload pu1_src_left
    318     BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
    319     BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
    320     BLT         END_LOOPS                   //Jump to end function
    321 
    322 WIDTH_RESIDUE:
    323     SUB         x6,x6,#14
    324     AND         x8,x9,#0xF                  //wd_rem = wd & 0xF
    325     CMP         x8,#0                       //Residue check
    326     BEQ         END_LOOPS                   //No Residue jump to end function
    327 
    328     CMP         x8,x9                       //if(wd_rem == wd)
    329     BNE         AU1_MASK_FF_RESIDUE         //jump to else part
    330     LDRB        w12,[x7]                    //pu1_avail[0]
    331     mov         v3.b[0], w12                //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    332     mov         v3.b[1], w12                //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    333     B           SKIP_AU1_MASK_FF_RESIDUE    //Skip the else part
    334 
    335 AU1_MASK_FF_RESIDUE:
    336     MOV         x12,#-1                     //move -1 to x12
    337     mov         v3.h[0], w12                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    338 
    339 SKIP_AU1_MASK_FF_RESIDUE:
    340     LDRB        w12,[x7,#1]                 //pu1_avail[1]
    341     mov         v3.b[6], w12                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    342     mov         v3.b[7], w12                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    343 
    344     MOV         x12,x0                      //pu1_src_cpy = pu1_src
    345     MOV         x4,x10                      //move ht to x4 for loop count
    346 
    347 PU1_SRC_LOOP_RESIDUE:
    348     LDRH        w11,[x2]                    //load pu1_src_left
    349     LD1         {v19.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
    350     //LD1 {v13.8b},[x12],x1                    //pu1_cur_row = vld1q_u8(pu1_src_cpy)
    351     //SUB x12, x12,#8
    352     SUB         x5,x9,#2                    //wd - 2
    353 
    354     SUB         x14,x10,x4                  //(ht - row)
    355     mov         v21.h[7], w11               //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    356     LSL         x14,x14,#1                  //(ht - row) * 2
    357 
    358     LD1         {v30.16b},[x12]             //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
    359     //LD1 {v31.8b},[x12]                    //II pu1_cur_row = vld1q_u8(pu1_src_cpy)
    360     //SUB x12, x12,#8
    361     EXT         v21.16b,  v21.16b ,  v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    362     SUB         x12,x12,x1
    363 
    364     LDRH        w11,[x2,#2]                 //II load pu1_src_left
    365     cmhi        v16.16b,  v19.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    366     mul         x14, x14, x1                //(ht - row) * 2 * src_strd
    367 
    368     cmhi        v18.16b,  v21.16b ,  v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    369     mov         v28.h[7], w11               //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    370 
    371     LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
    372     SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    373     ADD         x5,x14,x5                   //(ht - row) * 2 * src_strd + (wd - 2)
    374 
    375     mov         v21.b[0], w11               //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    376     EXT         v28.16b,  v28.16b ,  v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    377 
    378     LDRB        w11,[x12,#17]               //pu1_src_cpy[17]
    379     cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    380     LDRH        w14,[x6, x5]                //pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
    381 
    382     mov         v21.b[1], w11               //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
    383     cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    384     ADD         x12,x12,x1
    385 
    386     STRH        w14,[x2],#2                 //pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
    387     EXT         v21.16b,  v19.16b ,  v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    388     LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
    389 
    390     cmhi        v16.16b,  v19.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    391     mov         v28.b[0], w11               //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    392 
    393     LDRB        w11,[x12,#17]               //II pu1_src_cpy[17]
    394     cmhi        v18.16b,  v21.16b ,  v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    395     SUB         x4,x4,#1                    //II Decrement row by 1
    396 
    397     SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    398     mov         v28.b[1], w11               //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
    399     SUB         x12,x12,x1
    400 
    401     ADD         v21.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
    402     EXT         v28.16b,  v30.16b ,  v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    403 
    404     ADD         v21.16b,  v21.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
    405 
    406     SUB         v20.16b,  v24.16b ,  v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    407     TBL         v21.16b, {v5.16b},v21.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    408     cmhi        v26.16b,  v30.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    409 
    410     cmhi        v24.16b,  v28.16b ,  v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    411     //TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    412     SUB         v22.16b,  v24.16b ,  v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    413 
    414     AND         v21.16b,  v21.16b ,  v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    415     mov         v23.d[0],v21.d[1]
    416     UZP1        v1.8b, v21.8b, v23.8b
    417     UZP2        v23.8b, v21.8b, v23.8b
    418     mov         v21.8b, v1.8b
    419 
    420     ADD         v28.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
    421     TBL         v16.8b, {v7.16b},v21.8b     //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
    422     ADD         v28.16b,  v28.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
    423 
    424     Uxtl        v18.8h, v19.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    425     TBL         v17.8b, {v0.16b},v23.8b
    426     Uxtl        v24.8h, v30.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    427 
    428     ZIP1        v1.8b, v16.8b, v17.8b
    429     ZIP2        v17.8b, v16.8b, v17.8b
    430     mov         v16.8b, v1.8b
    431     TBL         v28.16b, {v5.16b},v28.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    432     SADDW       v18.8h,  v18.8h ,  v16.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    433 
    434     SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    435     //TBL v29.8b, {v10.16b},v29.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    436     UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    437 
    438     xtn         v18.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    439     AND         v28.16b,  v28.16b ,  v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
    440     mov         v29.d[0],v28.d[1]
    441     SUB         x5,x9,#2                    //II wd - 2
    442     UZP1        v1.8b, v28.8b, v29.8b
    443     UZP2        v29.8b, v28.8b, v29.8b      //II
    444     mov         v28.8b, v1.8b
    445     SUB         x14,x10,x4                  //II (ht - row)
    446 
    447     LSL         x14,x14,#1                  //II (ht - row) * 2
    448     TBL         v26.8b, {v7.16b},v28.8b     //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
    449     mul         x14, x14, x1                //II (ht - row) * 2 * src_strd
    450 
    451     ADD         x5,x14,x5                   //II (ht - row) * 2 * src_strd + (wd - 2)
    452     TBL         v27.8b, {v0.16b},v29.8b     //II
    453     LDRH        w14,[x6, x5]                //II pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
    454 
    455     ZIP1        v1.8b, v26.8b, v27.8b
    456     ZIP2        v27.8b, v26.8b, v27.8b      //II
    457     mov         v26.8b, v1.8b
    458     ST1         {v18.8b},[x12],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    459 
    460     STRH        w14,[x2],#2                 //II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
    461     SADDW       v24.8h,  v24.8h ,  v26.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    462     SUBS        x4,x4,#1                    //Decrement row by 1
    463 
    464     SMAX        v24.8h,  v24.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    465     UMIN        v24.8h,  v24.8h ,  v6.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    466 
    467     xtn         v28.8b,  v24.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
    468 
    469     ST1         {v28.8b},[x12],x1           //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    470 
    471     BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to the pu1_src loop
    472 
    473 END_LOOPS:
    474     // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
    475     ldp         x25, x26,[sp],#16
    476     ldp         x23, x24,[sp],#16
    477     ldp         x21, x22,[sp],#16
    478     ldp         x19, x20,[sp],#16
    479 
    480     ret
    481 
    482 
    483 
    484 
    485 
    486