Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* ,:file
     21 //*  ihevc_sao_edge_offset_class0.s
     22 //*
     23 //* ,:brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using// ARM
     26 //* RVCT
     27 //*
     28 //* ,:author
     29 //*  Parthiban V
     30 //*
     31 //* ,:par List of Functions:
     32 //*
     33 //*
     34 //* ,:remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 //void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
     40 //                              WORD32 src_strd,
     41 //                              UWORD8 *pu1_src_left,
     42 //                              UWORD8 *pu1_src_top,
     43 //                              UWORD8 *pu1_src_top_left,
     44 //                              UWORD8 *pu1_src_top_right,
     45 //                              UWORD8 *pu1_src_bot_left,
     46 //                              UWORD8 *pu1_avail,
     47 //                              WORD8 *pi1_sao_offset,
     48 //                              WORD32 wd,
     49 //                              WORD32 ht)
     50 //
     51 //**************Variables Vs Registers*****************************************
     52 //x0 =>    *pu1_src
     53 //x1 =>    src_strd
     54 //x2 =>    *pu1_src_left
     55 //x3 =>    *pu1_src_top
     56 //x4    =>    *pu1_src_top_left
     57 //x7    =>    *pu1_avail
     58 //x8    =>    *pi1_sao_offset
     59 //x9    =>    wd
     60 //x10=>    ht
     61 
     62 .text
     63 .p2align 2
     64 
     65 .include "ihevc_neon_macros.s"
     66 
     67 .globl gi1_table_edge_idx
     68 .globl ihevc_sao_edge_offset_class0_av8
     69 
     70 ihevc_sao_edge_offset_class0_av8:
     71 
     72 
     73     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     74 
     75     LDR         x8, [sp]                    // pi1_sao_offset
     76     LDR         x9,[sp,#8]                  //Loads wd
     77     AND         x9,x9,0xFFFFFFFF            // Since argument is passed as WORD32, Using only lower half of x9
     78     LDR         x10,[sp,#16]                //Loads ht
     79     AND         x10,x10,0xFFFFFFFF          // Since argument is passed as WORD32, Using only lower half of x10
     80 
     81 
     82     stp         x19, x20,[sp,#-16]!
     83 
     84     movi        v2.16b, #2                  //const_2 = vdupq_n_s8(2)
     85     ADD         x11,x3,x9                   //pu1_src_top[wd]
     86     SUB         x11,x11,#1
     87 
     88     movi        v4.8h, #0                   //const_min_clip = vdupq_n_s16(0)
     89     LDRB        w12,[x11]                   //pu1_src_top[wd - 1]
     90     ADD         x11,x11,#1
     91 
     92     movi        v6.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
     93     ADRP        x14, :got:gi1_table_edge_idx //table pointer
     94     LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
     95 
     96     movi        v3.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
     97     STRB        w12,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
     98 
     99     MOV         x6,x0                       //pu1_src_org
    100     LD1         {v5.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    101     SUB         x4,x10,#1                   //(ht - 1)
    102 
    103     MOV         x12,x9                      //Move wd to x12 for loop count
    104     LD1         {v7.8b},[x8]                //offset_tbl = vld1_s8(pi1_sao_offset)
    105     mul         x4, x4, x1                  //(ht - 1) * src_strd
    106 
    107     ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
    108 
    109 SRC_TOP_LOOP:                               //wd is always multiple of 8
    110     LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
    111     SUBS        x12,x12,#8                  //Decrement the loop counter by 8
    112     ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
    113     BNE         SRC_TOP_LOOP
    114     ADD         x6,x6,#15                   //pu1_src_org[16 - 1]
    115 
    116     CMP         x9,#16                      //Compare wd with 16
    117     MOV         x3,x2                       //pu1_src_left backup to reload later
    118     BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    119 
    120     MOV         x8,x9                       //move wd to x8 for loop count
    121 
    122 WIDTH_LOOP_16:
    123     CMP         x8,x9                       //if(col == wd)
    124     BNE         AU1_MASK_FF                 //jump to else part
    125     LDRB        w12,[x7]                    //pu1_avail[0]
    126     mov         v3.b[0], w12                //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    127     B           SKIP_AU1_MASK_FF            //Skip the else part
    128 
    129 AU1_MASK_FF:
    130     MOV         x12,#0xFF                   //move -1 to x12
    131     mov         v3.b[0], w12                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    132 
    133 SKIP_AU1_MASK_FF:
    134     CMP         x8,#16                      //If col == 16
    135     BNE         SKIP_MASKING_IF_NOT16       //If not skip masking
    136     LDRB        w12,[x7,#1]                 //pu1_avail[1]
    137     mov         v3.b[15], w12               //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    138 
    139 SKIP_MASKING_IF_NOT16:
    140     MOV         x12,x0                      //pu1_src_cpy = pu1_src
    141     MOV         x4,x10                      //move ht to x4 for loop count
    142 
    143 PU1_SRC_LOOP:
    144     LDRB        w11,[x2]                    //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
    145     LD1         {v17.16b},[x12],x1          //pu1_cur_row = vld1q_u8(pu1_src_cpy)
    146     SUB         x5,x9,x8                    //wd - col
    147 
    148     SUB         x14,x10,x4                  //ht - row
    149     mov         v21.b[15], w11              //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    150     mul         x14, x14, x1                //(ht - row) * src_strd
    151 
    152     LD1         {v26.16b},[x12]             //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
    153     EXT         v21.16b,  v21.16b ,  v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    154     ADD         x5,x14,x5                   //(ht - row) * src_strd + (wd - col)
    155 
    156     LDRB        w11,[x2, #1]                //II Iteration load pu1_src_left since ht - row + 1 =1
    157     cmhi        v16.16b,  v17.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    158     LDRB        w14,[x6,x5]                 //pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
    159 
    160     SUB         x4,x4,#1
    161     mov         v28.b[15], w11              //II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    162     cmhi        v18.16b,  v21.16b ,  v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    163 
    164     SUB         x12,x12,x1                  //Decrement the pu1_src pointer by src_strd
    165     SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    166     STRB        w14,[x2],#1                 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    167 
    168     LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
    169     EXT         v28.16b,  v28.16b ,  v26.16b,#15 //II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    170     SUB         x5,x9,x8                    //II wd - col
    171 
    172     ADD         x12,x12,x1                  //Increment the pu1_src pointer by src_strd
    173     mov         v21.b[0], w11               //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    174     cmhi        v30.16b,  v26.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    175 
    176     LDRB        w11,[x12,#16]               //II pu1_src_cpy[16]
    177     EXT         v21.16b,  v17.16b ,  v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    178     SUB         x14,x10,x4                  //II ht - row
    179 
    180     cmhi        v0.16b,  v28.16b ,  v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    181     mov         v28.b[0], w11               //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    182     SUB         x12,x12,x1                  //Decrement the pu1_src pointer by src_strd
    183 
    184     mul         x14, x14, x1                //II (ht - row) * src_strd
    185     cmhi        v16.16b,  v17.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    186     ADD         x5,x14,x5                   //II (ht - row) * src_strd + (wd - col)
    187 
    188     cmhi        v18.16b,  v21.16b ,  v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    189     EXT         v28.16b,  v26.16b ,  v28.16b,#1 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    190 
    191     LDRB        w14,[x6,x5]                 //II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
    192     SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    193     SUBS        x4,x4,#1                    //Decrement row by 1
    194 
    195     ADD         v21.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
    196     STRB        w14,[x2],#1                 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    197 
    198     ADD         v21.16b,  v21.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
    199     Uxtl        v18.8h, v17.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    200 
    201     SUB         v20.16b,  v0.16b ,  v30.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    202     TBL         v21.16b, {v5.16b},v21.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    203     cmhi        v30.16b,  v26.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    204 
    205     cmhi        v0.16b,  v28.16b ,  v26.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    206 //  TBL v15.8b, {v10.16b},v15.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    207     SUB         v22.16b,  v0.16b ,  v30.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    208 
    209     AND         v21.16b,  v21.16b ,  v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    210     TBL         v16.16b, {v7.16b},v21.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    211     Uxtl        v0.8h, v26.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    212 
    213     ADD         v28.16b,  v2.16b ,  v20.16b //II edge_idx = vaddq_s8(const_2, sign_left)
    214     ADD         v28.16b,  v28.16b ,  v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right)
    215 
    216     SADDW       v18.8h,  v18.8h ,  v16.8b
    217     TBL         v28.16b, {v5.16b},v28.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    218     SMAX        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    219 
    220 //  TBL v29.8b, {v10.16b},v29.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    221     UMIN        v18.8h,  v18.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    222 
    223     AND         v28.16b,  v28.16b ,  v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
    224 //  TBL v17.8b, {v11.16b},v15.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    225 
    226     Uxtl2       v21.8h, v17.16b             //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    227     TBL         v30.16b, {v7.16b},v28.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    228     SADDW2      v21.8h,  v21.8h ,  v16.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    229 
    230     SMAX        v21.8h,  v21.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    231 //  TBL v31.8b, {v11.16b},v29.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    232     UMIN        v21.8h,  v21.8h ,  v6.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    233 
    234     xtn         v18.8b,  v18.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    235     SADDW       v0.8h,  v0.8h ,  v30.8b
    236 
    237     xtn         v19.8b,  v21.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
    238     SMAX        v0.8h,  v0.8h ,  v4.8h      //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    239 
    240     Uxtl2       v28.8h, v26.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    241     UMIN        v0.8h,  v0.8h ,  v6.8h      //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    242 
    243     xtn         v0.8b,  v0.8h               //II vmovn_s16(pi2_tmp_cur_row.val[0])
    244     SADDW2      v28.8h,  v28.8h ,  v30.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    245 
    246     SMAX        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    247     ST1         {v18.8b, v19.8b},[x12],x1   //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    248     UMIN        v28.8h,  v28.8h ,  v6.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    249 
    250     xtn         v1.8b,  v28.8h              //II vmovn_s16(pi2_tmp_cur_row.val[1])
    251 
    252     ST1         {v0.8b, v1.8b},[x12],x1     //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    253 
    254     BNE         PU1_SRC_LOOP                //If not equal jump to the inner loop
    255 
    256     ADD         x0,x0,#16                   //pu1_src += 16
    257 
    258     SUBS        x8,x8,#16                   //Decrement column by 16
    259     CMP         x8,#8                       //Check whether residue remains
    260     MOV         x2,x3                       //Reload pu1_src_left
    261     BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
    262     BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
    263     BLT         END_LOOPS                   //Jump to end function
    264 
    265 WIDTH_RESIDUE:
    266     SUB         x6,x6,#15
    267     AND         x8,x9,#0xF                  //wd_rem = wd & 0xF
    268     CMP         x8,#0                       //Residue check
    269     BEQ         END_LOOPS                   //No Residue jump to end function
    270 
    271     CMP         x8,x9                       //if(wd_rem == wd)
    272     BNE         AU1_MASK_FF_RESIDUE         //jump to else part
    273     LDRB        w12,[x7]                    //pu1_avail[0]
    274     mov         v3.b[0], w12                //vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    275     B           SKIP_AU1_MASK_FF_RESIDUE    //Skip the else part
    276 
    277 AU1_MASK_FF_RESIDUE:
    278     MOV         x12,#0xFF                   //move -s to x12
    279     mov         v3.b[0], w12                //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    280 
    281 SKIP_AU1_MASK_FF_RESIDUE:
    282     LDRB        w11,[x7,#1]                 //pu1_avail[1]
    283     SUB         x5,x9,#1                    //wd - 1
    284 
    285     MOV         x4,x10                      //move ht to x4 for loop count
    286     mov         v3.b[7], w11                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    287     MOV         x12,x0                      //pu1_src_cpy = pu1_src
    288 
    289 PU1_SRC_LOOP_RESIDUE:
    290     LD1         {v17.16b},[x12]             //pu1_cur_row = vld1q_u8(pu1_src_cpy)
    291     LDRB        w11,[x2]                    //load pu1_src_left
    292     mov         v21.b[15], w11              //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    293     EXT         v21.16b,  v21.16b ,  v17.16b,#15 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    294 
    295     cmhi        v16.16b,  v17.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    296     cmhi        v18.16b,  v21.16b ,  v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    297     SUB         v20.16b,  v18.16b ,  v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    298 
    299     LDRB        w11,[x12,#16]               //pu1_src_cpy[16]
    300     mov         v21.b[0], w11               //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    301     EXT         v21.16b,  v17.16b ,  v21.16b,#1 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    302 
    303     cmhi        v16.16b,  v17.16b ,  v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    304     cmhi        v18.16b,  v21.16b ,  v17.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    305     SUB         v22.16b,  v18.16b ,  v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    306 
    307     ADD         v24.16b,  v2.16b ,  v20.16b //edge_idx = vaddq_s8(const_2, sign_left)
    308     ADD         v24.16b,  v24.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right)
    309 
    310     TBL         v24.16b, {v5.16b},v24.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    311 //  TBL v25.8b, {v10.16b},v25.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    312 
    313     AND         v24.16b,  v24.16b ,  v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
    314 
    315     NEG         v20.16b, v22.16b            //sign_left = vnegq_s8(sign_right)
    316     EXT         v20.16b,  v20.16b ,  v22.16b,#15 //sign_left = vextq_s8(sign_left, sign_left, 15)
    317 
    318     TBL         v26.8b, {v7.16b},v24.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    319     Uxtl        v28.8h, v17.8b              //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    320     SADDW       v28.8h,  v28.8h ,  v26.8b
    321     SMAX        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    322     UMIN        v28.8h,  v28.8h ,  v6.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    323 
    324     xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
    325 
    326     SUB         x14,x10,x4                  //ht - row
    327     mul         x14, x14, x1                //(ht - row) * src_strd
    328     ADD         x11,x14,x5                  //(ht - row) * src_strd + (wd - 1)
    329     LDRB        w14,[x6, x11]               //pu1_src_org[(ht - row) * src_strd + (wd - 1)]
    330     STRB        w14,[x2],#1                 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    331 
    332     ST1         {v28.8b},[x12],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
    333 
    334     SUBS        x4,x4,#1                    //Decrement row by 1
    335     BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to the pu1_src loop
    336 
    337 END_LOOPS:
    338     // LDMFD sp!,{x4-x12,x15}              //Reload the registers from SP
    339     ldp         x19, x20,[sp], #16
    340 
    341     ret
    342 
    343 
    344 
    345 
    346