Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class0.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset,
     48 @                              WORD32 wd,
     49 @                              WORD32 ht)
     50 @
     51 @**************Variables Vs Registers*****************************************
     52 @r0 =>  *pu1_src
     53 @r1 =>  src_strd
     54 @r2 =>  *pu1_src_left
     55 @r3 =>  *pu1_src_top
     56 @r4 =>  *pu1_src_top_left
     57 @r7 =>  *pu1_avail
     58 @r8 =>  *pi1_sao_offset
     59 @r9 =>  wd
     60 @r10=>  ht
     61 
     62 .text
     63 .p2align 2
     64 
     65 .extern gi1_table_edge_idx
     66 .globl ihevc_sao_edge_offset_class0_a9q
     67 
     68 gi1_table_edge_idx_addr:
     69 .long gi1_table_edge_idx - ulbl1 - 8
     70 
     71 ihevc_sao_edge_offset_class0_a9q:
     72 
     73 
     74     STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
     75     LDR         r9,[sp,#60]                 @Loads wd
     76 
     77     LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
     78     VMOV.I8     Q1,#2                       @const_2 = vdupq_n_s8(2)
     79     ADD         r11,r3,r9                   @pu1_src_top[wd]
     80 
     81     LDR         r10,[sp,#64]                @Loads ht
     82     VMOV.I16    Q2,#0                       @const_min_clip = vdupq_n_s16(0)
     83     LDRB        r12,[r11,#-1]               @pu1_src_top[wd - 1]
     84 
     85     LDR         r7,[sp,#52]                 @Loads pu1_avail
     86     VMOV.I16    Q3,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
     87     LDR         r14, gi1_table_edge_idx_addr @table pointer
     88 ulbl1:
     89     add         r14,r14,pc
     90 
     91     LDR         r8,[sp,#56]                 @Loads pi1_sao_offset
     92     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
     93     STRB        r12,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
     94 
     95     MOV         r6,r0                       @pu1_src_org
     96     VLD1.8      D10,[r14]                   @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
     97     SUB         r4,r10,#1                   @(ht - 1)
     98 
     99     MOV         r12,r9                      @Move wd to r12 for loop count
    100     VLD1.8      D11,[r8]                    @offset_tbl = vld1_s8(pi1_sao_offset)
    101     MUL         r4,r4,r1                    @(ht - 1) * src_strd
    102 
    103     ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
    104 
    105 SRC_TOP_LOOP:                               @wd is always multiple of 8
    106     VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
    107     SUBS        r12,r12,#8                  @Decrement the loop counter by 8
    108     VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
    109     BNE         SRC_TOP_LOOP
    110     ADD         r6,r6,#15                   @pu1_src_org[16 - 1]
    111 
    112     CMP         r9,#16                      @Compare wd with 16
    113     MOV         r3,r2                       @pu1_src_left backup to reload later
    114     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    115 
    116     MOV         r8,r9                       @move wd to r8 for loop count
    117 
    118 WIDTH_LOOP_16:
    119     CMP         r8,r9                       @if(col == wd)
    120     BNE         AU1_MASK_FF                 @jump to else part
    121     LDRB        r12,[r7]                    @pu1_avail[0]
    122     VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    123     B           SKIP_AU1_MASK_FF            @Skip the else part
    124 
    125 AU1_MASK_FF:
    126     MOV         r12,#0xFF                   @move -1 to r12
    127     VMOV.8      D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    128 
    129 SKIP_AU1_MASK_FF:
    130     CMP         r8,#16                      @If col == 16
    131     BNE         SKIP_MASKING_IF_NOT16       @If not skip masking
    132     LDRB        r12,[r7,#1]                 @pu1_avail[1]
    133     VMOV.8      D9[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    134 
    135 SKIP_MASKING_IF_NOT16:
    136     MOV         r12,r0                      @pu1_src_cpy = pu1_src
    137     MOV         r4,r10                      @move ht to r4 for loop count
    138 
    139 PU1_SRC_LOOP:
    140     LDRB        r11,[r2]                    @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
    141     VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
    142     VLD1.8      D13,[r12], r1               @pu1_cur_row = vld1q_u8(pu1_src_cpy)
    143     SUB         r12,#8
    144     SUB         r5,r9,r8                    @wd - col
    145 
    146     SUB         r14,r10,r4                  @ht - row
    147     VMOV.8      D15[7],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    148     MUL         r14,r14,r1                  @(ht - row) * src_strd
    149 
    150     VLD1.8      D26,[r12]!                  @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
    151     VLD1.8      D27,[r12]                   @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
    152     SUB         r12,#8
    153     VEXT.8      Q7,Q7,Q6,#15                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    154     ADD         r5,r14,r5                   @(ht - row) * src_strd + (wd - col)
    155 
    156     LDRB        r11,[r2, #1]                @II Iteration load pu1_src_left since ht - row + 1 =1
    157     VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    158     LDRB        r14,[r6,r5]                 @pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
    159 
    160     SUB         r4,r4,#1
    161     VMOV.8      D29[7],r11                  @II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    162     VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    163 
    164     SUB         r12,r12,r1                  @Decrement the pu1_src pointer by src_strd
    165     VSUB.I8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    166     STRB        r14,[r2],#1                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    167 
    168     LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
    169     VEXT.8      Q14,Q14,Q13,#15             @II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    170     SUB         r5,r9,r8                    @II wd - col
    171 
    172     ADD         r12,r12,r1                  @Increment the pu1_src pointer by src_strd
    173     VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    174     VCGT.U8     Q15,Q13,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    175 
    176     LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
    177     VEXT.8      Q7,Q6,Q7,#1                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    178     SUB         r14,r10,r4                  @II ht - row
    179 
    180     VCLT.U8     Q0,Q13,Q14                  @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    181     VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    182     SUB         r12,r12,r1                  @Decrement the pu1_src pointer by src_strd
    183 
    184     MUL         r14,r14,r1                  @II (ht - row) * src_strd
    185     VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    186     ADD         r5,r14,r5                   @II (ht - row) * src_strd + (wd - col)
    187 
    188     VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    189     VEXT.8      Q14,Q13,Q14,#1              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    190 
    191     LDRB        r14,[r6,r5]                 @II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
    192     VSUB.I8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    193     SUBS        r4,r4,#1                    @Decrement row by 1
    194 
    195     VADD.I8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
    196     STRB        r14,[r2],#1                 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    197 
    198     VADD.I8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
    199     VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    200 
    201     VSUB.I8     Q10,Q0,Q15                  @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    202     VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    203     VCGT.U8     Q15,Q13,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    204 
    205     VCLT.U8     Q0,Q13,Q14                  @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    206     VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    207     VSUB.I8     Q11,Q0,Q15                  @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    208 
    209     VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
    210     VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    211     VMOVL.U8    Q0,D26                      @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    212 
    213     VADD.I8     Q14,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
    214     VADD.I8     Q14,Q14,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
    215 
    216     VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    217     VTBL.8      D28,{D10},D28               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    218     VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    219 
    220     VTBL.8      D29,{D10},D29               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    221     VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    222 
    223     VAND        Q14,Q14,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    224     VTBL.8      D17,{D11},D15               @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    225 
    226     VMOVL.U8    Q7,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    227     VTBL.8      D30,{D11},D28               @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    228     VADDW.S8    Q7,Q7,D17                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    229 
    230     VMAX.S16    Q7,Q7,Q2                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    231     VTBL.8      D31,{D11},D29               @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    232     VMIN.U16    Q7,Q7,Q3                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    233 
    234     VMOVN.I16   D18,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
    235     VADDW.S8    Q0,Q0,D30                   @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    236 
    237     VMOVN.I16   D19,Q7                      @vmovn_s16(pi2_tmp_cur_row.val[1])
    238     VMAX.S16    Q0,Q0,Q2                    @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    239 
    240     VMOVL.U8    Q14,D27                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    241     VMIN.U16    Q0,Q0,Q3                    @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    242 
    243     VMOVN.I16   D0,Q0                       @II vmovn_s16(pi2_tmp_cur_row.val[0])
    244     VADDW.S8    Q14,Q14,D31                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    245 
    246     VMAX.S16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    247     VST1.8      {D18,D19},[r12],r1          @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    248     VMIN.U16    Q14,Q14,Q3                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    249 
    250     VMOVN.I16   D1,Q14                      @II vmovn_s16(pi2_tmp_cur_row.val[1])
    251 
    252     VST1.8      {D0,D1},[r12],r1            @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    253 
    254     BNE         PU1_SRC_LOOP                @If not equal jump to the inner loop
    255 
    256     ADD         r0,r0,#16                   @pu1_src += 16
    257 
    258     SUBS        r8,r8,#16                   @Decrement column by 16
    259     CMP         r8,#8                       @Check whether residue remains
    260     MOV         r2,r3                       @Reload pu1_src_left
    261     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    262     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    263     BLT         END_LOOPS                   @Jump to end function
    264 
    265 WIDTH_RESIDUE:
    266     SUB         r6,r6,#15
    267     AND         r8,r9,#0xF                  @wd_rem = wd & 0xF
    268     CMP         r8,#0                       @Residue check
    269     BEQ         END_LOOPS                   @No Residue jump to end function
    270 
    271     CMP         r8,r9                       @if(wd_rem == wd)
    272     BNE         AU1_MASK_FF_RESIDUE         @jump to else part
    273     LDRB        r12,[r7]                    @pu1_avail[0]
    274     VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    275     B           SKIP_AU1_MASK_FF_RESIDUE    @Skip the else part
    276 
    277 AU1_MASK_FF_RESIDUE:
    278     MOV         r12,#0xFF                   @move -s to r12
    279     VMOV.8      D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    280 
    281 SKIP_AU1_MASK_FF_RESIDUE:
    282     LDRB        r11,[r7,#1]                 @pu1_avail[1]
    283     SUB         r5,r9,#1                    @wd - 1
    284 
    285     MOV         r4,r10                      @move ht to r4 for loop count
    286     VMOV.8      D8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    287     MOV         r12,r0                      @pu1_src_cpy = pu1_src
    288 
    289 PU1_SRC_LOOP_RESIDUE:
    290     VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
    291     VLD1.8      D13,[r12]                   @pu1_cur_row = vld1q_u8(pu1_src_cpy)
    292     SUB         r12,#8
    293     LDRB        r11,[r2]                    @load pu1_src_left
    294     VMOV.8      D15[7],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    295     VEXT.8      Q7,Q7,Q6,#15                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    296 
    297     VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    298     VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    299     VSUB.I8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    300 
    301     LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
    302     VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    303     VEXT.8      Q7,Q6,Q7,#1                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    304 
    305     VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    306     VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    307     VSUB.I8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    308 
    309     VADD.I8     Q12,Q1,Q10                  @edge_idx = vaddq_s8(const_2, sign_left)
    310     VADD.I8     Q12,Q12,Q11                 @edge_idx = vaddq_s8(edge_idx, sign_right)
    311 
    312     VTBL.8      D24,{D10},D24               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    313     VTBL.8      D25,{D10},D25               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    314 
    315     VAND        Q12,Q12,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    316 
    317     VNEG.S8     Q10,Q11                     @sign_left = vnegq_s8(sign_right)
    318     VEXT.8      Q10,Q10,Q11,#15             @sign_left = vextq_s8(sign_left, sign_left, 15)
    319 
    320     VTBL.8      D26,{D11},D24               @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    321     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    322     VADDW.S8    Q14,Q14,D26                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    323     VMAX.S16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    324     VMIN.U16    Q14,Q14,Q3                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    325 
    326     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    327 
    328     SUB         r14,r10,r4                  @ht - row
    329     MUL         r14,r14,r1                  @(ht - row) * src_strd
    330     ADD         r11,r14,r5                  @(ht - row) * src_strd + (wd - 1)
    331     LDRB        r14,[r6, r11]               @pu1_src_org[(ht - row) * src_strd + (wd - 1)]
    332     STRB        r14,[r2],#1                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    333 
    334     VST1.8      {D28},[r12],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    335 
    336     SUBS        r4,r4,#1                    @Decrement row by 1
    337     BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to the pu1_src loop
    338 
    339 END_LOOPS:
    340     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    341 
    342 
    343 
    344 
    345