Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class2.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset,
     48 @                              WORD32 wd,
     49 @                              WORD32 ht)
     50 @**************Variables Vs Registers*****************************************
     51 @r0 =>  *pu1_src
     52 @r1 =>  src_strd
     53 @r2 =>  *pu1_src_left
     54 @r3 =>  *pu1_src_top
     55 @r4 =>  *pu1_src_top_left
     56 @r5 =>  *pu1_avail
     57 @r6 =>  *pi1_sao_offset
     58 @r7 =>  wd
     59 @r8=>   ht
     60 
     61 .text
     62 .syntax unified
     63 .p2align 2
     64 
     65 .extern gi1_table_edge_idx
     66 .globl ihevc_sao_edge_offset_class2_a9q
     67 
     68 gi1_table_edge_idx_addr_1:
     69 .long gi1_table_edge_idx - ulbl1 - 8
     70 
     71 gi1_table_edge_idx_addr_2:
     72 .long gi1_table_edge_idx - ulbl2 - 8
     73 
     74 gi1_table_edge_idx_addr_3:
     75 .long gi1_table_edge_idx - ulbl3 - 8
     76 
     77 ihevc_sao_edge_offset_class2_a9q:
     78 
     79 
     80     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     81     LDR         r7,[sp,#0x3C]               @Loads wd
     82 
     83     LDR         r8,[sp,#0x40]               @Loads ht
     84     SUB         r9,r7,#1                    @wd - 1
     85 
     86     LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
     87     LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
     88 
     89     STR         r0,[sp,#0x2C]               @Store pu1_src in sp
     90     MOV         r9,r7                       @Move width to r9 for loop count
     91 
     92     STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
     93     LDR         r5,[sp,#0x34]               @Loads pu1_avail
     94     LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
     95     STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
     96 
     97     SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
     98 
     99     STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
    100     SUB         r10,r8,#1                   @ht-1
    101     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
    102     ADD         r12,sp,#0x02                @temp array
    103 
    104 AU1_SRC_TOP_LOOP:
    105     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    106     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    107     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    108     BNE         AU1_SRC_TOP_LOOP
    109 
    110 PU1_AVAIL_4_LOOP:
    111     LDRB        r10,[r5,#4]                 @pu1_avail[4]
    112     CMP         r10,#0
    113     LDRB        r9,[r0]                     @u1_pos_0_0_tmp = pu1_src[0]
    114     BEQ         PU1_AVAIL_7_LOOP
    115 
    116     LDRB        r11,[r4]                    @pu1_src_top_left[0]
    117     ADD         r14,r0,r1                   @pu1_src + src_strd
    118 
    119     SUBS        r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
    120     LDRB        r4,[r14,#1]                 @pu1_src[1 + src_strd]
    121 
    122     MVNLT       r12,#0
    123     MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
    124 
    125     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    126 ulbl1:
    127     add         r14,r14,pc
    128     SUBS        r11,r9,r4                   @pu1_src[0] - pu1_src[1 + src_strd]
    129 
    130     MVNLT       r11,#0
    131     MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[1 + src_strd])
    132     ADD         r4,r12,r11                  @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
    133     ADD         r4,r4,#2                    @edge_idx
    134 
    135     LDRSB       r12,[r14,r4]                @edge_idx = gi1_table_edge_idx[edge_idx]
    136     CMP         r12,#0                      @0 != edge_idx
    137     BEQ         PU1_AVAIL_7_LOOP
    138     LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
    139     ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
    140     USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    141 
    142 PU1_AVAIL_7_LOOP:
    143     LDRB        r14,[r5,#7]                 @pu1_avail[7]
    144     CMP         r14,#0
    145     SUB         r10,r7,#1                   @wd - 1
    146     SUB         r11,r8,#1                   @ht - 1
    147     MLA         r12,r11,r1,r10              @wd - 1 + (ht - 1) * src_strd
    148     ADD         r12,r12,r0                  @pu1_src[wd - 1 + (ht - 1) * src_strd]
    149     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
    150     BEQ         PU1_AVAIL
    151 
    152     SUB         r4,r12,r1                   @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
    153     LDRB        r11,[r4,#-1]                @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
    154     ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
    155 
    156     SUBS        r11,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
    157     LDRB        r4,[r14,#1]                 @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
    158 
    159     MVNLT       r11,#0
    160     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
    161 
    162     SUBS        r4,r10,r4                   @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
    163     MVNLT       r4,#0
    164     MOVGT       r4,#1                       @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
    165 
    166     ADD         r11,r11,r4                  @Add 2 sign value
    167     ADD         r11,r11,#2                  @edge_idx
    168     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    169 ulbl2:
    170     add         r14,r14,pc
    171 
    172     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    173     CMP         r12,#0
    174     BEQ         PU1_AVAIL
    175     LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
    176     ADD         r10,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    177     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    178 
    179 PU1_AVAIL:
    180     MOV         r12,r8                      @Move ht
    181     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    182     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    183 
    184     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    185     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    186     CMP         r11,#0
    187 
    188     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    189     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    190     SUBEQ       r12,r12,#1                  @ht_tmp--
    191 
    192     CMP         r5,#0
    193     VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
    194     LDR         r11, gi1_table_edge_idx_addr_3 @table pointer
    195 ulbl3:
    196     add         r11,r11,pc
    197 
    198     ADDEQ       r0,r0,r1                    @pu1_src += src_strd
    199     VLD1.8      D6,[r11]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    200     SUBEQ       r12,r12,#1                  @ht_tmp--
    201 
    202     MOV         r6,r7                       @move wd to r6 loop_count
    203     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    204     ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
    205 
    206     STR         r0,[sp,#0x90]               @Store pu1_src in sp
    207     CMP         r7,#16                      @Compare wd with 16
    208 
    209     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    210     CMP         r8,#4                       @Compare ht with 4
    211     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    212 
    213 WIDTH_LOOP_16:
    214     LDR         r7,[sp,#0xD0]               @Loads wd
    215 
    216     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    217     CMP         r6,r7                       @col == wd
    218     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    219     MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    220 
    221     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
    222     CMP         r6,#16                      @if(col == 16)
    223     BNE         SKIP_AU1_MASK_VAL
    224     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    225     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    226 
    227 SKIP_AU1_MASK_VAL:
    228     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    229     CMP         r11,#0
    230 
    231     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    232     MOVNE       r8,r3                       @pu1_src_top_cpy
    233     SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
    234 
    235     LDR         r7,[sp,#0xD0]               @Loads wd
    236     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    237     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    238     SUB         r8,#8
    239     ADD         r3,r3,#16
    240 
    241     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    242     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    243     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    244     SUB         r0,#8
    245     LDR         r4,[sp,#0xD4]               @Loads ht
    246 
    247     SUB         r7,r7,r6                    @(wd - col)
    248     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    249     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    250 
    251     ADD         r7,r7,#15                   @15 + (wd - col)
    252     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    253     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    254 
    255     SUB         r5,r5,#1
    256     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    257 
    258 AU1_SRC_LEFT_LOOP:
    259     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    260     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    261     SUBS        r4,r4,#1                    @decrement the loop count
    262     BNE         AU1_SRC_LEFT_LOOP
    263 
    264     ADD         r8,r0,r1                    @I Iteration *pu1_src + src_strd
    265     VMOV.I8     Q9,#0
    266     LDR         r4,[sp,#0xC8]               @I Loads pu1_avail
    267 
    268     MOV         r7,r12                      @row count, move ht_tmp to r7
    269     VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    270     VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    271     SUB         r8,#8
    272     LDRB        r4,[r4,#2]                  @I pu1_avail[2]
    273 
    274     LDRB        r5,[r8,#16]                 @I pu1_src_cpy[src_strd + 16]
    275     VMOV.8      D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    276 
    277     VEXT.8      Q9,Q8,Q9,#1                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    278     CMP         r4,#0                       @I
    279     BNE         SIGN_UP_CHANGE_DONE         @I
    280 
    281 SIGN_UP_CHANGE:
    282     SUB         r2,r12,r7                   @I ht_tmp - row
    283     LDRB        r11,[r0]                    @I pu1_src_cpy[0]
    284     ADD         r2,r14,r2                   @I pu1_src_left_cpy[ht_tmp - row]
    285 
    286     LDRB        r5,[r2,#-1]                 @I load the value
    287     SUBS        r4,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    288     MVNLT       r4,#0                       @I
    289     MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    290     VMOV.8      D14[0],r4                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    291 
    292 SIGN_UP_CHANGE_DONE:
    293     VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    294     VADD.I8     Q12,Q0,Q7                   @I edge_idx = vaddq_s8(const_2, sign_up)
    295 
    296     VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    297     VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    298 
    299     VADD.I8     Q12,Q12,Q5                  @I edge_idx = vaddq_s8(edge_idx, sign_down)
    300     VTBL.8      D18,{D6},D24                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    301     VTBL.8      D19,{D6},D25                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    302 
    303     VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
    304 
    305     VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
    306     VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    307     VEXT.8      Q7,Q7,Q7,#15                @I sign_up = vextq_s8(sign_up, sign_up, 15)
    308 
    309     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    310     VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    311     VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    312 
    313     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    314     VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    315 
    316     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    317     VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
    318 
    319     VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    320     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    321 
    322     VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    323     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    324 
    325     VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    326 
    327     VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
    328 
    329 PU1_SRC_LOOP:
    330 
    331     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    332     ADD         r8,r0,r1                    @II iteration *pu1_src + src_strd
    333 
    334     VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    335     VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    336     SUB         r8,#8
    337     ADD         r11,r8,r1                   @III iteration *pu1_src + src_strd
    338 
    339     LDRB        r5,[r8,#16]                 @II pu1_src_cpy[src_strd + 16]
    340     VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    341     VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    342     SUB         r11,#8
    343     LDRB        r4,[r0]                     @II pu1_src_cpy[0]
    344 
    345     LDRB        r8,[r11,#16]                @III pu1_src_cpy[src_strd + 16]
    346     VMOV.8      D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    347 
    348     SUB         r5,r12,r7                   @II ht_tmp - row
    349     VEXT.8      Q11,Q8,Q14,#1               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    350     ADD         r5,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
    351 
    352     LDRB        r5,[r5,#-1]                 @II load the value
    353     VMOV.8      D18[0],r8                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    354     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    355 
    356     SUBS        r4,r4,r5                    @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    357     VEXT.8      Q9,Q15,Q9,#1                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    358     LDRB        r2,[r0,r1]                  @III pu1_src_cpy[0]
    359 
    360     VCGT.U8     Q12,Q6,Q11                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    361     SUB         r5,r12,r7                   @III ht_tmp - row
    362 
    363     MVNLT       r4,#0                       @II
    364     VCLT.U8     Q11,Q6,Q11                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    365     ADD         r5,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
    366 
    367     MOVGT       r4,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    368     VSUB.U8     Q12,Q11,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    369     LDRB        r5,[r5,#-1]                 @III load the value
    370 
    371     SUBS        r2,r2,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    372     VMOV.8      D14[0],r4                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    373 
    374     MVNLT       r2,#0                       @III
    375     VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    376     MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    377 
    378     VADD.I8     Q11,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    379     VADD.I8     Q11,Q11,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    380 
    381     VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    382     VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    383     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    384 
    385     VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    386     VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    387     VEXT.8      Q7,Q7,Q7,#15                @II sign_up = vextq_s8(sign_up, sign_up, 15)
    388 
    389     VAND        Q11,Q11,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    390     VMOV.8      D14[0],r2                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    391 
    392     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    393     VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    394     VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
    395 
    396     VMOVL.U8    Q13,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    397     VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    398     VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
    399 
    400     VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    401     VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    402     VEXT.8      Q7,Q7,Q7,#15                @III sign_up = vextq_s8(sign_up, sign_up, 15)
    403 
    404     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    405     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    406 
    407     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    408     VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    409     VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    410 
    411     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    412     VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    413     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    414 
    415     VMOVL.U8    Q14,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    416     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    417 
    418     VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    419     VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    420     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    421 
    422     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    423     VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    424 
    425     VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
    426     VMOVN.I16   D26,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    427 
    428     VMOVN.I16   D27,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    429     VADDW.S8    Q9,Q9,D11                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    430 
    431     VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    432     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    433 
    434     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    435     VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    436     CMP         r7,#1                       @III
    437 
    438     VST1.8      {Q13},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    439     VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
    440 
    441     BGT         PU1_SRC_LOOP                @III If not equal jump to PU1_SRC_LOOP
    442     BLT         INNER_LOOP_DONE
    443 
    444     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    445     ADD         r8,r0,r1                    @*pu1_src + src_strd
    446 
    447     LDRB        r2,[r0]                     @pu1_src_cpy[0]
    448     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    449     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    450     SUB         r8,#8
    451     LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    452 
    453     SUB         r11,r12,r7                  @ht_tmp - row
    454     VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    455     ADD         r11,r14,r11                 @pu1_src_left_cpy[ht_tmp - row]
    456 
    457     LDRB        r5,[r11,#-1]                @load the value
    458     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    459     SUBS        r4,r2,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    460 
    461     VCGT.U8     Q5,Q6,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    462     MVNLT       r4,#0
    463 
    464     MOVGT       r4,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    465     VCLT.U8     Q9,Q6,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    466 
    467     VMOV.8      D14[0],r4                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    468     VSUB.U8     Q5,Q9,Q5                    @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    469 
    470     VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
    471     VADD.I8     Q9,Q9,Q5                    @edge_idx = vaddq_s8(edge_idx, sign_down)
    472 
    473     VTBL.8      D18,{D6},D18                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    474     VNEG.S8     Q7,Q5                       @sign_up = vnegq_s8(sign_down)
    475 
    476     VTBL.8      D19,{D6},D19                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    477     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    478 
    479     VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
    480 
    481     VTBL.8      D10,{D7},D18                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    482 
    483     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    484     VTBL.8      D11,{D7},D19                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    485     VADDW.S8    Q10,Q10,D10                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    486 
    487     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    488     VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    489 
    490     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    491     VADDW.S8    Q6,Q6,D11                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    492 
    493     VMAX.S16    Q6,Q6,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    494     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    495 
    496     VMIN.U16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    497     VMOVN.I16   D21,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
    498 
    499 
    500 INNER_LOOP_DONE:
    501     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    502     VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    503     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    504 
    505     LDR         r8,[sp,#0xD4]               @Loads ht
    506     SUB         r5,r5,#1
    507 
    508     SUB         r2,r2,#1
    509 SRC_LEFT_LOOP:
    510     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    511     SUBS        r8,r8,#1
    512     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    513     BNE         SRC_LEFT_LOOP
    514 
    515     SUB         r6,r6,#16                   @Decrement the wd loop count by 16
    516     CMP         r6,#8                       @Check whether residue remains
    517     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    518     LDR         r7,[sp,#0xD0]               @Loads wd
    519     LDR         r0,[sp,#0x90]               @Loads *pu1_src
    520     SUB         r7,r7,r6
    521     ADD         r0,r0,r7
    522     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    523     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    524 
    525 
    526 WD_16_HT_4_LOOP:
    527     LDR         r7,[sp,#0xD0]               @Loads wd
    528     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    529     CMP         r6,r7                       @col == wd
    530     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    531     MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    532 
    533     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
    534     CMP         r6,#16                      @if(col == 16)
    535     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    536     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    537     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    538 
    539 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    540     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    541     CMP         r8,#0
    542 
    543     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    544     MOVNE       r8,r3
    545     SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
    546 
    547     LDR         r7,[sp,#0xD0]               @Loads wd
    548     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    549     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    550     SUB         r8,#8
    551     ADD         r3,r3,#16
    552 
    553     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    554     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    555     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    556     SUB         r0,#8
    557     LDR         r4,[sp,#0xD4]               @Loads ht
    558 
    559     SUB         r7,r7,r6                    @(wd - col)
    560     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    561     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    562 
    563     ADD         r7,r7,#15                   @15 + (wd - col)
    564     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    565     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    566 
    567     SUB         r5,r5,#1
    568     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    569 
    570 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    571     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    572     SUBS        r4,r4,#1                    @decrement the loop count
    573     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    574     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    575 
    576     VMOV.I8     Q9,#0
    577     MOV         r7,r12                      @row count, move ht_tmp to r7
    578 
    579 PU1_SRC_LOOP_WD_16_HT_4:
    580     ADD         r8,r0,r1                    @*pu1_src + src_strd
    581     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    582     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    583     SUB         r8,#8
    584 
    585     LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    586     VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    587     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    588 
    589     CMP         r7,r12
    590     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    591     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    592     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    593     CMP         r5,#0
    594     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    595 
    596 SIGN_UP_CHANGE_WD_16_HT_4:
    597     LDRB        r8,[r0]                     @pu1_src_cpy[0]
    598     SUB         r5,r12,r7                   @ht_tmp - row
    599     ADD         r5,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    600     LDRB        r5,[r5,#-1]                 @load the value
    601     SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    602     MVNLT       r8,#0
    603     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    604     VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    605 
    606 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    607     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    608     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    609     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    610 
    611     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    612     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    613     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    614     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    615 
    616     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    617 
    618     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    619     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    620 
    621     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    622     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    623     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    624     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    625     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    626 
    627     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    628     VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    629     VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    630     VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    631     VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    632 
    633     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    634     VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    635 
    636     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    637 
    638     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    639     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    640     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    641 
    642     LDR         r8,[sp,#0xD4]               @Loads ht
    643     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    644     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    645     SUB         r5,r5,#1
    646     SUB         r2,r2,#1
    647 
    648 SRC_LEFT_LOOP_WD_16_HT_4:
    649     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    650     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    651     SUBS        r8,r8,#1
    652     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    653 
    654     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    655     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    656 
    657 
    658 WIDTH_RESIDUE:
    659     LDR         r7,[sp,#0xD0]               @Loads wd
    660     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    661     CMP         r6,r7                       @wd_residue == wd
    662     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    663 
    664     MOVNE       r8,#-1
    665     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    666 
    667     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    668     VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    669 
    670 PU1_AVAIL_2_RESIDUE:
    671     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    672     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    673     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    674     SUB         r0,#8
    675     CMP         r11,#0
    676 
    677     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    678     MOVNE       r8,r3
    679 
    680     SUB         r8,r8,#1
    681 
    682     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    683     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
    684     VLD1.8      D11,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
    685     LDR         r7,[sp,#0xD0]               @Loads wd
    686 
    687     LDR         r4,[sp,#0xD4]               @Loads ht
    688     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    689     SUB         r7,r7,#1                    @(wd - 1)
    690 
    691     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    692     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    693     SUB         r5,r5,#1
    694 
    695     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
    696     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    697 
    698 
    699 AU1_SRC_LEFT_LOOP_RESIDUE:
    700     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    701     SUBS        r4,r4,#1                    @decrement the loop count
    702     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    703     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    704 
    705 
    706     MOV         r7,r12                      @row count, move ht_tmp to r7
    707 
    708 PU1_SRC_LOOP_RESIDUE:
    709     VMOV.I8     Q9,#0
    710     ADD         r8,r0,r1                    @*pu1_src + src_strd
    711     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    712     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    713     SUB         r8,#8
    714 
    715     LDRB        r8,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    716     VMOV.8      d18[0],r8                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    717     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    718 
    719     CMP         r7,r12
    720     BLT         SIGN_UP_CHANGE_RESIDUE
    721     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    722     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    723     CMP         r5,#0
    724     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    725 
    726 SIGN_UP_CHANGE_RESIDUE:
    727     LDRB        r8,[r0]                     @pu1_src_cpy[0]
    728     SUB         r5,r12,r7                   @ht_tmp - row
    729 
    730     ADD         r5,r14,r5
    731     LDRB        r5,[r5,#-1]                 @load the value
    732     SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    733     MVNLT       r8,#0
    734     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    735     VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    736 
    737 SIGN_UP_CHANGE_DONE_RESIDUE:
    738     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    739     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    740     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    741 
    742     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    743     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    744     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    745     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    746 
    747     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    748 
    749     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    750     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    751 
    752     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    753     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    754     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    755     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    756     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    757 
    758     VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    759 
    760     VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    761     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    762     SUBS        r7,r7,#1
    763     BNE         PU1_SRC_LOOP_RESIDUE
    764 
    765     LDR         r8,[sp,#0xD4]               @Loads ht
    766     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    767 
    768     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    769     SUB         r5,r5,#1
    770 
    771     SUB         r2,r2,#1
    772 
    773 SRC_LEFT_LOOP_RESIDUE:
    774     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    775     SUBS        r8,r8,#1
    776     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    777     BNE         SRC_LEFT_LOOP_RESIDUE
    778 
    779 
    780 RE_ASSINING_LOOP:
    781     LDR         r8,[sp,#0xD4]               @Loads ht
    782     LDR         r7,[sp,#0xD0]               @Loads wd
    783 
    784     LDR         r0,[sp,#0xC0]               @Loads *pu1_src
    785     SUB         r8,r8,#1                    @ht - 1
    786 
    787     MLA         r6,r8,r1,r7                 @wd - 1 + (ht - 1) * src_strd
    788     STRB        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
    789 
    790     LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
    791     ADD         r6,r0,r6                    @pu1_src[wd - 1 + (ht - 1) * src_strd]
    792 
    793     ADD         r12,sp,#0x02
    794     STRB        r10,[r6,#-1]                @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
    795 
    796     LDRB        r11,[sp]                    @load u1_src_top_left_tmp from stack pointer
    797     LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
    798 
    799     STRB        r11,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
    800 
    801 SRC_TOP_LOOP:
    802     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
    803     SUBS        r7,r7,#8                    @Decrement the width
    804     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
    805     BNE         SRC_TOP_LOOP
    806 
    807 END_LOOPS:
    808     ADD         sp,sp,#0x94
    809     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    810 
    811 
    812 
    813