Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class2.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset,
     48 @                              WORD32 wd,
     49 @                              WORD32 ht)
     50 @**************Variables Vs Registers*****************************************
     51 @r0 =>  *pu1_src
     52 @r1 =>  src_strd
     53 @r2 =>  *pu1_src_left
     54 @r3 =>  *pu1_src_top
     55 @r4 =>  *pu1_src_top_left
     56 @r5 =>  *pu1_avail
     57 @r6 =>  *pi1_sao_offset
     58 @r7 =>  wd
     59 @r8=>   ht
     60 
     61 .text
     62 .p2align 2
     63 
     64 .extern gi1_table_edge_idx
     65 .globl ihevc_sao_edge_offset_class2_a9q
     66 
     67 gi1_table_edge_idx_addr_1:
     68 .long gi1_table_edge_idx - ulbl1 - 8
     69 
     70 gi1_table_edge_idx_addr_2:
     71 .long gi1_table_edge_idx - ulbl2 - 8
     72 
     73 gi1_table_edge_idx_addr_3:
     74 .long gi1_table_edge_idx - ulbl3 - 8
     75 
     76 ihevc_sao_edge_offset_class2_a9q:
     77 
     78 
     79     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     80     LDR         r7,[sp,#0x3C]               @Loads wd
     81 
     82     LDR         r8,[sp,#0x40]               @Loads ht
     83     SUB         r9,r7,#1                    @wd - 1
     84 
     85     LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
     86     LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
     87 
     88     STR         r0,[sp,#0x2C]               @Store pu1_src in sp
     89     MOV         r9,r7                       @Move width to r9 for loop count
     90 
     91     STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
     92     LDR         r5,[sp,#0x34]               @Loads pu1_avail
     93     LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
     94     STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
     95 
     96     SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
     97 
     98     STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
     99     SUB         r10,r8,#1                   @ht-1
    100     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
    101     ADD         r12,sp,#0x02                @temp array
    102 
    103 AU1_SRC_TOP_LOOP:
    104     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    105     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    106     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    107     BNE         AU1_SRC_TOP_LOOP
    108 
    109 PU1_AVAIL_4_LOOP:
    110     LDRB        r10,[r5,#4]                 @pu1_avail[4]
    111     CMP         r10,#0
    112     LDRB        r9,[r0]                     @u1_pos_0_0_tmp = pu1_src[0]
    113     BEQ         PU1_AVAIL_7_LOOP
    114 
    115     LDRB        r11,[r4]                    @pu1_src_top_left[0]
    116     ADD         r14,r0,r1                   @pu1_src + src_strd
    117 
    118     SUBS        r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
    119     LDRB        r4,[r14,#1]                 @pu1_src[1 + src_strd]
    120 
    121     MVNLT       r12,#0
    122     MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
    123 
    124     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    125 ulbl1:
    126     add         r14,r14,pc
    127     SUBS        r11,r9,r4                   @pu1_src[0] - pu1_src[1 + src_strd]
    128 
    129     MVNLT       r11,#0
    130     MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[1 + src_strd])
    131     ADD         r4,r12,r11                  @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
    132     ADD         r4,r4,#2                    @edge_idx
    133 
    134     LDRSB       r12,[r14,r4]                @edge_idx = gi1_table_edge_idx[edge_idx]
    135     CMP         r12,#0                      @0 != edge_idx
    136     BEQ         PU1_AVAIL_7_LOOP
    137     LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
    138     ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
    139     USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    140 
    141 PU1_AVAIL_7_LOOP:
    142     LDRB        r14,[r5,#7]                 @pu1_avail[7]
    143     CMP         r14,#0
    144     SUB         r10,r7,#1                   @wd - 1
    145     SUB         r11,r8,#1                   @ht - 1
    146     MLA         r12,r11,r1,r10              @wd - 1 + (ht - 1) * src_strd
    147     ADD         r12,r12,r0                  @pu1_src[wd - 1 + (ht - 1) * src_strd]
    148     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
    149     BEQ         PU1_AVAIL
    150 
    151     SUB         r4,r12,r1                   @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
    152     LDRB        r11,[r4,#-1]                @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
    153     ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
    154 
    155     SUBS        r11,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
    156     LDRB        r4,[r14,#1]                 @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
    157 
    158     MVNLT       r11,#0
    159     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
    160 
    161     SUBS        r4,r10,r4                   @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
    162     MVNLT       r4,#0
    163     MOVGT       r4,#1                       @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
    164 
    165     ADD         r11,r11,r4                  @Add 2 sign value
    166     ADD         r11,r11,#2                  @edge_idx
    167     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    168 ulbl2:
    169     add         r14,r14,pc
    170 
    171     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    172     CMP         r12,#0
    173     BEQ         PU1_AVAIL
    174     LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
    175     ADD         r10,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    176     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    177 
    178 PU1_AVAIL:
    179     MOV         r12,r8                      @Move ht
    180     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    181     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    182 
    183     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    184     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    185     CMP         r11,#0
    186 
    187     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    188     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    189     SUBEQ       r12,r12,#1                  @ht_tmp--
    190 
    191     CMP         r5,#0
    192     VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
    193     LDR         r11, gi1_table_edge_idx_addr_3 @table pointer
    194 ulbl3:
    195     add         r11,r11,pc
    196 
    197     ADDEQ       r0,r0,r1                    @pu1_src += src_strd
    198     VLD1.8      D6,[r11]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    199     SUBEQ       r12,r12,#1                  @ht_tmp--
    200 
    201     MOV         r6,r7                       @move wd to r6 loop_count
    202     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    203     ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
    204 
    205     STR         r0,[sp,#0x90]               @Store pu1_src in sp
    206     CMP         r7,#16                      @Compare wd with 16
    207 
    208     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    209     CMP         r8,#4                       @Compare ht with 4
    210     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    211 
    212 WIDTH_LOOP_16:
    213     LDR         r7,[sp,#0xD0]               @Loads wd
    214 
    215     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    216     CMP         r6,r7                       @col == wd
    217     LDREQB      r8,[r5]                     @pu1_avail[0]
    218     MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    219 
    220     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
    221     CMP         r6,#16                      @if(col == 16)
    222     BNE         SKIP_AU1_MASK_VAL
    223     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    224     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    225 
    226 SKIP_AU1_MASK_VAL:
    227     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    228     CMP         r11,#0
    229 
    230     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    231     MOVNE       r8,r3                       @pu1_src_top_cpy
    232     SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
    233 
    234     LDR         r7,[sp,#0xD0]               @Loads wd
    235     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    236     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    237     SUB         r8,#8
    238     ADD         r3,r3,#16
    239 
    240     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    241     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    242     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    243     SUB         r0,#8
    244     LDR         r4,[sp,#0xD4]               @Loads ht
    245 
    246     SUB         r7,r7,r6                    @(wd - col)
    247     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    248     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    249 
    250     ADD         r7,r7,#15                   @15 + (wd - col)
    251     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    252     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    253 
    254     SUB         r5,r5,#1
    255     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    256 
    257 AU1_SRC_LEFT_LOOP:
    258     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    259     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    260     SUBS        r4,r4,#1                    @decrement the loop count
    261     BNE         AU1_SRC_LEFT_LOOP
    262 
    263     ADD         r8,r0,r1                    @I Iteration *pu1_src + src_strd
    264     VMOV.I8     Q9,#0
    265     LDR         r4,[sp,#0xC8]               @I Loads pu1_avail
    266 
    267     MOV         r7,r12                      @row count, move ht_tmp to r7
    268     VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    269     VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    270     SUB         r8,#8
    271     LDRB        r4,[r4,#2]                  @I pu1_avail[2]
    272 
    273     LDRB        r5,[r8,#16]                 @I pu1_src_cpy[src_strd + 16]
    274     VMOV.8      D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    275 
    276     VEXT.8      Q9,Q8,Q9,#1                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    277     CMP         r4,#0                       @I
    278     BNE         SIGN_UP_CHANGE_DONE         @I
    279 
    280 SIGN_UP_CHANGE:
    281     SUB         r2,r12,r7                   @I ht_tmp - row
    282     LDRB        r11,[r0]                    @I pu1_src_cpy[0]
    283     ADD         r2,r14,r2                   @I pu1_src_left_cpy[ht_tmp - row]
    284 
    285     LDRB        r5,[r2,#-1]                 @I load the value
    286     SUBS        r4,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    287     MVNLT       r4,#0                       @I
    288     MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    289     VMOV.8      D14[0],r4                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    290 
    291 SIGN_UP_CHANGE_DONE:
    292     VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    293     VADD.I8     Q12,Q0,Q7                   @I edge_idx = vaddq_s8(const_2, sign_up)
    294 
    295     VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    296     VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    297 
    298     VADD.I8     Q12,Q12,Q5                  @I edge_idx = vaddq_s8(edge_idx, sign_down)
    299     VTBL.8      D18,{D6},D24                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    300     VTBL.8      D19,{D6},D25                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    301 
    302     VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
    303 
    304     VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
    305     VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    306     VEXT.8      Q7,Q7,Q7,#15                @I sign_up = vextq_s8(sign_up, sign_up, 15)
    307 
    308     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    309     VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    310     VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    311 
    312     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    313     VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    314 
    315     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    316     VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
    317 
    318     VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    319     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    320 
    321     VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    322     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    323 
    324     VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    325 
    326     VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
    327 
    328 PU1_SRC_LOOP:
    329 
    330     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    331     ADD         r8,r0,r1                    @II iteration *pu1_src + src_strd
    332 
    333     VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    334     VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    335     SUB         r8,#8
    336     ADD         r11,r8,r1                   @III iteration *pu1_src + src_strd
    337 
    338     LDRB        r5,[r8,#16]                 @II pu1_src_cpy[src_strd + 16]
    339     VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    340     VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    341     SUB         r11,#8
    342     LDRB        r4,[r0]                     @II pu1_src_cpy[0]
    343 
    344     LDRB        r8,[r11,#16]                @III pu1_src_cpy[src_strd + 16]
    345     VMOV.8      D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    346 
    347     SUB         r5,r12,r7                   @II ht_tmp - row
    348     VEXT.8      Q11,Q8,Q14,#1               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    349     ADD         r5,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
    350 
    351     LDRB        r5,[r5,#-1]                 @II load the value
    352     VMOV.8      D18[0],r8                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    353     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    354 
    355     SUBS        r4,r4,r5                    @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    356     VEXT.8      Q9,Q15,Q9,#1                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    357     LDRB        r2,[r0,r1]                  @III pu1_src_cpy[0]
    358 
    359     VCGT.U8     Q12,Q6,Q11                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    360     SUB         r5,r12,r7                   @III ht_tmp - row
    361 
    362     MVNLT       r4,#0                       @II
    363     VCLT.U8     Q11,Q6,Q11                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    364     ADD         r5,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
    365 
    366     MOVGT       r4,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    367     VSUB.U8     Q12,Q11,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    368     LDRB        r5,[r5,#-1]                 @III load the value
    369 
    370     SUBS        r2,r2,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    371     VMOV.8      D14[0],r4                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    372 
    373     MVNLT       r2,#0                       @III
    374     VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    375     MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    376 
    377     VADD.I8     Q11,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    378     VADD.I8     Q11,Q11,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    379 
    380     VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    381     VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    382     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    383 
    384     VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    385     VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    386     VEXT.8      Q7,Q7,Q7,#15                @II sign_up = vextq_s8(sign_up, sign_up, 15)
    387 
    388     VAND        Q11,Q11,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    389     VMOV.8      D14[0],r2                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    390 
    391     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    392     VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    393     VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
    394 
    395     VMOVL.U8    Q13,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    396     VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    397     VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
    398 
    399     VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    400     VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    401     VEXT.8      Q7,Q7,Q7,#15                @III sign_up = vextq_s8(sign_up, sign_up, 15)
    402 
    403     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    404     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    405 
    406     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    407     VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    408     VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    409 
    410     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    411     VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    412     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    413 
    414     VMOVL.U8    Q14,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    415     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    416 
    417     VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    418     VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    419     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    420 
    421     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    422     VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    423 
    424     VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
    425     VMOVN.I16   D26,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    426 
    427     VMOVN.I16   D27,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    428     VADDW.S8    Q9,Q9,D11                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    429 
    430     VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    431     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    432 
    433     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    434     VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    435     CMP         r7,#1                       @III
    436 
    437     VST1.8      {Q13},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    438     VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
    439 
    440     BGT         PU1_SRC_LOOP                @III If not equal jump to PU1_SRC_LOOP
    441     BLT         INNER_LOOP_DONE
    442 
    443     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    444     ADD         r8,r0,r1                    @*pu1_src + src_strd
    445 
    446     LDRB        r2,[r0]                     @pu1_src_cpy[0]
    447     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    448     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    449     SUB         r8,#8
    450     LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    451 
    452     SUB         r11,r12,r7                  @ht_tmp - row
    453     VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    454     ADD         r11,r14,r11                 @pu1_src_left_cpy[ht_tmp - row]
    455 
    456     LDRB        r5,[r11,#-1]                @load the value
    457     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    458     SUBS        r4,r2,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    459 
    460     VCGT.U8     Q5,Q6,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    461     MVNLT       r4,#0
    462 
    463     MOVGT       r4,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    464     VCLT.U8     Q9,Q6,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    465 
    466     VMOV.8      D14[0],r4                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    467     VSUB.U8     Q5,Q9,Q5                    @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    468 
    469     VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
    470     VADD.I8     Q9,Q9,Q5                    @edge_idx = vaddq_s8(edge_idx, sign_down)
    471 
    472     VTBL.8      D18,{D6},D18                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    473     VNEG.S8     Q7,Q5                       @sign_up = vnegq_s8(sign_down)
    474 
    475     VTBL.8      D19,{D6},D19                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    476     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    477 
    478     VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
    479 
    480     VTBL.8      D10,{D7},D18                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    481 
    482     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    483     VTBL.8      D11,{D7},D19                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    484     VADDW.S8    Q10,Q10,D10                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    485 
    486     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    487     VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    488 
    489     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    490     VADDW.S8    Q6,Q6,D11                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    491 
    492     VMAX.S16    Q6,Q6,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    493     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    494 
    495     VMIN.U16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    496     VMOVN.I16   D21,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
    497 
    498 
    499 INNER_LOOP_DONE:
    500     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    501     VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    502     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    503 
    504     LDR         r8,[sp,#0xD4]               @Loads ht
    505     SUB         r5,r5,#1
    506 
    507     SUB         r2,r2,#1
    508 SRC_LEFT_LOOP:
    509     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    510     SUBS        r8,r8,#1
    511     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    512     BNE         SRC_LEFT_LOOP
    513 
    514     SUB         r6,r6,#16                   @Decrement the wd loop count by 16
    515     CMP         r6,#8                       @Check whether residue remains
    516     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    517     LDR         r7,[sp,#0xD0]               @Loads wd
    518     LDR         r0,[sp,#0x90]               @Loads *pu1_src
    519     SUB         r7,r7,r6
    520     ADD         r0,r0,r7
    521     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    522     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    523 
    524 
    525 WD_16_HT_4_LOOP:
    526     LDR         r7,[sp,#0xD0]               @Loads wd
    527     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    528     CMP         r6,r7                       @col == wd
    529     LDREQB      r8,[r5]                     @pu1_avail[0]
    530     MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    531 
    532     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
    533     CMP         r6,#16                      @if(col == 16)
    534     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    535     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    536     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    537 
    538 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    539     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    540     CMP         r8,#0
    541 
    542     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    543     MOVNE       r8,r3
    544     SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
    545 
    546     LDR         r7,[sp,#0xD0]               @Loads wd
    547     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    548     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    549     SUB         r8,#8
    550     ADD         r3,r3,#16
    551 
    552     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    553     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    554     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    555     SUB         r0,#8
    556     LDR         r4,[sp,#0xD4]               @Loads ht
    557 
    558     SUB         r7,r7,r6                    @(wd - col)
    559     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    560     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    561 
    562     ADD         r7,r7,#15                   @15 + (wd - col)
    563     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    564     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    565 
    566     SUB         r5,r5,#1
    567     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    568 
    569 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    570     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    571     SUBS        r4,r4,#1                    @decrement the loop count
    572     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    573     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    574 
    575     VMOV.I8     Q9,#0
    576     MOV         r7,r12                      @row count, move ht_tmp to r7
    577 
    578 PU1_SRC_LOOP_WD_16_HT_4:
    579     ADD         r8,r0,r1                    @*pu1_src + src_strd
    580     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    581     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    582     SUB         r8,#8
    583 
    584     LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    585     VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    586     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    587 
    588     CMP         r7,r12
    589     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    590     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    591     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    592     CMP         r5,#0
    593     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    594 
    595 SIGN_UP_CHANGE_WD_16_HT_4:
    596     LDRB        r8,[r0]                     @pu1_src_cpy[0]
    597     SUB         r5,r12,r7                   @ht_tmp - row
    598     ADD         r5,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    599     LDRB        r5,[r5,#-1]                 @load the value
    600     SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    601     MVNLT       r8,#0
    602     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    603     VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    604 
    605 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    606     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    607     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    608     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    609 
    610     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    611     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    612     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    613     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    614 
    615     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    616 
    617     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    618     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    619 
    620     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    621     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    622     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    623     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    624     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    625 
    626     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    627     VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    628     VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    629     VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    630     VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    631 
    632     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    633     VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    634 
    635     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    636 
    637     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    638     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    639     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    640 
    641     LDR         r8,[sp,#0xD4]               @Loads ht
    642     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    643     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    644     SUB         r5,r5,#1
    645     SUB         r2,r2,#1
    646 
    647 SRC_LEFT_LOOP_WD_16_HT_4:
    648     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    649     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    650     SUBS        r8,r8,#1
    651     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    652 
    653     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    654     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    655 
    656 
    657 WIDTH_RESIDUE:
    658     LDR         r7,[sp,#0xD0]               @Loads wd
    659     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    660     CMP         r6,r7                       @wd_residue == wd
    661     LDREQB      r8,[r5]                     @pu1_avail[0]
    662 
    663     MOVNE       r8,#-1
    664     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    665 
    666     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    667     VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    668 
    669 PU1_AVAIL_2_RESIDUE:
    670     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    671     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    672     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    673     SUB         r0,#8
    674     CMP         r11,#0
    675 
    676     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    677     MOVNE       r8,r3
    678 
    679     SUB         r8,r8,#1
    680 
    681     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    682     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
    683     VLD1.8      D11,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
    684     LDR         r7,[sp,#0xD0]               @Loads wd
    685 
    686     LDR         r4,[sp,#0xD4]               @Loads ht
    687     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    688     SUB         r7,r7,#1                    @(wd - 1)
    689 
    690     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    691     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    692     SUB         r5,r5,#1
    693 
    694     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
    695     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    696 
    697 
    698 AU1_SRC_LEFT_LOOP_RESIDUE:
    699     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    700     SUBS        r4,r4,#1                    @decrement the loop count
    701     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    702     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    703 
    704 
    705     MOV         r7,r12                      @row count, move ht_tmp to r7
    706 
    707 PU1_SRC_LOOP_RESIDUE:
    708     VMOV.I8     Q9,#0
    709     ADD         r8,r0,r1                    @*pu1_src + src_strd
    710     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    711     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    712     SUB         r8,#8
    713 
    714     LDRB        r8,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    715     VMOV.8      d18[0],r8                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    716     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    717 
    718     CMP         r7,r12
    719     BLT         SIGN_UP_CHANGE_RESIDUE
    720     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    721     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    722     CMP         r5,#0
    723     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    724 
    725 SIGN_UP_CHANGE_RESIDUE:
    726     LDRB        r8,[r0]                     @pu1_src_cpy[0]
    727     SUB         r5,r12,r7                   @ht_tmp - row
    728 
    729     ADD         r5,r14,r5
    730     LDRB        r5,[r5,#-1]                 @load the value
    731     SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    732     MVNLT       r8,#0
    733     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    734     VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    735 
    736 SIGN_UP_CHANGE_DONE_RESIDUE:
    737     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    738     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    739     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    740 
    741     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    742     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    743     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    744     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    745 
    746     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    747 
    748     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    749     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    750 
    751     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    752     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    753     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    754     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    755     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    756 
    757     VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    758 
    759     VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    760     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    761     SUBS        r7,r7,#1
    762     BNE         PU1_SRC_LOOP_RESIDUE
    763 
    764     LDR         r8,[sp,#0xD4]               @Loads ht
    765     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    766 
    767     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    768     SUB         r5,r5,#1
    769 
    770     SUB         r2,r2,#1
    771 
    772 SRC_LEFT_LOOP_RESIDUE:
    773     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    774     SUBS        r8,r8,#1
    775     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    776     BNE         SRC_LEFT_LOOP_RESIDUE
    777 
    778 
    779 RE_ASSINING_LOOP:
    780     LDR         r8,[sp,#0xD4]               @Loads ht
    781     LDR         r7,[sp,#0xD0]               @Loads wd
    782 
    783     LDR         r0,[sp,#0xC0]               @Loads *pu1_src
    784     SUB         r8,r8,#1                    @ht - 1
    785 
    786     MLA         r6,r8,r1,r7                 @wd - 1 + (ht - 1) * src_strd
    787     STRB        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
    788 
    789     LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
    790     ADD         r6,r0,r6                    @pu1_src[wd - 1 + (ht - 1) * src_strd]
    791 
    792     ADD         r12,sp,#0x02
    793     STRB        r10,[r6,#-1]                @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
    794 
    795     LDRB        r11,[sp]                    @load u1_src_top_left_tmp from stack pointer
    796     LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
    797 
    798     STRB        r11,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
    799 
    800 SRC_TOP_LOOP:
    801     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
    802     SUBS        r7,r7,#8                    @Decrement the width
    803     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
    804     BNE         SRC_TOP_LOOP
    805 
    806 END_LOOPS:
    807     ADD         sp,sp,#0x94
    808     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    809 
    810 
    811 
    812