Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class3.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset,
     48 @                              WORD32 wd,
     49 @                              WORD32 ht)
     50 @**************Variables Vs Registers*****************************************
     51 @r0 =>  *pu1_src
     52 @r1 =>  src_strd
     53 @r2 =>  *pu1_src_left
     54 @r3 =>  *pu1_src_top
     55 @r4 =>  *pu1_src_top_left
     56 @r5 =>  *pu1_avail
     57 @r6 =>  *pi1_sao_offset
     58 @r7 =>  wd
     59 @r8=>   ht
     60 
     61 .text
     62 .syntax unified
     63 .p2align 2
     64 
     65 .extern gi1_table_edge_idx
     66 .globl ihevc_sao_edge_offset_class3_a9q
     67 
     68 gi1_table_edge_idx_addr_1:
     69 .long gi1_table_edge_idx - ulbl1 - 8
     70 
     71 gi1_table_edge_idx_addr_2:
     72 .long gi1_table_edge_idx - ulbl2 - 8
     73 
     74 gi1_table_edge_idx_addr_3:
     75 .long gi1_table_edge_idx - ulbl3 - 8
     76 
     77 ihevc_sao_edge_offset_class3_a9q:
     78 
     79 
     80     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     81     LDR         r7,[sp,#0x3C]               @Loads wd
     82 
     83     LDR         r8,[sp,#0x40]               @Loads ht
     84     SUB         r9,r7,#1                    @wd - 1
     85 
     86     LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
     87     LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
     88 
     89     MOV         r9,r7                       @Move width to r9 for loop count
     90 
     91     LDR         r5,[sp,#0x34]               @Loads pu1_avail
     92     LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
     93     STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
     94 
     95     SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
     96 
     97     STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
     98     SUB         r10,r8,#1                   @ht-1
     99     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
    100     ADD         r12,sp,#0x02                @temp array
    101 
    102 AU1_SRC_TOP_LOOP:
    103     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    104     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    105     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    106     BNE         AU1_SRC_TOP_LOOP
    107 
    108 PU1_AVAIL_5_LOOP:
    109     LDRB        r9,[r5,#5]                  @pu1_avail[5]
    110     CMP         r9,#0
    111     SUB         r10,r7,#1                   @[wd - 1]
    112     LDRB        r9,[r0,r10]                 @u1_pos_0_0_tmp = pu1_src[wd - 1]
    113     BEQ         PU1_AVAIL_6_LOOP
    114 
    115     LDR         r11,[sp,#0xC0]              @Load pu1_src_top_right from sp
    116     SUB         r10,r10,#1                  @[wd - 1 - 1]
    117 
    118     LDRB        r11,[r11]                   @pu1_src_top_right[0]
    119     SUB         r12,r9,r11                  @pu1_src[wd - 1] - pu1_src_top_right[0]
    120 
    121     ADD         r11,r0,r1                   @pu1_src + src_strd
    122 
    123     LDRB        r14,[r11,r10]               @pu1_src[wd - 1 - 1 + src_strd]
    124     CMP         r12,#0
    125     MVNLT       r12,#0
    126     SUB         r11,r9,r14                  @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
    127 
    128     MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
    129     CMP         r11,#0
    130     MVNLT       r11,#0
    131     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
    132     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    133 ulbl1:
    134     add         r14,r14,pc
    135     ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
    136     ADD         r11,r11,#2                  @edge_idx
    137 
    138     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    139     CMP         r12,#0                      @0 != edge_idx
    140     BEQ         PU1_AVAIL_6_LOOP
    141     LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
    142     ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
    143     USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    144 
    145 PU1_AVAIL_6_LOOP:
    146     LDRB        r10,[r5,#6]                 @pu1_avail[6]
    147     SUB         r11,r8,#1                   @ht - 1
    148 
    149     CMP         r10,#0
    150     STR         r0,[sp,#0xC0]               @Store pu1_src in sp
    151     MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
    152 
    153     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
    154     BEQ         PU1_AVAIL_3_LOOP
    155 
    156     LDR         r14,[sp,#0xC4]              @Load pu1_src_bot_left from sp
    157     SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd) - src_strd]
    158 
    159     LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
    160     ADD         r11,r11,#1                  @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    161 
    162     LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    163     SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
    164 
    165     SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    166     CMP         r11,#0
    167     MVNLT       r11,#0
    168     MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
    169 
    170     CMP         r14,#0
    171     MVNLT       r14,#0
    172     MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
    173 
    174     ADD         r11,r11,r14                 @Add 2 sign value
    175 
    176     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    177 ulbl2:
    178     add         r14,r14,pc
    179     ADD         r11,r11,#2                  @edge_idx
    180 
    181     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    182     CMP         r12,#0
    183     BEQ         PU1_AVAIL_3_LOOP
    184     LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
    185     ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    186     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    187 
    188 PU1_AVAIL_3_LOOP:
    189     STR         r2,[sp,#0xC4]               @Store pu1_src_left in sp
    190     MOV         r12,r8                      @Move ht
    191 
    192     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    193     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    194     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    195 
    196     CMP         r11,#0
    197     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    198     SUBEQ       r12,r12,#1                  @ht_tmp--
    199 
    200     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    201     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    202     CMP         r5,#0
    203 
    204     ADDEQ       r0,r0,r1                    @pu1_src += src_strd
    205     VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
    206     SUBEQ       r12,r12,#1                  @ht_tmp--
    207 
    208     LDR         r6, gi1_table_edge_idx_addr_3 @table pointer
    209 ulbl3:
    210     add         r6,r6,pc
    211     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    212     ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
    213 
    214     STR         r0,[sp,#0x90]               @Store pu1_src in sp
    215     VLD1.8      D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    216     MOV         r6,r7                       @move wd to r6 loop_count
    217 
    218     CMP         r7,#16                      @Compare wd with 16
    219     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    220     CMP         r8,#4                       @Compare ht with 4
    221     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    222 
    223 WIDTH_LOOP_16:
    224     LDR         r7,[sp,#0xD0]               @Loads wd
    225 
    226     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    227     CMP         r6,r7                       @col == wd
    228     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    229     MOVNE       r8,#-1
    230     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    231 
    232     CMP         r6,#16                      @if(col == 16)
    233     BNE         SKIP_AU1_MASK_VAL
    234     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    235     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    236 
    237 SKIP_AU1_MASK_VAL:
    238     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    239     CMP         r8,#0
    240 
    241     LDR         r4,[sp,#0xD4]               @Loads ht
    242     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    243 
    244     MOVNE       r8,r3
    245     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    246 
    247     LDR         r7,[sp,#0xD0]               @Loads wd
    248     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    249 
    250     SUB         r7,r7,r6                    @(wd - col)
    251     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    252     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    253     SUB         r8,#8
    254     ADD         r3,r3,#16
    255 
    256     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    257     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    258     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    259     SUB         r0,#8
    260     ADD         r7,r7,#15                   @15 + (wd - col)
    261 
    262     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    263     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    264     SUB         r5,r5,#1
    265 
    266 AU1_SRC_LEFT_LOOP:
    267     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    268     SUBS        r4,r4,#1                    @decrement the loop count
    269     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    270     BNE         AU1_SRC_LEFT_LOOP
    271 
    272     VMOV.I8     Q9,#0
    273     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    274 
    275     ADD         r8,r0,r1                    @I *pu1_src + src_strd
    276     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    277     MOV         r7,r12                      @row count, move ht_tmp to r7
    278 
    279     SUB         r5,r12,r7                   @I ht_tmp - row
    280     VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    281     VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    282     SUB         r8,#8
    283     ADD         r8,r14,r5                   @I pu1_src_left_cpy[ht_tmp - row]
    284 
    285     ADD         r8,r8,#1                    @I pu1_src_left_cpy[ht_tmp - row + 1]
    286     LDRB        r8,[r8]
    287 
    288     LDR         r5,[sp,#0xC8]               @I Loads pu1_avail
    289     VMOV.8      D19[7],r8                   @I vsetq_lane_u8
    290     LDRB        r5,[r5,#2]                  @I pu1_avail[2]
    291 
    292     VEXT.8      Q9,Q9,Q8,#15                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    293     CMP         r5,#0                       @I
    294     BNE         SIGN_UP_CHANGE_DONE         @I
    295 
    296 SIGN_UP_CHANGE:
    297     LDRB        r8,[r0,#15]                 @I pu1_src_cpy[15]
    298     SUB         r5,r0,r1                    @I pu1_src_cpy[16 - src_strd]
    299 
    300     LDRB        r5,[r5,#16]                 @I load the value
    301     SUB         r8,r8,r5                    @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    302     CMP         r8,#0                       @I
    303     MVNLT       r8,#0                       @I
    304     MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    305     VMOV.8      D15[7],r8                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    306 
    307 SIGN_UP_CHANGE_DONE:
    308     VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    309     VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    310     VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    311 
    312     VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
    313     VADD.I8     Q9,Q9,Q5                    @I edge_idx = vaddq_s8(edge_idx, sign_down)
    314     VTBL.8      D18,{D6},D18                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    315     VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
    316 
    317     VEXT.8      Q7,Q7,Q7,#1                 @I sign_up = vextq_s8(sign_up, sign_up, 1)
    318     VTBL.8      D19,{D6},D19                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    319 
    320     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    321     VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
    322 
    323     VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    324 
    325     VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    326     VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    327 
    328     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    329     VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    330     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    331 
    332     VMOV        Q6,Q8
    333     VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    334 
    335     VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    336     VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    337 
    338     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    339 
    340 PU1_SRC_LOOP:
    341     ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
    342     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    343     SUB         r5,r12,r7                   @II ht_tmp - row
    344 
    345     ADD         r4,r0,r1                    @II pu1_src_cpy[16 - src_strd]
    346     VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
    347     ADD         r2,r8,r1                    @III *pu1_src + src_strd
    348 
    349     LDRB        r11,[r4,#15]                @II pu1_src_cpy[15]
    350     VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    351     VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    352     SUB         r8,#8
    353     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    354 
    355     ADD         r8,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
    356     VLD1.8      D30,[r2]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    357     VLD1.8      D31,[r2]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    358     SUB         r2,#8
    359     LDRB        r8,[r8,#1]
    360 
    361     LDRB        r4,[r0,#16]                 @II load the value
    362     VMOV.8      D19[7],r8                   @II vsetq_lane_u8
    363     SUB         r11,r11,r4                  @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    364 
    365     CMP         r11,#0                      @II
    366     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    367     SUB         r5,r12,r7                   @III ht_tmp - row
    368 
    369     MVNLT       r11,#0                      @II
    370     VEXT.8      Q9,Q9,Q8,#15                @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    371     MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    372 
    373     ADD         r8,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
    374     VMOV.8      D15[7],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    375     CMP         r7,#1                       @III
    376 
    377     BNE         NEXT_ROW_ELSE_2             @III
    378     LDR         r5,[sp,#0xC8]               @III Loads pu1_avail
    379     LDRB        r5,[r5,#3]                  @III pu1_avail[3]
    380     CMP         r5,#0                       @III
    381     SUBNE       r8,r2,#2                    @III pu1_src_cpy[src_strd - 1]
    382 
    383 NEXT_ROW_ELSE_2:
    384     LDRB        r8,[r8,#1]                  @III
    385     VCGT.U8     Q12,Q6,Q9                   @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    386     ADD         r5,r0,r1
    387 
    388     LDRB        r2,[r5,#15]                 @III pu1_src_cpy[15]
    389     VCLT.U8     Q13,Q6,Q9                   @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    390     LDRB        r5,[r0,#16]                 @III load the value
    391 
    392     SUB         r2,r2,r5                    @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    393     VSUB.U8     Q12,Q13,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    394     CMP         r2,#0                       @III
    395 
    396     MVNLT       r2,#0                       @III
    397     VMOV.8      D19[7],r8                   @III vsetq_lane_u8
    398     MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    399 
    400     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    401     VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    402 
    403     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    404     VEXT.8      Q9,Q9,Q15,#15               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    405 
    406     VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    407 
    408     VEXT.8      Q7,Q7,Q7,#1                 @II sign_up = vextq_s8(sign_up, sign_up, 1)
    409     VTBL.8      D26,{D6},D26                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    410     VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    411 
    412     VMOV.8      D15[7],r2                   @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    413     VTBL.8      D27,{D6},D27                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    414     VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    415 
    416     VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    417     VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    418 
    419     VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    420     VTBL.8      D24,{D7},D26                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    421     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    422 
    423     VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
    424     VTBL.8      D25,{D7},D27                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    425     VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
    426 
    427     VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    428     VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    429     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    430 
    431     VEXT.8      Q7,Q7,Q7,#1                 @III sign_up = vextq_s8(sign_up, sign_up, 1)
    432     VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    433     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    434 
    435     VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    436     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    437 
    438     VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    439     VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    440     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    441 
    442     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    443     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    444 
    445     VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    446     VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    447     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    448 
    449     VMOVL.U8    Q11,D17                     @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    450     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    451 
    452     VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    453     VADDW.S8    Q11,Q11,D11                 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    454 
    455     VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    456     VMAX.S16    Q11,Q11,Q1                  @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    457 
    458     VMOV        Q6,Q15                      @II pu1_cur_row = pu1_next_row
    459     VMIN.U16    Q11,Q11,Q2                  @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    460 
    461     CMP         r7,#1                       @III
    462     VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    463     BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
    464     BLT         INNER_LOOP_DONE
    465 
    466     ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
    467     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    468     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    469 
    470     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    471     VMOVN.I16   D21,Q11                     @III vmovn_s16(pi2_tmp_cur_row.val[1])
    472     CMP         r5,#0
    473 
    474     ADD         r4,r0,r1                    @pu1_src_cpy[16 - src_strd]
    475     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    476     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    477     SUB         r8,#8
    478     LDRB        r5,[r0,#16]                 @load the value
    479 
    480     BEQ         NEXT_ROW_ELSE_3
    481     LDRB        r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    482     B           NEXT_ROW_POINTER_ASSIGNED_3
    483 NEXT_ROW_ELSE_3:
    484     SUB         r11,r12,r7                  @ht_tmp - row
    485     ADD         r8,r14,r11                  @pu1_src_left_cpy[ht_tmp - row]
    486     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    487     LDRB        r8,[r8]
    488 
    489 NEXT_ROW_POINTER_ASSIGNED_3:
    490     LDRB        r11,[r4,#15]                @pu1_src_cpy[15]
    491     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    492     SUB         r8,r11,r5                   @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    493 
    494     CMP         r8,#0
    495     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    496     MVNLT       r8,#0
    497 
    498     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    499     VCGT.U8     Q12,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    500 
    501     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    502     VCLT.U8     Q13,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    503 
    504     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    505     VSUB.U8     Q12,Q13,Q12                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    506 
    507     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    508     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    509 
    510     VMOVL.U8    Q11,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    511     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    512 
    513     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    514     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    515 
    516     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    517 
    518     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    519 
    520     VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    521     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    522     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    523 
    524     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    525 
    526     VADDW.S8    Q11,Q11,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    527     VMAX.S16    Q11,Q11,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    528     VMIN.U16    Q11,Q11,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    529 
    530 INNER_LOOP_DONE:
    531     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    532     LDR         r8,[sp,#0xD4]               @Loads ht
    533 
    534     VMOVN.I16   D21,Q11                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    535     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    536 
    537     VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    538     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    539 SRC_LEFT_LOOP:
    540     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    541     SUBS        r8,r8,#4
    542     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    543     BNE         SRC_LEFT_LOOP
    544 
    545     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    546     CMP         r6,#8                       @Check whether residue remains
    547     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    548     LDR         r7,[sp,#0xD0]               @Loads wd
    549     LDR         r0,[sp,#0x90]               @Loads *pu1_src
    550     SUB         r7,r7,r6
    551     ADD         r0,r0,r7
    552     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    553     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    554 
    555 
    556 
    557 WD_16_HT_4_LOOP:
    558     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    559     LDR         r7,[sp,#0xD0]               @Loads wd
    560     CMP         r6,r7                       @col == wd
    561     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    562     MOVNE       r8,#-1
    563     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    564 
    565     CMP         r6,#16                      @if(col == 16)
    566     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    567     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    568     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    569 
    570 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    571     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    572     CMP         r8,#0
    573 
    574     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    575     MOVNE       r8,r3
    576     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    577     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    578     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    579     SUB         r8,#8
    580 
    581     ADD         r3,r3,#16
    582     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    583     LDR         r4,[sp,#0xD4]               @Loads ht
    584     LDR         r7,[sp,#0xD0]               @Loads wd
    585     SUB         r7,r7,r6                    @(wd - col)
    586     ADD         r7,r7,#15                   @15 + (wd - col)
    587     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    588     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    589     SUB         r5,r5,#1
    590 
    591 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    592     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    593     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    594     SUBS        r4,r4,#1                    @decrement the loop count
    595     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    596 
    597     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    598     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    599     SUB         r0,#8
    600 
    601     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    602     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    603     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    604     VMOV.I8     Q9,#0
    605     MOV         r7,r12                      @row count, move ht_tmp to r7
    606 
    607 PU1_SRC_LOOP_WD_16_HT_4:
    608     ADD         r8,r0,r1                    @*pu1_src + src_strd
    609     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    610     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    611     SUB         r8,#8
    612     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    613     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    614     CMP         r5,#0
    615     BEQ         NEXT_ROW_ELSE_WD_16_HT_4
    616     CMP         r7,#1
    617     LDRBEQ      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    618     BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
    619 NEXT_ROW_ELSE_WD_16_HT_4:
    620     SUB         r5,r12,r7                   @ht_tmp - row
    621     ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    622     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    623     LDRB        r8,[r8]
    624 
    625 NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
    626     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    627     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    628 
    629     CMP         r7,r12
    630     BNE         SIGN_UP_CHANGE_WD_16_HT_4
    631     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    632     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    633     CMP         r5,#0
    634     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    635 
    636 SIGN_UP_CHANGE_WD_16_HT_4:
    637     LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
    638     ADD         r5,r0,#16                   @pu1_src_cpy[16]
    639     SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
    640     LDRB        r5,[r5]                     @load the value
    641     SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    642     CMP         r8,#0
    643     MVNLT       r8,#0
    644     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    645     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    646 
    647 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    648     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    649     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    650     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    651 
    652     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    653     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    654     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    655     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    656 
    657     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    658 
    659     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    660     VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
    661 
    662     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    663     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    664     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    665     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    666     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    667 
    668     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    669     VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    670     VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    671     VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    672     VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    673 
    674     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    675     VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    676 
    677     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    678 
    679     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    680     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    681     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    682 
    683     LDR         r8,[sp,#0xD4]               @Loads ht
    684     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    685     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    686 SRC_LEFT_LOOP_WD_16_HT_4:
    687     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    688     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    689     SUBS        r8,r8,#4
    690     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    691 
    692     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    693     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    694     BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
    695 
    696 
    697 WIDTH_RESIDUE:
    698     LDR         r7,[sp,#0xD0]               @Loads wd
    699     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    700     CMP         r6,r7                       @wd_residue == wd
    701     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    702 
    703     MOVNE       r8,#-1
    704     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    705 
    706     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    707     VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    708 
    709 PU1_AVAIL_2_RESIDUE:
    710     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    711     CMP         r8,#0
    712 
    713     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    714     MOVNE       r8,r3
    715     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    716     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    717     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    718     SUB         r8,#8
    719 
    720 
    721     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    722     LDR         r4,[sp,#0xD4]               @Loads ht
    723     LDR         r7,[sp,#0xD0]               @Loads wd
    724     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    725     SUB         r7,r7,#1                    @(wd - 1)
    726     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
    727     SUB         r5,r5,#1
    728 
    729 AU1_SRC_LEFT_LOOP_RESIDUE:
    730     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    731     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    732     SUBS        r4,r4,#1                    @decrement the loop count
    733     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    734 
    735     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    736     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    737     SUB         r0,#8
    738 
    739     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    740     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    741     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    742     MOV         r7,r12                      @row count, move ht_tmp to r7
    743 
    744 PU1_SRC_LOOP_RESIDUE:
    745     VMOV.I8     Q9,#0
    746     ADD         r8,r0,r1                    @*pu1_src + src_strd
    747     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    748     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    749     SUB         r8,#8
    750     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    751     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    752     CMP         r5,#0
    753     BEQ         NEXT_ROW_ELSE_RESIDUE
    754     CMP         r7,#1
    755     LDRBEQ      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    756     BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
    757 NEXT_ROW_ELSE_RESIDUE:
    758     SUB         r5,r12,r7                   @ht_tmp - row
    759     ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    760     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    761     LDRB        r8,[r8]
    762 
    763 NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
    764     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    765     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    766 
    767     CMP         r7,r12
    768     BNE         SIGN_UP_CHANGE_RESIDUE
    769     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    770     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    771     CMP         r5,#0
    772     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    773 
    774 SIGN_UP_CHANGE_RESIDUE:
    775     LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
    776     ADD         r5,r0,#16                   @pu1_src_cpy[16]
    777     SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
    778     LDRB        r5,[r5]                     @load the value
    779     SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    780     CMP         r8,#0
    781     MVNLT       r8,#0
    782     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    783     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    784 
    785 SIGN_UP_CHANGE_DONE_RESIDUE:
    786     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    787     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    788     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    789 
    790     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    791     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    792     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    793     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    794 
    795     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    796 
    797     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    798     VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
    799 
    800     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    801     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    802     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    803     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    804     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    805 
    806     VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    807 
    808     VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    809     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    810     SUBS        r7,r7,#1
    811     BNE         PU1_SRC_LOOP_RESIDUE
    812 
    813     LDR         r8,[sp,#0xD4]               @Loads ht
    814     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    815     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    816 
    817 SRC_LEFT_LOOP_RESIDUE:
    818     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    819     SUBS        r8,r8,#4
    820     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    821     BNE         SRC_LEFT_LOOP_RESIDUE
    822 
    823 
    824 RE_ASSINING_LOOP:
    825     LDR         r7,[sp,#0xD0]               @Loads wd
    826     LDR         r0,[sp,#0xC0]               @Loads *pu1_src
    827 
    828     LDR         r11,[sp,#0xD4]              @Loads ht
    829     ADD         r8,r0,r7                    @pu1_src[wd]
    830 
    831     LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
    832     SUB         r11,r11,#1                  @ht - 1
    833 
    834     STRB        r9,[r8,#-1]                 @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
    835     MLA         r6,r11,r1,r0                @pu1_src_org[(ht - 1) * src_strd]
    836 
    837     LDRB        r8,[sp]                     @load u1_src_top_left_tmp from stack pointer
    838     ADD         r12,sp,#0x02
    839 
    840     STRB        r10,[r6]                    @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
    841     STRB        r8,[r4]                     @*pu1_src_top_left = u1_src_top_left_tmp
    842     LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
    843 
    844 SRC_TOP_LOOP:
    845     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
    846     SUBS        r7,r7,#8                    @Decrement the width
    847     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
    848     BNE         SRC_TOP_LOOP
    849 
    850 END_LOOPS:
    851     ADD         sp,sp,#0x94
    852     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    853 
    854 
    855 
    856