Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class3.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset,
     48 @                              WORD32 wd,
     49 @                              WORD32 ht)
     50 @**************Variables Vs Registers*****************************************
     51 @r0 =>  *pu1_src
     52 @r1 =>  src_strd
     53 @r2 =>  *pu1_src_left
     54 @r3 =>  *pu1_src_top
     55 @r4 =>  *pu1_src_top_left
     56 @r5 =>  *pu1_avail
     57 @r6 =>  *pi1_sao_offset
     58 @r7 =>  wd
     59 @r8=>   ht
     60 
     61 .text
     62 .p2align 2
     63 
     64 .extern gi1_table_edge_idx
     65 .globl ihevc_sao_edge_offset_class3_a9q
     66 
     67 gi1_table_edge_idx_addr_1:
     68 .long gi1_table_edge_idx - ulbl1 - 8
     69 
     70 gi1_table_edge_idx_addr_2:
     71 .long gi1_table_edge_idx - ulbl2 - 8
     72 
     73 gi1_table_edge_idx_addr_3:
     74 .long gi1_table_edge_idx - ulbl3 - 8
     75 
     76 ihevc_sao_edge_offset_class3_a9q:
     77 
     78 
     79     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     80     LDR         r7,[sp,#0x3C]               @Loads wd
     81 
     82     LDR         r8,[sp,#0x40]               @Loads ht
     83     SUB         r9,r7,#1                    @wd - 1
     84 
     85     LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
     86     LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
     87 
     88     MOV         r9,r7                       @Move width to r9 for loop count
     89 
     90     LDR         r5,[sp,#0x34]               @Loads pu1_avail
     91     LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
     92     STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
     93 
     94     SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
     95 
     96     STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
     97     SUB         r10,r8,#1                   @ht-1
     98     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
     99     ADD         r12,sp,#0x02                @temp array
    100 
    101 AU1_SRC_TOP_LOOP:
    102     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    103     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    104     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    105     BNE         AU1_SRC_TOP_LOOP
    106 
    107 PU1_AVAIL_5_LOOP:
    108     LDRB        r9,[r5,#5]                  @pu1_avail[5]
    109     CMP         r9,#0
    110     SUB         r10,r7,#1                   @[wd - 1]
    111     LDRB        r9,[r0,r10]                 @u1_pos_0_0_tmp = pu1_src[wd - 1]
    112     BEQ         PU1_AVAIL_6_LOOP
    113 
    114     LDR         r11,[sp,#0xC0]              @Load pu1_src_top_right from sp
    115     SUB         r10,r10,#1                  @[wd - 1 - 1]
    116 
    117     LDRB        r11,[r11]                   @pu1_src_top_right[0]
    118     SUB         r12,r9,r11                  @pu1_src[wd - 1] - pu1_src_top_right[0]
    119 
    120     ADD         r11,r0,r1                   @pu1_src + src_strd
    121 
    122     LDRB        r14,[r11,r10]               @pu1_src[wd - 1 - 1 + src_strd]
    123     CMP         r12,#0
    124     MVNLT       r12,#0
    125     SUB         r11,r9,r14                  @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
    126 
    127     MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
    128     CMP         r11,#0
    129     MVNLT       r11,#0
    130     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
    131     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    132 ulbl1:
    133     add         r14,r14,pc
    134     ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
    135     ADD         r11,r11,#2                  @edge_idx
    136 
    137     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    138     CMP         r12,#0                      @0 != edge_idx
    139     BEQ         PU1_AVAIL_6_LOOP
    140     LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
    141     ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
    142     USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    143 
    144 PU1_AVAIL_6_LOOP:
    145     LDRB        r10,[r5,#6]                 @pu1_avail[6]
    146     SUB         r11,r8,#1                   @ht - 1
    147 
    148     CMP         r10,#0
    149     STR         r0,[sp,#0xC0]               @Store pu1_src in sp
    150     MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
    151 
    152     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
    153     BEQ         PU1_AVAIL_3_LOOP
    154 
    155     LDR         r14,[sp,#0xC4]              @Load pu1_src_bot_left from sp
    156     SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd) - src_strd]
    157 
    158     LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
    159     ADD         r11,r11,#1                  @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    160 
    161     LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    162     SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
    163 
    164     SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    165     CMP         r11,#0
    166     MVNLT       r11,#0
    167     MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
    168 
    169     CMP         r14,#0
    170     MVNLT       r14,#0
    171     MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
    172 
    173     ADD         r11,r11,r14                 @Add 2 sign value
    174 
    175     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    176 ulbl2:
    177     add         r14,r14,pc
    178     ADD         r11,r11,#2                  @edge_idx
    179 
    180     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    181     CMP         r12,#0
    182     BEQ         PU1_AVAIL_3_LOOP
    183     LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
    184     ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    185     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    186 
    187 PU1_AVAIL_3_LOOP:
    188     STR         r2,[sp,#0xC4]               @Store pu1_src_left in sp
    189     MOV         r12,r8                      @Move ht
    190 
    191     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    192     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    193     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    194 
    195     CMP         r11,#0
    196     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    197     SUBEQ       r12,r12,#1                  @ht_tmp--
    198 
    199     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    200     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    201     CMP         r5,#0
    202 
    203     ADDEQ       r0,r0,r1                    @pu1_src += src_strd
    204     VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
    205     SUBEQ       r12,r12,#1                  @ht_tmp--
    206 
    207     LDR         r6, gi1_table_edge_idx_addr_3 @table pointer
    208 ulbl3:
    209     add         r6,r6,pc
    210     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    211     ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
    212 
    213     STR         r0,[sp,#0x90]               @Store pu1_src in sp
    214     VLD1.8      D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    215     MOV         r6,r7                       @move wd to r6 loop_count
    216 
    217     CMP         r7,#16                      @Compare wd with 16
    218     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    219     CMP         r8,#4                       @Compare ht with 4
    220     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    221 
    222 WIDTH_LOOP_16:
    223     LDR         r7,[sp,#0xD0]               @Loads wd
    224 
    225     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    226     CMP         r6,r7                       @col == wd
    227     LDREQB      r8,[r5]                     @pu1_avail[0]
    228     MOVNE       r8,#-1
    229     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    230 
    231     CMP         r6,#16                      @if(col == 16)
    232     BNE         SKIP_AU1_MASK_VAL
    233     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    234     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    235 
    236 SKIP_AU1_MASK_VAL:
    237     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    238     CMP         r8,#0
    239 
    240     LDR         r4,[sp,#0xD4]               @Loads ht
    241     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    242 
    243     MOVNE       r8,r3
    244     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    245 
    246     LDR         r7,[sp,#0xD0]               @Loads wd
    247     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    248 
    249     SUB         r7,r7,r6                    @(wd - col)
    250     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    251     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    252     SUB         r8,#8
    253     ADD         r3,r3,#16
    254 
    255     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    256     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    257     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    258     SUB         r0,#8
    259     ADD         r7,r7,#15                   @15 + (wd - col)
    260 
    261     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    262     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    263     SUB         r5,r5,#1
    264 
    265 AU1_SRC_LEFT_LOOP:
    266     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    267     SUBS        r4,r4,#1                    @decrement the loop count
    268     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    269     BNE         AU1_SRC_LEFT_LOOP
    270 
    271     VMOV.I8     Q9,#0
    272     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    273 
    274     ADD         r8,r0,r1                    @I *pu1_src + src_strd
    275     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    276     MOV         r7,r12                      @row count, move ht_tmp to r7
    277 
    278     SUB         r5,r12,r7                   @I ht_tmp - row
    279     VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    280     VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    281     SUB         r8,#8
    282     ADD         r8,r14,r5                   @I pu1_src_left_cpy[ht_tmp - row]
    283 
    284     ADD         r8,r8,#1                    @I pu1_src_left_cpy[ht_tmp - row + 1]
    285     LDRB        r8,[r8]
    286 
    287     LDR         r5,[sp,#0xC8]               @I Loads pu1_avail
    288     VMOV.8      D19[7],r8                   @I vsetq_lane_u8
    289     LDRB        r5,[r5,#2]                  @I pu1_avail[2]
    290 
    291     VEXT.8      Q9,Q9,Q8,#15                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    292     CMP         r5,#0                       @I
    293     BNE         SIGN_UP_CHANGE_DONE         @I
    294 
    295 SIGN_UP_CHANGE:
    296     LDRB        r8,[r0,#15]                 @I pu1_src_cpy[15]
    297     SUB         r5,r0,r1                    @I pu1_src_cpy[16 - src_strd]
    298 
    299     LDRB        r5,[r5,#16]                 @I load the value
    300     SUB         r8,r8,r5                    @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    301     CMP         r8,#0                       @I
    302     MVNLT       r8,#0                       @I
    303     MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    304     VMOV.8      D15[7],r8                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    305 
    306 SIGN_UP_CHANGE_DONE:
    307     VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    308     VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    309     VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    310 
    311     VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
    312     VADD.I8     Q9,Q9,Q5                    @I edge_idx = vaddq_s8(edge_idx, sign_down)
    313     VTBL.8      D18,{D6},D18                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    314     VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
    315 
    316     VEXT.8      Q7,Q7,Q7,#1                 @I sign_up = vextq_s8(sign_up, sign_up, 1)
    317     VTBL.8      D19,{D6},D19                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    318 
    319     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    320     VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
    321 
    322     VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    323 
    324     VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    325     VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    326 
    327     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    328     VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    329     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    330 
    331     VMOV        Q6,Q8
    332     VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    333 
    334     VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    335     VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    336 
    337     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    338 
    339 PU1_SRC_LOOP:
    340     ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
    341     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    342     SUB         r5,r12,r7                   @II ht_tmp - row
    343 
    344     ADD         r4,r0,r1                    @II pu1_src_cpy[16 - src_strd]
    345     VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
    346     ADD         r2,r8,r1                    @III *pu1_src + src_strd
    347 
    348     LDRB        r11,[r4,#15]                @II pu1_src_cpy[15]
    349     VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    350     VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    351     SUB         r8,#8
    352     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    353 
    354     ADD         r8,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
    355     VLD1.8      D30,[r2]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    356     VLD1.8      D31,[r2]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    357     SUB         r2,#8
    358     LDRB        r8,[r8,#1]
    359 
    360     LDRB        r4,[r0,#16]                 @II load the value
    361     VMOV.8      D19[7],r8                   @II vsetq_lane_u8
    362     SUB         r11,r11,r4                  @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    363 
    364     CMP         r11,#0                      @II
    365     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    366     SUB         r5,r12,r7                   @III ht_tmp - row
    367 
    368     MVNLT       r11,#0                      @II
    369     VEXT.8      Q9,Q9,Q8,#15                @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    370     MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    371 
    372     ADD         r8,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
    373     VMOV.8      D15[7],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    374     CMP         r7,#1                       @III
    375 
    376     BNE         NEXT_ROW_ELSE_2             @III
    377     LDR         r5,[sp,#0xC8]               @III Loads pu1_avail
    378     LDRB        r5,[r5,#3]                  @III pu1_avail[3]
    379     CMP         r5,#0                       @III
    380     SUBNE       r8,r2,#2                    @III pu1_src_cpy[src_strd - 1]
    381 
    382 NEXT_ROW_ELSE_2:
    383     LDRB        r8,[r8,#1]                  @III
    384     VCGT.U8     Q12,Q6,Q9                   @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    385     ADD         r5,r0,r1
    386 
    387     LDRB        r2,[r5,#15]                 @III pu1_src_cpy[15]
    388     VCLT.U8     Q13,Q6,Q9                   @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    389     LDRB        r5,[r0,#16]                 @III load the value
    390 
    391     SUB         r2,r2,r5                    @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    392     VSUB.U8     Q12,Q13,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    393     CMP         r2,#0                       @III
    394 
    395     MVNLT       r2,#0                       @III
    396     VMOV.8      D19[7],r8                   @III vsetq_lane_u8
    397     MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    398 
    399     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    400     VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    401 
    402     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    403     VEXT.8      Q9,Q9,Q15,#15               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    404 
    405     VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    406 
    407     VEXT.8      Q7,Q7,Q7,#1                 @II sign_up = vextq_s8(sign_up, sign_up, 1)
    408     VTBL.8      D26,{D6},D26                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    409     VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    410 
    411     VMOV.8      D15[7],r2                   @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    412     VTBL.8      D27,{D6},D27                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    413     VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    414 
    415     VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    416     VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    417 
    418     VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    419     VTBL.8      D24,{D7},D26                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    420     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    421 
    422     VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
    423     VTBL.8      D25,{D7},D27                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    424     VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
    425 
    426     VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    427     VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    428     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    429 
    430     VEXT.8      Q7,Q7,Q7,#1                 @III sign_up = vextq_s8(sign_up, sign_up, 1)
    431     VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    432     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    433 
    434     VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    435     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    436 
    437     VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    438     VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    439     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    440 
    441     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    442     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    443 
    444     VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    445     VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    446     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    447 
    448     VMOVL.U8    Q11,D17                     @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    449     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    450 
    451     VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    452     VADDW.S8    Q11,Q11,D11                 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    453 
    454     VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    455     VMAX.S16    Q11,Q11,Q1                  @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    456 
    457     VMOV        Q6,Q15                      @II pu1_cur_row = pu1_next_row
    458     VMIN.U16    Q11,Q11,Q2                  @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    459 
    460     CMP         r7,#1                       @III
    461     VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    462     BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
    463     BLT         INNER_LOOP_DONE
    464 
    465     ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
    466     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    467     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    468 
    469     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    470     VMOVN.I16   D21,Q11                     @III vmovn_s16(pi2_tmp_cur_row.val[1])
    471     CMP         r5,#0
    472 
    473     ADD         r4,r0,r1                    @pu1_src_cpy[16 - src_strd]
    474     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    475     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    476     SUB         r8,#8
    477     LDRB        r5,[r0,#16]                 @load the value
    478 
    479     BEQ         NEXT_ROW_ELSE_3
    480     LDRB        r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    481     B           NEXT_ROW_POINTER_ASSIGNED_3
    482 NEXT_ROW_ELSE_3:
    483     SUB         r11,r12,r7                  @ht_tmp - row
    484     ADD         r8,r14,r11                  @pu1_src_left_cpy[ht_tmp - row]
    485     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    486     LDRB        r8,[r8]
    487 
    488 NEXT_ROW_POINTER_ASSIGNED_3:
    489     LDRB        r11,[r4,#15]                @pu1_src_cpy[15]
    490     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    491     SUB         r8,r11,r5                   @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    492 
    493     CMP         r8,#0
    494     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    495     MVNLT       r8,#0
    496 
    497     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    498     VCGT.U8     Q12,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    499 
    500     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    501     VCLT.U8     Q13,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    502 
    503     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    504     VSUB.U8     Q12,Q13,Q12                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    505 
    506     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    507     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    508 
    509     VMOVL.U8    Q11,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    510     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    511 
    512     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    513     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    514 
    515     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    516 
    517     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    518 
    519     VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    520     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    521     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    522 
    523     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    524 
    525     VADDW.S8    Q11,Q11,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    526     VMAX.S16    Q11,Q11,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    527     VMIN.U16    Q11,Q11,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    528 
    529 INNER_LOOP_DONE:
    530     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    531     LDR         r8,[sp,#0xD4]               @Loads ht
    532 
    533     VMOVN.I16   D21,Q11                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    534     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    535 
    536     VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    537     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    538 SRC_LEFT_LOOP:
    539     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    540     SUBS        r8,r8,#4
    541     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    542     BNE         SRC_LEFT_LOOP
    543 
    544     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    545     CMP         r6,#8                       @Check whether residue remains
    546     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    547     LDR         r7,[sp,#0xD0]               @Loads wd
    548     LDR         r0,[sp,#0x90]               @Loads *pu1_src
    549     SUB         r7,r7,r6
    550     ADD         r0,r0,r7
    551     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    552     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    553 
    554 
    555 
    556 WD_16_HT_4_LOOP:
    557     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    558     LDR         r7,[sp,#0xD0]               @Loads wd
    559     CMP         r6,r7                       @col == wd
    560     LDREQB      r8,[r5]                     @pu1_avail[0]
    561     MOVNE       r8,#-1
    562     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    563 
    564     CMP         r6,#16                      @if(col == 16)
    565     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    566     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    567     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    568 
    569 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    570     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    571     CMP         r8,#0
    572 
    573     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    574     MOVNE       r8,r3
    575     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    576     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    577     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    578     SUB         r8,#8
    579 
    580     ADD         r3,r3,#16
    581     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    582     LDR         r4,[sp,#0xD4]               @Loads ht
    583     LDR         r7,[sp,#0xD0]               @Loads wd
    584     SUB         r7,r7,r6                    @(wd - col)
    585     ADD         r7,r7,#15                   @15 + (wd - col)
    586     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    587     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    588     SUB         r5,r5,#1
    589 
    590 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    591     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    592     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    593     SUBS        r4,r4,#1                    @decrement the loop count
    594     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    595 
    596     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    597     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    598     SUB         r0,#8
    599 
    600     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    601     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    602     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    603     VMOV.I8     Q9,#0
    604     MOV         r7,r12                      @row count, move ht_tmp to r7
    605 
    606 PU1_SRC_LOOP_WD_16_HT_4:
    607     ADD         r8,r0,r1                    @*pu1_src + src_strd
    608     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    609     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    610     SUB         r8,#8
    611     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    612     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    613     CMP         r5,#0
    614     BEQ         NEXT_ROW_ELSE_WD_16_HT_4
    615     CMP         r7,#1
    616     LDREQB      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    617     BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
    618 NEXT_ROW_ELSE_WD_16_HT_4:
    619     SUB         r5,r12,r7                   @ht_tmp - row
    620     ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    621     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    622     LDRB        r8,[r8]
    623 
    624 NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
    625     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    626     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    627 
    628     CMP         r7,r12
    629     BNE         SIGN_UP_CHANGE_WD_16_HT_4
    630     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    631     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    632     CMP         r5,#0
    633     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    634 
    635 SIGN_UP_CHANGE_WD_16_HT_4:
    636     LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
    637     ADD         r5,r0,#16                   @pu1_src_cpy[16]
    638     SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
    639     LDRB        r5,[r5]                     @load the value
    640     SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    641     CMP         r8,#0
    642     MVNLT       r8,#0
    643     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    644     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    645 
    646 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    647     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    648     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    649     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    650 
    651     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    652     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    653     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    654     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    655 
    656     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    657 
    658     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    659     VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
    660 
    661     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    662     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    663     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    664     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    665     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    666 
    667     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    668     VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    669     VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    670     VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    671     VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    672 
    673     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    674     VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    675 
    676     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    677 
    678     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    679     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    680     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    681 
    682     LDR         r8,[sp,#0xD4]               @Loads ht
    683     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    684     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    685 SRC_LEFT_LOOP_WD_16_HT_4:
    686     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    687     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    688     SUBS        r8,r8,#4
    689     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    690 
    691     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    692     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    693     BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
    694 
    695 
    696 WIDTH_RESIDUE:
    697     LDR         r7,[sp,#0xD0]               @Loads wd
    698     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    699     CMP         r6,r7                       @wd_residue == wd
    700     LDREQB      r8,[r5]                     @pu1_avail[0]
    701 
    702     MOVNE       r8,#-1
    703     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    704 
    705     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    706     VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    707 
    708 PU1_AVAIL_2_RESIDUE:
    709     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    710     CMP         r8,#0
    711 
    712     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    713     MOVNE       r8,r3
    714     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    715     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    716     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    717     SUB         r8,#8
    718 
    719 
    720     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    721     LDR         r4,[sp,#0xD4]               @Loads ht
    722     LDR         r7,[sp,#0xD0]               @Loads wd
    723     LDR         r8,[sp,#0xC0]               @Loads *pu1_src
    724     SUB         r7,r7,#1                    @(wd - 1)
    725     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
    726     SUB         r5,r5,#1
    727 
    728 AU1_SRC_LEFT_LOOP_RESIDUE:
    729     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    730     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    731     SUBS        r4,r4,#1                    @decrement the loop count
    732     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    733 
    734     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    735     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    736     SUB         r0,#8
    737 
    738     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    739     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    740     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    741     MOV         r7,r12                      @row count, move ht_tmp to r7
    742 
    743 PU1_SRC_LOOP_RESIDUE:
    744     VMOV.I8     Q9,#0
    745     ADD         r8,r0,r1                    @*pu1_src + src_strd
    746     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    747     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    748     SUB         r8,#8
    749     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    750     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    751     CMP         r5,#0
    752     BEQ         NEXT_ROW_ELSE_RESIDUE
    753     CMP         r7,#1
    754     LDREQB      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    755     BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
    756 NEXT_ROW_ELSE_RESIDUE:
    757     SUB         r5,r12,r7                   @ht_tmp - row
    758     ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    759     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    760     LDRB        r8,[r8]
    761 
    762 NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
    763     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    764     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    765 
    766     CMP         r7,r12
    767     BNE         SIGN_UP_CHANGE_RESIDUE
    768     LDR         r5,[sp,#0xC8]               @Loads pu1_avail
    769     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    770     CMP         r5,#0
    771     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    772 
    773 SIGN_UP_CHANGE_RESIDUE:
    774     LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
    775     ADD         r5,r0,#16                   @pu1_src_cpy[16]
    776     SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
    777     LDRB        r5,[r5]                     @load the value
    778     SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    779     CMP         r8,#0
    780     MVNLT       r8,#0
    781     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    782     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    783 
    784 SIGN_UP_CHANGE_DONE_RESIDUE:
    785     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    786     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    787     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    788 
    789     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    790     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    791     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    792     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    793 
    794     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    795 
    796     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    797     VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
    798 
    799     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    800     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    801     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    802     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    803     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    804 
    805     VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    806 
    807     VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    808     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    809     SUBS        r7,r7,#1
    810     BNE         PU1_SRC_LOOP_RESIDUE
    811 
    812     LDR         r8,[sp,#0xD4]               @Loads ht
    813     LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
    814     ADD         r5,sp,#0x42                 @*au1_src_left_tmp
    815 
    816 SRC_LEFT_LOOP_RESIDUE:
    817     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    818     SUBS        r8,r8,#4
    819     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    820     BNE         SRC_LEFT_LOOP_RESIDUE
    821 
    822 
    823 RE_ASSINING_LOOP:
    824     LDR         r7,[sp,#0xD0]               @Loads wd
    825     LDR         r0,[sp,#0xC0]               @Loads *pu1_src
    826 
    827     LDR         r11,[sp,#0xD4]              @Loads ht
    828     ADD         r8,r0,r7                    @pu1_src[wd]
    829 
    830     LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
    831     SUB         r11,r11,#1                  @ht - 1
    832 
    833     STRB        r9,[r8,#-1]                 @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
    834     MLA         r6,r11,r1,r0                @pu1_src_org[(ht - 1) * src_strd]
    835 
    836     LDRB        r8,[sp]                     @load u1_src_top_left_tmp from stack pointer
    837     ADD         r12,sp,#0x02
    838 
    839     STRB        r10,[r6]                    @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
    840     STRB        r8,[r4]                     @*pu1_src_top_left = u1_src_top_left_tmp
    841     LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
    842 
    843 SRC_TOP_LOOP:
    844     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
    845     SUBS        r7,r7,#8                    @Decrement the width
    846     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
    847     BNE         SRC_TOP_LOOP
    848 
    849 END_LOOPS:
    850     ADD         sp,sp,#0x94
    851     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    852 
    853 
    854 
    855