Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class2.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset,
     48 @                              WORD32 wd,
     49 @                              WORD32 ht)
     50 @**************Variables Vs Registers*****************************************
     51 @r0 =>  *pu1_src
     52 @r1 =>  src_strd
     53 @r2 =>  *pu1_src_left
     54 @r3 =>  *pu1_src_top
     55 @r4 =>  *pu1_src_top_left
     56 @r5 =>  *pu1_avail
     57 @r6 =>  *pi1_sao_offset
     58 @r7 =>  wd
     59 @r8=>   ht
     60 
     61 .equ    pu1_src_top_left_offset,    264
     62 .equ    pu1_src_top_right_offset,   268
     63 .equ    pu1_src_bot_left_offset,    272
     64 .equ    pu1_avail_offset,           276
     65 .equ    pi1_sao_offset,             280
     66 .equ    wd_offset,                  284
     67 .equ    ht_offset,                  288
     68 
     69 .text
     70 .syntax unified
     71 .p2align 2
     72 
     73 .extern gi1_table_edge_idx
     74 .globl ihevc_sao_edge_offset_class2_a9q
     75 
     76 gi1_table_edge_idx_addr_1:
     77 .long gi1_table_edge_idx - ulbl1 - 8
     78 
     79 gi1_table_edge_idx_addr_2:
     80 .long gi1_table_edge_idx - ulbl2 - 8
     81 
     82 gi1_table_edge_idx_addr_3:
     83 .long gi1_table_edge_idx - ulbl3 - 8
     84 
     85 ihevc_sao_edge_offset_class2_a9q:
     86 
     87 
     88     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     89     vpush       {d8  -  d15}
     90     SUB         sp,sp,#160                  @Decrement the stack pointer to store some temp arr values
     91 
     92     LDR         r7,[sp,#wd_offset]          @Loads wd
     93     LDR         r8,[sp,#ht_offset]          @Loads ht
     94     SUB         r9,r7,#1                    @wd - 1
     95 
     96     LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
     97     LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
     98 
     99     STR         r0,[sp,#152]                @Store pu1_src in sp
    100     MOV         r9,r7                       @Move width to r9 for loop count
    101 
    102     STR         r2,[sp,#156]                @Store pu1_src_left in sp
    103     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    104     LDR         r6,[sp,#pi1_sao_offset]     @Loads pi1_sao_offset
    105     STR         r3,[sp,#148]                @Store pu1_src_top in sp
    106 
    107 
    108     STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
    109     SUB         r10,r8,#1                   @ht-1
    110     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
    111     ADD         r12,sp,#2                   @temp array
    112 
    113 AU1_SRC_TOP_LOOP:
    114     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    115     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    116     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    117     BNE         AU1_SRC_TOP_LOOP
    118 
    119 PU1_AVAIL_4_LOOP:
    120     LDRB        r10,[r5,#4]                 @pu1_avail[4]
    121     CMP         r10,#0
    122     LDRB        r9,[r0]                     @u1_pos_0_0_tmp = pu1_src[0]
    123     BEQ         PU1_AVAIL_7_LOOP
    124 
    125     LDRB        r11,[r4]                    @pu1_src_top_left[0]
    126     ADD         r14,r0,r1                   @pu1_src + src_strd
    127 
    128     SUBS        r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
    129     LDRB        r4,[r14,#1]                 @pu1_src[1 + src_strd]
    130 
    131     MVNLT       r12,#0
    132     MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
    133 
    134     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    135 ulbl1:
    136     add         r14,r14,pc
    137     SUBS        r11,r9,r4                   @pu1_src[0] - pu1_src[1 + src_strd]
    138 
    139     MVNLT       r11,#0
    140     MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[1 + src_strd])
    141     ADD         r4,r12,r11                  @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
    142     ADD         r4,r4,#2                    @edge_idx
    143 
    144     LDRSB       r12,[r14,r4]                @edge_idx = gi1_table_edge_idx[edge_idx]
    145     CMP         r12,#0                      @0 != edge_idx
    146     BEQ         PU1_AVAIL_7_LOOP
    147     LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
    148     ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
    149     USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    150 
    151 PU1_AVAIL_7_LOOP:
    152     LDRB        r14,[r5,#7]                 @pu1_avail[7]
    153     CMP         r14,#0
    154     SUB         r10,r7,#1                   @wd - 1
    155     SUB         r11,r8,#1                   @ht - 1
    156     MLA         r12,r11,r1,r10              @wd - 1 + (ht - 1) * src_strd
    157     ADD         r12,r12,r0                  @pu1_src[wd - 1 + (ht - 1) * src_strd]
    158     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
    159     BEQ         PU1_AVAIL
    160 
    161     SUB         r4,r12,r1                   @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
    162     LDRB        r11,[r4,#-1]                @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
    163     ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
    164 
    165     SUBS        r11,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
    166     LDRB        r4,[r14,#1]                 @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
    167 
    168     MVNLT       r11,#0
    169     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
    170 
    171     SUBS        r4,r10,r4                   @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
    172     MVNLT       r4,#0
    173     MOVGT       r4,#1                       @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
    174 
    175     ADD         r11,r11,r4                  @Add 2 sign value
    176     ADD         r11,r11,#2                  @edge_idx
    177     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    178 ulbl2:
    179     add         r14,r14,pc
    180 
    181     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    182     CMP         r12,#0
    183     BEQ         PU1_AVAIL
    184     LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
    185     ADD         r10,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    186     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    187 
    188 PU1_AVAIL:
    189     MOV         r12,r8                      @Move ht
    190     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    191     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    192 
    193     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    194     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    195     CMP         r11,#0
    196 
    197     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    198     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    199     SUBEQ       r12,r12,#1                  @ht_tmp--
    200 
    201     CMP         r5,#0
    202     VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
    203     LDR         r11, gi1_table_edge_idx_addr_3 @table pointer
    204 ulbl3:
    205     add         r11,r11,pc
    206 
    207     ADDEQ       r0,r0,r1                    @pu1_src += src_strd
    208     VLD1.8      D6,[r11]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    209     SUBEQ       r12,r12,#1                  @ht_tmp--
    210 
    211     MOV         r6,r7                       @move wd to r6 loop_count
    212     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    213     ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
    214 
    215     STR         r0,[sp,#144]                @Store pu1_src in sp
    216     CMP         r7,#16                      @Compare wd with 16
    217 
    218     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    219     CMP         r8,#4                       @Compare ht with 4
    220     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    221 
    222 WIDTH_LOOP_16:
    223     LDR         r7,[sp,#wd_offset]          @Loads wd
    224 
    225     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    226     CMP         r6,r7                       @col == wd
    227     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    228     MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    229 
    230     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
    231     CMP         r6,#16                      @if(col == 16)
    232     BNE         SKIP_AU1_MASK_VAL
    233     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    234     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    235 
    236 SKIP_AU1_MASK_VAL:
    237     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    238     CMP         r11,#0
    239 
    240     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    241     MOVNE       r8,r3                       @pu1_src_top_cpy
    242     SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
    243 
    244     LDR         r7,[sp,#wd_offset]          @Loads wd
    245     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    246     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    247     SUB         r8,#8
    248     ADD         r3,r3,#16
    249 
    250     ADD         r5,sp,#66                   @*au1_src_left_tmp
    251     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    252     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    253     SUB         r0,#8
    254     LDR         r4,[sp,#ht_offset]          @Loads ht
    255 
    256     SUB         r7,r7,r6                    @(wd - col)
    257     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    258     LDR         r8,[sp,#152]                @Loads *pu1_src
    259 
    260     ADD         r7,r7,#15                   @15 + (wd - col)
    261     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    262     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    263 
    264     SUB         r5,r5,#1
    265     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    266 
    267 AU1_SRC_LEFT_LOOP:
    268     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    269     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    270     SUBS        r4,r4,#1                    @decrement the loop count
    271     BNE         AU1_SRC_LEFT_LOOP
    272 
    273     ADD         r8,r0,r1                    @I Iteration *pu1_src + src_strd
    274     VMOV.I8     Q9,#0
    275     LDR         r4,[sp,#pu1_avail_offset]   @I Loads pu1_avail
    276 
    277     MOV         r7,r12                      @row count, move ht_tmp to r7
    278     VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    279     VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    280     SUB         r8,#8
    281     LDRB        r4,[r4,#2]                  @I pu1_avail[2]
    282 
    283     LDRB        r5,[r8,#16]                 @I pu1_src_cpy[src_strd + 16]
    284     VMOV.8      D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    285 
    286     VEXT.8      Q9,Q8,Q9,#1                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    287     CMP         r4,#0                       @I
    288     BNE         SIGN_UP_CHANGE_DONE         @I
    289 
    290 SIGN_UP_CHANGE:
    291     SUB         r2,r12,r7                   @I ht_tmp - row
    292     LDRB        r11,[r0]                    @I pu1_src_cpy[0]
    293     ADD         r2,r14,r2                   @I pu1_src_left_cpy[ht_tmp - row]
    294 
    295     LDRB        r5,[r2,#-1]                 @I load the value
    296     SUBS        r4,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    297     MVNLT       r4,#0                       @I
    298     MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    299     VMOV.8      D14[0],r4                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    300 
    301 SIGN_UP_CHANGE_DONE:
    302     VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    303     VADD.I8     Q12,Q0,Q7                   @I edge_idx = vaddq_s8(const_2, sign_up)
    304 
    305     VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    306     VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    307 
    308     VADD.I8     Q12,Q12,Q5                  @I edge_idx = vaddq_s8(edge_idx, sign_down)
    309     VTBL.8      D18,{D6},D24                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    310     VTBL.8      D19,{D6},D25                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    311 
    312     VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
    313 
    314     VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
    315     VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    316     VEXT.8      Q7,Q7,Q7,#15                @I sign_up = vextq_s8(sign_up, sign_up, 15)
    317 
    318     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    319     VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    320     VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    321 
    322     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    323     VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    324 
    325     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    326     VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
    327 
    328     VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    329     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    330 
    331     VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    332     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    333 
    334     VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    335 
    336     VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
    337 
    338 PU1_SRC_LOOP:
    339 
    340     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    341     ADD         r8,r0,r1                    @II iteration *pu1_src + src_strd
    342 
    343     VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    344     VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    345     SUB         r8,#8
    346     ADD         r11,r8,r1                   @III iteration *pu1_src + src_strd
    347 
    348     LDRB        r5,[r8,#16]                 @II pu1_src_cpy[src_strd + 16]
    349     VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    350     VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    351     SUB         r11,#8
    352     LDRB        r4,[r0]                     @II pu1_src_cpy[0]
    353 
    354     LDRB        r8,[r11,#16]                @III pu1_src_cpy[src_strd + 16]
    355     VMOV.8      D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    356 
    357     SUB         r5,r12,r7                   @II ht_tmp - row
    358     VEXT.8      Q11,Q8,Q14,#1               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    359     ADD         r5,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
    360 
    361     LDRB        r5,[r5,#-1]                 @II load the value
    362     VMOV.8      D18[0],r8                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    363     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    364 
    365     SUBS        r4,r4,r5                    @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    366     VEXT.8      Q9,Q15,Q9,#1                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    367     LDRB        r2,[r0,r1]                  @III pu1_src_cpy[0]
    368 
    369     VCGT.U8     Q12,Q6,Q11                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    370     SUB         r5,r12,r7                   @III ht_tmp - row
    371 
    372     MVNLT       r4,#0                       @II
    373     VCLT.U8     Q11,Q6,Q11                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    374     ADD         r5,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
    375 
    376     MOVGT       r4,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    377     VSUB.U8     Q12,Q11,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    378     LDRB        r5,[r5,#-1]                 @III load the value
    379 
    380     SUBS        r2,r2,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    381     VMOV.8      D14[0],r4                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    382 
    383     MVNLT       r2,#0                       @III
    384     VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    385     MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    386 
    387     VADD.I8     Q11,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    388     VADD.I8     Q11,Q11,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    389 
    390     VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    391     VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    392     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    393 
    394     VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    395     VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    396     VEXT.8      Q7,Q7,Q7,#15                @II sign_up = vextq_s8(sign_up, sign_up, 15)
    397 
    398     VAND        Q11,Q11,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    399     VMOV.8      D14[0],r2                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    400 
    401     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    402     VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    403     VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
    404 
    405     VMOVL.U8    Q13,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    406     VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    407     VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
    408 
    409     VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    410     VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    411     VEXT.8      Q7,Q7,Q7,#15                @III sign_up = vextq_s8(sign_up, sign_up, 15)
    412 
    413     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    414     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    415 
    416     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    417     VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    418     VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    419 
    420     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    421     VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    422     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    423 
    424     VMOVL.U8    Q14,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    425     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    426 
    427     VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    428     VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    429     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    430 
    431     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    432     VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    433 
    434     VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
    435     VMOVN.I16   D26,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    436 
    437     VMOVN.I16   D27,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    438     VADDW.S8    Q9,Q9,D11                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    439 
    440     VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    441     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    442 
    443     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    444     VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    445     CMP         r7,#1                       @III
    446 
    447     VST1.8      {Q13},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    448     VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
    449 
    450     BGT         PU1_SRC_LOOP                @III If not equal jump to PU1_SRC_LOOP
    451     BLT         INNER_LOOP_DONE
    452 
    453     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    454     ADD         r8,r0,r1                    @*pu1_src + src_strd
    455 
    456     LDRB        r2,[r0]                     @pu1_src_cpy[0]
    457     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    458     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    459     SUB         r8,#8
    460     LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    461 
    462     SUB         r11,r12,r7                  @ht_tmp - row
    463     VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    464     ADD         r11,r14,r11                 @pu1_src_left_cpy[ht_tmp - row]
    465 
    466     LDRB        r5,[r11,#-1]                @load the value
    467     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    468     SUBS        r4,r2,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    469 
    470     VCGT.U8     Q5,Q6,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    471     MVNLT       r4,#0
    472 
    473     MOVGT       r4,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    474     VCLT.U8     Q9,Q6,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    475 
    476     VMOV.8      D14[0],r4                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    477     VSUB.U8     Q5,Q9,Q5                    @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    478 
    479     VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
    480     VADD.I8     Q9,Q9,Q5                    @edge_idx = vaddq_s8(edge_idx, sign_down)
    481 
    482     VTBL.8      D18,{D6},D18                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    483     VNEG.S8     Q7,Q5                       @sign_up = vnegq_s8(sign_down)
    484 
    485     VTBL.8      D19,{D6},D19                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    486     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    487 
    488     VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
    489 
    490     VTBL.8      D10,{D7},D18                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    491 
    492     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    493     VTBL.8      D11,{D7},D19                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    494     VADDW.S8    Q10,Q10,D10                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    495 
    496     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    497     VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    498 
    499     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    500     VADDW.S8    Q6,Q6,D11                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    501 
    502     VMAX.S16    Q6,Q6,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    503     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    504 
    505     VMIN.U16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    506     VMOVN.I16   D21,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
    507 
    508 
    509 INNER_LOOP_DONE:
    510     ADD         r5,sp,#66                   @*au1_src_left_tmp
    511     VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    512     LDR         r2,[sp,#156]                @Loads *pu1_src_left
    513 
    514     LDR         r8,[sp,#ht_offset]          @Loads ht
    515     SUB         r5,r5,#1
    516 
    517     SUB         r2,r2,#1
    518 SRC_LEFT_LOOP:
    519     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    520     SUBS        r8,r8,#1
    521     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    522     BNE         SRC_LEFT_LOOP
    523 
    524     SUB         r6,r6,#16                   @Decrement the wd loop count by 16
    525     CMP         r6,#8                       @Check whether residue remains
    526     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    527     LDR         r7,[sp,#wd_offset]          @Loads wd
    528     LDR         r0,[sp,#144]                @Loads *pu1_src
    529     SUB         r7,r7,r6
    530     ADD         r0,r0,r7
    531     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    532     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    533 
    534 
    535 WD_16_HT_4_LOOP:
    536     LDR         r7,[sp,#wd_offset]          @Loads wd
    537     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    538     CMP         r6,r7                       @col == wd
    539     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    540     MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    541 
    542     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
    543     CMP         r6,#16                      @if(col == 16)
    544     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    545     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    546     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    547 
    548 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    549     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    550     CMP         r8,#0
    551 
    552     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    553     MOVNE       r8,r3
    554     SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
    555 
    556     LDR         r7,[sp,#wd_offset]          @Loads wd
    557     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    558     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
    559     SUB         r8,#8
    560     ADD         r3,r3,#16
    561 
    562     ADD         r5,sp,#66                   @*au1_src_left_tmp
    563     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    564     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    565     SUB         r0,#8
    566     LDR         r4,[sp,#ht_offset]          @Loads ht
    567 
    568     SUB         r7,r7,r6                    @(wd - col)
    569     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    570     LDR         r8,[sp,#152]                @Loads *pu1_src
    571 
    572     ADD         r7,r7,#15                   @15 + (wd - col)
    573     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    574     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    575 
    576     SUB         r5,r5,#1
    577     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    578 
    579 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    580     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    581     SUBS        r4,r4,#1                    @decrement the loop count
    582     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    583     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    584 
    585     VMOV.I8     Q9,#0
    586     MOV         r7,r12                      @row count, move ht_tmp to r7
    587 
    588 PU1_SRC_LOOP_WD_16_HT_4:
    589     ADD         r8,r0,r1                    @*pu1_src + src_strd
    590     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    591     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    592     SUB         r8,#8
    593 
    594     LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    595     VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    596     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    597 
    598     CMP         r7,r12
    599     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    600     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    601     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    602     CMP         r5,#0
    603     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    604 
    605 SIGN_UP_CHANGE_WD_16_HT_4:
    606     LDRB        r8,[r0]                     @pu1_src_cpy[0]
    607     SUB         r5,r12,r7                   @ht_tmp - row
    608     ADD         r5,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    609     LDRB        r5,[r5,#-1]                 @load the value
    610     SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    611     MVNLT       r8,#0
    612     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    613     VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    614 
    615 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    616     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    617     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    618     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    619 
    620     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    621     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    622     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    623     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    624 
    625     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    626 
    627     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    628     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    629 
    630     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    631     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    632     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    633     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    634     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    635 
    636     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    637     VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    638     VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    639     VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    640     VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    641 
    642     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    643     VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    644 
    645     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    646 
    647     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    648     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    649     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    650 
    651     LDR         r8,[sp,#ht_offset]          @Loads ht
    652     ADD         r5,sp,#66                   @*au1_src_left_tmp
    653     LDR         r2,[sp,#156]                @Loads *pu1_src_left
    654     SUB         r5,r5,#1
    655     SUB         r2,r2,#1
    656 
    657 SRC_LEFT_LOOP_WD_16_HT_4:
    658     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    659     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    660     SUBS        r8,r8,#1
    661     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    662 
    663     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    664     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    665 
    666 
    667 WIDTH_RESIDUE:
    668     LDR         r7,[sp,#wd_offset]          @Loads wd
    669     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    670     CMP         r6,r7                       @wd_residue == wd
    671     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    672 
    673     MOVNE       r8,#-1
    674     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    675 
    676     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    677     VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    678 
    679 PU1_AVAIL_2_RESIDUE:
    680     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    681     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    682     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    683     SUB         r0,#8
    684     CMP         r11,#0
    685 
    686     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    687     MOVNE       r8,r3
    688 
    689     SUB         r8,r8,#1
    690 
    691     ADD         r5,sp,#66                   @*au1_src_left_tmp
    692     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
    693     VLD1.8      D11,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
    694     LDR         r7,[sp,#wd_offset]          @Loads wd
    695 
    696     LDR         r4,[sp,#ht_offset]          @Loads ht
    697     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    698     SUB         r7,r7,#1                    @(wd - 1)
    699 
    700     LDR         r8,[sp,#152]                @Loads *pu1_src
    701     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    702     SUB         r5,r5,#1
    703 
    704     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
    705     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    706 
    707 
    708 AU1_SRC_LEFT_LOOP_RESIDUE:
    709     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    710     SUBS        r4,r4,#1                    @decrement the loop count
    711     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    712     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    713 
    714 
    715     MOV         r7,r12                      @row count, move ht_tmp to r7
    716 
    717 PU1_SRC_LOOP_RESIDUE:
    718     VMOV.I8     Q9,#0
    719     ADD         r8,r0,r1                    @*pu1_src + src_strd
    720     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    721     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    722     SUB         r8,#8
    723 
    724     LDRB        r8,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
    725     VMOV.8      d18[0],r8                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    726     VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
    727 
    728     CMP         r7,r12
    729     BLT         SIGN_UP_CHANGE_RESIDUE
    730     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    731     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    732     CMP         r5,#0
    733     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    734 
    735 SIGN_UP_CHANGE_RESIDUE:
    736     LDRB        r8,[r0]                     @pu1_src_cpy[0]
    737     SUB         r5,r12,r7                   @ht_tmp - row
    738 
    739     ADD         r5,r14,r5
    740     LDRB        r5,[r5,#-1]                 @load the value
    741     SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
    742     MVNLT       r8,#0
    743     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
    744     VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
    745 
    746 SIGN_UP_CHANGE_DONE_RESIDUE:
    747     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    748     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    749     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    750 
    751     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    752     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    753     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    754     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    755 
    756     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    757 
    758     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    759     VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
    760 
    761     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    762     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    763     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    764     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    765     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    766 
    767     VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    768 
    769     VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    770     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    771     SUBS        r7,r7,#1
    772     BNE         PU1_SRC_LOOP_RESIDUE
    773 
    774     LDR         r8,[sp,#ht_offset]          @Loads ht
    775     ADD         r5,sp,#66                   @*au1_src_left_tmp
    776 
    777     LDR         r2,[sp,#156]                @Loads *pu1_src_left
    778     SUB         r5,r5,#1
    779 
    780     SUB         r2,r2,#1
    781 
    782 SRC_LEFT_LOOP_RESIDUE:
    783     LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
    784     SUBS        r8,r8,#1
    785     STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
    786     BNE         SRC_LEFT_LOOP_RESIDUE
    787 
    788 
    789 RE_ASSINING_LOOP:
    790     LDR         r8,[sp,#ht_offset]          @Loads ht
    791     LDR         r7,[sp,#wd_offset]          @Loads wd
    792 
    793     LDR         r0,[sp,#152]                @Loads *pu1_src
    794     SUB         r8,r8,#1                    @ht - 1
    795 
    796     MLA         r6,r8,r1,r7                 @wd - 1 + (ht - 1) * src_strd
    797     STRB        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
    798 
    799     LDR         r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
    800     ADD         r6,r0,r6                    @pu1_src[wd - 1 + (ht - 1) * src_strd]
    801 
    802     ADD         r12,sp,#2
    803     STRB        r10,[r6,#-1]                @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
    804 
    805     LDRB        r11,[sp]                    @load u1_src_top_left_tmp from stack pointer
    806     LDR         r3,[sp,#148]                @Loads pu1_src_top
    807 
    808     STRB        r11,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
    809 
    810 SRC_TOP_LOOP:
    811     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
    812     SUBS        r7,r7,#8                    @Decrement the width
    813     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
    814     BNE         SRC_TOP_LOOP
    815 
    816 END_LOOPS:
    817     ADD         sp,sp,#160
    818     vpop        {d8  -  d15}
    819     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    820 
    821 
    822 
    823