Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class3.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset,
     48 @                              WORD32 wd,
     49 @                              WORD32 ht)
     50 @**************Variables Vs Registers*****************************************
     51 @r0 =>  *pu1_src
     52 @r1 =>  src_strd
     53 @r2 =>  *pu1_src_left
     54 @r3 =>  *pu1_src_top
     55 @r4 =>  *pu1_src_top_left
     56 @r5 =>  *pu1_avail
     57 @r6 =>  *pi1_sao_offset
     58 @r7 =>  wd
     59 @r8=>   ht
     60 
     61 .equ    pu1_src_top_left_offset,    264
     62 .equ    pu1_src_top_right_offset,   268
     63 .equ    pu1_src_bot_left_offset,    272
     64 .equ    pu1_avail_offset,           276
     65 .equ    pi1_sao_offset,             280
     66 .equ    wd_offset,                  284
     67 .equ    ht_offset,                  288
     68 
     69 .text
     70 .syntax unified
     71 .p2align 2
     72 
     73 .extern gi1_table_edge_idx
     74 .globl ihevc_sao_edge_offset_class3_a9q
     75 
     76 gi1_table_edge_idx_addr_1:
     77 .long gi1_table_edge_idx - ulbl1 - 8
     78 
     79 gi1_table_edge_idx_addr_2:
     80 .long gi1_table_edge_idx - ulbl2 - 8
     81 
     82 gi1_table_edge_idx_addr_3:
     83 .long gi1_table_edge_idx - ulbl3 - 8
     84 
     85 ihevc_sao_edge_offset_class3_a9q:
     86 
     87 
     88     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     89     vpush       {d8  -  d15}
     90     SUB         sp,sp,#160                  @Decrement the stack pointer to store some temp arr values
     91     LDR         r7,[sp,#wd_offset]          @Loads wd
     92 
     93     LDR         r8,[sp,#ht_offset]          @Loads ht
     94     SUB         r9,r7,#1                    @wd - 1
     95 
     96     LDR         r4,[sp,#pu1_src_top_left_offset]               @Loads pu1_src_top_left
     97     LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
     98 
     99     MOV         r9,r7                       @Move width to r9 for loop count
    100 
    101     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    102     LDR         r6,[sp,#pi1_sao_offset]     @Loads pi1_sao_offset
    103     STR         r3,[sp,#156]                @Store pu1_src_top in sp
    104 
    105 
    106     STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
    107     SUB         r10,r8,#1                   @ht-1
    108     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
    109     ADD         r12,sp,#2                   @temp array
    110 
    111 AU1_SRC_TOP_LOOP:
    112     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    113     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    114     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    115     BNE         AU1_SRC_TOP_LOOP
    116 
    117 PU1_AVAIL_5_LOOP:
    118     LDRB        r9,[r5,#5]                  @pu1_avail[5]
    119     CMP         r9,#0
    120     SUB         r10,r7,#1                   @[wd - 1]
    121     LDRB        r9,[r0,r10]                 @u1_pos_0_0_tmp = pu1_src[wd - 1]
    122     BEQ         PU1_AVAIL_6_LOOP
    123 
    124     LDR         r11,[sp,#pu1_src_top_right_offset]  @Load pu1_src_top_right from sp
    125     SUB         r10,r10,#1                  @[wd - 1 - 1]
    126 
    127     LDRB        r11,[r11]                   @pu1_src_top_right[0]
    128     SUB         r12,r9,r11                  @pu1_src[wd - 1] - pu1_src_top_right[0]
    129 
    130     ADD         r11,r0,r1                   @pu1_src + src_strd
    131 
    132     LDRB        r14,[r11,r10]               @pu1_src[wd - 1 - 1 + src_strd]
    133     CMP         r12,#0
    134     MVNLT       r12,#0
    135     SUB         r11,r9,r14                  @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
    136 
    137     MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
    138     CMP         r11,#0
    139     MVNLT       r11,#0
    140     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
    141     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    142 ulbl1:
    143     add         r14,r14,pc
    144     ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
    145     ADD         r11,r11,#2                  @edge_idx
    146 
    147     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    148     CMP         r12,#0                      @0 != edge_idx
    149     BEQ         PU1_AVAIL_6_LOOP
    150     LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
    151     ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
    152     USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    153 
    154 PU1_AVAIL_6_LOOP:
    155     LDRB        r10,[r5,#6]                 @pu1_avail[6]
    156     SUB         r11,r8,#1                   @ht - 1
    157 
    158     CMP         r10,#0
    159     STR         r0,[sp,#148]                @Store pu1_src in sp
    160     MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
    161 
    162     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
    163     BEQ         PU1_AVAIL_3_LOOP
    164 
    165     LDR         r14,[sp,#pu1_src_bot_left_offset]   @Load pu1_src_bot_left from sp
    166     SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd) - src_strd]
    167 
    168     LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
    169     ADD         r11,r11,#1                  @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    170 
    171     LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    172     SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
    173 
    174     SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
    175     CMP         r11,#0
    176     MVNLT       r11,#0
    177     MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
    178 
    179     CMP         r14,#0
    180     MVNLT       r14,#0
    181     MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
    182 
    183     ADD         r11,r11,r14                 @Add 2 sign value
    184 
    185     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    186 ulbl2:
    187     add         r14,r14,pc
    188     ADD         r11,r11,#2                  @edge_idx
    189 
    190     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    191     CMP         r12,#0
    192     BEQ         PU1_AVAIL_3_LOOP
    193     LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
    194     ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    195     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    196 
    197 PU1_AVAIL_3_LOOP:
    198     STR         r2,[sp,#152]                @Store pu1_src_left in sp
    199     MOV         r12,r8                      @Move ht
    200 
    201     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    202     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    203     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    204 
    205     CMP         r11,#0
    206     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    207     SUBEQ       r12,r12,#1                  @ht_tmp--
    208 
    209     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    210     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    211     CMP         r5,#0
    212 
    213     ADDEQ       r0,r0,r1                    @pu1_src += src_strd
    214     VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
    215     SUBEQ       r12,r12,#1                  @ht_tmp--
    216 
    217     LDR         r6, gi1_table_edge_idx_addr_3 @table pointer
    218 ulbl3:
    219     add         r6,r6,pc
    220     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    221     ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
    222 
    223     STR         r0,[sp,#144]                @Store pu1_src in sp
    224     VLD1.8      D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    225     MOV         r6,r7                       @move wd to r6 loop_count
    226 
    227     CMP         r7,#16                      @Compare wd with 16
    228     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    229     CMP         r8,#4                       @Compare ht with 4
    230     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    231 
    232 WIDTH_LOOP_16:
    233     LDR         r7,[sp,#wd_offset]          @Loads wd
    234 
    235     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    236     CMP         r6,r7                       @col == wd
    237     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    238     MOVNE       r8,#-1
    239     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    240 
    241     CMP         r6,#16                      @if(col == 16)
    242     BNE         SKIP_AU1_MASK_VAL
    243     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    244     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    245 
    246 SKIP_AU1_MASK_VAL:
    247     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    248     CMP         r8,#0
    249 
    250     LDR         r4,[sp,#ht_offset]          @Loads ht
    251     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    252 
    253     MOVNE       r8,r3
    254     ADD         r5,sp,#66                   @*au1_src_left_tmp
    255 
    256     LDR         r7,[sp,#wd_offset]          @Loads wd
    257     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    258 
    259     SUB         r7,r7,r6                    @(wd - col)
    260     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    261     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    262     SUB         r8,#8
    263     ADD         r3,r3,#16
    264 
    265     LDR         r8,[sp,#148]                @Loads *pu1_src
    266     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    267     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    268     SUB         r0,#8
    269     ADD         r7,r7,#15                   @15 + (wd - col)
    270 
    271     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    272     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    273     SUB         r5,r5,#1
    274 
    275 AU1_SRC_LEFT_LOOP:
    276     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    277     SUBS        r4,r4,#1                    @decrement the loop count
    278     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    279     BNE         AU1_SRC_LEFT_LOOP
    280 
    281     VMOV.I8     Q9,#0
    282     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    283 
    284     ADD         r8,r0,r1                    @I *pu1_src + src_strd
    285     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    286     MOV         r7,r12                      @row count, move ht_tmp to r7
    287 
    288     SUB         r5,r12,r7                   @I ht_tmp - row
    289     VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    290     VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    291     SUB         r8,#8
    292     ADD         r8,r14,r5                   @I pu1_src_left_cpy[ht_tmp - row]
    293 
    294     ADD         r8,r8,#1                    @I pu1_src_left_cpy[ht_tmp - row + 1]
    295     LDRB        r8,[r8]
    296 
    297     LDR         r5,[sp,#pu1_avail_offset]   @I Loads pu1_avail
    298     VMOV.8      D19[7],r8                   @I vsetq_lane_u8
    299     LDRB        r5,[r5,#2]                  @I pu1_avail[2]
    300 
    301     VEXT.8      Q9,Q9,Q8,#15                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    302     CMP         r5,#0                       @I
    303     BNE         SIGN_UP_CHANGE_DONE         @I
    304 
    305 SIGN_UP_CHANGE:
    306     LDRB        r8,[r0,#15]                 @I pu1_src_cpy[15]
    307     SUB         r5,r0,r1                    @I pu1_src_cpy[16 - src_strd]
    308 
    309     LDRB        r5,[r5,#16]                 @I load the value
    310     SUB         r8,r8,r5                    @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    311     CMP         r8,#0                       @I
    312     MVNLT       r8,#0                       @I
    313     MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    314     VMOV.8      D15[7],r8                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    315 
    316 SIGN_UP_CHANGE_DONE:
    317     VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    318     VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    319     VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    320 
    321     VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
    322     VADD.I8     Q9,Q9,Q5                    @I edge_idx = vaddq_s8(edge_idx, sign_down)
    323     VTBL.8      D18,{D6},D18                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    324     VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
    325 
    326     VEXT.8      Q7,Q7,Q7,#1                 @I sign_up = vextq_s8(sign_up, sign_up, 1)
    327     VTBL.8      D19,{D6},D19                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    328 
    329     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    330     VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
    331 
    332     VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    333 
    334     VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    335     VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    336 
    337     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    338     VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    339     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    340 
    341     VMOV        Q6,Q8
    342     VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    343 
    344     VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    345     VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    346 
    347     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    348 
    349 PU1_SRC_LOOP:
    350     ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
    351     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    352     SUB         r5,r12,r7                   @II ht_tmp - row
    353 
    354     ADD         r4,r0,r1                    @II pu1_src_cpy[16 - src_strd]
    355     VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
    356     ADD         r2,r8,r1                    @III *pu1_src + src_strd
    357 
    358     LDRB        r11,[r4,#15]                @II pu1_src_cpy[15]
    359     VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    360     VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    361     SUB         r8,#8
    362     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    363 
    364     ADD         r8,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
    365     VLD1.8      D30,[r2]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    366     VLD1.8      D31,[r2]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    367     SUB         r2,#8
    368     LDRB        r8,[r8,#1]
    369 
    370     LDRB        r4,[r0,#16]                 @II load the value
    371     VMOV.8      D19[7],r8                   @II vsetq_lane_u8
    372     SUB         r11,r11,r4                  @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    373 
    374     CMP         r11,#0                      @II
    375     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    376     SUB         r5,r12,r7                   @III ht_tmp - row
    377 
    378     MVNLT       r11,#0                      @II
    379     VEXT.8      Q9,Q9,Q8,#15                @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    380     MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    381 
    382     ADD         r8,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
    383     VMOV.8      D15[7],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    384     CMP         r7,#1                       @III
    385 
    386     BNE         NEXT_ROW_ELSE_2             @III
    387     LDR         r5,[sp,#pu1_avail_offset]   @III Loads pu1_avail
    388     LDRB        r5,[r5,#3]                  @III pu1_avail[3]
    389     CMP         r5,#0                       @III
    390     SUBNE       r8,r2,#2                    @III pu1_src_cpy[src_strd - 1]
    391 
    392 NEXT_ROW_ELSE_2:
    393     LDRB        r8,[r8,#1]                  @III
    394     VCGT.U8     Q12,Q6,Q9                   @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    395     ADD         r5,r0,r1
    396 
    397     LDRB        r2,[r5,#15]                 @III pu1_src_cpy[15]
    398     VCLT.U8     Q13,Q6,Q9                   @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    399     LDRB        r5,[r0,#16]                 @III load the value
    400 
    401     SUB         r2,r2,r5                    @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    402     VSUB.U8     Q12,Q13,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    403     CMP         r2,#0                       @III
    404 
    405     MVNLT       r2,#0                       @III
    406     VMOV.8      D19[7],r8                   @III vsetq_lane_u8
    407     MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    408 
    409     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    410     VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    411 
    412     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    413     VEXT.8      Q9,Q9,Q15,#15               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    414 
    415     VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    416 
    417     VEXT.8      Q7,Q7,Q7,#1                 @II sign_up = vextq_s8(sign_up, sign_up, 1)
    418     VTBL.8      D26,{D6},D26                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    419     VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    420 
    421     VMOV.8      D15[7],r2                   @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    422     VTBL.8      D27,{D6},D27                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    423     VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    424 
    425     VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    426     VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    427 
    428     VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    429     VTBL.8      D24,{D7},D26                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    430     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    431 
    432     VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
    433     VTBL.8      D25,{D7},D27                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    434     VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
    435 
    436     VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    437     VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    438     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    439 
    440     VEXT.8      Q7,Q7,Q7,#1                 @III sign_up = vextq_s8(sign_up, sign_up, 1)
    441     VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    442     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    443 
    444     VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    445     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    446 
    447     VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    448     VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    449     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    450 
    451     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    452     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    453 
    454     VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    455     VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    456     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    457 
    458     VMOVL.U8    Q11,D17                     @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    459     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    460 
    461     VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    462     VADDW.S8    Q11,Q11,D11                 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    463 
    464     VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    465     VMAX.S16    Q11,Q11,Q1                  @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    466 
    467     VMOV        Q6,Q15                      @II pu1_cur_row = pu1_next_row
    468     VMIN.U16    Q11,Q11,Q2                  @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    469 
    470     CMP         r7,#1                       @III
    471     VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    472     BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
    473     BLT         INNER_LOOP_DONE
    474 
    475     ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
    476     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    477     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    478 
    479     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    480     VMOVN.I16   D21,Q11                     @III vmovn_s16(pi2_tmp_cur_row.val[1])
    481     CMP         r5,#0
    482 
    483     ADD         r4,r0,r1                    @pu1_src_cpy[16 - src_strd]
    484     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    485     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    486     SUB         r8,#8
    487     LDRB        r5,[r0,#16]                 @load the value
    488 
    489     BEQ         NEXT_ROW_ELSE_3
    490     LDRB        r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    491     B           NEXT_ROW_POINTER_ASSIGNED_3
    492 NEXT_ROW_ELSE_3:
    493     SUB         r11,r12,r7                  @ht_tmp - row
    494     ADD         r8,r14,r11                  @pu1_src_left_cpy[ht_tmp - row]
    495     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    496     LDRB        r8,[r8]
    497 
    498 NEXT_ROW_POINTER_ASSIGNED_3:
    499     LDRB        r11,[r4,#15]                @pu1_src_cpy[15]
    500     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    501     SUB         r8,r11,r5                   @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    502 
    503     CMP         r8,#0
    504     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    505     MVNLT       r8,#0
    506 
    507     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    508     VCGT.U8     Q12,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    509 
    510     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    511     VCLT.U8     Q13,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    512 
    513     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    514     VSUB.U8     Q12,Q13,Q12                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    515 
    516     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    517     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    518 
    519     VMOVL.U8    Q11,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    520     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    521 
    522     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    523     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    524 
    525     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    526 
    527     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    528 
    529     VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    530     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    531     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    532 
    533     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    534 
    535     VADDW.S8    Q11,Q11,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    536     VMAX.S16    Q11,Q11,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    537     VMIN.U16    Q11,Q11,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    538 
    539 INNER_LOOP_DONE:
    540     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    541     LDR         r8,[sp,#ht_offset]          @Loads ht
    542 
    543     VMOVN.I16   D21,Q11                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    544     ADD         r5,sp,#66                   @*au1_src_left_tmp
    545 
    546     VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    547     LDR         r2,[sp,#152]                @Loads *pu1_src_left
    548 SRC_LEFT_LOOP:
    549     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    550     SUBS        r8,r8,#4
    551     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    552     BNE         SRC_LEFT_LOOP
    553 
    554     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    555     CMP         r6,#8                       @Check whether residue remains
    556     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    557     LDR         r7,[sp,#wd_offset]          @Loads wd
    558     LDR         r0,[sp,#144]                @Loads *pu1_src
    559     SUB         r7,r7,r6
    560     ADD         r0,r0,r7
    561     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    562     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    563 
    564 
    565 
    566 WD_16_HT_4_LOOP:
    567     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    568     LDR         r7,[sp,#wd_offset]          @Loads wd
    569     CMP         r6,r7                       @col == wd
    570     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    571     MOVNE       r8,#-1
    572     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    573 
    574     CMP         r6,#16                      @if(col == 16)
    575     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    576     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    577     VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    578 
    579 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    580     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    581     CMP         r8,#0
    582 
    583     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    584     MOVNE       r8,r3
    585     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    586     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    587     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    588     SUB         r8,#8
    589 
    590     ADD         r3,r3,#16
    591     ADD         r5,sp,#66                   @*au1_src_left_tmp
    592     LDR         r4,[sp,#ht_offset]          @Loads ht
    593     LDR         r7,[sp,#wd_offset]          @Loads wd
    594     SUB         r7,r7,r6                    @(wd - col)
    595     ADD         r7,r7,#15                   @15 + (wd - col)
    596     LDR         r8,[sp,#148]                @Loads *pu1_src
    597     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    598     SUB         r5,r5,#1
    599 
    600 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    601     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    602     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    603     SUBS        r4,r4,#1                    @decrement the loop count
    604     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    605 
    606     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    607     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    608     SUB         r0,#8
    609 
    610     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    611     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    612     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    613     VMOV.I8     Q9,#0
    614     MOV         r7,r12                      @row count, move ht_tmp to r7
    615 
    616 PU1_SRC_LOOP_WD_16_HT_4:
    617     ADD         r8,r0,r1                    @*pu1_src + src_strd
    618     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    619     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    620     SUB         r8,#8
    621     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    622     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    623     CMP         r5,#0
    624     BEQ         NEXT_ROW_ELSE_WD_16_HT_4
    625     CMP         r7,#1
    626     LDRBEQ      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    627     BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
    628 NEXT_ROW_ELSE_WD_16_HT_4:
    629     SUB         r5,r12,r7                   @ht_tmp - row
    630     ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    631     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    632     LDRB        r8,[r8]
    633 
    634 NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
    635     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    636     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    637 
    638     CMP         r7,r12
    639     BNE         SIGN_UP_CHANGE_WD_16_HT_4
    640     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    641     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    642     CMP         r5,#0
    643     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    644 
    645 SIGN_UP_CHANGE_WD_16_HT_4:
    646     LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
    647     ADD         r5,r0,#16                   @pu1_src_cpy[16]
    648     SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
    649     LDRB        r5,[r5]                     @load the value
    650     SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    651     CMP         r8,#0
    652     MVNLT       r8,#0
    653     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    654     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    655 
    656 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    657     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    658     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    659     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    660 
    661     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    662     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    663     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    664     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    665 
    666     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    667 
    668     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    669     VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
    670 
    671     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    672     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    673     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    674     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    675     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    676 
    677     VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    678     VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    679     VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    680     VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    681     VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    682 
    683     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    684     VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    685 
    686     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    687 
    688     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    689     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    690     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    691 
    692     LDR         r8,[sp,#ht_offset]          @Loads ht
    693     ADD         r5,sp,#66                   @*au1_src_left_tmp
    694     LDR         r2,[sp,#152]                @Loads *pu1_src_left
    695 SRC_LEFT_LOOP_WD_16_HT_4:
    696     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    697     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    698     SUBS        r8,r8,#4
    699     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    700 
    701     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    702     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    703     LDR         r7,[sp,#wd_offset]          @Loads wd
    704     LDR         r0,[sp,#144]                @Loads *pu1_src
    705     SUB         r7,r7,r6
    706     ADD         r0,r0,r7
    707     BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
    708 
    709 
    710 WIDTH_RESIDUE:
    711     LDR         r7,[sp,#wd_offset]          @Loads wd
    712     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    713     CMP         r6,r7                       @wd_residue == wd
    714     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    715 
    716     MOVNE       r8,#-1
    717     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    718 
    719     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    720     VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    721 
    722 PU1_AVAIL_2_RESIDUE:
    723     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    724     CMP         r8,#0
    725 
    726     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    727     MOVNE       r8,r3
    728     ADD         r8,r8,#1                    @pu1_src - src_strd + 1
    729     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    730     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
    731     SUB         r8,#8
    732 
    733 
    734     ADD         r5,sp,#66                   @*au1_src_left_tmp
    735     LDR         r4,[sp,#ht_offset]          @Loads ht
    736     LDR         r7,[sp,#wd_offset]          @Loads wd
    737     LDR         r8,[sp,#148]                @Loads *pu1_src
    738     SUB         r7,r7,#1                    @(wd - 1)
    739     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
    740     SUB         r5,r5,#1
    741 
    742 AU1_SRC_LEFT_LOOP_RESIDUE:
    743     LDRB        r8,[r7],r1                  @load the value and increment by src_strd
    744     STRB        r8,[r5,#1]!                 @store it in the stack pointer
    745     SUBS        r4,r4,#1                    @decrement the loop count
    746     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    747 
    748     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    749     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    750     SUB         r0,#8
    751 
    752     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    753     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    754     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    755     MOV         r7,r12                      @row count, move ht_tmp to r7
    756 
    757 PU1_SRC_LOOP_RESIDUE:
    758     VMOV.I8     Q9,#0
    759     ADD         r8,r0,r1                    @*pu1_src + src_strd
    760     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    761     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    762     SUB         r8,#8
    763     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    764     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    765     CMP         r5,#0
    766     BEQ         NEXT_ROW_ELSE_RESIDUE
    767     CMP         r7,#1
    768     LDRBEQ      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
    769     BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
    770 NEXT_ROW_ELSE_RESIDUE:
    771     SUB         r5,r12,r7                   @ht_tmp - row
    772     ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
    773     ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
    774     LDRB        r8,[r8]
    775 
    776 NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
    777     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    778     VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
    779 
    780     CMP         r7,r12
    781     BNE         SIGN_UP_CHANGE_RESIDUE
    782     LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
    783     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    784     CMP         r5,#0
    785     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    786 
    787 SIGN_UP_CHANGE_RESIDUE:
    788     LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
    789     ADD         r5,r0,#16                   @pu1_src_cpy[16]
    790     SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
    791     LDRB        r5,[r5]                     @load the value
    792     SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
    793     CMP         r8,#0
    794     MVNLT       r8,#0
    795     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
    796     VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
    797 
    798 SIGN_UP_CHANGE_DONE_RESIDUE:
    799     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    800     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    801     VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    802 
    803     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    804     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    805     VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    806     VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    807 
    808     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    809 
    810     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    811     VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
    812 
    813     VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    814     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    815     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    816     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    817     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    818 
    819     VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    820 
    821     VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    822     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    823     SUBS        r7,r7,#1
    824     BNE         PU1_SRC_LOOP_RESIDUE
    825 
    826     LDR         r8,[sp,#ht_offset]          @Loads ht
    827     LDR         r2,[sp,#152]                @Loads *pu1_src_left
    828     ADD         r5,sp,#66                   @*au1_src_left_tmp
    829 
    830 SRC_LEFT_LOOP_RESIDUE:
    831     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    832     SUBS        r8,r8,#4
    833     STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
    834     BNE         SRC_LEFT_LOOP_RESIDUE
    835 
    836 
    837 RE_ASSINING_LOOP:
    838     LDR         r7,[sp,#wd_offset]          @Loads wd
    839     LDR         r0,[sp,#148]                @Loads *pu1_src
    840 
    841     LDR         r11,[sp,#ht_offset]         @Loads ht
    842     ADD         r8,r0,r7                    @pu1_src[wd]
    843 
    844     LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
    845     SUB         r11,r11,#1                  @ht - 1
    846 
    847     STRB        r9,[r8,#-1]                 @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
    848     MLA         r6,r11,r1,r0                @pu1_src_org[(ht - 1) * src_strd]
    849 
    850     LDRB        r8,[sp]                     @load u1_src_top_left_tmp from stack pointer
    851     ADD         r12,sp,#2
    852 
    853     STRB        r10,[r6]                    @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
    854     STRB        r8,[r4]                     @*pu1_src_top_left = u1_src_top_left_tmp
    855     LDR         r3,[sp,#156]                @Loads pu1_src_top
    856 
    857 SRC_TOP_LOOP:
    858     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
    859     SUBS        r7,r7,#8                    @Decrement the width
    860     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
    861     BNE         SRC_TOP_LOOP
    862 
    863 END_LOOPS:
    864     ADD         sp,sp,#160
    865     vpop        {d8  -  d15}
    866     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    867 
    868 
    869 
    870