Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class3_chroma.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset_u,
     48 @                              WORD8 *pi1_sao_offset_v,
     49 @                              WORD32 wd,
     50 @                              WORD32 ht)
     51 @**************Variables Vs Registers*****************************************
     52 @r0 =>  *pu1_src
     53 @r1 =>  src_strd
     54 @r2 =>  *pu1_src_left
     55 @r3 =>  *pu1_src_top
     56 @r4 =>  *pu1_src_top_left
     57 @r5 =>  *pu1_avail
     58 @r6 =>  *pi1_sao_offset_u
     59 @r9 =>  *pi1_sao_offset_v
     60 @r7 =>  wd
     61 @r8=>   ht
     62 
     63 .text
     64 .syntax unified
     65 .p2align 2
     66 
     67 .extern gi1_table_edge_idx
     68 .globl ihevc_sao_edge_offset_class3_chroma_a9q
     69 
     70 gi1_table_edge_idx_addr_1:
     71 .long gi1_table_edge_idx - ulbl1 - 8
     72 
     73 gi1_table_edge_idx_addr_2:
     74 .long gi1_table_edge_idx - ulbl2 - 8
     75 
     76 gi1_table_edge_idx_addr_3:
     77 .long gi1_table_edge_idx - ulbl3 - 8
     78 
     79 gi1_table_edge_idx_addr_4:
     80 .long gi1_table_edge_idx - ulbl4 - 8
     81 
     82 gi1_table_edge_idx_addr_5:
     83 .long gi1_table_edge_idx - ulbl5 - 8
     84 
     85 ihevc_sao_edge_offset_class3_chroma_a9q:
     86 
     87 
     88     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     89 
     90     LDR         r7,[sp,#0x40]               @Loads wd
     91     LDR         r8,[sp,#0x44]               @Loads ht
     92     SUB         r9,r7,#2                    @wd - 2
     93 
     94     LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
     95     LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
     96 
     97     MOV         r9,r7                       @Move width to r9 for loop count
     98 
     99     LDR         r5,[sp,#0x34]               @Loads pu1_avail
    100     LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
    101 
    102     STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
    103     SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
    104 
    105     STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
    106     SUB         r10,r8,#1                   @ht-1
    107     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
    108     ADD         r12,sp,#10                  @temp array
    109 
    110 AU1_SRC_TOP_LOOP:
    111     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    112     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    113     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    114     BNE         AU1_SRC_TOP_LOOP
    115 
    116 PU1_AVAIL_5_LOOP_U:
    117     LDRB        r9,[r5,#5]                  @pu1_avail[5]
    118     CMP         r9,#0
    119     SUB         r14,r7,#2                   @[wd - 2]
    120     LDRB        r9,[r0,r14]                 @u1_pos_0_0_tmp_u = pu1_src[wd - 2]
    121     SUB         r11,r7,#1                   @[wd - 1]
    122     LDRB        r10,[r0,r11]                @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
    123     BEQ         PU1_AVAIL_6_LOOP_U
    124 
    125     LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
    126     LDRB        r11,[r11]                   @pu1_src_top_right[0]
    127     SUB         r12,r9,r11                  @pu1_src[wd - 2] - pu1_src_top_right[0]
    128     CMP         r12,#0
    129     MVNLT       r12,#0
    130     MOVGT       r12,#1                      @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
    131     ADD         r11,r0,r1                   @pu1_src + src_strd
    132     SUB         r14,r14,#2                  @[wd - 2 - 2]
    133     LDRB        r14,[r11,r14]               @pu1_src[wd - 2 - 2 + src_strd]
    134     SUB         r11,r9,r14                  @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
    135     CMP         r11,#0
    136     MVNLT       r11,#0
    137     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
    138     ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
    139     ADD         r11,r11,#2                  @edge_idx
    140     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    141 ulbl1:
    142     add         r14,r14,pc
    143 
    144     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    145     CMP         r12,#0                      @0 != edge_idx
    146     BEQ         PU1_AVAIL_5_LOOP_V
    147     LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
    148     ADD         r9,r9,r11                   @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
    149     USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    150 
    151 PU1_AVAIL_5_LOOP_V:
    152 
    153     LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
    154     LDRB        r11,[r11,#1]                @pu1_src_top_right[1]
    155     SUB         r12,r10,r11                 @pu1_src[wd - 1] - pu1_src_top_right[1]
    156     CMP         r12,#0
    157     MVNLT       r12,#0
    158     MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
    159     ADD         r11,r0,r1                   @pu1_src + src_strd
    160     SUB         r14,r7,#3                   @[wd - 1 - 2]
    161     LDRB        r14,[r11,r14]               @pu1_src[wd - 1 - 2 + src_strd]
    162     SUB         r11,r10,r14                 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
    163     CMP         r11,#0
    164     MVNLT       r11,#0
    165     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
    166     ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
    167     ADD         r11,r11,#2                  @edge_idx
    168     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    169 ulbl2:
    170     add         r14,r14,pc
    171 
    172     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    173     CMP         r12,#0                      @0 != edge_idx
    174     BEQ         PU1_AVAIL_6_LOOP_U
    175     LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
    176     LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
    177     ADD         r10,r10,r11                 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
    178     USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
    179 
    180 PU1_AVAIL_6_LOOP_U:
    181     STRB        r9,[sp,#6]
    182     STRB        r10,[sp,#7]
    183     STR         r0,[sp,#0x100]              @Store pu1_src in sp
    184 
    185     LDRB        r10,[r5,#6]                 @pu1_avail[6]
    186     CMP         r10,#0
    187     SUB         r11,r8,#1                   @ht - 1
    188     MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
    189     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
    190     LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
    191     BEQ         PU1_AVAIL_3_LOOP
    192 
    193     SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd - src_strd]
    194     ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    195     LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    196     SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    197     CMP         r11,#0
    198     MVNLT       r11,#0
    199     MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
    200 
    201     LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
    202     LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
    203     SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
    204     CMP         r14,#0
    205     MVNLT       r14,#0
    206     MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
    207 
    208     ADD         r11,r11,r14                 @Add 2 sign value
    209     ADD         r11,r11,#2                  @edge_idx
    210     LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
    211 ulbl3:
    212     add         r14,r14,pc
    213 
    214     LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    215     CMP         r14,#0
    216     BEQ         PU1_AVAIL_6_LOOP_V
    217     LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
    218     ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    219     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    220 
    221 PU1_AVAIL_6_LOOP_V:
    222     ADD         r12,r12,#1                  @pu1_src[(ht - 1) * src_strd + 1]
    223     SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd + 1) - src_strd]
    224     ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd + 2 - src_strd]
    225     LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
    226     SUB         r11,r9,r11                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
    227     CMP         r11,#0
    228     MVNLT       r11,#0
    229     MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
    230 
    231     LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
    232     LDRB        r14,[r14,#1]                @Load pu1_src_bot_left[1]
    233     SUB         r14,r9,r14                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
    234     CMP         r14,#0
    235     MVNLT       r14,#0
    236     MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
    237 
    238     ADD         r11,r11,r14                 @Add 2 sign value
    239     ADD         r11,r11,#2                  @edge_idx
    240     LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
    241 ulbl4:
    242     add         r14,r14,pc
    243 
    244     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    245     CMP         r12,#0
    246     BEQ         PU1_AVAIL_3_LOOP
    247     LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
    248     LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
    249     ADD         r9,r9,r11                   @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    250     USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    251 
    252 PU1_AVAIL_3_LOOP:
    253     STRB        r10,[sp,#8]
    254     STRB        r9,[sp,#9]
    255     STR         r2,[sp,#0x104]              @Store pu1_src_left in sp
    256 
    257     MOV         r12,r8                      @Move ht
    258     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    259     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    260     CMP         r11,#0
    261     BNE         PU1_AVAIL_2_LOOP
    262     SUB         r12,r12,#1                  @ht_tmp--
    263 
    264 PU1_AVAIL_2_LOOP:
    265     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    266     CMP         r5,#0
    267     BNE         PU1_AVAIL_2_LOOP_END
    268 
    269     ADD         r0,r0,r1                    @pu1_src += src_strd
    270     SUB         r12,r12,#1                  @ht_tmp--
    271     ADD         r14,r14,#2                  @pu1_src_left_cpy += 2
    272 
    273 PU1_AVAIL_2_LOOP_END:
    274     STR         r0,[sp,#2]                  @Store pu1_src in sp
    275     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    276     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    277     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    278     VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
    279     LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
    280     VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
    281     LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
    282 ulbl5:
    283     add         r2,r2,pc
    284     @VLD1.8     D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    285     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    286     MOV         r6,r7                       @move wd to r6 loop_count
    287 
    288     CMP         r7,#16                      @Compare wd with 16
    289     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    290     CMP         r8,#4                       @Compare ht with 4
    291     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    292 
    293 WIDTH_LOOP_16:
    294     LDR         r7,[sp,#0x114]              @Loads wd
    295     CMP         r6,r7                       @col == wd
    296     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    297 
    298     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    299     MOVNE       r8,#-1
    300 
    301     VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    302     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    303 
    304     CMP         r6,#16                      @if(col == 16)
    305     VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    306 
    307     BNE         SKIP_AU1_MASK_VAL
    308     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    309     VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    310     VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    311 
    312 SKIP_AU1_MASK_VAL:
    313     CMP         r11,#0
    314     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    315     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    316     SUB         r0,#8
    317     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    318 
    319     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    320     VMOV.I8     Q9,#0
    321     MOVNE       r8,r3
    322 
    323     ADD         r8,r8,#2                    @pu1_src - src_strd + 2
    324     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    325     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    326     SUB         r8,#8
    327     ADD         r3,r3,#16
    328 
    329     LDR         r4,[sp,#0x118]              @Loads ht
    330     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    331     LDR         r7,[sp,#0x114]              @Loads wd
    332 
    333     SUB         r7,r7,r6                    @(wd - col)
    334     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    335     ADD         r7,r7,#14                   @15 + (wd - col)
    336 
    337     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    338     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    339     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    340 
    341 AU1_SRC_LEFT_LOOP:
    342     LDRH        r8,[r7]                     @load the value and increment by src_strd
    343     SUBS        r4,r4,#1                    @decrement the loop count
    344 
    345     STRH        r8,[r5],#2                  @store it in the stack pointer
    346     ADD         r7,r7,r1
    347     BNE         AU1_SRC_LEFT_LOOP
    348 
    349 
    350     MOV         r7,r12                      @row count, move ht_tmp to r7
    351     VMOV.I8     Q9,#0                       @I
    352     ADD         r11,r0,r1                   @I *pu1_src + src_strd
    353 
    354     SUB         r5,r12,r7                   @I ht_tmp - row
    355     VLD1.8      D16,[r11]!                  @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    356     VLD1.8      D17,[r11]                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    357     SUB         r11,#8
    358     ADD         r8,r14,r5,LSL #1            @I pu1_src_left_cpy[(ht_tmp - row) * 2]
    359 
    360     LDRH        r5,[r8,#2]                  @I
    361     VMOV.16     D19[3],r5                   @I vsetq_lane_u8
    362     LDR         r11,[sp,#0x108]             @I Loads pu1_avail
    363 
    364     LDRB        r11,[r11,#2]                @I pu1_avail[2]
    365     VEXT.8      Q9,Q9,Q8,#14                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    366     CMP         r11,#0                      @I
    367     BNE         SIGN_UP_CHANGE_DONE         @I
    368 
    369     LDRB        r8,[r0,#14]                 @I pu1_src_cpy[14]
    370     SUB         r5,r0,r1                    @I
    371 
    372     LDRB        r11,[r5,#16]                @I load the value pu1_src_cpy[16 - src_strd]
    373 
    374     LDRB        r9,[r0,#15]                 @I pu1_src_cpy[15]
    375     SUB         r8,r8,r11                   @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    376 
    377     LDRB        r10,[r5,#17]                @I load the value pu1_src_cpy[17 - src_strd]
    378     CMP         r8,#0                       @I
    379 
    380     MVNLT       r8,#0                       @I
    381     SUB         r9,r9,r10                   @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    382 
    383     MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    384     CMP         r9,#0                       @I
    385 
    386     MVNLT       r9,#0                       @I
    387     VMOV.8      D15[6],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    388     MOVGT       r9,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    389 
    390     VMOV.8      D15[7],r9                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    391 
    392 SIGN_UP_CHANGE_DONE:
    393     VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    394     VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    395 
    396     VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    397     VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    398 
    399     VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
    400     VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
    401     VTBL.8      D18,{D28},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    402     VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
    403 
    404     VTBL.8      D19,{D28},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    405     VEXT.8      Q7,Q7,Q7,#2                 @I sign_up = vextq_s8(sign_up, sign_up, 2)
    406 
    407     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    408     VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
    409 
    410     VUZP.8      D18,D19                     @I
    411     VTBL.8      D22,{D6},D18                @I
    412     VTBL.8      D23,{D7},D19                @I
    413     VZIP.8      D22,D23                     @I
    414 
    415     VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    416     VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    417 
    418     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    419     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    420 
    421     VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
    422     VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    423 
    424     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    425     VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    426 
    427     VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    428 
    429 
    430 PU1_SRC_LOOP:
    431     ADD         r11,r0,r1,LSL #1            @II *pu1_src + src_strd
    432     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    433     SUB         r5,r12,r7                   @II ht_tmp - row
    434 
    435     ADD         r4,r0,r1                    @III *pu1_src + src_strd
    436     VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
    437     ADD         r8,r14,r5,LSL #1            @II pu1_src_left_cpy[(ht_tmp - row) * 2]
    438 
    439     LDRH        r9,[r8,#2]
    440     VLD1.8      D16,[r11]!                  @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    441     VLD1.8      D17,[r11]                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    442     SUB         r11,#8
    443     LDRB        r10,[r4,#14]                @II pu1_src_cpy[14]
    444 
    445     LDRB        r8,[r4,#15]                 @II pu1_src_cpy[15]
    446     VMOV.16     D29[3],r9                   @II vsetq_lane_u8
    447     ADD         r4,r11,r1                   @III *pu1_src + src_strd
    448 
    449     LDRB        r5,[r0,#17]                 @II load the value pu1_src_cpy[17 - src_strd]
    450     VLD1.8      D30,[r4]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    451     VLD1.8      D31,[r4]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    452     SUB         r4,#8
    453     LDRB        r11,[r0,#16]                @II load the value pu1_src_cpy[16 - src_strd]
    454 
    455     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    456     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    457     SUB         r10,r10,r11                 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    458 
    459     CMP         r10,#0                      @II
    460     VEXT.8      Q14,Q14,Q8,#14              @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    461     SUB         r8,r8,r5                    @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    462 
    463     MVNLT       r10,#0                      @II
    464     VLD1.8      D21,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    465     MOVGT       r10,#1                      @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    466 
    467     CMP         r8,#0                       @II
    468     VMOV.8      D15[6],r10                  @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    469     MVNLT       r8,#0                       @II
    470 
    471     MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    472     SUB         r10,r12,r7                  @III ht_tmp - row
    473     VMOV.8      D15[7],r8                   @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    474     ADD         r11,r14,r10,LSL #1          @III pu1_src_left_cpy[(ht_tmp - row) * 2]
    475 
    476     CMP         r7,#1                       @III
    477     VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    478     BNE         NEXT_ROW_POINTER_ASSIGNED_2 @III
    479 
    480     LDR         r5,[sp,#0x108]              @III Loads pu1_avail
    481     LDRB        r5,[r5,#3]                  @III pu1_avail[3]
    482     CMP         r5,#0                       @III
    483     SUBNE       r11,r4,#4                   @III pu1_src[src_strd - 2]
    484 
    485 NEXT_ROW_POINTER_ASSIGNED_2:
    486     LDRH        r5,[r11,#2]                 @III
    487     VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    488     ADD         r11,r0,r1                   @III
    489 
    490     LDRB        r9,[r11,#14]                @III pu1_src_cpy[14]
    491     VMOV.16     D19[3],r5                   @III vsetq_lane_u8
    492     LDRB        r8,[r11,#15]                @III pu1_src_cpy[15]
    493 
    494     LDRB        r11,[r0,#16]                @III load the value pu1_src_cpy[16 - src_strd]
    495     VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    496     LDRB        r10,[r0,#17]                @III load the value pu1_src_cpy[17 - src_strd]
    497 
    498     SUB         r9,r9,r11                   @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    499     VEXT.8      Q9,Q9,Q15,#14               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    500     SUB         r10,r8,r10                  @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    501 
    502     CMP         r9,#0                       @III
    503     VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    504     MVNLT       r9,#0                       @III
    505 
    506     MOVGT       r9,#1                       @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    507     VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    508     CMP         r10,#0                      @III
    509 
    510     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    511     VTBL.8      D26,{D21},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    512     MVNLT       r10,#0                      @III
    513     MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    514 
    515     VEXT.8      Q7,Q7,Q7,#2                 @II sign_up = vextq_s8(sign_up, sign_up, 2)
    516     VTBL.8      D27,{D21},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    517     VCGT.U8     Q11,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    518 
    519     VMOV.8      D15[6],r9                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    520     VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    521 
    522     VMOV.8      D15[7],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    523     VUZP.8      D26,D27                     @II
    524 
    525     VCLT.U8     Q10,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    526     VTBL.8      D24,{D6},D26                @II
    527     VSUB.U8     Q11,Q10,Q11                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    528 
    529     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    530     VTBL.8      D25,{D7},D27                @II
    531     VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
    532 
    533     VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    534     VZIP.8      D24,D25                     @II
    535 
    536     VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    537     VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    538     VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
    539 
    540     VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    541     VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    542     VEXT.8      Q7,Q7,Q7,#2                 @III sign_up = vextq_s8(sign_up, sign_up, 2)
    543 
    544     VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    545     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    546 
    547     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    548     VUZP.8      D18,D19                     @III
    549 
    550     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    551     VTBL.8      D22,{D6},D18                @III
    552     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    553 
    554     VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    555     VTBL.8      D23,{D7},D19                @III
    556     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    557 
    558     VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    559     VZIP.8      D22,D23                     @III
    560 
    561     VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    562     VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    563 
    564     VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
    565     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    566 
    567     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    568     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    569     CMP         r7,#1                       @III
    570 
    571     VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    572     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    573 
    574     VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    575 
    576     VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    577 
    578     VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    579     VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    580 
    581     BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
    582     BLT         INNER_LOOP_DONE
    583 
    584 
    585     ADD         r11,r0,r1,LSL #1            @*pu1_src + src_strd
    586     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    587     SUB         r5,r12,r7                   @ht_tmp - row
    588 
    589     ADD         r8,r14,r5,LSL #1            @pu1_src_left_cpy[(ht_tmp - row) * 2]
    590     VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
    591     CMP         r7,#1
    592 
    593     LDRB        r4,[r0,#16]                 @load the value pu1_src_cpy[16 - src_strd]
    594     VLD1.8      D16,[r11]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    595     VLD1.8      D17,[r11]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    596     SUB         r11,#8
    597     LDRB        r9,[r0,#17]                 @load the value pu1_src_cpy[17 - src_strd]
    598 
    599     BNE         NEXT_ROW_POINTER_ASSIGNED_3
    600     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    601     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    602     CMP         r5,#0
    603     SUBNE       r8,r11,#4                   @pu1_src[src_strd - 2]
    604 
    605 NEXT_ROW_POINTER_ASSIGNED_3:
    606     LDRH        r5,[r8,#2]
    607     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    608     LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
    609 
    610     SUB         r8,r8,r4                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    611     VMOV.16     D19[3],r5                   @vsetq_lane_u8
    612     LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
    613 
    614     CMP         r8,#0
    615     VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    616     SUB         r10,r10,r9                  @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    617 
    618     MVNLT       r8,#0
    619     VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    620     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    621 
    622     CMP         r10,#0
    623     VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    624     MVNLT       r10,#0
    625 
    626     MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    627     VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    628     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    629 
    630     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    631     VSUB.U8     Q11,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    632 
    633     VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
    634     VADD.I8     Q9,Q9,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_down)
    635     VTBL.8      D18,{D28},D18               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    636     VTBL.8      D19,{D28},D19               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    637 
    638     VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
    639 
    640     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    641     VUZP.8      D18,D19
    642 
    643     VTBL.8      D22,{D6},D18
    644     VTBL.8      D23,{D7},D19
    645 
    646     VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    647     VZIP.8      D22,D23
    648 
    649     VADDW.S8    Q10,Q10,D22                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    650     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    651     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    652 
    653     VADDW.S8    Q9,Q9,D23                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    654     VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    655     VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    656 
    657 
    658 INNER_LOOP_DONE:
    659 
    660     LDR         r8,[sp,#0x118]              @Loads ht
    661     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    662     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    663 
    664     LSL         r8,r8,#1
    665     VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
    666     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
    667 
    668 SRC_LEFT_LOOP:
    669     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    670     SUBS        r8,r8,#4
    671     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
    672     BNE         SRC_LEFT_LOOP
    673 
    674     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    675     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    676     CMP         r6,#8                       @Check whether residue remains
    677 
    678     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    679     LDR         r7,[sp,#0x114]              @Loads wd
    680     LDR         r0,[sp,#0x02]               @Loads *pu1_src
    681     SUB         r7,r7,r6
    682     ADD         r0,r0,r7
    683     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    684     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    685 
    686 WD_16_HT_4_LOOP:
    687     LDR         r7,[sp,#0x114]              @Loads wd
    688 
    689     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    690     CMP         r6,r7                       @col == wd
    691 
    692     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    693     MOVNE       r8,#-1
    694     VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    695 
    696     CMP         r6,#16                      @if(col == 16)
    697     VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    698 
    699     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    700     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    701     VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    702     VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    703 
    704 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    705     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    706     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    707 
    708     CMP         r11,#0
    709     MOVNE       r8,r3
    710     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    711     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    712     SUB         r0,#8
    713     ADD         r8,r8,#2                    @pu1_src - src_strd + 2
    714 
    715     ADD         r3,r3,#16
    716     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    717     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    718     SUB         r8,#8
    719     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    720 
    721     LDR         r4,[sp,#0x118]              @Loads ht
    722     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    723     LDR         r7,[sp,#0x114]              @Loads wd
    724 
    725     SUB         r7,r7,r6                    @(wd - col)
    726     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    727     ADD         r7,r7,#14                   @15 + (wd - col)
    728 
    729     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    730     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    731     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    732 
    733 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    734     LDRH        r8,[r7]                     @load the value and increment by src_strd
    735     SUBS        r4,r4,#1                    @decrement the loop count
    736 
    737     STRH        r8,[r5],#2                  @store it in the stack pointer
    738     ADD         r7,r7,r1
    739     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    740 
    741     VMOV.I8     Q9,#0
    742     MOV         r7,r12                      @row count, move ht_tmp to r7
    743 
    744 PU1_SRC_LOOP_WD_16_HT_4:
    745     ADD         r9,r0,r1                    @*pu1_src + src_strd
    746 
    747     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    748     VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    749     VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    750     SUB         r9,#8
    751     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    752 
    753     SUB         r11,r12,r7                  @ht_tmp - row
    754     ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
    755     ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
    756 
    757     CMP         r5,#0
    758     BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
    759     CMP         r7,#1
    760     SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
    761 
    762 NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
    763     LDRH        r5,[r8]
    764     VMOV.16     D19[3],r5                   @vsetq_lane_u8
    765     VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    766 
    767     CMP         r7,r12
    768     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    769     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    770     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    771     CMP         r5,#0
    772     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    773 
    774 SIGN_UP_CHANGE_WD_16_HT_4:
    775     LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
    776     SUB         r9,r0,r1
    777 
    778     LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
    779 
    780     LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
    781     SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    782 
    783     LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
    784     CMP         r8,#0
    785 
    786     MVNLT       r8,#0
    787     SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    788 
    789     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    790 
    791     CMP         r10,#0
    792     VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    793     MVNLT       r10,#0
    794 
    795     MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    796     VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    797 
    798 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    799     VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    800     VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    801 
    802     VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    803     VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    804 
    805     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    806     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    807 
    808     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    809     VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    810 
    811     VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    812     VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 2)
    813 
    814     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    815     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    816 
    817 
    818     VUZP.8      D26,D27
    819     VTBL.8      D24,{D6},D26
    820     VTBL.8      D25,{D7},D27
    821     VZIP.8      D24,D25
    822 
    823     VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    824     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    825 
    826     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    827     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    828 
    829     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    830     VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    831 
    832     VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    833     VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    834 
    835     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    836     VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    837 
    838     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    839     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    840     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    841 
    842     LDR         r8,[sp,#0x118]              @Loads ht
    843     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    844     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
    845 
    846 SRC_LEFT_LOOP_WD_16_HT_4:
    847     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    848     SUBS        r8,r8,#2
    849     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
    850     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    851 
    852     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    853     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    854     BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
    855 
    856 WIDTH_RESIDUE:
    857     LDR         r7,[sp,#0x114]              @Loads wd
    858 
    859     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    860     CMP         r6,r7                       @wd_residue == wd
    861 
    862     LDRBEQ      r8,[r5]                     @pu1_avail[0]
    863 
    864     MOVNE       r8,#-1
    865     LDRB        r11,[r5,#1]                 @pu1_avail[1]
    866 
    867     LDRB        r9,[r5,#2]                  @pu1_avail[2]
    868     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    869     CMP         r9,#0
    870 
    871     SUBEQ       r10,r0,r1                   @pu1_src - src_strd
    872     VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    873     MOVNE       r10,r3
    874 
    875     ADD         r10,r10,#2                  @pu1_src - src_strd + 2
    876     VMOV.8      d8[6],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    877     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    878 
    879     LDR         r4,[sp,#0x118]              @Loads ht
    880     VMOV.8      d8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    881     LDR         r7,[sp,#0x114]              @Loads wd
    882 
    883     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    884     VLD1.8      D10,[r10]!                  @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    885     VLD1.8      D11,[r10]                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    886     SUB         r10,#8
    887     SUB         r7,r7,#2                    @(wd - 2)
    888 
    889     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
    890 
    891 AU1_SRC_LEFT_LOOP_RESIDUE:
    892     LDRH        r8,[r7]                     @load the value and increment by src_strd
    893     ADD         r7,r7,r1
    894     STRH        r8,[r5],#2                  @store it in the stack pointer
    895     SUBS        r4,r4,#1                    @decrement the loop count
    896     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    897 
    898     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    899     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    900     SUB         r0,#8
    901 
    902     VMOV.I8     Q9,#0
    903     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    904 
    905     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    906     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    907     MOV         r7,r12                      @row count, move ht_tmp to r7
    908 
    909 PU1_SRC_LOOP_RESIDUE:
    910     ADD         r9,r0,r1                    @*pu1_src + src_strd
    911 
    912     SUB         r11,r12,r7                  @ht_tmp - row
    913     VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    914     VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    915     SUB         r9,#8
    916     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    917 
    918     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    919     ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
    920 
    921     CMP         r5,#0
    922     ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
    923 
    924     BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
    925     CMP         r7,#1
    926     SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
    927 
    928 NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
    929     LDRB        r5,[r8]
    930 
    931     LDRB        r8,[r8,#1]
    932     VMOV.8      D19[6],r5                   @vsetq_lane_u8
    933     CMP         r7,r12
    934 
    935     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    936     VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    937 
    938     BLT         SIGN_UP_CHANGE_RESIDUE
    939     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    940     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    941     CMP         r5,#0
    942     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    943 
    944 SIGN_UP_CHANGE_RESIDUE:
    945     LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
    946     SUB         r9,r0,r1
    947 
    948     LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
    949 
    950     LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
    951     SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    952 
    953     LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
    954     CMP         r8,#0
    955 
    956     MVNLT       r8,#0
    957     SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    958 
    959     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    960 
    961     CMP         r10,#0
    962     VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    963     MVNLT       r10,#0
    964 
    965     MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    966     VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    967 
    968 SIGN_UP_CHANGE_DONE_RESIDUE:
    969     VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    970     VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    971 
    972     VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    973     VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    974 
    975     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    976     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    977 
    978     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    979     VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    980 
    981     VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    982     VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 14)
    983 
    984     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    985     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    986 
    987 
    988     VUZP.8      D26,D27
    989     VTBL.8      D24,{D6},D26
    990     VTBL.8      D25,{D7},D27
    991     VZIP.8      D24,D25
    992 
    993     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    994     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    995 
    996     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    997     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    998 
    999     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
   1000     VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
   1001 
   1002     VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
   1003 
   1004     BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
   1005 
   1006     LDR         r8,[sp,#0x118]              @Loads ht
   1007     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
   1008 
   1009     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
   1010 
   1011 SRC_LEFT_LOOP_RESIDUE:
   1012     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
   1013     SUBS        r8,r8,#2
   1014     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
   1015     BNE         SRC_LEFT_LOOP_RESIDUE
   1016 
   1017 
   1018 RE_ASSINING_LOOP:
   1019     LDR         r7,[sp,#0x114]              @Loads wd
   1020     LDR         r8,[sp,#0x118]              @Loads ht
   1021 
   1022     LDR         r0,[sp,#0x100]              @Loads *pu1_src
   1023     SUB         r10,r7,#2                   @wd - 2
   1024 
   1025     LDRH        r9,[sp,#6]
   1026     SUB         r8,r8,#1                    @ht - 1
   1027 
   1028     STRH        r9,[r0,r10]                 @pu1_src_org[0] = u1_pos_0_0_tmp
   1029     MLA         r6,r8,r1,r0                 @pu1_src[(ht - 1) * src_strd]
   1030 
   1031     LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
   1032 
   1033     LDRH        r9,[sp,#8]
   1034     ADD         r12,sp,#10
   1035 
   1036     STRH        r9,[r6]                     @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
   1037 
   1038     LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
   1039     STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
   1040     LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
   1041 
   1042 SRC_TOP_LOOP:
   1043     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
   1044     SUBS        r7,r7,#8                    @Decrement the width
   1045     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
   1046     BNE         SRC_TOP_LOOP
   1047 
   1048 END_LOOPS:
   1049     ADD         sp,sp,#0xD4
   1050     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
   1051 
   1052 
   1053 
   1054