Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class3_chroma.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset_u,
     48 @                              WORD8 *pi1_sao_offset_v,
     49 @                              WORD32 wd,
     50 @                              WORD32 ht)
     51 @**************Variables Vs Registers*****************************************
     52 @r0 =>  *pu1_src
     53 @r1 =>  src_strd
     54 @r2 =>  *pu1_src_left
     55 @r3 =>  *pu1_src_top
     56 @r4 =>  *pu1_src_top_left
     57 @r5 =>  *pu1_avail
     58 @r6 =>  *pi1_sao_offset_u
     59 @r9 =>  *pi1_sao_offset_v
     60 @r7 =>  wd
     61 @r8=>   ht
     62 
     63 .text
     64 .p2align 2
     65 
     66 .extern gi1_table_edge_idx
     67 .globl ihevc_sao_edge_offset_class3_chroma_a9q
     68 
     69 gi1_table_edge_idx_addr_1:
     70 .long gi1_table_edge_idx - ulbl1 - 8
     71 
     72 gi1_table_edge_idx_addr_2:
     73 .long gi1_table_edge_idx - ulbl2 - 8
     74 
     75 gi1_table_edge_idx_addr_3:
     76 .long gi1_table_edge_idx - ulbl3 - 8
     77 
     78 gi1_table_edge_idx_addr_4:
     79 .long gi1_table_edge_idx - ulbl4 - 8
     80 
     81 gi1_table_edge_idx_addr_5:
     82 .long gi1_table_edge_idx - ulbl5 - 8
     83 
     84 ihevc_sao_edge_offset_class3_chroma_a9q:
     85 
     86 
     87     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     88 
     89     LDR         r7,[sp,#0x40]               @Loads wd
     90     LDR         r8,[sp,#0x44]               @Loads ht
     91     SUB         r9,r7,#2                    @wd - 2
     92 
     93     LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
     94     LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
     95 
     96     MOV         r9,r7                       @Move width to r9 for loop count
     97 
     98     LDR         r5,[sp,#0x34]               @Loads pu1_avail
     99     LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
    100 
    101     STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
    102     SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
    103 
    104     STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
    105     SUB         r10,r8,#1                   @ht-1
    106     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
    107     ADD         r12,sp,#10                  @temp array
    108 
    109 AU1_SRC_TOP_LOOP:
    110     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    111     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    112     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    113     BNE         AU1_SRC_TOP_LOOP
    114 
    115 PU1_AVAIL_5_LOOP_U:
    116     LDRB        r9,[r5,#5]                  @pu1_avail[5]
    117     CMP         r9,#0
    118     SUB         r14,r7,#2                   @[wd - 2]
    119     LDRB        r9,[r0,r14]                 @u1_pos_0_0_tmp_u = pu1_src[wd - 2]
    120     SUB         r11,r7,#1                   @[wd - 1]
    121     LDRB        r10,[r0,r11]                @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
    122     BEQ         PU1_AVAIL_6_LOOP_U
    123 
    124     LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
    125     LDRB        r11,[r11]                   @pu1_src_top_right[0]
    126     SUB         r12,r9,r11                  @pu1_src[wd - 2] - pu1_src_top_right[0]
    127     CMP         r12,#0
    128     MVNLT       r12,#0
    129     MOVGT       r12,#1                      @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
    130     ADD         r11,r0,r1                   @pu1_src + src_strd
    131     SUB         r14,r14,#2                  @[wd - 2 - 2]
    132     LDRB        r14,[r11,r14]               @pu1_src[wd - 2 - 2 + src_strd]
    133     SUB         r11,r9,r14                  @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
    134     CMP         r11,#0
    135     MVNLT       r11,#0
    136     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
    137     ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
    138     ADD         r11,r11,#2                  @edge_idx
    139     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    140 ulbl1:
    141     add         r14,r14,pc
    142 
    143     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    144     CMP         r12,#0                      @0 != edge_idx
    145     BEQ         PU1_AVAIL_5_LOOP_V
    146     LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
    147     ADD         r9,r9,r11                   @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
    148     USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    149 
    150 PU1_AVAIL_5_LOOP_V:
    151 
    152     LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
    153     LDRB        r11,[r11,#1]                @pu1_src_top_right[1]
    154     SUB         r12,r10,r11                 @pu1_src[wd - 1] - pu1_src_top_right[1]
    155     CMP         r12,#0
    156     MVNLT       r12,#0
    157     MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
    158     ADD         r11,r0,r1                   @pu1_src + src_strd
    159     SUB         r14,r7,#3                   @[wd - 1 - 2]
    160     LDRB        r14,[r11,r14]               @pu1_src[wd - 1 - 2 + src_strd]
    161     SUB         r11,r10,r14                 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
    162     CMP         r11,#0
    163     MVNLT       r11,#0
    164     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
    165     ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
    166     ADD         r11,r11,#2                  @edge_idx
    167     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    168 ulbl2:
    169     add         r14,r14,pc
    170 
    171     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    172     CMP         r12,#0                      @0 != edge_idx
    173     BEQ         PU1_AVAIL_6_LOOP_U
    174     LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
    175     LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
    176     ADD         r10,r10,r11                 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
    177     USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
    178 
    179 PU1_AVAIL_6_LOOP_U:
    180     STRB        r9,[sp,#6]
    181     STRB        r10,[sp,#7]
    182     STR         r0,[sp,#0x100]              @Store pu1_src in sp
    183 
    184     LDRB        r10,[r5,#6]                 @pu1_avail[6]
    185     CMP         r10,#0
    186     SUB         r11,r8,#1                   @ht - 1
    187     MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
    188     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
    189     LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
    190     BEQ         PU1_AVAIL_3_LOOP
    191 
    192     SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd - src_strd]
    193     ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    194     LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    195     SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
    196     CMP         r11,#0
    197     MVNLT       r11,#0
    198     MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
    199 
    200     LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
    201     LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
    202     SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
    203     CMP         r14,#0
    204     MVNLT       r14,#0
    205     MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
    206 
    207     ADD         r11,r11,r14                 @Add 2 sign value
    208     ADD         r11,r11,#2                  @edge_idx
    209     LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
    210 ulbl3:
    211     add         r14,r14,pc
    212 
    213     LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    214     CMP         r14,#0
    215     BEQ         PU1_AVAIL_6_LOOP_V
    216     LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
    217     ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    218     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    219 
    220 PU1_AVAIL_6_LOOP_V:
    221     ADD         r12,r12,#1                  @pu1_src[(ht - 1) * src_strd + 1]
    222     SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd + 1) - src_strd]
    223     ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd + 2 - src_strd]
    224     LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
    225     SUB         r11,r9,r11                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
    226     CMP         r11,#0
    227     MVNLT       r11,#0
    228     MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
    229 
    230     LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
    231     LDRB        r14,[r14,#1]                @Load pu1_src_bot_left[1]
    232     SUB         r14,r9,r14                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
    233     CMP         r14,#0
    234     MVNLT       r14,#0
    235     MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
    236 
    237     ADD         r11,r11,r14                 @Add 2 sign value
    238     ADD         r11,r11,#2                  @edge_idx
    239     LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
    240 ulbl4:
    241     add         r14,r14,pc
    242 
    243     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    244     CMP         r12,#0
    245     BEQ         PU1_AVAIL_3_LOOP
    246     LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
    247     LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
    248     ADD         r9,r9,r11                   @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    249     USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    250 
    251 PU1_AVAIL_3_LOOP:
    252     STRB        r10,[sp,#8]
    253     STRB        r9,[sp,#9]
    254     STR         r2,[sp,#0x104]              @Store pu1_src_left in sp
    255 
    256     MOV         r12,r8                      @Move ht
    257     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    258     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    259     CMP         r11,#0
    260     BNE         PU1_AVAIL_2_LOOP
    261     SUB         r12,r12,#1                  @ht_tmp--
    262 
    263 PU1_AVAIL_2_LOOP:
    264     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    265     CMP         r5,#0
    266     BNE         PU1_AVAIL_2_LOOP_END
    267 
    268     ADD         r0,r0,r1                    @pu1_src += src_strd
    269     SUB         r12,r12,#1                  @ht_tmp--
    270     ADD         r14,r14,#2                  @pu1_src_left_cpy += 2
    271 
    272 PU1_AVAIL_2_LOOP_END:
    273     STR         r0,[sp,#2]                  @Store pu1_src in sp
    274     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    275     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    276     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    277     VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
    278     LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
    279     VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
    280     LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
    281 ulbl5:
    282     add         r2,r2,pc
    283     @VLD1.8     D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    284     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    285     MOV         r6,r7                       @move wd to r6 loop_count
    286 
    287     CMP         r7,#16                      @Compare wd with 16
    288     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    289     CMP         r8,#4                       @Compare ht with 4
    290     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    291 
    292 WIDTH_LOOP_16:
    293     LDR         r7,[sp,#0x114]              @Loads wd
    294     CMP         r6,r7                       @col == wd
    295     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    296 
    297     LDREQB      r8,[r5]                     @pu1_avail[0]
    298     MOVNE       r8,#-1
    299 
    300     VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    301     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    302 
    303     CMP         r6,#16                      @if(col == 16)
    304     VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    305 
    306     BNE         SKIP_AU1_MASK_VAL
    307     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    308     VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    309     VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    310 
    311 SKIP_AU1_MASK_VAL:
    312     CMP         r11,#0
    313     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    314     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    315     SUB         r0,#8
    316     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    317 
    318     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    319     VMOV.I8     Q9,#0
    320     MOVNE       r8,r3
    321 
    322     ADD         r8,r8,#2                    @pu1_src - src_strd + 2
    323     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    324     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    325     SUB         r8,#8
    326     ADD         r3,r3,#16
    327 
    328     LDR         r4,[sp,#0x118]              @Loads ht
    329     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    330     LDR         r7,[sp,#0x114]              @Loads wd
    331 
    332     SUB         r7,r7,r6                    @(wd - col)
    333     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    334     ADD         r7,r7,#14                   @15 + (wd - col)
    335 
    336     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    337     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    338     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    339 
    340 AU1_SRC_LEFT_LOOP:
    341     LDRH        r8,[r7]                     @load the value and increment by src_strd
    342     SUBS        r4,r4,#1                    @decrement the loop count
    343 
    344     STRH        r8,[r5],#2                  @store it in the stack pointer
    345     ADD         r7,r7,r1
    346     BNE         AU1_SRC_LEFT_LOOP
    347 
    348 
    349     MOV         r7,r12                      @row count, move ht_tmp to r7
    350     VMOV.I8     Q9,#0                       @I
    351     ADD         r11,r0,r1                   @I *pu1_src + src_strd
    352 
    353     SUB         r5,r12,r7                   @I ht_tmp - row
    354     VLD1.8      D16,[r11]!                  @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    355     VLD1.8      D17,[r11]                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    356     SUB         r11,#8
    357     ADD         r8,r14,r5,LSL #1            @I pu1_src_left_cpy[(ht_tmp - row) * 2]
    358 
    359     LDRH        r5,[r8,#2]                  @I
    360     VMOV.16     D19[3],r5                   @I vsetq_lane_u8
    361     LDR         r11,[sp,#0x108]             @I Loads pu1_avail
    362 
    363     LDRB        r11,[r11,#2]                @I pu1_avail[2]
    364     VEXT.8      Q9,Q9,Q8,#14                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    365     CMP         r11,#0                      @I
    366     BNE         SIGN_UP_CHANGE_DONE         @I
    367 
    368     LDRB        r8,[r0,#14]                 @I pu1_src_cpy[14]
    369     SUB         r5,r0,r1                    @I
    370 
    371     LDRB        r11,[r5,#16]                @I load the value pu1_src_cpy[16 - src_strd]
    372 
    373     LDRB        r9,[r0,#15]                 @I pu1_src_cpy[15]
    374     SUB         r8,r8,r11                   @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    375 
    376     LDRB        r10,[r5,#17]                @I load the value pu1_src_cpy[17 - src_strd]
    377     CMP         r8,#0                       @I
    378 
    379     MVNLT       r8,#0                       @I
    380     SUB         r9,r9,r10                   @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    381 
    382     MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    383     CMP         r9,#0                       @I
    384 
    385     MVNLT       r9,#0                       @I
    386     VMOV.8      D15[6],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    387     MOVGT       r9,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    388 
    389     VMOV.8      D15[7],r9                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    390 
    391 SIGN_UP_CHANGE_DONE:
    392     VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    393     VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    394 
    395     VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    396     VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    397 
    398     VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
    399     VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
    400     VTBL.8      D18,{D28},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    401     VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
    402 
    403     VTBL.8      D19,{D28},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    404     VEXT.8      Q7,Q7,Q7,#2                 @I sign_up = vextq_s8(sign_up, sign_up, 2)
    405 
    406     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    407     VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
    408 
    409     VUZP.8      D18,D19                     @I
    410     VTBL.8      D22,{D6},D18                @I
    411     VTBL.8      D23,{D7},D19                @I
    412     VZIP.8      D22,D23                     @I
    413 
    414     VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    415     VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    416 
    417     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    418     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    419 
    420     VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
    421     VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    422 
    423     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    424     VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    425 
    426     VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    427 
    428 
    429 PU1_SRC_LOOP:
    430     ADD         r11,r0,r1,LSL #1            @II *pu1_src + src_strd
    431     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    432     SUB         r5,r12,r7                   @II ht_tmp - row
    433 
    434     ADD         r4,r0,r1                    @III *pu1_src + src_strd
    435     VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
    436     ADD         r8,r14,r5,LSL #1            @II pu1_src_left_cpy[(ht_tmp - row) * 2]
    437 
    438     LDRH        r9,[r8,#2]
    439     VLD1.8      D16,[r11]!                  @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    440     VLD1.8      D17,[r11]                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    441     SUB         r11,#8
    442     LDRB        r10,[r4,#14]                @II pu1_src_cpy[14]
    443 
    444     LDRB        r8,[r4,#15]                 @II pu1_src_cpy[15]
    445     VMOV.16     D29[3],r9                   @II vsetq_lane_u8
    446     ADD         r4,r11,r1                   @III *pu1_src + src_strd
    447 
    448     LDRB        r5,[r0,#17]                 @II load the value pu1_src_cpy[17 - src_strd]
    449     VLD1.8      D30,[r4]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    450     VLD1.8      D31,[r4]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    451     SUB         r4,#8
    452     LDRB        r11,[r0,#16]                @II load the value pu1_src_cpy[16 - src_strd]
    453 
    454     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    455     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    456     SUB         r10,r10,r11                 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    457 
    458     CMP         r10,#0                      @II
    459     VEXT.8      Q14,Q14,Q8,#14              @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    460     SUB         r8,r8,r5                    @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    461 
    462     MVNLT       r10,#0                      @II
    463     VLD1.8      D21,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    464     MOVGT       r10,#1                      @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    465 
    466     CMP         r8,#0                       @II
    467     VMOV.8      D15[6],r10                  @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    468     MVNLT       r8,#0                       @II
    469 
    470     MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    471     SUB         r10,r12,r7                  @III ht_tmp - row
    472     VMOV.8      D15[7],r8                   @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    473     ADD         r11,r14,r10,LSL #1          @III pu1_src_left_cpy[(ht_tmp - row) * 2]
    474 
    475     CMP         r7,#1                       @III
    476     VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    477     BNE         NEXT_ROW_POINTER_ASSIGNED_2 @III
    478 
    479     LDR         r5,[sp,#0x108]              @III Loads pu1_avail
    480     LDRB        r5,[r5,#3]                  @III pu1_avail[3]
    481     CMP         r5,#0                       @III
    482     SUBNE       r11,r4,#4                   @III pu1_src[src_strd - 2]
    483 
    484 NEXT_ROW_POINTER_ASSIGNED_2:
    485     LDRH        r5,[r11,#2]                 @III
    486     VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    487     ADD         r11,r0,r1                   @III
    488 
    489     LDRB        r9,[r11,#14]                @III pu1_src_cpy[14]
    490     VMOV.16     D19[3],r5                   @III vsetq_lane_u8
    491     LDRB        r8,[r11,#15]                @III pu1_src_cpy[15]
    492 
    493     LDRB        r11,[r0,#16]                @III load the value pu1_src_cpy[16 - src_strd]
    494     VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    495     LDRB        r10,[r0,#17]                @III load the value pu1_src_cpy[17 - src_strd]
    496 
    497     SUB         r9,r9,r11                   @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    498     VEXT.8      Q9,Q9,Q15,#14               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    499     SUB         r10,r8,r10                  @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    500 
    501     CMP         r9,#0                       @III
    502     VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    503     MVNLT       r9,#0                       @III
    504 
    505     MOVGT       r9,#1                       @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    506     VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    507     CMP         r10,#0                      @III
    508 
    509     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    510     VTBL.8      D26,{D21},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    511     MVNLT       r10,#0                      @III
    512     MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    513 
    514     VEXT.8      Q7,Q7,Q7,#2                 @II sign_up = vextq_s8(sign_up, sign_up, 2)
    515     VTBL.8      D27,{D21},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    516     VCGT.U8     Q11,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    517 
    518     VMOV.8      D15[6],r9                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    519     VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    520 
    521     VMOV.8      D15[7],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    522     VUZP.8      D26,D27                     @II
    523 
    524     VCLT.U8     Q10,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    525     VTBL.8      D24,{D6},D26                @II
    526     VSUB.U8     Q11,Q10,Q11                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    527 
    528     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    529     VTBL.8      D25,{D7},D27                @II
    530     VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
    531 
    532     VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    533     VZIP.8      D24,D25                     @II
    534 
    535     VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    536     VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    537     VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
    538 
    539     VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    540     VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    541     VEXT.8      Q7,Q7,Q7,#2                 @III sign_up = vextq_s8(sign_up, sign_up, 2)
    542 
    543     VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    544     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    545 
    546     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    547     VUZP.8      D18,D19                     @III
    548 
    549     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    550     VTBL.8      D22,{D6},D18                @III
    551     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    552 
    553     VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    554     VTBL.8      D23,{D7},D19                @III
    555     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    556 
    557     VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    558     VZIP.8      D22,D23                     @III
    559 
    560     VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    561     VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    562 
    563     VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
    564     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    565 
    566     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    567     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    568     CMP         r7,#1                       @III
    569 
    570     VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    571     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    572 
    573     VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    574 
    575     VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    576 
    577     VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    578     VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    579 
    580     BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
    581     BLT         INNER_LOOP_DONE
    582 
    583 
    584     ADD         r11,r0,r1,LSL #1            @*pu1_src + src_strd
    585     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    586     SUB         r5,r12,r7                   @ht_tmp - row
    587 
    588     ADD         r8,r14,r5,LSL #1            @pu1_src_left_cpy[(ht_tmp - row) * 2]
    589     VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
    590     CMP         r7,#1
    591 
    592     LDRB        r4,[r0,#16]                 @load the value pu1_src_cpy[16 - src_strd]
    593     VLD1.8      D16,[r11]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    594     VLD1.8      D17,[r11]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    595     SUB         r11,#8
    596     LDRB        r9,[r0,#17]                 @load the value pu1_src_cpy[17 - src_strd]
    597 
    598     BNE         NEXT_ROW_POINTER_ASSIGNED_3
    599     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    600     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    601     CMP         r5,#0
    602     SUBNE       r8,r11,#4                   @pu1_src[src_strd - 2]
    603 
    604 NEXT_ROW_POINTER_ASSIGNED_3:
    605     LDRH        r5,[r8,#2]
    606     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    607     LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
    608 
    609     SUB         r8,r8,r4                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    610     VMOV.16     D19[3],r5                   @vsetq_lane_u8
    611     LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
    612 
    613     CMP         r8,#0
    614     VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    615     SUB         r10,r10,r9                  @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    616 
    617     MVNLT       r8,#0
    618     VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    619     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    620 
    621     CMP         r10,#0
    622     VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    623     MVNLT       r10,#0
    624 
    625     MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    626     VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    627     VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    628 
    629     VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    630     VSUB.U8     Q11,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    631 
    632     VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
    633     VADD.I8     Q9,Q9,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_down)
    634     VTBL.8      D18,{D28},D18               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    635     VTBL.8      D19,{D28},D19               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    636 
    637     VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
    638 
    639     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    640     VUZP.8      D18,D19
    641 
    642     VTBL.8      D22,{D6},D18
    643     VTBL.8      D23,{D7},D19
    644 
    645     VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    646     VZIP.8      D22,D23
    647 
    648     VADDW.S8    Q10,Q10,D22                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    649     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    650     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    651 
    652     VADDW.S8    Q9,Q9,D23                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    653     VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    654     VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    655 
    656 
    657 INNER_LOOP_DONE:
    658 
    659     LDR         r8,[sp,#0x118]              @Loads ht
    660     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    661     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    662 
    663     LSL         r8,r8,#1
    664     VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
    665     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
    666 
    667 SRC_LEFT_LOOP:
    668     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    669     SUBS        r8,r8,#4
    670     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
    671     BNE         SRC_LEFT_LOOP
    672 
    673     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    674     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    675     CMP         r6,#8                       @Check whether residue remains
    676 
    677     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    678     LDR         r7,[sp,#0x114]              @Loads wd
    679     LDR         r0,[sp,#0x02]               @Loads *pu1_src
    680     SUB         r7,r7,r6
    681     ADD         r0,r0,r7
    682     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    683     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    684 
    685 WD_16_HT_4_LOOP:
    686     LDR         r7,[sp,#0x114]              @Loads wd
    687 
    688     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    689     CMP         r6,r7                       @col == wd
    690 
    691     LDREQB      r8,[r5]                     @pu1_avail[0]
    692     MOVNE       r8,#-1
    693     VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    694 
    695     CMP         r6,#16                      @if(col == 16)
    696     VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    697 
    698     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    699     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    700     VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    701     VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    702 
    703 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    704     LDRB        r11,[r5,#2]                 @pu1_avail[2]
    705     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    706 
    707     CMP         r11,#0
    708     MOVNE       r8,r3
    709     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    710     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    711     SUB         r0,#8
    712     ADD         r8,r8,#2                    @pu1_src - src_strd + 2
    713 
    714     ADD         r3,r3,#16
    715     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    716     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    717     SUB         r8,#8
    718     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    719 
    720     LDR         r4,[sp,#0x118]              @Loads ht
    721     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    722     LDR         r7,[sp,#0x114]              @Loads wd
    723 
    724     SUB         r7,r7,r6                    @(wd - col)
    725     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    726     ADD         r7,r7,#14                   @15 + (wd - col)
    727 
    728     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    729     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    730     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    731 
    732 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    733     LDRH        r8,[r7]                     @load the value and increment by src_strd
    734     SUBS        r4,r4,#1                    @decrement the loop count
    735 
    736     STRH        r8,[r5],#2                  @store it in the stack pointer
    737     ADD         r7,r7,r1
    738     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    739 
    740     VMOV.I8     Q9,#0
    741     MOV         r7,r12                      @row count, move ht_tmp to r7
    742 
    743 PU1_SRC_LOOP_WD_16_HT_4:
    744     ADD         r9,r0,r1                    @*pu1_src + src_strd
    745 
    746     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    747     VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    748     VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    749     SUB         r9,#8
    750     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    751 
    752     SUB         r11,r12,r7                  @ht_tmp - row
    753     ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
    754     ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
    755 
    756     CMP         r5,#0
    757     BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
    758     CMP         r7,#1
    759     SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
    760 
    761 NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
    762     LDRH        r5,[r8]
    763     VMOV.16     D19[3],r5                   @vsetq_lane_u8
    764     VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    765 
    766     CMP         r7,r12
    767     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    768     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    769     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    770     CMP         r5,#0
    771     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    772 
    773 SIGN_UP_CHANGE_WD_16_HT_4:
    774     LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
    775     SUB         r9,r0,r1
    776 
    777     LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
    778 
    779     LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
    780     SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    781 
    782     LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
    783     CMP         r8,#0
    784 
    785     MVNLT       r8,#0
    786     SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    787 
    788     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    789 
    790     CMP         r10,#0
    791     VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    792     MVNLT       r10,#0
    793 
    794     MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    795     VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    796 
    797 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    798     VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    799     VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    800 
    801     VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    802     VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    803 
    804     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    805     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    806 
    807     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    808     VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    809 
    810     VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    811     VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 2)
    812 
    813     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    814     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    815 
    816 
    817     VUZP.8      D26,D27
    818     VTBL.8      D24,{D6},D26
    819     VTBL.8      D25,{D7},D27
    820     VZIP.8      D24,D25
    821 
    822     VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    823     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    824 
    825     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    826     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    827 
    828     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    829     VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    830 
    831     VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    832     VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    833 
    834     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    835     VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    836 
    837     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    838     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    839     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    840 
    841     LDR         r8,[sp,#0x118]              @Loads ht
    842     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    843     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
    844 
    845 SRC_LEFT_LOOP_WD_16_HT_4:
    846     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    847     SUBS        r8,r8,#2
    848     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
    849     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    850 
    851     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    852     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    853     BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
    854 
    855 WIDTH_RESIDUE:
    856     LDR         r7,[sp,#0x114]              @Loads wd
    857 
    858     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    859     CMP         r6,r7                       @wd_residue == wd
    860 
    861     LDREQB      r8,[r5]                     @pu1_avail[0]
    862 
    863     MOVNE       r8,#-1
    864     LDRB        r11,[r5,#1]                 @pu1_avail[1]
    865 
    866     LDRB        r9,[r5,#2]                  @pu1_avail[2]
    867     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    868     CMP         r9,#0
    869 
    870     SUBEQ       r10,r0,r1                   @pu1_src - src_strd
    871     VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    872     MOVNE       r10,r3
    873 
    874     ADD         r10,r10,#2                  @pu1_src - src_strd + 2
    875     VMOV.8      d8[6],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    876     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    877 
    878     LDR         r4,[sp,#0x118]              @Loads ht
    879     VMOV.8      d8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    880     LDR         r7,[sp,#0x114]              @Loads wd
    881 
    882     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    883     VLD1.8      D10,[r10]!                  @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    884     VLD1.8      D11,[r10]                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
    885     SUB         r10,#8
    886     SUB         r7,r7,#2                    @(wd - 2)
    887 
    888     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
    889 
    890 AU1_SRC_LEFT_LOOP_RESIDUE:
    891     LDRH        r8,[r7]                     @load the value and increment by src_strd
    892     ADD         r7,r7,r1
    893     STRH        r8,[r5],#2                  @store it in the stack pointer
    894     SUBS        r4,r4,#1                    @decrement the loop count
    895     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    896 
    897     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    898     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    899     SUB         r0,#8
    900 
    901     VMOV.I8     Q9,#0
    902     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    903 
    904     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    905     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    906     MOV         r7,r12                      @row count, move ht_tmp to r7
    907 
    908 PU1_SRC_LOOP_RESIDUE:
    909     ADD         r9,r0,r1                    @*pu1_src + src_strd
    910 
    911     SUB         r11,r12,r7                  @ht_tmp - row
    912     VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    913     VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    914     SUB         r9,#8
    915     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    916 
    917     LDRB        r5,[r5,#3]                  @pu1_avail[3]
    918     ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
    919 
    920     CMP         r5,#0
    921     ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
    922 
    923     BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
    924     CMP         r7,#1
    925     SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
    926 
    927 NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
    928     LDRB        r5,[r8]
    929 
    930     LDRB        r8,[r8,#1]
    931     VMOV.8      D19[6],r5                   @vsetq_lane_u8
    932     CMP         r7,r12
    933 
    934     VMOV.8      D19[7],r8                   @vsetq_lane_u8
    935     VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
    936 
    937     BLT         SIGN_UP_CHANGE_RESIDUE
    938     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    939     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    940     CMP         r5,#0
    941     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    942 
    943 SIGN_UP_CHANGE_RESIDUE:
    944     LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
    945     SUB         r9,r0,r1
    946 
    947     LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
    948 
    949     LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
    950     SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
    951 
    952     LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
    953     CMP         r8,#0
    954 
    955     MVNLT       r8,#0
    956     SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    957 
    958     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
    959 
    960     CMP         r10,#0
    961     VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
    962     MVNLT       r10,#0
    963 
    964     MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
    965     VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
    966 
    967 SIGN_UP_CHANGE_DONE_RESIDUE:
    968     VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    969     VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    970 
    971     VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    972     VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    973 
    974     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    975     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    976 
    977     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    978     VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    979 
    980     VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    981     VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 14)
    982 
    983     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    984     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    985 
    986 
    987     VUZP.8      D26,D27
    988     VTBL.8      D24,{D6},D26
    989     VTBL.8      D25,{D7},D27
    990     VZIP.8      D24,D25
    991 
    992     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    993     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    994 
    995     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    996     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    997 
    998     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    999     VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
   1000 
   1001     VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
   1002 
   1003     BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
   1004 
   1005     LDR         r8,[sp,#0x118]              @Loads ht
   1006     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
   1007 
   1008     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
   1009 
   1010 SRC_LEFT_LOOP_RESIDUE:
   1011     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
   1012     SUBS        r8,r8,#2
   1013     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
   1014     BNE         SRC_LEFT_LOOP_RESIDUE
   1015 
   1016 
   1017 RE_ASSINING_LOOP:
   1018     LDR         r7,[sp,#0x114]              @Loads wd
   1019     LDR         r8,[sp,#0x118]              @Loads ht
   1020 
   1021     LDR         r0,[sp,#0x100]              @Loads *pu1_src
   1022     SUB         r10,r7,#2                   @wd - 2
   1023 
   1024     LDRH        r9,[sp,#6]
   1025     SUB         r8,r8,#1                    @ht - 1
   1026 
   1027     STRH        r9,[r0,r10]                 @pu1_src_org[0] = u1_pos_0_0_tmp
   1028     MLA         r6,r8,r1,r0                 @pu1_src[(ht - 1) * src_strd]
   1029 
   1030     LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
   1031 
   1032     LDRH        r9,[sp,#8]
   1033     ADD         r12,sp,#10
   1034 
   1035     STRH        r9,[r6]                     @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
   1036 
   1037     LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
   1038     STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
   1039     LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
   1040 
   1041 SRC_TOP_LOOP:
   1042     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
   1043     SUBS        r7,r7,#8                    @Decrement the width
   1044     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
   1045     BNE         SRC_TOP_LOOP
   1046 
   1047 END_LOOPS:
   1048     ADD         sp,sp,#0xD4
   1049     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
   1050 
   1051 
   1052 
   1053