Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class2_chroma.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset_u,
     48 @                              WORD8 *pi1_sao_offset_v,
     49 @                              WORD32 wd,
     50 @                              WORD32 ht)
     51 @**************Variables Vs Registers*****************************************
     52 @r0 =>  *pu1_src
     53 @r1 =>  src_strd
     54 @r2 =>  *pu1_src_left
     55 @r3 =>  *pu1_src_top
     56 @r4 =>  *pu1_src_top_left
     57 @r5 =>  *pu1_avail
     58 @r6 =>  *pi1_sao_offset_u
     59 @r9 =>  *pi1_sao_offset_v
     60 @r7 =>  wd
     61 @r8=>   ht
     62 
     63 .text
     64 .p2align 2
     65 
     66 .extern gi1_table_edge_idx
     67 .globl ihevc_sao_edge_offset_class2_chroma_a9q
     68 
     69 gi1_table_edge_idx_addr_1:
     70 .long gi1_table_edge_idx - ulbl1 - 8
     71 
     72 gi1_table_edge_idx_addr_2:
     73 .long gi1_table_edge_idx - ulbl2 - 8
     74 
     75 gi1_table_edge_idx_addr_3:
     76 .long gi1_table_edge_idx - ulbl3 - 8
     77 
     78 gi1_table_edge_idx_addr_4:
     79 .long gi1_table_edge_idx - ulbl4 - 8
     80 
     81 gi1_table_edge_idx_addr_5:
     82 .long gi1_table_edge_idx - ulbl5 - 8
     83 
     84 ihevc_sao_edge_offset_class2_chroma_a9q:
     85 
     86 
     87     STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
     88 
     89     LDR         r7,[sp,#0x40]               @Loads wd
     90     LDR         r8,[sp,#0x44]               @Loads ht
     91     SUB         r9,r7,#2                    @wd - 2
     92 
     93     LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
     94     LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
     95 
     96     STR         r0,[sp,#0x2C]               @Store pu1_src in sp
     97     MOV         r9,r7                       @Move width to r9 for loop count
     98 
     99     STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
    100     LDR         r5,[sp,#0x34]               @Loads pu1_avail
    101     LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
    102 
    103     STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
    104     SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
    105 
    106     STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
    107     SUB         r10,r8,#1                   @ht-1
    108     MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
    109     ADD         r12,sp,#10                  @temp array
    110 
    111 AU1_SRC_TOP_LOOP:
    112     VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
    113     SUBS        r9,r9,#8                    @Decrement the loop count by 8
    114     VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
    115     BNE         AU1_SRC_TOP_LOOP
    116 
    117 PU1_AVAIL_4_LOOP_U:
    118     LDRB        r9,[r5,#4]                  @pu1_avail[4]
    119     CMP         r9,#0
    120     LDRB        r9,[r0]                     @u1_pos_0_0_tmp_u = pu1_src[0]
    121     LDRB        r10,[r0,#1]                 @u1_pos_0_0_tmp_v = pu1_src[1]
    122     BEQ         PU1_AVAIL_7_LOOP_U
    123 
    124     LDRB        r11,[r4]                    @pu1_src_top_left[0]
    125     ADD         r14,r0,r1                   @pu1_src + src_strd
    126 
    127     SUB         r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
    128 
    129     LDRB        r14,[r14,#2]                @pu1_src[2 + src_strd]
    130     CMP         r12,#0
    131 
    132     MVNLT       r12,#0
    133     SUB         r11,r9,r14                  @pu1_src[0] - pu1_src[2 + src_strd]
    134 
    135     MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
    136 
    137     CMP         r11,#0
    138     MVNLT       r11,#0
    139     LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
    140 ulbl1:
    141     add         r14,r14,pc
    142     MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[2 + src_strd])
    143 
    144     ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
    145     ADD         r11,r11,#2                  @edge_idx
    146 
    147     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    148     CMP         r12,#0                      @0 != edge_idx
    149     BEQ         PU1_AVAIL_4_LOOP_V
    150     LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
    151     ADD         r9,r9,r11                   @pu1_src[0] + pi1_sao_offset_u[edge_idx]
    152     USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    153 
    154 PU1_AVAIL_4_LOOP_V:
    155 
    156     LDRB        r11,[r4,#1]                 @pu1_src_top_left[1]
    157     ADD         r14,r0,r1                   @pu1_src + src_strd
    158 
    159     SUB         r12,r10,r11                 @pu1_src[1] - pu1_src_top_left[1]
    160     LDRB        r14,[r14,#3]                @pu1_src[3 + src_strd]
    161 
    162     CMP         r12,#0
    163     MVNLT       r12,#0
    164     SUB         r11,r10,r14                 @pu1_src[1] - pu1_src[3 + src_strd]
    165     MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
    166 
    167     CMP         r11,#0
    168     MVNLT       r11,#0
    169     LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
    170 ulbl2:
    171     add         r14,r14,pc
    172     MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[3 + src_strd])
    173 
    174     ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
    175     ADD         r11,r11,#2                  @edge_idx
    176 
    177     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    178     CMP         r12,#0                      @0 != edge_idx
    179     BEQ         PU1_AVAIL_7_LOOP_U
    180     LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
    181     LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
    182     ADD         r10,r10,r11                 @pu1_src[0] + pi1_sao_offset_v[edge_idx]
    183     USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
    184 
    185 PU1_AVAIL_7_LOOP_U:
    186     STRB        r10,[sp,#7]
    187     STRB        r9,[sp,#6]
    188 
    189     LDRB        r10,[r5,#7]                 @pu1_avail[7]
    190     CMP         r10,#0
    191     SUB         r10,r7,#2                   @wd - 2
    192     SUB         r11,r8,#1                   @ht - 1
    193     MLA         r12,r11,r1,r10              @wd - 2 + (ht - 1) * src_strd
    194     ADD         r12,r12,r0                  @pu1_src[wd - 2 + (ht - 1) * src_strd]
    195     LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
    196     LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
    197     BEQ         PU1_AVAIL_3_LOOP
    198 
    199     SUB         r11,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
    200     SUB         r11,r11,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
    201     LDRB        r11,[r11]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
    202     SUB         r11,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
    203     CMP         r11,#0
    204     MVNLT       r11,#0
    205     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
    206 
    207     ADD         r14,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
    208     ADD         r14,r14,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
    209     LDRB        r14,[r14]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
    210     SUB         r14,r10,r14                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    211     CMP         r14,#0
    212     MVNLT       r14,#0
    213     MOVGT       r14,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
    214 
    215     ADD         r11,r11,r14                 @Add 2 sign value
    216     ADD         r11,r11,#2                  @edge_idx
    217     LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
    218 ulbl3:
    219     add         r14,r14,pc
    220 
    221     LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    222     CMP         r14,#0
    223     BEQ         PU1_AVAIL_7_LOOP_V
    224     LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
    225     ADD         r10,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    226     USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    227 
    228 PU1_AVAIL_7_LOOP_V:
    229     ADD         r12,r12,#1
    230     SUB         r11,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
    231     SUB         r11,r11,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
    232     LDRB        r11,[r11]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
    233     SUB         r11,r9,r11                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
    234     CMP         r11,#0
    235     MVNLT       r11,#0
    236     MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
    237 
    238     ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
    239     ADD         r14,r14,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    240     LDRB        r14,[r14]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    241     SUB         r14,r9,r14                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
    242     CMP         r14,#0
    243     MVNLT       r14,#0
    244     MOVGT       r14,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
    245 
    246     ADD         r11,r11,r14                 @Add 2 sign value
    247     ADD         r11,r11,#2                  @edge_idx
    248     LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
    249 ulbl4:
    250     add         r14,r14,pc
    251 
    252     LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
    253     CMP         r12,#0
    254     BEQ         PU1_AVAIL_3_LOOP
    255     LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
    256     LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
    257     ADD         r9,r9,r11                   @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
    258     USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
    259 
    260 PU1_AVAIL_3_LOOP:
    261     STRB        r10,[sp,#8]
    262     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    263     STRB        r9,[sp,#9]
    264 
    265     MOV         r12,r8                      @Move ht
    266     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    267     MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
    268 
    269     LDRB        r11,[r5,#3]                 @pu1_avail[3]
    270     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    271     CMP         r11,#0
    272 
    273     SUBEQ       r12,r12,#1                  @ht_tmp--
    274     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    275 
    276     CMP         r5,#0
    277 
    278     ADDEQ       r0,r0,r1                    @pu1_src += src_strd
    279     VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
    280     SUBEQ       r12,r12,#1                  @ht_tmp--
    281 
    282     LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
    283     ADDEQ       r14,r14,#2                  @pu1_src_left_cpy += 2
    284 
    285     STR         r0,[sp,#2]                  @Store pu1_src in sp
    286     VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
    287     LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
    288 ulbl5:
    289     add         r2,r2,pc
    290 
    291     MOV         r6,r7                       @move wd to r6 loop_count
    292     VMOV.S8     Q4,#0XFF                    @au1_mask = vdupq_n_s8(-1)
    293     CMP         r7,#16                      @Compare wd with 16
    294 
    295     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    296     CMP         r8,#4                       @Compare ht with 4
    297     BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
    298 
    299 WIDTH_LOOP_16:
    300     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    301     LDR         r7,[sp,#0x114]              @Loads wd
    302     CMP         r6,r7                       @col == wd
    303     LDREQB      r8,[r5]                     @pu1_avail[0]
    304 
    305     MOVNE       r8,#-1
    306     VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    307 
    308     CMP         r6,#16                      @if(col == 16)
    309     VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    310 
    311     BNE         SKIP_AU1_MASK_VAL
    312     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    313     VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    314     VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    315 
    316 SKIP_AU1_MASK_VAL:
    317     LDRB        r9,[r5,#2]                  @pu1_avail[2]
    318     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    319     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    320     SUB         r0,#8
    321     CMP         r9,#0
    322 
    323     LDR         r4,[sp,#0x118]              @Loads ht
    324     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    325 
    326     LDR         r7,[sp,#0x114]              @Loads wd
    327     MOVNE       r8,r3                       @pu1_src_top_cpy
    328 
    329     SUB         r8,r8,#2                    @pu1_src - src_strd - 2
    330     ADD         r3,r3,#16
    331 
    332     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    333     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    334     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    335     SUB         r8,#8
    336     SUB         r7,r7,r6                    @(wd - col)
    337 
    338     ADD         r7,r7,#14                   @15 + (wd - col)
    339     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    340     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    341 
    342     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    343     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    344 
    345 AU1_SRC_LEFT_LOOP:
    346     LDRH        r8,[r7]                     @load the value and increment by src_strd
    347     SUBS        r4,r4,#1                    @decrement the loop count
    348 
    349     STRH        r8,[r5],#2                  @store it in the stack pointer
    350     ADD         r7,r7,r1
    351 
    352     BNE         AU1_SRC_LEFT_LOOP
    353 
    354     ADD         r8,r0,r1                    @I *pu1_src + src_strd
    355     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    356     MOV         r7,r12                      @row count, move ht_tmp to r7
    357 
    358     VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    359     VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    360     SUB         r8,#8
    361 
    362     ADD         r8,r8,#16                   @I
    363     VMOV.I8     Q9,#0
    364     LDRH        r5,[r8]                     @I pu1_src_cpy[src_strd + 16]
    365 
    366     LDR         r10,[sp,#0x108]             @I Loads pu1_avail
    367     VMOV.16     D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    368     LDRB        r10,[r10,#2]                @I pu1_avail[2]
    369 
    370     CMP         r10,#0                      @I
    371     VEXT.8      Q9,Q8,Q9,#2                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    372     BNE         SIGN_UP_CHANGE_DONE         @I
    373 
    374     LDRB        r11,[r0]                    @I pu1_src_cpy[0]
    375     SUB         r4,r12,r7                   @I ht_tmp - row
    376 
    377     LDRB        r10,[r0,#1]                 @I pu1_src_cpy[0]
    378     LSL         r4,r4,#1                    @I (ht_tmp - row) * 2
    379 
    380     ADD         r9,r14,r4                   @I pu1_src_left_cpy[(ht_tmp - row) * 2]
    381     LDRB        r5,[r9,#-2]                 @I load the value
    382 
    383     SUB         r8,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    384     LDRB        r5,[r9,#-1]                 @I load the value
    385 
    386     CMP         r8,#0                       @I
    387     SUB         r4,r10,r5                   @I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    388 
    389     MVNLT       r8,#0                       @I
    390     MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    391 
    392     CMP         r4,#0                       @I
    393     VMOV.8      D14[0],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    394     MVNLT       r4,#0                       @I
    395 
    396     MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    397     VMOV.8      D14[1],r4                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    398 
    399 SIGN_UP_CHANGE_DONE:
    400     VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    401     VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    402 
    403     VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    404     VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    405 
    406     VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
    407     VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
    408 
    409     VTBL.8      D18,{D30},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    410     VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
    411 
    412     VTBL.8      D19,{D30},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    413     VEXT.8      Q7,Q7,Q7,#14                @I sign_up = vextq_s8(sign_up, sign_up, 14)
    414 
    415     VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    416     VAND        Q11,Q9,Q4                   @I edge_idx = vandq_s8(edge_idx, au1_mask)
    417 
    418     VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    419     VUZP.8      D22,D23                     @I
    420 
    421     VTBL.8      D22,{D6},D22                @I
    422     VTBL.8      D23,{D7},D23                @I
    423     VZIP.8      D22,D23                     @I
    424 
    425     VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
    426     VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    427 
    428     VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    429     VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    430 
    431     VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    432     VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    433 
    434     VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    435     SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
    436 
    437 
    438 PU1_SRC_LOOP:
    439     ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
    440     VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
    441     ADD         r11,r8,r1                   @III *pu1_src + src_strd
    442 
    443     VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    444     VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    445     SUB         r8,#8
    446     VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    447     VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    448     SUB         r11,#8
    449 
    450     ADD         r8,r8,#16                   @II
    451     VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
    452     LDRH        r5,[r8]                     @II pu1_src_cpy[src_strd + 16]
    453 
    454     ADD         r11,r11,#16                 @III
    455     VMOV.16     D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    456     LDRH        r4,[r11]                    @III pu1_src_cpy[src_strd + 16]
    457 
    458     LDRB        r8,[r0,r1]                  @II pu1_src_cpy[0]
    459     VEXT.8      Q14,Q8,Q14,#2               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    460     SUB         r5,r12,r7                   @II ht_tmp - row
    461 
    462     LSL         r5,r5,#1                    @II (ht_tmp - row) * 2
    463     VMOV.16     D18[0],r4                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    464     ADD         r9,r14,r5                   @II pu1_src_left_cpy[(ht_tmp - row) * 2]
    465 
    466     LDRB        r11,[r9,#-2]                @II load the value
    467     VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
    468     SUB         r8,r8,r11                   @II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    469 
    470     CMP         r8,#0                       @II
    471     VEXT.8      Q9,Q15,Q9,#2                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    472     LDRB        r11,[r0,#1]                 @II pu1_src_cpy[0]
    473 
    474     MVNLT       r8,#0                       @II
    475     VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    476     MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    477 
    478     LDRB        r5,[r9,#-1]                 @II load the value
    479     VMOV.8      D14[0],r8                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    480     SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
    481 
    482     SUB         r11,r11,r5                  @II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    483     VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    484     CMP         r11,#0                      @II
    485 
    486     MVNLT       r11,#0                      @II
    487     VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    488     MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    489 
    490     LDRB        r4,[r0,r1]                  @III pu1_src_cpy[0]
    491     VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    492     SUB         r5,r12,r7                   @III ht_tmp - row
    493 
    494     ADD         r10,r0,r1
    495     VMOV.8      D14[1],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    496     LSL         r5,r5,#1                    @III (ht_tmp - row) * 2
    497 
    498     ADD         r9,r14,r5                   @III pu1_src_left_cpy[(ht_tmp - row) * 2]
    499     VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
    500     LDRB        r10,[r10,#1]                @III pu1_src_cpy[0]
    501 
    502     LDRB        r5,[r9,#-2]                 @III load the value
    503     VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    504     SUB         r4,r4,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    505 
    506     CMP         r4,#0                       @III
    507     LDRB        r9,[r9,#-1]                 @III load the value
    508     VTBL.8      D26,{D22},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    509     VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
    510 
    511     MVNLT       r4,#0                       @III
    512     SUB         r10,r10,r9                  @III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    513     VTBL.8      D27,{D22},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    514     VEXT.8      Q7,Q7,Q7,#14                @II sign_up = vextq_s8(sign_up, sign_up, 14)
    515 
    516     MOVGT       r4,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    517     VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    518     CMP         r10,#0                      @III
    519 
    520     VUZP.8      D26,D27                     @II
    521     VMOV.8      d14[0],r4                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    522 
    523     MVNLT       r10,#0                      @III
    524     MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    525     VTBL.8      D24,{D6},D26                @II
    526     VCGT.U8     Q10,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    527 
    528     VCLT.U8     Q11,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    529     VTBL.8      D25,{D7},D27                @II
    530     VSUB.U8     Q11,Q11,Q10                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    531 
    532     VMOV.8      D14[1],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    533     VZIP.8      D24,D25                     @II
    534 
    535     VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    536     VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
    537 
    538     VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    539     VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    540 
    541     VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
    542     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    543 
    544     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    545     VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    546     VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
    547 
    548     VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    549     VEXT.8      Q7,Q7,Q7,#14                @III sign_up = vextq_s8(sign_up, sign_up, 14)
    550 
    551     VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    552     VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
    553 
    554     VUZP.8      D18,D19                     @III
    555     VTBL.8      D22,{D6},D18                @III
    556     VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    557 
    558     VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
    559     VTBL.8      D23,{D7},D19                @III
    560     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    561 
    562     VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    563     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    564 
    565     VZIP.8      D22,D23                     @III
    566     VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    567 
    568     VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    569     VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    570 
    571     VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    572     VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    573 
    574     VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    575     VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    576 
    577     SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
    578     VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    579     CMP         r7,#1
    580 
    581     VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    582     VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    583 
    584     BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
    585     BLT         INNER_LOOP_DONE
    586 
    587     ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
    588     VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
    589 
    590     LDRB        r11,[r0,r1]                 @pu1_src_cpy[0]
    591     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    592     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    593     SUB         r8,#8
    594     SUB         r4,r12,r7                   @ht_tmp - row
    595 
    596     ADD         r8,r8,#16
    597     VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
    598     LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
    599 
    600     LSL         r4,r4,#1                    @(ht_tmp - row) * 2
    601     VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    602     ADD         r9,r14,r4                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
    603 
    604     LDRB        r5,[r9,#-2]                 @load the value
    605     VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    606     SUB         r8,r11,r5                   @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    607 
    608     CMP         r8,#0
    609     VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
    610     MVNLT       r8,#0
    611 
    612     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    613     VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    614 
    615     LDRB        r11,[r0,#1]                 @pu1_src_cpy[0]
    616     VMOV.8      D14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    617     LDRB        r5,[r9,#-1]                 @load the value
    618 
    619     SUB         r4,r11,r5                   @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    620     VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    621     CMP         r4,#0
    622 
    623     MVNLT       r4,#0
    624     VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    625     MOVGT       r4,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    626 
    627     VMOV.8      D14[1],r4                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    628     VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    629 
    630     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    631     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    632 
    633     VTBL.8      D26,{D30},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    634     VTBL.8      D27,{D30},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    635 
    636     VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    637     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    638 
    639     VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    640     VUZP.8      D26,D27
    641 
    642     VTBL.8      D24,{D6},D26
    643     VTBL.8      D25,{D7},D27
    644     VZIP.8      D24,D25
    645 
    646     VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    647     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    648     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    649 
    650     VADDW.S8    Q9,Q9,D25                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    651     VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    652     VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    653 
    654 
    655 INNER_LOOP_DONE:
    656     LDR         r8,[sp,#0x118]              @Loads ht
    657     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    658     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    659 
    660     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
    661     VMOVN.I16   D21,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[1])
    662 
    663 
    664 SRC_LEFT_LOOP:
    665     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    666     SUBS        r8,r8,#2
    667     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
    668     BNE         SRC_LEFT_LOOP
    669 
    670     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    671     VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    672     CMP         r6,#8                       @Check whether residue remains
    673 
    674     BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
    675     LDR         r7,[sp,#0x114]              @Loads wd
    676     LDR         r0,[sp,#0x02]               @Loads *pu1_src
    677     SUB         r7,r7,r6
    678     ADD         r0,r0,r7
    679     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    680     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    681 
    682 
    683 WD_16_HT_4_LOOP:
    684     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    685     LDR         r7,[sp,#0x114]              @Loads wd
    686     CMP         r6,r7                       @col == wd
    687     LDREQB      r8,[r5]                     @pu1_avail[0]
    688 
    689     MOVNE       r8,#-1
    690     VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    691     VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    692 
    693     CMP         r6,#16                      @if(col == 16)
    694     BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
    695     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    696     VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    697     VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    698 
    699 SKIP_AU1_MASK_VAL_WD_16_HT_4:
    700     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    701     CMP         r8,#0
    702 
    703     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    704     MOVNE       r8,r3                       @pu1_src_top_cpy
    705     SUB         r8,r8,#2                    @pu1_src - src_strd - 2
    706     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    707     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
    708     SUB         r8,#8
    709 
    710     ADD         r3,r3,#16
    711     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    712     LDR         r4,[sp,#0x118]              @Loads ht
    713     LDR         r7,[sp,#0x114]              @Loads wd
    714     SUB         r7,r7,r6                    @(wd - col)
    715     ADD         r7,r7,#14                   @15 + (wd - col)
    716     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    717     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
    718 
    719 AU1_SRC_LEFT_LOOP_WD_16_HT_4:
    720     LDRH        r8,[r7]                     @load the value and increment by src_strd
    721     STRH        r8,[r5],#2                  @store it in the stack pointer
    722     ADD         r7,r7,r1
    723 
    724     SUBS        r4,r4,#1                    @decrement the loop count
    725     BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
    726 
    727     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    728     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    729     SUB         r0,#8
    730 
    731     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    732     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    733     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    734     VMOV.I8     Q9,#0
    735     MOV         r7,r12                      @row count, move ht_tmp to r7
    736 
    737 PU1_SRC_LOOP_WD_16_HT_4:
    738     VMOV.I8     Q9,#0
    739     ADD         r8,r0,r1                    @*pu1_src + src_strd
    740     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    741     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    742     SUB         r8,#8
    743 
    744     ADD         r8,r8,#16
    745     LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
    746     VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    747     VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    748 
    749     CMP         r7,r12
    750     BLT         SIGN_UP_CHANGE_WD_16_HT_4
    751     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    752     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    753     CMP         r5,#0
    754     BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
    755 
    756 SIGN_UP_CHANGE_WD_16_HT_4:
    757     LDRB        r8,[r0]                     @pu1_src_cpy[0]
    758     SUB         r5,r12,r7                   @ht_tmp - row
    759     LSL         r5,r5,#1                    @(ht_tmp - row) * 2
    760     ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
    761     LDRB        r5,[r9,#-2]                 @load the value
    762     SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    763     CMP         r8,#0
    764     MVNLT       r8,#0
    765     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    766     VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    767 
    768     LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
    769     LDRB        r5,[r9,#-1]                 @load the value
    770     SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
    771     CMP         r8,#0
    772     MVNLT       r8,#0
    773     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    774     VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    775 
    776 SIGN_UP_CHANGE_DONE_WD_16_HT_4:
    777     VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    778     VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    779     VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    780 
    781     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    782     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    783 
    784     VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    785     VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    786     VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    787 
    788     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    789 
    790     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    791     VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
    792 
    793     VUZP.8      D26,D27
    794     VTBL.8      D24,{D6},D26
    795     VTBL.8      D25,{D7},D27
    796     VZIP.8      D24,D25
    797 
    798     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    799     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    800     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    801     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    802 
    803     VMOVL.U8    Q13,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    804     VADDW.S8    Q13,Q13,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    805     VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    806     VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    807 
    808     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    809     VMOVN.I16   D29,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    810 
    811     VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    812 
    813     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    814     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    815     BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
    816 
    817     LDR         r8,[sp,#0x118]              @Loads ht
    818     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    819     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
    820 
    821 SRC_LEFT_LOOP_WD_16_HT_4:
    822     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    823     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
    824 
    825     SUBS        r8,r8,#2
    826     BNE         SRC_LEFT_LOOP_WD_16_HT_4
    827 
    828 
    829     SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
    830     BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
    831     BGT         WD_16_HT_4_LOOP
    832 
    833 
    834 WIDTH_RESIDUE:
    835     LDR         r7,[sp,#0x114]              @Loads wd
    836     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    837     CMP         r6,r7                       @wd_residue == wd
    838     LDREQB      r8,[r5]                     @pu1_avail[0]
    839 
    840     MOVNE       r8,#-1
    841     VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    842     VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    843 
    844     LDRB        r8,[r5,#1]                  @pu1_avail[1]
    845     VMOV.8      d8[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    846     VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    847 
    848     LDRB        r8,[r5,#2]                  @pu1_avail[2]
    849     CMP         r8,#0
    850 
    851     SUBEQ       r8,r0,r1                    @pu1_src - src_strd
    852     MOVNE       r8,r3
    853     SUB         r8,r8,#2                    @pu1_src - src_strd - 2
    854     VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
    855     VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
    856     SUB         r8,#8
    857 
    858     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    859     LDR         r4,[sp,#0x118]              @Loads ht
    860     LDR         r7,[sp,#0x114]              @Loads wd
    861     LDR         r8,[sp,#0x100]              @Loads *pu1_src
    862     SUB         r7,r7,#2                    @(wd - 2)
    863     ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
    864 
    865 AU1_SRC_LEFT_LOOP_RESIDUE:
    866     LDRH        r8,[r7]                     @load the value and increment by src_strd
    867     STRH        r8,[r5],#2                  @store it in the stack pointer
    868     ADD         r7,r7,r1
    869     SUBS        r4,r4,#1                    @decrement the loop count
    870     BNE         AU1_SRC_LEFT_LOOP_RESIDUE
    871 
    872     VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    873     VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
    874     SUB         r0,#8
    875 
    876     VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    877     VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    878     VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    879     MOV         r7,r12                      @row count, move ht_tmp to r7
    880 
    881 PU1_SRC_LOOP_RESIDUE:
    882     VMOV.I8     Q9,#0
    883     ADD         r8,r0,r1                    @*pu1_src + src_strd
    884     VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    885     VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    886     SUB         r8,#8
    887 
    888     ADD         r8,r8,#16
    889     LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
    890     VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
    891     VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
    892 
    893     CMP         r7,r12
    894     BLT         SIGN_UP_CHANGE_RESIDUE
    895     LDR         r5,[sp,#0x108]              @Loads pu1_avail
    896     LDRB        r5,[r5,#2]                  @pu1_avail[2]
    897     CMP         r5,#0
    898     BNE         SIGN_UP_CHANGE_DONE_RESIDUE
    899 
    900 SIGN_UP_CHANGE_RESIDUE:
    901     LDRB        r8,[r0]                     @pu1_src_cpy[0]
    902     SUB         r5,r12,r7                   @ht_tmp - row
    903     LSL         r5,r5,#1                    @(ht_tmp - row) * 2
    904     ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    905     LDRB        r5,[r9,#-2]                 @load the value
    906     SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    907     CMP         r8,#0
    908     MVNLT       r8,#0
    909     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    910     VMOV.8      d14[0],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
    911 
    912     LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
    913     LDRB        r5,[r9,#-1]                 @load the value
    914     SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
    915     CMP         r8,#0
    916     MVNLT       r8,#0
    917     MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
    918     VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
    919 
    920 SIGN_UP_CHANGE_DONE_RESIDUE:
    921     VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
    922     VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
    923     VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    924 
    925     VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
    926     VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    927 
    928     VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    929     VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    930     VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    931 
    932     VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
    933 
    934     VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
    935     VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
    936 
    937     VUZP.8      D26,D27
    938     VTBL.8      D24,{D6},D26
    939     VTBL.8      D25,{D7},D27
    940     VZIP.8      D24,D25
    941 
    942     VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    943     VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    944     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    945     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    946 
    947     VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    948 
    949     VST1.8      {D28},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    950 
    951     VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
    952     SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
    953     BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
    954 
    955     LDR         r8,[sp,#0x118]              @Loads ht
    956     LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
    957     ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
    958 
    959 SRC_LEFT_LOOP_RESIDUE:
    960     LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
    961     SUBS        r8,r8,#2
    962     STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
    963 
    964     BNE         SRC_LEFT_LOOP_RESIDUE
    965 
    966 
    967 RE_ASSINING_LOOP:
    968     LDR         r8,[sp,#0x118]              @Loads ht
    969 
    970     LDR         r0,[sp,#0x100]              @Loads *pu1_src
    971     SUB         r8,r8,#1                    @ht - 1
    972 
    973     LDR         r7,[sp,#0x114]              @Loads wd
    974 
    975     LDRH        r9,[sp,#6]
    976     MLA         r6,r8,r1,r7                 @wd - 2 + (ht - 1) * src_strd
    977 
    978     STRH        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
    979     ADD         r6,r0,r6                    @pu1_src[wd - 2 + (ht - 1) * src_strd]
    980 
    981     LDRH        r9,[sp,#8]
    982     ADD         r12,sp,#10
    983     STRH        r9,[r6,#-2]                 @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
    984 
    985     LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
    986     LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
    987     STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
    988     LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
    989 
    990 SRC_TOP_LOOP:
    991     VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
    992     SUBS        r7,r7,#8                    @Decrement the width
    993     VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
    994     BNE         SRC_TOP_LOOP
    995 
    996 END_LOOPS:
    997     ADD         sp,sp,#0xD4
    998     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    999 
   1000 
   1001 
   1002