Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class0_chroma.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset_u,
     48 @                              WORD8 *pi1_sao_offset_v,
     49 @                              WORD32 wd,
     50 @
     51 @**************Variables Vs Registers*****************************************
     52 @r0 =>  *pu1_src
     53 @r1 =>  src_strd
     54 @r2 =>  *pu1_src_left
     55 @r3 =>  *pu1_src_top
     56 @r4 =>  *pu1_src_top_left
     57 @r7 =>  *pu1_avail
     58 @r8 =>  *pi1_sao_offset_u
     59 @r5 =>  *pi1_sao_offset_v
     60 @r9 =>  wd
     61 @r10=>  ht
     62 
     63 .equ    pu1_src_top_left_offset,    104
     64 .equ    pu1_src_top_right_offset,   108
     65 .equ    pu1_src_bot_left_offset,    112
     66 .equ    pu1_avail_offset,           116
     67 .equ    pi1_sao_u_offset,           120
     68 .equ    pi1_sao_v_offset,           124
     69 .equ    wd_offset,                  128
     70 .equ    ht_offset,                  132
     71 
     72 .text
     73 .p2align 2
     74 
     75 .extern gi1_table_edge_idx
     76 .globl ihevc_sao_edge_offset_class0_chroma_a9q
     77 
     78 gi1_table_edge_idx_addr:
     79 .long gi1_table_edge_idx - ulbl1 - 8
     80 
     81 ihevc_sao_edge_offset_class0_chroma_a9q:
     82 
     83 
     84     STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
     85     vpush       {d8  -  d15}
     86 
     87     LDR         r9,[sp,#wd_offset]          @Loads wd
     88 
     89     LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
     90     ADD         r11,r3,r9                   @pu1_src_top[wd]
     91 
     92     LDR         r10,[sp,#ht_offset]         @Loads ht
     93     VMOV.I8     Q1,#2                       @const_2 = vdupq_n_s8(2)
     94     LDRH        r12,[r11,#-2]               @pu1_src_top[wd - 1]
     95 
     96     LDR         r7,[sp,#pu1_avail_offset]   @Loads pu1_avail
     97     VMOV.I16    Q2,#0                       @const_min_clip = vdupq_n_s16(0)
     98     STRH        r12,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
     99 
    100     LDR         r8,[sp,#pi1_sao_u_offset]   @Loads pi1_sao_offset_u
    101     VMOV.I16    Q3,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    102     SUB         r4,r10,#1                   @(ht - 1)
    103 
    104     LDR         r14, gi1_table_edge_idx_addr @table pointer
    105 ulbl1:
    106     add         r14,r14,pc
    107     VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
    108     MUL         r4,r4,r1                    @(ht - 1) * src_strd
    109 
    110     LDR         r5,[sp,#pi1_sao_v_offset]   @Loads pi1_sao_offset_v
    111     VLD1.8      D11,[r8]                    @offset_tbl = vld1_s8(pi1_sao_offset_u)
    112     ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
    113 
    114     MOV         r6,r0                       @pu1_src_org
    115     VLD1.8      D10,[r14]                   @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    116     MOV         r12,r9                      @Move wd to r12 for loop count
    117 
    118 SRC_TOP_LOOP:                               @wd is always multiple of 8
    119     VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
    120     SUBS        r12,r12,#8                  @Decrement the loop counter by 8
    121     VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
    122     BNE         SRC_TOP_LOOP
    123     ADD         r6,r6,#14                   @pu1_src_org[14]
    124 
    125     MOV         r3,r2                       @pu1_src_left backup to reload later
    126     VLD1.8      D0,[r5]                     @offset_tbl = vld1_s8(pi1_sao_offset_v)
    127     CMP         r9,#16                      @Compare wd with 16
    128 
    129     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    130 
    131     MOV         r8,r9                       @move wd to r8 for loop count
    132 
    133 WIDTH_LOOP_16:
    134     CMP         r8,r9                       @if(col == wd)
    135     BNE         AU1_MASK_FF                 @jump to else part
    136     LDRB        r12,[r7]                    @pu1_avail[0]
    137     VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    138     VMOV.8      D8[1],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
    139     B           SKIP_AU1_MASK_FF            @Skip the else part
    140 
    141 AU1_MASK_FF:
    142     MOV         r12,#-1                     @move -1 to r12
    143     VMOV.16     D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    144 
    145 SKIP_AU1_MASK_FF:
    146     CMP         r8,#16                      @If col == 16
    147     BNE         SKIP_MASKING_IF_NOT16       @If not skip masking
    148     LDRB        r12,[r7,#1]                 @pu1_avail[1]
    149     VMOV.8      D9[6],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
    150     VMOV.8      D9[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    151 
    152 SKIP_MASKING_IF_NOT16:
    153     MOV         r12,r0                      @pu1_src_cpy = pu1_src
    154     MOV         r4,r10                      @move ht to r4 for loop count
    155 
    156 PU1_SRC_LOOP:
    157     LDRH        r11,[r2]                    @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
    158     VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
    159     VLD1.8      D13,[r12],r1                @pu1_cur_row = vld1q_u8(pu1_src_cpy)
    160     SUB         r12,#8
    161     SUB         r5,r9,r8                    @wd - col
    162 
    163     SUB         r14,r10,r4                  @ht - row
    164     VMOV.16     D15[3],r11                  @vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
    165     MUL         r14,r14,r1                  @(ht - row) * src_strd
    166 
    167     VLD1.8      D30,[r12]!                  @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
    168     VLD1.8      D31,[r12]                   @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
    169     SUB         r12,#8
    170     VEXT.8      Q7,Q7,Q6,#14                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
    171     SUB         r12,r12,r1
    172 
    173     LDRH        r11,[r2,#2]                 @II load pu1_src_left since ht - row =0
    174     VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    175     ADD         r5,r14,r5                   @(ht - row) * src_strd + (wd - col)
    176 
    177     VMOV.16     D29[3],r11                  @II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
    178     VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    179 
    180     LDRH        r14,[r6,r5]                 @pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
    181     VSUB.U8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    182     SUB         r4,r4,#1
    183 
    184     LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
    185     VEXT.8      Q14,Q14,Q15,#14             @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
    186 
    187     VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    188     VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    189 
    190     LDRB        r11,[r12,#17]               @pu1_src_cpy[17]
    191     VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    192     STRH        r14,[r2],#2                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    193 
    194     ADD         r12,r12,r1
    195     VMOV.8      D14[1],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
    196     LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
    197 
    198     VEXT.8      Q7,Q6,Q7,#2                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
    199     VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    200 
    201     LDRB        r11,[r12,#17]               @II pu1_src_cpy[17]
    202     VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    203     SUB         r12,r12,r1
    204 
    205     VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    206     VMOV.8      D28[1],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
    207 
    208     VSUB.U8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    209     VEXT.8      Q14,Q15,Q14,#2              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
    210 
    211     VADD.U8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
    212 
    213     VADD.U8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
    214     VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    215     VSUB.U8     Q10,Q12,Q13                 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    216 
    217     VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    218     VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    219     VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    220 
    221     VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
    222     VUZP.8      D14,D15
    223 
    224     VSUB.U8     Q11,Q12,Q13                 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    225     VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
    226     VADD.U8     Q12,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
    227 
    228     VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    229     VTBL.8      D17,{D0},D15
    230     VADD.U8     Q12,Q12,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
    231 
    232     VZIP.S8     D16,D17
    233     VTBL.8      D24,{D10},D24               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    234     VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    235 
    236     VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    237     VTBL.8      D25,{D10},D25               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    238     VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    239 
    240     VAND        Q12,Q12,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    241     VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    242     VUZP.8      D24,D25                     @II
    243 
    244     VADDW.S8    Q6,Q6,D17                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    245     VTBL.8      D26,{D11},D24               @II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
    246     VMAX.S16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    247 
    248     VMIN.U16    Q6,Q6,Q3                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    249     VTBL.8      D27,{D0},D25                @II
    250     VMOVN.I16   D14,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
    251 
    252     VMOVN.I16   D15,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
    253     VZIP.S8     D26,D27                     @II
    254 
    255     SUB         r5,r9,r8                    @II wd - col
    256     VMOVL.U8    Q14,D30                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    257     SUB         r14,r10,r4                  @II ht - row
    258 
    259     MUL         r14,r14,r1                  @II (ht - row) * src_strd
    260     VADDW.S8    Q14,Q14,D26                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    261     ADD         r5,r14,r5                   @II (ht - row) * src_strd + (wd - col)
    262 
    263     LDRH        r14,[r6,r5]                 @II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
    264     VMAX.S16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    265 
    266     STRH        r14,[r2],#2                 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
    267     VMIN.U16    Q14,Q14,Q3                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    268 
    269     VMOVL.U8    Q15,D31                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    270 
    271     VADDW.S8    Q15,Q15,D27                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    272     VST1.8      {D14,D15},[r12],r1          @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    273 
    274     VMAX.S16    Q15,Q15,Q2                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    275     SUBS        r4,r4,#1                    @Decrement row by 1
    276     VMIN.U16    Q15,Q15,Q3                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    277 
    278     VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    279     VMOVN.I16   D29,Q15                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    280 
    281     VST1.8      {D28,D29},[r12],r1          @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    282 
    283     BNE         PU1_SRC_LOOP                @If not equal jump to the inner loop
    284 
    285     ADD         r0,r0,#16                   @pu1_src += 16
    286 
    287     SUBS        r8,r8,#16                   @Decrement column by 16
    288     CMP         r8,#8                       @Check whether residue remains
    289     MOV         r2,r3                       @Reload pu1_src_left
    290     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    291     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    292     BLT         END_LOOPS                   @Jump to end function
    293 
    294 WIDTH_RESIDUE:
    295     SUB         r6,r6,#14
    296     AND         r8,r9,#0xF                  @wd_rem = wd & 0xF
    297     CMP         r8,#0                       @Residue check
    298     BEQ         END_LOOPS                   @No Residue jump to end function
    299 
    300     CMP         r8,r9                       @if(wd_rem == wd)
    301     BNE         AU1_MASK_FF_RESIDUE         @jump to else part
    302     LDRB        r12,[r7]                    @pu1_avail[0]
    303     VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    304     VMOV.8      D8[1],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
    305     B           SKIP_AU1_MASK_FF_RESIDUE    @Skip the else part
    306 
    307 AU1_MASK_FF_RESIDUE:
    308     MOV         r12,#-1                     @move -1 to r12
    309     VMOV.16     D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
    310 
    311 SKIP_AU1_MASK_FF_RESIDUE:
    312     LDRB        r12,[r7,#1]                 @pu1_avail[1]
    313     VMOV.8      D8[6],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    314     VMOV.8      D8[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
    315 
    316     MOV         r12,r0                      @pu1_src_cpy = pu1_src
    317     MOV         r4,r10                      @move ht to r4 for loop count
    318 
    319 PU1_SRC_LOOP_RESIDUE:
    320     LDRH        r11,[r2]                    @load pu1_src_left
    321     VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
    322     VLD1.8      D13,[r12],r1                @pu1_cur_row = vld1q_u8(pu1_src_cpy)
    323     SUB         r12,#8
    324     SUB         r5,r9,#2                    @wd - 2
    325 
    326     SUB         r14,r10,r4                  @(ht - row)
    327     VMOV.16     D15[3],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    328     LSL         r14,r14,#1                  @(ht - row) * 2
    329 
    330     VLD1.8      D30,[r12]!                  @II pu1_cur_row = vld1q_u8(pu1_src_cpy)
    331     VLD1.8      D31,[r12]                   @II pu1_cur_row = vld1q_u8(pu1_src_cpy)
    332     SUB         r12,#8
    333     VEXT.8      Q7,Q7,Q6,#14                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    334     SUB         r12,r12,r1
    335 
    336     LDRH        r11,[r2,#2]                 @II load pu1_src_left
    337     VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    338     MUL         r14,r14,r1                  @(ht - row) * 2 * src_strd
    339 
    340     VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    341     VMOV.16     D29[3],r11                  @II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
    342 
    343     LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
    344     VSUB.U8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    345     ADD         r5,r14,r5                   @(ht - row) * 2 * src_strd + (wd - 2)
    346 
    347     VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    348     VEXT.8      Q14,Q14,Q15,#14             @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
    349 
    350     LDRB        r11,[r12,#17]               @pu1_src_cpy[17]
    351     VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    352     LDRH        r14,[r6, r5]                @pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
    353 
    354     VMOV.8      D14[1],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
    355     VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    356     ADD         r12,r12,r1
    357 
    358     STRH        r14,[r2],#2                 @pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
    359     VEXT.8      Q7,Q6,Q7,#2                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    360     LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
    361 
    362     VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    363     VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
    364 
    365     LDRB        r11,[r12,#17]               @II pu1_src_cpy[17]
    366     VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    367     SUB         r4,r4,#1                    @II Decrement row by 1
    368 
    369     VSUB.U8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    370     VMOV.8      D28[1],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
    371     SUB         r12,r12,r1
    372 
    373     VADD.U8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
    374     VEXT.8      Q14,Q15,Q14,#2              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
    375 
    376     VADD.U8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
    377 
    378     VSUB.U8     Q10,Q12,Q13                 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    379     VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    380     VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
    381 
    382     VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
    383     VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    384     VSUB.U8     Q11,Q12,Q13                 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    385 
    386     VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
    387     VUZP.8      D14,D15
    388 
    389     VADD.U8     Q14,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
    390     VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
    391     VADD.U8     Q14,Q14,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
    392 
    393     VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    394     VTBL.8      D17,{D0},D15
    395     VMOVL.U8    Q12,D30                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    396 
    397     VZIP.S8     D16,D17
    398     VTBL.8      D28,{D10},D28               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    399     VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    400 
    401     VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    402     VTBL.8      D29,{D10},D29               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    403     VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    404 
    405     VMOVN.I16   D18,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
    406     VAND        Q14,Q14,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
    407 
    408     SUB         r5,r9,#2                    @II wd - 2
    409     VUZP.8      D28,D29                     @II
    410     SUB         r14,r10,r4                  @II (ht - row)
    411 
    412     LSL         r14,r14,#1                  @II (ht - row) * 2
    413     VTBL.8      D26,{D11},D28               @II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
    414     MUL         r14,r14,r1                  @II (ht - row) * 2 * src_strd
    415 
    416     ADD         r5,r14,r5                   @II (ht - row) * 2 * src_strd + (wd - 2)
    417     VTBL.8      D27,{D0},D29                @II
    418     LDRH        r14,[r6, r5]                @II pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
    419 
    420     VZIP.S8     D26,D27                     @II
    421     VST1.8      {D18},[r12],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    422 
    423     STRH        r14,[r2],#2                 @II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
    424     VADDW.S8    Q12,Q12,D26                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    425     SUBS        r4,r4,#1                    @Decrement row by 1
    426 
    427     VMAX.S16    Q12,Q12,Q2                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    428     VMIN.U16    Q12,Q12,Q3                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    429 
    430     VMOVN.I16   D28,Q12                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    431 
    432     VST1.8      {D28},[r12],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    433 
    434     BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to the pu1_src loop
    435 
    436 END_LOOPS:
    437     vpop        {d8  -  d15}
    438     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    439 
    440 
    441 
    442 
    443 
    444