Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_edge_offset_class1.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
     40 @                              WORD32 src_strd,
     41 @                              UWORD8 *pu1_src_left,
     42 @                              UWORD8 *pu1_src_top,
     43 @                              UWORD8 *pu1_src_top_left,
     44 @                              UWORD8 *pu1_src_top_right,
     45 @                              UWORD8 *pu1_src_bot_left,
     46 @                              UWORD8 *pu1_avail,
     47 @                              WORD8 *pi1_sao_offset,
     48 @                              WORD32 wd,
     49 @                              WORD32 ht)
     50 @**************Variables Vs Registers*****************************************
     51 @r0 =>  *pu1_src
     52 @r1 =>  src_strd
     53 @r2 =>  *pu1_src_left
     54 @r3 =>  *pu1_src_top
     55 @r4 =>  *pu1_src_top_left
     56 @r5 =>  *pu1_avail
     57 @r6 =>  *pi1_sao_offset
     58 @r7 =>  wd
     59 @r8 =>  ht
     60 
     61 .equ    pu1_src_top_left_offset,    104
     62 .equ    pu1_src_top_right_offset,   108
     63 .equ    pu1_src_bot_left_offset,    112
     64 .equ    pu1_avail_offset,           116
     65 .equ    pi1_sao_offset,             120
     66 .equ    wd_offset,                  124
     67 .equ    ht_offset,                  128
     68 
     69 .text
     70 .p2align 2
     71 
     72 .extern gi1_table_edge_idx
     73 .globl ihevc_sao_edge_offset_class1_a9q
     74 
     75 gi1_table_edge_idx_addr:
     76 .long gi1_table_edge_idx - ulbl1 - 8
     77 
     78 ihevc_sao_edge_offset_class1_a9q:
     79 
     80 
     81     STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
     82     vpush       {d8  -  d15}
     83 
     84     LDR         r7,[sp,#wd_offset]               @Loads wd
     85     LDR         r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left
     86     LDR         r5,[sp,#pu1_avail_offset]        @Loads pu1_avail
     87     LDR         r6,[sp,#pi1_sao_offset]          @Loads pi1_sao_offset
     88     LDR         r8,[sp,#ht_offset]               @Loads ht
     89 
     90     SUB         r9,r7,#1                    @wd - 1
     91     LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
     92     STRB        r10,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
     93     ADD         r10,r0,r9                   @pu1_src[row * src_strd + wd - 1]
     94     MOV         r11,r2                      @Move pu1_src_left pointer to r11
     95     MOV         r12,r8                      @Move ht to r12 for loop count
     96 SRC_LEFT_LOOP:
     97     LDRB        r14,[r10],r1                @Load pu1_src[row * src_strd + wd - 1]
     98     STRB        r14,[r11],#1                @pu1_src_left[row]
     99     SUBS        r12,#1                      @Decrement the loop count
    100     BNE         SRC_LEFT_LOOP               @If not equal to 0 jump to the src_left_loop
    101 
    102     SUB         r12,r8,#1                   @ht - 1
    103     MUL         r12,r12,r1                  @(ht - 1) * src_strd
    104     ADD         r12,r12,r0                  @pu1_src[(ht - 1) * src_strd]
    105 
    106     LDRB        r4,[r5,#2]                  @pu1_avail[2]
    107     CMP         r4,#0                       @0 == pu1_avail[2]
    108     ADDEQ       r0,r0,r1                    @pu1_src += src_strd
    109     SUBEQ       r8,r8,#1                    @ht--
    110 
    111     LDRB        r4,[r5,#3]                  @pu1_avail[3]
    112     CMP         r4,#0                       @0 == pu1_avail[3]
    113     SUBEQ       r8,r8,#1                    @ht--
    114 
    115     VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
    116     VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
    117     VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
    118     LDR         r14, gi1_table_edge_idx_addr @table pointer
    119 ulbl1:
    120     add         r14,r14,pc
    121     VLD1.8      D6,[r14]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
    122     VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
    123 
    124     CMP         r7,#16                      @Compare wd with 16
    125     BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
    126 
    127 WIDTH_LOOP_16:
    128     LDRB        r4,[r5,#2]                  @pu1_avail[2]
    129     CMP         r4,#0                       @0 == pu1_avail[2]
    130     SUBEQ       r9,r0,r1                    @pu1_src -= src_strd
    131     MOVNE       r9,r3                       @*pu1_src_top
    132 
    133     MOV         r10,r0                      @*pu1_src
    134 
    135     VLD1.8      D8,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
    136     VLD1.8      D9,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
    137     VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    138     VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    139 
    140     VLD1.8      D30,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
    141     VLD1.8      D31,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
    142     VCGT.U8     Q6,Q5,Q4                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    143 
    144     VST1.8      {Q15},[r3]!                 @vst1q_u8(pu1_src_top[col])
    145     VCLT.U8     Q7,Q5,Q4                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    146 
    147     VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    148     MOV         r11,r8                      @move ht to r11 for loop count
    149 
    150 PU1_SRC_LOOP:
    151     ADD         r10,r10,r1                  @*pu1_src + src_strd
    152     VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    153     VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    154     SUB         r10,#8
    155     ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
    156 
    157     VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    158     VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    159     VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    160     SUB         r6,#8
    161 
    162     VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    163     SUB         r10,r10,r1
    164 
    165     VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    166     VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    167 
    168     VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
    169     VMOVL.U8    Q14,D19                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    170 
    171     VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
    172     VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_top_row)
    173 
    174     VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
    175     VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    176     VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_top_row)
    177 
    178     VSUB.U8     Q4,Q12,Q11                  @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    179     VTBL.8      D13,{D6},D13                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    180     VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
    181 
    182 
    183     VNEG.S8     Q8,Q4                       @II sign_up = vnegq_s8(sign_down)
    184     VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    185     VADD.I8     Q11,Q11,Q4                  @II edge_idx = vaddq_s8(edge_idx, sign_down)
    186 
    187 
    188     VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    189     VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    190     VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    191 
    192     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    193     VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    194     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    195 
    196 
    197     VMOVL.U8    Q4,D11                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    198     VTBL.8      D13,{D7},D13                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    199     VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
    200 
    201     VADDW.S8    Q4,Q4,D13                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    202     VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    203     VMAX.S16    Q4,Q4,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    204 
    205     VMIN.U16    Q4,Q4,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    206     VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    207 
    208     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    209     VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    210 
    211     VMOVN.I16   D21,Q4                      @vmovn_s16(pi2_tmp_cur_row.val[1])
    212     VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    213 
    214 
    215     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    216     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    217 
    218     VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    219     VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    220     VST1.8      {Q10},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    221 
    222     VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    223     SUBS        r11,r11,#2                  @II Decrement the ht loop count by 1
    224     VMOVN.I16   D31,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
    225 
    226     VST1.8      {Q15},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    227 
    228     BEQ         PU1_SRC_LOOP_END            @if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
    229     CMP         r11,#1                      @checking any residue remains
    230     BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
    231 
    232     ADD         r10,r10,r1                  @*pu1_src + src_strd
    233     VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    234     VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    235     SUB         r10,#8
    236     VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    237     VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    238     VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    239     SUB         r10,r10,r1
    240 
    241     VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
    242     VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    243     VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    244     VTBL.8      D23,{D6},D23                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
    245 
    246     VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    247     VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    248     VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    249     VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    250     VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    251 
    252     VTBL.8      D25,{D7},D23                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
    253     VMOVL.U8    Q14,D11                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
    254     VADDW.S8    Q14,Q14,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
    255     VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
    256     VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
    257 
    258     VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    259     VMOVN.I16   D31,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[1])
    260 
    261     VST1.8      {Q15},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    262 
    263 PU1_SRC_LOOP_END:
    264     VMOV        Q5,Q9                       @pu1_cur_row = pu1_next_row
    265     SUBS        r7,r7,#16                   @Decrement the wd loop count by 16
    266     CMP         r7,#8                       @Check whether residue remains
    267     BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
    268     BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
    269     BLT         END_LOOPS                   @Jump to end function
    270 
    271 
    272 WIDTH_RESIDUE:
    273     LDRB        r4,[r5,#2]                  @pu1_avail[2]
    274     CMP         r4,#0                       @0 == pu1_avail[2]
    275     SUBEQ       r9,r0,r1                    @pu1_src -= src_strd
    276     MOVNE       r9,r3                       @*pu1_src_top
    277     MOV         r10,r0
    278 
    279     VLD1.8      D8,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
    280     VLD1.8      D9,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
    281     VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    282     VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
    283 
    284     VLD1.8      D30,[r12]                   @vld1_u8(pu1_src[(ht - 1) * src_strd])
    285     VST1.8      {D30},[r3]                  @vst1_u8(pu1_src_top[col])
    286 
    287     VCGT.U8     Q6,Q5,Q4                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
    288     VCLT.U8     Q7,Q5,Q4                    @vcltq_u8(pu1_cur_row, pu1_top_row)
    289     VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    290     MOV         r11,r8                      @move ht to r11 for loop count
    291 
    292 PU1_SRC_LOOP_RESIDUE:
    293     ADD         r10,r10,r1                  @*pu1_src + src_strd
    294     VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    295     VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    296     SUB         r10,#8
    297     ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
    298 
    299     VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
    300     VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    301     VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    302     SUB         r6,#8
    303 
    304     VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row)
    305     SUB         r10,r10,r1
    306 
    307     VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    308     VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    309 
    310     VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
    311     VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_next_row)
    312 
    313     VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
    314     VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_next_row)
    315 
    316     VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
    317     VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    318     VSUB.U8     Q10,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    319 
    320     VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
    321     VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    322     VNEG.S8     Q8,Q10                      @II sign_up = vnegq_s8(sign_down)
    323 
    324     VADD.I8     Q11,Q11,Q10                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
    325     VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    326 
    327     VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    328     VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    329     VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    330 
    331     VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    332     VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    333     VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    334 
    335     VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    336     VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    337     VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    338 
    339     VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
    340     VST1.8      {D20},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    341     VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
    342 
    343     SUBS        r11,r11,#2                  @Decrement the ht loop count by 1
    344     VST1.8      {D30},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
    345 
    346     BEQ         END_LOOPS
    347     CMP         r11,#1
    348     BGT         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
    349 
    350 
    351     ADD         r10,r10,r1                  @*pu1_src + src_strd
    352     VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    353     VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
    354     SUB         r10,#8
    355     VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
    356     VCGT.U8     Q7,Q9,Q5                    @vcltq_u8(pu1_cur_row, pu1_next_row)
    357     VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
    358     SUB         r10,r10,r1
    359 
    360     VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
    361     VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
    362     VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
    363 
    364     VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
    365     VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
    366     VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
    367     VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
    368     VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
    369 
    370     VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
    371 
    372     VST1.8      {D30},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
    373 
    374 END_LOOPS:
    375     vpop        {d8  -  d15}
    376     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    377 
    378 
    379 
    380 
    381 
    382 
    383