Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_mode_19_to_25.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  naveen sr
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    chroma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] nt
     61 @*  size of tranform block
     62 @*
     63 @* @param[in] mode
     64 @*  type of filtering
     65 @*
     66 @* @returns
     67 @*
     68 @* @remarks
     69 @*  none
     70 @*
     71 @*******************************************************************************
     72 @*/
     73 
     74 @void ihevc_intra_pred_chroma_mode_19_to_25(uword8* pu1_ref,
     75 @                               word32 src_strd,
     76 @                               uword8* pu1_dst,
     77 @                               word32 dst_strd,
     78 @                               word32 nt,
     79 @                               word32 mode)
     80 @
     81 @**************variables vs registers*****************************************
     82 @r0 => *pu1_ref
     83 @r1 => src_strd
     84 @r2 => *pu1_dst
     85 @r3 => dst_strd
     86 
     87 @stack contents from #40
     88 @   nt
     89 @   mode
     90 
     91 .text
     92 .align 4
     93 
     94 
     95 
     96 
     97 .globl ihevc_intra_pred_chroma_mode_19_to_25_a9q
     98 .extern gai4_ihevc_ang_table
     99 .extern gai4_ihevc_inv_ang_table
    100 .extern gau1_ihevc_planar_factor
    101 
    102 gai4_ihevc_inv_ang_table_addr:
    103 .long gai4_ihevc_inv_ang_table - ulbl1 - 8
    104 
    105 gau1_ihevc_planar_factor_addr:
    106 .long gau1_ihevc_planar_factor - ulbl2 - 8
    107 
    108 gai4_ihevc_ang_table_addr_1:
    109 .long gai4_ihevc_ang_table - ulbl3 - 8
    110 
    111 gai4_ihevc_ang_table_addr_2:
    112 .long gai4_ihevc_ang_table - ulbl4 - 8
    113 
    114 .type ihevc_intra_pred_chroma_mode_19_to_25_a9q, %function
    115 
    116 ihevc_intra_pred_chroma_mode_19_to_25_a9q:
    117 
    118     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    119 
    120     ldr         r4,[sp,#40]                 @loads nt
    121     ldr         r7, gai4_ihevc_ang_table_addr_1
    122 ulbl3:
    123     add         r7,r7,pc
    124 
    125     ldr         r5,[sp,#44]                 @mode (19 to 25)
    126     ldr         r8, gai4_ihevc_inv_ang_table_addr
    127 ulbl1:
    128     add         r8,r8,pc
    129 
    130     add         r7, r7, r5, lsl #2          @gai4_ihevc_ang_table[mode]
    131     add         r8, r8, r5, lsl #2          @gai4_ihevc_inv_ang_table
    132     sub         r8, r8, #48                 @gai4_ihevc_inv_ang_table[mode - 12]
    133 
    134     ldr         r7, [r7]                    @intra_pred_ang
    135     sub         sp, sp, #132                @ref_temp[2 * max_cu_size + 2]
    136 
    137     ldr         r8, [r8]                    @inv_ang
    138     add         r6, sp, r4 , lsl #1         @ref_temp + 2 * nt
    139 
    140     mul         r9, r4, r7                  @nt*intra_pred_ang
    141 
    142     sub         r6, r6, #2                  @ref_temp + 2*nt - 2
    143 
    144     add         r1, r0, r4, lsl #2          @r1 = &src[4nt]
    145     vdup.8      d30, r7                     @intra_pred_ang
    146 
    147     mov         r7, r4
    148 
    149     asr         r9, r9, #5
    150 
    151     vld1.32     d0,[r1]!                    @ pu1_ref[two_nt + k]
    152 
    153     vst1.32     d0,[r6]!                    @ref_temp[k + nt - 1] = pu1_ref[two_nt + k]@
    154 
    155     subs        r7, r7, #4
    156     beq         end_loop_copy
    157     subs        r7,r7,#4
    158     beq         loop_copy_8
    159     subs        r7,r7,#8
    160     beq         loop_copy_16
    161 
    162 loop_copy_32:
    163     vld1.8      {d0,d1,d2,d3},[r1]!
    164     vld1.8      {d4,d5,d6},[r1]!
    165 
    166     vst1.8      {d0,d1,d2,d3},[r6]!
    167 
    168 
    169     vst1.8      {d4,d5,d6},[r6]!
    170     b           end_loop_copy
    171 
    172 loop_copy_16:
    173     vld1.8      {d0,d1,d2},[r1]!
    174     vst1.8      {d0,d1,d2},[r6]!
    175 
    176     b           end_loop_copy
    177 
    178 loop_copy_8:
    179     vld1.8      d0,[r1]!
    180     vst1.8      d0,[r6]!
    181 
    182 end_loop_copy:
    183 
    184     ldrh        r11, [r1]
    185     strh        r11, [r6]
    186 
    187     cmp         r9, #-1
    188     bge         linear_filtering
    189 
    190     add         r6, sp, r4 ,lsl #1          @ref_temp + 2 * nt
    191     sub         r6, r6, #4                  @ref_temp + 2 * nt - 2 - 2
    192 
    193     mov         r12, #0xffffffff
    194 
    195     rsb         r9, r9, r12                 @count to take care off ref_idx
    196 
    197     add         r1, r0, r4, lsl #2          @r1 = &src[2nt]
    198 
    199     mov         r7, #128                    @inv_ang_sum
    200 
    201 loop_copy_ref_idx:
    202 
    203     add         r7, r7, r8                  @inv_ang_sum += inv_ang
    204     mov         r0,r7, lsr #8
    205     mov         r0,r0, lsl #1
    206     ldrh        r11, [r1, -r0]
    207     strh        r11, [r6], #-2
    208 
    209     subs        r9, r9, #1
    210 
    211     bne         loop_copy_ref_idx
    212 
    213 
    214 linear_filtering:
    215 @   after copy
    216 @   below code is taken from mode 27 to 33 and modified
    217 
    218     ldr         r6,gai4_ihevc_ang_table_addr_2 @loads word32 gai4_ihevc_ang_table[35]
    219 ulbl4:
    220     add         r6,r6,pc
    221 
    222     lsl         r7,r4,#2                    @four_nt
    223 
    224     add         r8,r6,r5,lsl #2             @*gai4_ihevc_ang_table[mode]
    225     ldr         r9,[r8]                     @intra_pred_ang = gai4_ihevc_ang_table[mode]
    226     ldr         r1,gau1_ihevc_planar_factor_addr @used for ((row + 1) * intra_pred_ang) row values
    227 ulbl2:
    228     add         r1,r1,pc
    229     add         r6,r1,#1
    230 
    231     add         r8, sp, r4, lsl #1          @ref_temp + 2 * nt
    232     sub         r8,#2                       @ref_temp + 2*nt -2
    233 
    234     mov         lr,#0                       @row
    235     mov         r12,r4
    236     lsl         r4,r4,#1
    237 
    238 core_loop_8:
    239     add         r8,r8,#2                    @pu1_ref_main_idx += (four_nt + 1)
    240     vdup.8      d0,r9                       @intra_pred_ang
    241     mov         r12,r4,lsr #4               @divide by 8
    242 
    243     vmov.i8     d1,#32
    244     mul         r7,r4,r12
    245 
    246     vmov.i16    q3,#31
    247 
    248 
    249     mov         r1,r8
    250 
    251     mov         r5,r4
    252     mov         r11,#2
    253 
    254 prologue:
    255     vld1.8      {d3},[r6]                   @loads the row value
    256     vmull.s8    q1,d3,d0                    @pos = ((row + 1) * intra_pred_ang)
    257     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    258     vmovn.i16   d4,q2
    259     vshrn.s16   d5,q1,#5                    @idx = pos >> 5
    260     vshl.s8     d5,d5,#1
    261 
    262     vdup.8      d31,d4[0]
    263     add         r0,r2,r3
    264 
    265     vmov.u32    lr,d5[0]                    @(i row)extract idx to the r register
    266 @   lsl         lr,lr,#1
    267 
    268     vdup.8      d29,d4[1]                   @(ii)
    269     sbfx        r9,lr,#0,#8
    270 
    271     add         r10,r8,r9                   @(i row)*pu1_ref[ref_main_idx]
    272 
    273     vld1.8      {d8},[r10],r11              @(i row)ref_main_idx
    274     sbfx        r9,lr,#8,#8
    275 
    276     vld1.8      {d9},[r10]                  @(i row)ref_main_idx_1
    277     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    278 
    279     sbfx        r9,lr,#16,#8
    280     vsub.u8     d30,d1,d31                  @32-fract(dup_const_32_fract)
    281     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    282 
    283     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    284     vmull.u8    q5,d8,d30                   @(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
    285 
    286     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    287     vmlal.u8    q5,d9,d31                   @(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
    288 
    289     vdup.8      d27,d4[2]                   @(iii)
    290     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    291     sbfx        r9,lr,#24,#8
    292 
    293     vdup.8      d25,d4[3]                   @(iv)
    294     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    295     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    296 
    297     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    298     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    299 
    300     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    301     vrshrn.i16  d10,q5,#5                   @(i row)shift_res = vrshrn_n_u16(add_res, 5)
    302 
    303     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    304     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    305 
    306     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    307 
    308     vdup.8      d31,d4[4]                   @(v)
    309     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    310 
    311     vmov.u32    lr,d5[1]                    @extract idx to the r register
    312     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    313 @   lsl         lr,lr,#1
    314 
    315     vst1.8      {d10},[r2]!                 @(i row)
    316     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    317 
    318     sbfx        r9,lr,#0,#8
    319     vdup.8      d29,d4[5]                   @(vi)
    320     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    321 
    322     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    323     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    324 
    325     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    326     sbfx        r9,lr,#8,#8
    327 
    328     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    329     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    330 
    331     vst1.8      {d14},[r0],r3               @(ii)
    332     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    333 
    334     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    335     vdup.8      d27,d4[6]                   @(vii)
    336 
    337     sbfx        r9,lr,#16,#8
    338     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    339     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    340 
    341     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    342     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    343 
    344     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    345     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    346 
    347     vst1.8      {d18},[r0],r3               @(iii)
    348     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    349 
    350     vdup.8      d25,d4[7]                   @(viii)
    351     sbfx        r9,lr,#24,#8
    352 
    353     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    354     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    355 
    356     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    357     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    358 
    359     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    360     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    361     subs        r7,r7,#8
    362 
    363     vst1.8      {d22},[r0],r3               @(iv)
    364     cmp         r4,#8                       @ go to end if 4x4
    365     beq         end_loops
    366 
    367     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    368 
    369     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    370     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    371 
    372     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    373     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    374 
    375     addgt       r8,r8,#8
    376     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    377     subgt       r4,r4,#8
    378 
    379     vst1.8      {d10},[r0],r3               @(v)
    380     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    381 
    382     beq         epilogue
    383 
    384     vld1.8      {d5},[r6]                   @loads the row value
    385     vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    386     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    387     vmovn.i16   d4,q2
    388     vshrn.s16   d3,q1,#5                    @idx = pos >> 5
    389     vshl.s8     d3,d3,#1
    390     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    391 @   lsl         lr,lr,#1
    392     sbfx        r9,lr,#0,#8
    393     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    394 
    395 kernel_8_rows:
    396     vdup.8      d31,d4[0]
    397     subs        r4,r4,#8
    398     sbfx        r9,lr,#8,#8
    399 
    400     vld1.8      {d8},[r10],r11              @(i)ref_main_idx
    401     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    402 
    403     addle       r6,r6,#8                    @increment the row value
    404     add         r12,r8,r9                   @(ii)*pu1_ref[ref_main_idx]
    405 
    406     vld1.8      {d9},[r10]                  @(i)ref_main_idx_1
    407     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    408 
    409     vld1.8      {d5},[r6]                   @loads the row value
    410     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    411 
    412     vdup.8      d29,d4[1]                   @(ii)
    413     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    414 
    415     sbfx        r9,lr,#16,#8
    416 
    417     vst1.8      {d14},[r0],r3               @(vi)
    418     vsub.u8     d30,d1,d31                  @(i)32-fract(dup_const_32_fract)
    419 
    420     add         r10,r8,r9                   @(iii)*pu1_ref[ref_main_idx]
    421 
    422     vld1.8      {d12},[r12],r11             @(ii)ref_main_idx
    423     vmull.u8    q5,d8,d30                   @(i)vmull_u8(ref_main_idx, dup_const_32_fract)
    424 
    425     vld1.8      {d13},[r12]                 @(ii)ref_main_idx_1
    426     vmlal.u8    q5,d9,d31                   @(i)vmull_u8(ref_main_idx_1, dup_const_fract)
    427 
    428     sbfx        r9,lr,#24,#8
    429     movle       r4,r5                       @reload nt
    430 
    431     vmov.u32    lr,d3[1]                    @extract idx to the r register
    432     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    433 
    434     vdup.8      d27,d4[2]                   @(iii)
    435     vsub.u8     d28,d1,d29                  @(ii)32-fract(dup_const_32_fract)
    436     add         r12,r8,r9                   @(iv)*pu1_ref[ref_main_idx]
    437 
    438     vld1.8      {d16},[r10],r11             @(iii)ref_main_idx
    439     vmull.u8    q7,d12,d28                  @(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
    440 
    441     vst1.8      {d18},[r0],r3               @(vii)
    442     vmlal.u8    q7,d13,d29                  @(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
    443 
    444     vld1.8      {d17},[r10]                 @(iii)ref_main_idx_1
    445     vrshrn.i16  d10,q5,#5                   @(i)shift_res = vrshrn_n_u16(add_res, 5)
    446 
    447     vdup.8      d25,d4[3]                   @(iv)
    448     vmull.s8    q1,d5,d0                    @pos = ((row + 1) * intra_pred_ang)
    449 
    450     vst1.8      {d22},[r0]                  @(viii)
    451     vsub.u8     d26,d1,d27                  @(iii)32-fract(dup_const_32_fract)
    452 
    453     vld1.8      {d20},[r12],r11             @(iv)ref_main_idx
    454     vmull.u8    q9,d16,d26                  @(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
    455 @   lsl         lr,lr,#1
    456 
    457     vld1.8      {d21},[r12]                 @(iv)ref_main_idx_1
    458     vmlal.u8    q9,d17,d27                  @(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
    459 
    460     sbfx        r9,lr,#0,#8
    461     add         r0,r2,r3
    462 
    463     vdup.8      d31,d4[4]                   @(v)
    464     vrshrn.i16  d14,q7,#5                   @(ii)shift_res = vrshrn_n_u16(add_res, 5)
    465 
    466     add         r10,r8,r9                   @(v)*pu1_ref[ref_main_idx]
    467     sbfx        r9,lr,#8,#8
    468 
    469     vst1.8      {d10},[r2]!                 @(i)
    470     vsub.u8     d24,d1,d25                  @(iv)32-fract(dup_const_32_fract)
    471 
    472     vdup.8      d29,d4[5]                   @(vi)
    473     vmull.u8    q11,d20,d24                 @(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
    474 
    475     vdup.8      d27,d4[6]                   @(vii)
    476     vmlal.u8    q11,d21,d25                 @(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
    477 
    478     add         r12,r8,r9                   @(vi)*pu1_ref[ref_main_idx]
    479     sbfx        r9,lr,#16,#8
    480 
    481     vdup.8      d25,d4[7]                   @(viii)
    482     vrshrn.i16  d18,q9,#5                   @(iii)shift_res = vrshrn_n_u16(add_res, 5)
    483 
    484     vld1.8      {d8},[r10],r11              @(v)ref_main_idx
    485     vand        q2,q1,q3                    @dup_const_fract(fract = pos & (31))
    486 
    487     vld1.8      {d9},[r10]                  @(v)ref_main_idx_1
    488     vshrn.s16   d3,q1,#5                    @idx = pos >> 5
    489 
    490     vst1.8      {d14},[r0],r3               @(ii)
    491     vrshrn.i16  d22,q11,#5                  @(iv)shift_res = vrshrn_n_u16(add_res, 5)
    492 
    493     add         r10,r8,r9                   @(vii)*pu1_ref[ref_main_idx]
    494     sbfx        r9,lr,#24,#8
    495 
    496     vld1.8      {d12},[r12],r11             @(vi)ref_main_idx
    497     vsub.u8     d30,d1,d31                  @(v)32-fract(dup_const_32_fract)
    498 
    499     vshl.s8     d3,d3,#1
    500 
    501     vld1.8      {d13},[r12]                 @(vi)ref_main_idx_1
    502     vmull.u8    q5,d8,d30                   @(v)vmull_u8(ref_main_idx, dup_const_32_fract)
    503 
    504     vmov.u32    lr,d3[0]                    @(i)extract idx to the r register
    505     vmlal.u8    q5,d9,d31                   @(v)vmull_u8(ref_main_idx_1, dup_const_fract)
    506 
    507     add         r12,r8,r9                   @(viii)*pu1_ref[ref_main_idx]
    508     movle       r8,r1                       @reload the source to pu1_src+2nt
    509 
    510     vld1.8      {d16},[r10],r11             @(vii)ref_main_idx
    511     vsub.u8     d28,d1,d29                  @(vi)32-fract(dup_const_32_fract)
    512 
    513     vst1.8      {d18},[r0],r3               @(iii)
    514     vmull.u8    q7,d12,d28                  @(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
    515 
    516     vld1.8      {d17},[r10]                 @(vii)ref_main_idx_1
    517     vmlal.u8    q7,d13,d29                  @(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
    518 
    519     vld1.8      {d20},[r12],r11             @(viii)ref_main_idx
    520     vrshrn.i16  d10,q5,#5                   @(v)shift_res = vrshrn_n_u16(add_res, 5)
    521 
    522     vld1.8      {d21},[r12]                 @(viii)ref_main_idx_1
    523     vsub.u8     d26,d1,d27                  @(vii)32-fract(dup_const_32_fract)
    524 
    525     addgt       r8,r8,#8                    @increment the source next set 8 columns in same row
    526     lslle       r12,r3,#3
    527     suble       r12,r12,r5
    528 
    529     vst1.8      {d22},[r0],r3               @(iv)
    530     vmull.u8    q9,d16,d26                  @(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
    531 
    532     vst1.8      {d10},[r0],r3               @(v)
    533     vmlal.u8    q9,d17,d27                  @(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
    534 
    535     addle       r2,r2,r12                   @increment the dst pointer to 8*dst_strd - nt
    536     sbfx        r9,lr,#0,#8
    537 
    538     vmovn.i16   d4,q2
    539     vrshrn.i16  d14,q7,#5                   @(vi)shift_res = vrshrn_n_u16(add_res, 5)
    540 @   lsl         lr,lr,#1
    541 
    542     subs        r7,r7,#8
    543     add         r10,r8,r9                   @(i)*pu1_ref[ref_main_idx]
    544 
    545     bne         kernel_8_rows
    546 
    547 epilogue:
    548     vst1.8      {d14},[r0],r3               @(vi)
    549     vrshrn.i16  d18,q9,#5                   @(vii)shift_res = vrshrn_n_u16(add_res, 5)
    550 
    551     vsub.u8     d24,d1,d25                  @(viii)32-fract(dup_const_32_fract)
    552     vmull.u8    q11,d20,d24                 @(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
    553     vmlal.u8    q11,d21,d25                 @(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
    554 
    555     vst1.8      {d18},[r0],r3               @(vii)
    556     vrshrn.i16  d22,q11,#5                  @(viii)shift_res = vrshrn_n_u16(add_res, 5)
    557 
    558     vst1.8      {d22},[r0],r3               @(viii)
    559     b           end_loops
    560 
    561 core_loop_4:
    562 
    563 end_loops:
    564     add         sp, sp, #132
    565     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    566 
    567 
    568 
    569 
    570 
    571 
    572