Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 
     20 @/**
     21 @******************************************************************************
     22 @* @file
     23 @*  ihevc_inter_pred_luma_horz_w16out.s
     24 @*
     25 @* @brief
     26 @*  contains function definitions for inter prediction  interpolation.
     27 @* functions are coded using neon  intrinsics and can be compiled using
     28 
     29 @* rvct
     30 @*
     31 @* @author
     32 @*  parthiban v
     33 @*
     34 @* @par list of functions:
     35 @*
     36 @*  - ihevc_inter_pred_luma_horz_w16out()
     37 @*
     38 @* @remarks
     39 @*  none
     40 @*
     41 @*******************************************************************************
     42 @*/
     43 @/**
     44 @*******************************************************************************
     45 @*
     46 @* @brief
     47 @*   interprediction luma filter for horizontal 16bit output
     48 @*
     49 @* @par description:
     50 @*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
     51 @*     to the elements pointed by 'pu1_src' and  writes to the location pointed
     52 @*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
     53 @*     as an input for vertical filtering or weighted  prediction   assumptions :
     54 @*     the function is optimized considering the fact width is  multiple of 4 or
     55 @*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
     56 @*     is optimized further.
     57 @*
     58 @* @param[in] pu1_src
     59 @*  uword8 pointer to the source
     60 @*
     61 @* @param[out] pi2_dst
     62 @*  word16 pointer to the destination
     63 @*
     64 @* @param[in] src_strd
     65 @*  integer source stride
     66 @*
     67 @* @param[in] dst_strd
     68 @*  integer destination stride
     69 @*
     70 @* @param[in] pi1_coeff
     71 @*  word8 pointer to the filter coefficients
     72 @*
     73 @* @param[in] ht
     74 @*  integer height of the array
     75 @*
     76 @* @param[in] wd
     77 @*  integer width of the array
     78 @*
     79 @* @returns
     80 @*
     81 @* @remarks
     82 @*  none
     83 @*
     84 @*******************************************************************************
     85 @*/
     86 
     87 @void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
     88 @                                word16 *pi2_dst,
     89 @                                word32 src_strd,
     90 @                                word32 dst_strd,
     91 @                                word8 *pi1_coeff,
     92 @                                word32 ht,
     93 @                                word32 wd
     94 
     95 
     96 @r0 - free
     97 @r1 - dst_ptr
     98 @r2 - src_strd
     99 @r3 - dst_strd
    100 @r4 - src_ptr2
    101 @r5 - inner loop counter
    102 @r6 - dst_ptr2
    103 @r7 - free
    104 @r8 - dst_strd2
    105 @r9 - src_strd1
    106 @r10 - wd
    107 @r11 - #1
    108 @r12 - src_ptr1
    109 @r14 - loop_counter
    110 
    111 .equ    coeff_offset,   104
    112 .equ    ht_offset,      108
    113 .equ    wd_offset,      112
    114 
    115 .text
    116 .align 4
    117 .syntax unified
    118 
    119 
    120 
    121 
    122 .globl ihevc_inter_pred_luma_horz_w16out_a9q
    123 
    124 .type ihevc_inter_pred_luma_horz_w16out_a9q, %function
    125 
    126 ihevc_inter_pred_luma_horz_w16out_a9q:
    127 
    128     bic         r14, #1                     @ clearing bit[0], so that it goes back to mode
    129     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    130     vpush       {d8 - d15}
    131     ldr         r4,[sp,#coeff_offset]                 @loads pi1_coeff
    132     ldr         r7,[sp,#ht_offset]                 @loads ht
    133 
    134 
    135     vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
    136     sub         r14,r7,#0                   @checks for ht == 0
    137     vabs.s8     d2,d0                       @vabs_s8(coeff)
    138     mov         r11,#1
    139     ldr         r10,[sp,#wd_offset]                @loads wd
    140     vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    141     sub         r12,r0,#3                   @pu1_src - 3
    142     vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    143     add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
    144     vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    145     rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
    146     vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    147     rsb         r8,r10,r3                   @dst_strd - wd
    148     vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
    149 
    150     vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
    151     and         r7,r14,#1                   @calculating ht_residue ht_residue = (ht & 1)
    152     vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
    153     sub         r14,r14,r7                  @decrement height by ht_residue(residue value is calculated outside)
    154     vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
    155 
    156     cmp         r7,#1
    157     beq         odd_height_decision
    158 
    159 even_height_decision:
    160     mov         r7,r1
    161     cmp         r10,#4
    162     ble         outer_loop_4
    163 
    164     cmp         r10,#24
    165     moveq       r10,#16
    166     addeq       r8,#8
    167     addeq       r9,#8
    168 
    169     cmp         r10,#16
    170     bge         outer_loop_16_branch
    171 
    172     cmp         r10,#12
    173     addeq       r8,#4
    174     addeq       r9,#4
    175 outer_loop_8_branch:
    176     b           outer_loop_8
    177 
    178 outer_loop_16_branch:
    179     b           outer_loop_16
    180 
    181 
    182 odd_height_decision:
    183     cmp         r10,#24
    184     beq         outer_loop_8_branch
    185     cmp         r10,#12
    186     beq         outer_loop_4
    187     b           even_height_decision
    188 
    189 outer_loop4_residual:
    190     sub         r12,r0,#3                   @pu1_src - 3
    191     mov         r1,r7
    192     add         r1,#16
    193     mov         r10,#4
    194     add         r12,#8
    195     mov         r14,#16
    196     add         r8,#4
    197     add         r9,#4
    198 
    199 outer_loop_4:
    200     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    201     add         r4,r12,r2                   @pu1_src + src_strd
    202 
    203     subs        r5,r10,#0                   @checks wd
    204     ble         end_inner_loop_4
    205 
    206 inner_loop_4:
    207     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    208     vld1.u32    {d1},[r12],r11
    209     vld1.u32    {d2},[r12],r11
    210     vld1.u32    {d3},[r12],r11
    211     vld1.u32    {d4},[r12],r11
    212     vld1.u32    {d5},[r12],r11
    213     vld1.u32    {d6},[r12],r11
    214     vld1.u32    {d7},[r12],r11
    215     @add       r12,r12,#4                      @increment the input pointer
    216     sub         r12,r12,#4
    217     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    218     @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
    219     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    220 
    221     @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
    222     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    223     @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
    224     @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
    225     vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
    226     vld1.u32    {d13},[r4],r11
    227     vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
    228     vld1.u32    {d14},[r4],r11
    229     vzip.32     d1,d13
    230     vld1.u32    {d15},[r4],r11
    231     vzip.32     d2,d14
    232     vld1.u32    {d16},[r4],r11
    233     vzip.32     d3,d15
    234     vld1.u32    {d17},[r4],r11
    235     vzip.32     d4,d16
    236     vld1.u32    {d18},[r4],r11
    237     vzip.32     d5,d17
    238     vld1.u32    {d19},[r4],r11
    239     sub         r4,r4,#4
    240     @ add       r4,r4,#4                        @increment the input pointer
    241     @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    242     @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
    243     @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    244     @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
    245     @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    246     @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
    247     @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
    248 
    249 
    250 
    251 
    252 
    253 
    254 
    255     vzip.32     d6,d18
    256     vzip.32     d7,d19
    257 
    258     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
    259     vmlsl.u8    q4,d0,d24
    260     vmlsl.u8    q4,d2,d26
    261     vmlal.u8    q4,d3,d27
    262     vmlal.u8    q4,d4,d28
    263     vmlsl.u8    q4,d5,d29
    264     vmlal.u8    q4,d6,d30
    265     vmlsl.u8    q4,d7,d31
    266 
    267     @ vqrshrun.s16 d8,q4,#6                     @narrow right shift and saturating the result
    268     vst1.64     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
    269     vst1.64     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
    270     subs        r5,r5,#4                    @decrement the wd by 4
    271     bgt         inner_loop_4
    272 
    273 end_inner_loop_4:
    274     subs        r14,r14,#2                  @decrement the ht by 4
    275     add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
    276     add         r1,r6,r8,lsl #1             @increment the output pointer 2*dst_strd-wd
    277     bgt         outer_loop_4
    278 
    279 
    280 height_residue_4:
    281 
    282     ldr         r7,[sp,#ht_offset]                 @loads ht
    283     and         r7,r7,#1                    @calculating ht_residue ht_residue = (ht & 1)
    284     cmp         r7,#0
    285     beq         end_loops
    286 
    287 outer_loop_height_residue_4:
    288 
    289 
    290     subs        r5,r10,#0                   @checks wd
    291     ble         end_inner_loop_height_residue_4
    292 
    293 inner_loop_height_residue_4:
    294     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    295     vld1.u32    {d1},[r12],r11
    296 
    297 
    298 
    299 
    300 
    301 
    302     @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    303     @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
    304     @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    305 
    306 
    307 
    308     @add        r12,r12,#4                      @increment the input pointer
    309     @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
    310     @ vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    311     @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
    312     @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
    313     vld1.u32    {d2},[r12],r11
    314     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
    315     vld1.u32    {d3},[r12],r11
    316     vmlsl.u8    q4,d0,d24
    317     vld1.u32    {d4},[r12],r11
    318     vmlsl.u8    q4,d2,d26
    319     vld1.u32    {d5},[r12],r11
    320     vmlal.u8    q4,d3,d27
    321     vld1.u32    {d6},[r12],r11
    322     vmlal.u8    q4,d4,d28
    323     vld1.u32    {d7},[r12],r11
    324     vmlsl.u8    q4,d5,d29
    325     sub         r12,r12,#4
    326     vmlal.u8    q4,d6,d30
    327     vmlsl.u8    q4,d7,d31                   @store the i iteration result which is in upper part of the register
    328     subs        r5,r5,#4                    @decrement the wd by 4
    329     vst1.64     {d8},[r1]!
    330     bgt         inner_loop_height_residue_4
    331 
    332 end_inner_loop_height_residue_4:
    333     subs        r7,r7,#1                    @decrement the ht by 4
    334     rsb         r9,r10,r2
    335     add         r12,r12,r9                  @increment the input pointer src_strd-wd
    336     add         r1,r1,r8                    @increment the output pointer dst_strd-wd
    337     bgt         outer_loop_height_residue_4
    338     vpop        {d8 - d15}
    339     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    340 
    341 outer_loop8_residual:
    342     sub         r12,r0,#3                   @pu1_src - 3
    343     mov         r1,r7
    344     mov         r14,#32
    345     add         r1,#32
    346     add         r12,#16
    347     mov         r10,#8
    348     add         r8,#8
    349     add         r9,#8
    350 
    351 outer_loop_8:
    352 
    353     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    354     add         r4,r12,r2                   @pu1_src + src_strd
    355     subs        r5,r10,#0                   @checks wd
    356 
    357     ble         end_inner_loop_8
    358 
    359 inner_loop_8:
    360     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    361     vld1.u32    {d1},[r12],r11
    362     vld1.u32    {d2},[r12],r11
    363     vld1.u32    {d3},[r12],r11
    364 
    365 
    366 
    367 
    368 
    369     @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    370     @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
    371     @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    372     @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
    373     @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
    374     @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
    375     @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
    376     @ vext.u8   d14,d12,d13,#2
    377 
    378     @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
    379     @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    380     @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
    381     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    382     @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
    383     @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
    384     vld1.u32    {d4},[r12],r11
    385     vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    386     vld1.u32    {d5},[r12],r11
    387     vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    388     vld1.u32    {d6},[r12],r11
    389     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    390     vld1.u32    {d7},[r12],r11
    391     vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    392     vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
    393     vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    394     vld1.u32    {d13},[r4],r11
    395     vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    396     vld1.u32    {d14},[r4],r11
    397     vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    398     vld1.u32    {d15},[r4],r11
    399     vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    400     vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd
    401 
    402     vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    403     vld1.u32    {d17},[r4],r11
    404     vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    405     vld1.u32    {d18},[r4],r11
    406     vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    407     vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
    408     vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    409     @ vqrshrun.s16  d20,q4,#6                       @right shift and saturating narrow result 1
    410     vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    411     vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    412     vst1.16     {q4},[r1]!                  @store the result pu1_dst
    413     vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    414     vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    415 
    416 
    417 
    418     @ vqrshrun.s16 d8,q5,#6                     @right shift and saturating narrow result 2
    419     subs        r5,r5,#8                    @decrement the wd loop
    420     vst1.16     {q5},[r6]!                  @store the result pu1_dst
    421     cmp         r5,#4
    422     bgt         inner_loop_8
    423 
    424 end_inner_loop_8:
    425     subs        r14,r14,#2                  @decrement the ht loop
    426     add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
    427     add         r1,r6,r8,lsl #1             @increment the dst pointer by 2*dst_strd-wd
    428     bgt         outer_loop_8
    429 
    430 
    431 
    432 
    433 
    434     ldr         r10,[sp,#wd_offset]                @loads wd
    435     cmp         r10,#12
    436 
    437     beq         outer_loop4_residual
    438 
    439     ldr         r7,[sp,#ht_offset]                 @loads ht
    440     and         r7,r7,#1
    441     cmp         r7,#1
    442     beq         height_residue_4
    443 
    444 
    445     vpop        {d8 - d15}
    446     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    447 
    448 
    449 
    450 
    451 
    452 outer_loop_16:
    453     str         r0, [sp, #-4]!
    454     str         r7, [sp, #-4]!
    455     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    456     add         r4,r12,r2                   @pu1_src + src_strd
    457     and         r0, r12, #31
    458     sub         r5,r10,#0                   @checks wd
    459     pld         [r12, r2, lsl #1]
    460     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    461     pld         [r4, r2, lsl #1]
    462     vld1.u32    {q1},[r12],r11
    463     vld1.u32    {q2},[r12],r11
    464     vld1.u32    {q3},[r12],r11
    465     vld1.u32    {q6},[r12],r11
    466     vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    467     vld1.u32    {q7},[r12],r11
    468     vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    469     vld1.u32    {q8},[r12],r11
    470     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    471     vld1.u32    {q9},[r12],r11
    472     vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    473     vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    474     vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    475     vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    476     vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    477 
    478 
    479 inner_loop_16:
    480 
    481 
    482     subs        r5,r5,#16
    483     vmull.u8    q10,d3,d25
    484 
    485     add         r12,#8
    486     vmlsl.u8    q10,d1,d24
    487 
    488     vld1.u32    {q0},[r4],r11               @vector load pu1_src
    489     vmlal.u8    q10,d7,d27
    490 
    491     vld1.u32    {q1},[r4],r11
    492     vmlsl.u8    q10,d5,d26
    493 
    494     vld1.u32    {q2},[r4],r11
    495     vmlal.u8    q10,d13,d28
    496 
    497     vld1.u32    {q3},[r4],r11
    498     vmlal.u8    q10,d17,d30
    499 
    500     vld1.u32    {q6},[r4],r11
    501     vmlsl.u8    q10,d15,d29
    502 
    503     vld1.u32    {q7},[r4],r11
    504     vmlsl.u8    q10,d19,d31
    505 
    506     vld1.u32    {q8},[r4],r11
    507     vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    508 
    509     vld1.u32    {q9},[r4],r11
    510     vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    511 
    512     add         r4,#8
    513     vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    514     pld         [r12, r2, lsl #2]
    515     pld         [r4, r2, lsl #2]
    516     vst1.8      {q4},[r1]!                  @store the result pu1_dst
    517     vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    518 
    519     addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
    520     vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    521 
    522     addeq       r4,r12,r2                   @pu1_src + src_strd
    523     vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    524 
    525 @   and         r7, r12, #31
    526     vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    527 
    528     subeq       r14,r14,#2
    529     vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    530 
    531     @cmp            r7, r0
    532     vmull.u8    q11,d3,d25
    533 
    534 @   pld     [r12, r2, lsl #2]
    535     vmlsl.u8    q11,d1,d24
    536 
    537     vst1.16     {q10},[r1]!
    538     vmlal.u8    q11,d7,d27
    539 
    540 @   pld     [r4, r2, lsl #2]
    541     vmlsl.u8    q11,d5,d26
    542 
    543 @   mov         r0, r7
    544     vmlal.u8    q11,d13,d28
    545 
    546     cmp         r14,#0
    547     vmlal.u8    q11,d17,d30
    548 
    549     vst1.16     {q5},[r6]!
    550     vmlsl.u8    q11,d15,d29
    551 
    552     vmlsl.u8    q11,d19,d31
    553 
    554     beq         epilog_16
    555 
    556     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    557     vld1.u32    {q1},[r12],r11
    558     vld1.u32    {q2},[r12],r11
    559     vld1.u32    {q3},[r12],r11
    560     vld1.u32    {q6},[r12],r11
    561     vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    562     vld1.u32    {q7},[r12],r11
    563     vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    564     vld1.u32    {q8},[r12],r11
    565     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    566     vld1.u32    {q9},[r12],r11
    567     vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    568     vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    569     cmp         r5,#0
    570     vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    571     moveq       r5,r10
    572     vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    573     vst1.8      {q11},[r6]!                 @store the result pu1_dst
    574     vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    575     addeq       r1,r6,r8,lsl #1
    576     addeq       r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    577     b           inner_loop_16
    578 
    579 
    580 epilog_16:
    581 @   vqrshrun.s16 d11,q11,#6
    582     vst1.8      {q11},[r6]!                 @store the result pu1_dst
    583 
    584     ldr         r7, [sp], #4
    585     ldr         r0, [sp], #4
    586     ldr         r10,[sp,#wd_offset]
    587     cmp         r10,#24
    588     beq         outer_loop8_residual
    589     add         r1,r6,r8,lsl #1
    590     ldr         r7,[sp,#ht_offset]                 @loads ht
    591     and         r7,r7,#1
    592     cmp         r7,#1
    593     beq         height_residue_4
    594 
    595 end_loops:
    596     vpop        {d8 - d15}
    597     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    598 
    599 
    600 
    601 
    602 
    603 
    604 
    605 
    606 
    607