Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 
     20 @/**
     21 @******************************************************************************
     22 @* @file
     23 @*  ihevc_inter_pred_luma_horz_w16out.s
     24 @*
     25 @* @brief
     26 @*  contains function definitions for inter prediction  interpolation.
     27 @* functions are coded using neon  intrinsics and can be compiled using
     28 
     29 @* rvct
     30 @*
     31 @* @author
     32 @*  parthiban v
     33 @*
     34 @* @par list of functions:
     35 @*
     36 @*  - ihevc_inter_pred_luma_horz_w16out()
     37 @*
     38 @* @remarks
     39 @*  none
     40 @*
     41 @*******************************************************************************
     42 @*/
     43 @/**
     44 @*******************************************************************************
     45 @*
     46 @* @brief
     47 @*   interprediction luma filter for horizontal 16bit output
     48 @*
     49 @* @par description:
     50 @*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
     51 @*     to the elements pointed by 'pu1_src' and  writes to the location pointed
     52 @*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
     53 @*     as an input for vertical filtering or weighted  prediction   assumptions :
     54 @*     the function is optimized considering the fact width is  multiple of 4 or
     55 @*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
     56 @*     is optimized further.
     57 @*
     58 @* @param[in] pu1_src
     59 @*  uword8 pointer to the source
     60 @*
     61 @* @param[out] pi2_dst
     62 @*  word16 pointer to the destination
     63 @*
     64 @* @param[in] src_strd
     65 @*  integer source stride
     66 @*
     67 @* @param[in] dst_strd
     68 @*  integer destination stride
     69 @*
     70 @* @param[in] pi1_coeff
     71 @*  word8 pointer to the filter coefficients
     72 @*
     73 @* @param[in] ht
     74 @*  integer height of the array
     75 @*
     76 @* @param[in] wd
     77 @*  integer width of the array
     78 @*
     79 @* @returns
     80 @*
     81 @* @remarks
     82 @*  none
     83 @*
     84 @*******************************************************************************
     85 @*/
     86 
     87 @void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
     88 @                                word16 *pi2_dst,
     89 @                                word32 src_strd,
     90 @                                word32 dst_strd,
     91 @                                word8 *pi1_coeff,
     92 @                                word32 ht,
     93 @                                word32 wd
     94 
     95 
     96 @r0 - free
     97 @r1 - dst_ptr
     98 @r2 - src_strd
     99 @r3 - dst_strd
    100 @r4 - src_ptr2
    101 @r5 - inner loop counter
    102 @r6 - dst_ptr2
    103 @r7 - free
    104 @r8 - dst_strd2
    105 @r9 - src_strd1
    106 @r10 - wd
    107 @r11 - #1
    108 @r12 - src_ptr1
    109 @r14 - loop_counter
    110 .text
    111 .align 4
    112 
    113 
    114 
    115 
    116 
    117 .globl ihevc_inter_pred_luma_horz_w16out_a9q
    118 
    119 .type ihevc_inter_pred_luma_horz_w16out_a9q, %function
    120 
    121 ihevc_inter_pred_luma_horz_w16out_a9q:
    122 
    123     bic         r14, #1                     @ clearing bit[0], so that it goes back to mode
    124     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    125     ldr         r4,[sp,#40]                 @loads pi1_coeff
    126     ldr         r7,[sp,#44]                 @loads ht
    127 
    128 
    129     vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
    130     sub         r14,r7,#0                   @checks for ht == 0
    131     vabs.s8     d2,d0                       @vabs_s8(coeff)
    132     mov         r11,#1
    133     @ble       end_loops
    134     ldr         r10,[sp,#48]                @loads wd
    135     vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    136     sub         r12,r0,#3                   @pu1_src - 3
    137     vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    138     add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
    139     vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    140     rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
    141     vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    142     rsb         r8,r10,r3                   @dst_strd - wd
    143     vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
    144 
    145     vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
    146     and         r7,r14,#1                   @calculating ht_residue ht_residue = (ht & 1)
    147     vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
    148     sub         r14,r14,r7                  @decrement height by ht_residue(residue value is calculated outside)
    149     vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
    150 
    151     cmp         r7,#1
    152     beq         odd_height_decision
    153 
    154 even_height_decision:
    155     mov         r7,r1
    156     cmp         r10,#4
    157     ble         outer_loop_4
    158 
    159     cmp         r10,#24
    160     moveq       r10,#16
    161     addeq       r8,#8
    162     addeq       r9,#8
    163 
    164     cmp         r10,#16
    165     bge         outer_loop_16_branch
    166 
    167     cmp         r10,#12
    168     addeq       r8,#4
    169     addeq       r9,#4
    170 outer_loop_8_branch:
    171     b           outer_loop_8
    172 
    173 outer_loop_16_branch:
    174     b           outer_loop_16
    175 
    176 
    177 odd_height_decision:
    178     cmp         r10,#24
    179     beq         outer_loop_8_branch
    180     cmp         r10,#12
    181     beq         outer_loop_4
    182     b           even_height_decision
    183 
    184 outer_loop4_residual:
    185     sub         r12,r0,#3                   @pu1_src - 3
    186     mov         r1,r7
    187     add         r1,#16
    188     mov         r10,#4
    189     add         r12,#8
    190     mov         r14,#16
    191     add         r8,#4
    192     add         r9,#4
    193 
    194 outer_loop_4:
    195     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    196     add         r4,r12,r2                   @pu1_src + src_strd
    197 
    198     subs        r5,r10,#0                   @checks wd
    199     ble         end_inner_loop_4
    200 
    201 inner_loop_4:
    202     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    203     vld1.u32    {d1},[r12],r11
    204     vld1.u32    {d2},[r12],r11
    205     vld1.u32    {d3},[r12],r11
    206     vld1.u32    {d4},[r12],r11
    207     vld1.u32    {d5},[r12],r11
    208     vld1.u32    {d6},[r12],r11
    209     vld1.u32    {d7},[r12],r11
    210     @add       r12,r12,#4                      @increment the input pointer
    211     sub         r12,r12,#4
    212     @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    213     @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
    214     @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    215 
    216     @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
    217     @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    218     @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
    219     @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
    220     vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
    221     vld1.u32    {d13},[r4],r11
    222     vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
    223     vld1.u32    {d14},[r4],r11
    224     vzip.32     d1,d13
    225     vld1.u32    {d15},[r4],r11
    226     vzip.32     d2,d14
    227     vld1.u32    {d16},[r4],r11
    228     vzip.32     d3,d15
    229     vld1.u32    {d17},[r4],r11
    230     vzip.32     d4,d16
    231     vld1.u32    {d18},[r4],r11
    232     vzip.32     d5,d17
    233     vld1.u32    {d19},[r4],r11
    234     sub         r4,r4,#4
    235     @ add       r4,r4,#4                        @increment the input pointer
    236     @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
    237     @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
    238     @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    239     @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
    240     @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    241     @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
    242     @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
    243 
    244 
    245 
    246 
    247 
    248 
    249 
    250     vzip.32     d6,d18
    251     vzip.32     d7,d19
    252 
    253     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
    254     vmlsl.u8    q4,d0,d24
    255     vmlsl.u8    q4,d2,d26
    256     vmlal.u8    q4,d3,d27
    257     vmlal.u8    q4,d4,d28
    258     vmlsl.u8    q4,d5,d29
    259     vmlal.u8    q4,d6,d30
    260     vmlsl.u8    q4,d7,d31
    261 
    262     @ vqrshrun.s16 d8,q4,#6                     @narrow right shift and saturating the result
    263     vst1.64     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
    264     vst1.64     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
    265     subs        r5,r5,#4                    @decrement the wd by 4
    266     bgt         inner_loop_4
    267 
    268 end_inner_loop_4:
    269     subs        r14,r14,#2                  @decrement the ht by 4
    270     add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
    271     add         r1,r6,r8,lsl #1             @increment the output pointer 2*dst_strd-wd
    272     bgt         outer_loop_4
    273 
    274 
    275 height_residue_4:
    276 
    277     ldr         r7,[sp,#44]                 @loads ht
    278     and         r7,r7,#1                    @calculating ht_residue ht_residue = (ht & 1)
    279     cmp         r7,#0
    280     @beq       end_loops
    281     ldmeqfd     sp!,{r4-r12,r15}            @reload the registers from sp
    282 
    283 outer_loop_height_residue_4:
    284 
    285 
    286     subs        r5,r10,#0                   @checks wd
    287     ble         end_inner_loop_height_residue_4
    288 
    289 inner_loop_height_residue_4:
    290     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    291     vld1.u32    {d1},[r12],r11
    292 
    293 
    294 
    295 
    296 
    297 
    298     @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    299     @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
    300     @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    301 
    302 
    303 
    304     @add        r12,r12,#4                      @increment the input pointer
    305     @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
    306     @ vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
    307     @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
    308     @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
    309     vld1.u32    {d2},[r12],r11
    310     vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
    311     vld1.u32    {d3},[r12],r11
    312     vmlsl.u8    q4,d0,d24
    313     vld1.u32    {d4},[r12],r11
    314     vmlsl.u8    q4,d2,d26
    315     vld1.u32    {d5},[r12],r11
    316     vmlal.u8    q4,d3,d27
    317     vld1.u32    {d6},[r12],r11
    318     vmlal.u8    q4,d4,d28
    319     vld1.u32    {d7},[r12],r11
    320     vmlsl.u8    q4,d5,d29
    321     sub         r12,r12,#4
    322     vmlal.u8    q4,d6,d30
    323     vmlsl.u8    q4,d7,d31                   @store the i iteration result which is in upper part of the register
    324     subs        r5,r5,#4                    @decrement the wd by 4
    325     vst1.64     {d8},[r1]!
    326     bgt         inner_loop_height_residue_4
    327 
    328 end_inner_loop_height_residue_4:
    329     subs        r7,r7,#1                    @decrement the ht by 4
    330     rsb         r9,r10,r2
    331     add         r12,r12,r9                  @increment the input pointer src_strd-wd
    332     add         r1,r1,r8                    @increment the output pointer dst_strd-wd
    333     bgt         outer_loop_height_residue_4
    334 
    335     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    336 
    337 outer_loop8_residual:
    338     sub         r12,r0,#3                   @pu1_src - 3
    339     mov         r1,r7
    340     mov         r14,#32
    341     add         r1,#32
    342     add         r12,#16
    343     mov         r10,#8
    344     add         r8,#8
    345     add         r9,#8
    346 
    347 outer_loop_8:
    348 
    349     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    350     add         r4,r12,r2                   @pu1_src + src_strd
    351     subs        r5,r10,#0                   @checks wd
    352 
    353     ble         end_inner_loop_8
    354 
    355 inner_loop_8:
    356     vld1.u32    {d0},[r12],r11              @vector load pu1_src
    357     vld1.u32    {d1},[r12],r11
    358     vld1.u32    {d2},[r12],r11
    359     vld1.u32    {d3},[r12],r11
    360 
    361 
    362 
    363 
    364 
    365     @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
    366     @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
    367     @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
    368     @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
    369     @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
    370     @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
    371     @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
    372     @ vext.u8   d14,d12,d13,#2
    373 
    374     @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
    375     @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
    376     @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
    377     @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
    378     @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
    379     @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
    380     vld1.u32    {d4},[r12],r11
    381     vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    382     vld1.u32    {d5},[r12],r11
    383     vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    384     vld1.u32    {d6},[r12],r11
    385     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    386     vld1.u32    {d7},[r12],r11
    387     vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    388     vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
    389     vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    390     vld1.u32    {d13},[r4],r11
    391     vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    392     vld1.u32    {d14},[r4],r11
    393     vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    394     vld1.u32    {d15},[r4],r11
    395     vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    396     vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd
    397 
    398     vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    399     vld1.u32    {d17},[r4],r11
    400     vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    401     vld1.u32    {d18},[r4],r11
    402     vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    403     vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
    404     vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    405     @ vqrshrun.s16  d20,q4,#6                       @right shift and saturating narrow result 1
    406     vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    407     vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    408     vst1.16     {q4},[r1]!                  @store the result pu1_dst
    409     vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    410     vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    411 
    412 
    413 
    414     @ vqrshrun.s16 d8,q5,#6                     @right shift and saturating narrow result 2
    415     subs        r5,r5,#8                    @decrement the wd loop
    416     vst1.16     {q5},[r6]!                  @store the result pu1_dst
    417     cmp         r5,#4
    418     bgt         inner_loop_8
    419 
    420 end_inner_loop_8:
    421     subs        r14,r14,#2                  @decrement the ht loop
    422     add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
    423     add         r1,r6,r8,lsl #1             @increment the dst pointer by 2*dst_strd-wd
    424     bgt         outer_loop_8
    425 
    426 
    427 
    428 
    429 
    430     ldr         r10,[sp,#48]                @loads wd
    431     cmp         r10,#12
    432 
    433     beq         outer_loop4_residual
    434 
    435     ldr         r7,[sp,#44]                 @loads ht
    436     and         r7,r7,#1
    437     cmp         r7,#1
    438     beq         height_residue_4
    439 
    440 @end_loops
    441 
    442     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    443 
    444 
    445 
    446 
    447 
    448 outer_loop_16:
    449     str         r0, [sp, #-4]!
    450     str         r7, [sp, #-4]!
    451     add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    452     add         r4,r12,r2                   @pu1_src + src_strd
    453     and         r0, r12, #31
    454     sub         r5,r10,#0                   @checks wd
    455     @ble       end_loops1
    456     pld         [r12, r2, lsl #1]
    457     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    458     pld         [r4, r2, lsl #1]
    459     vld1.u32    {q1},[r12],r11
    460     vld1.u32    {q2},[r12],r11
    461     vld1.u32    {q3},[r12],r11
    462     vld1.u32    {q6},[r12],r11
    463     vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    464     vld1.u32    {q7},[r12],r11
    465     vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    466     vld1.u32    {q8},[r12],r11
    467     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    468     vld1.u32    {q9},[r12],r11
    469     vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    470     vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    471     vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    472     vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    473     vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    474 
    475 
    476 inner_loop_16:
    477 
    478 
    479     subs        r5,r5,#16
    480     vmull.u8    q10,d3,d25
    481 
    482     add         r12,#8
    483     vmlsl.u8    q10,d1,d24
    484 
    485     vld1.u32    {q0},[r4],r11               @vector load pu1_src
    486     vmlal.u8    q10,d7,d27
    487 
    488     vld1.u32    {q1},[r4],r11
    489     vmlsl.u8    q10,d5,d26
    490 
    491     vld1.u32    {q2},[r4],r11
    492     vmlal.u8    q10,d13,d28
    493 
    494     vld1.u32    {q3},[r4],r11
    495     vmlal.u8    q10,d17,d30
    496 
    497     vld1.u32    {q6},[r4],r11
    498     vmlsl.u8    q10,d15,d29
    499 
    500     vld1.u32    {q7},[r4],r11
    501     vmlsl.u8    q10,d19,d31
    502 
    503     vld1.u32    {q8},[r4],r11
    504     vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    505 
    506     vld1.u32    {q9},[r4],r11
    507     vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    508 
    509     add         r4,#8
    510     vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    511     pld         [r12, r2, lsl #2]
    512     pld         [r4, r2, lsl #2]
    513     vst1.8      {q4},[r1]!                  @store the result pu1_dst
    514     vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    515 
    516     addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
    517     vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    518 
    519     addeq       r4,r12,r2                   @pu1_src + src_strd
    520     vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    521 
    522 @   and         r7, r12, #31
    523     vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    524 
    525     subeq       r14,r14,#2
    526     vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    527 
    528     @cmp            r7, r0
    529     vmull.u8    q11,d3,d25
    530 
    531 @   pld     [r12, r2, lsl #2]
    532     vmlsl.u8    q11,d1,d24
    533 
    534     vst1.16     {q10},[r1]!
    535     vmlal.u8    q11,d7,d27
    536 
    537 @   pld     [r4, r2, lsl #2]
    538     vmlsl.u8    q11,d5,d26
    539 
    540 @   mov         r0, r7
    541     vmlal.u8    q11,d13,d28
    542 
    543     cmp         r14,#0
    544     vmlal.u8    q11,d17,d30
    545 
    546     vst1.16     {q5},[r6]!
    547     vmlsl.u8    q11,d15,d29
    548 
    549     vmlsl.u8    q11,d19,d31
    550 
    551     beq         epilog_16
    552 
    553     vld1.u32    {q0},[r12],r11              @vector load pu1_src
    554     vld1.u32    {q1},[r12],r11
    555     vld1.u32    {q2},[r12],r11
    556     vld1.u32    {q3},[r12],r11
    557     vld1.u32    {q6},[r12],r11
    558     vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
    559     vld1.u32    {q7},[r12],r11
    560     vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
    561     vld1.u32    {q8},[r12],r11
    562     vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
    563     vld1.u32    {q9},[r12],r11
    564     vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
    565     vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
    566     cmp         r5,#0
    567     vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
    568     moveq       r5,r10
    569     vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
    570     vst1.8      {q11},[r6]!                 @store the result pu1_dst
    571     vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
    572     addeq       r1,r6,r8,lsl #1
    573     addeq       r6,r1,r3,lsl #1             @pu1_dst + dst_strd
    574     b           inner_loop_16
    575 
    576 
    577 epilog_16:
    578 @   vqrshrun.s16 d11,q11,#6
    579     vst1.8      {q11},[r6]!                 @store the result pu1_dst
    580 
    581     ldr         r7, [sp], #4
    582     ldr         r0, [sp], #4
    583     ldr         r10,[sp,#48]
    584     cmp         r10,#24
    585     beq         outer_loop8_residual
    586     add         r1,r6,r8,lsl #1
    587     ldr         r7,[sp,#44]                 @loads ht
    588     and         r7,r7,#1
    589     cmp         r7,#1
    590     beq         height_residue_4
    591 
    592 end_loops1:
    593 
    594     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    595 
    596 
    597 
    598 
    599 
    600 
    601 
    602 
    603 
    604