Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_filters_luma_vert.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  parthiban v
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*  - ihevc_inter_pred_luma_vert()
     35 @*
     36 @* @remarks
     37 @*  none
     38 @*
     39 @*******************************************************************************
     40 @*/
     41 
     42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 @/* include reconstruction */
     44 
     45 
     46 
     47 @/**
     48 @*******************************************************************************
     49 @*
     50 @* @brief
     51 @*     interprediction luma filter for vertical input
     52 @*
     53 @* @par description:
     54 @*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     55 @*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     56 @*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
     57 @*    assumptions : the function is optimized considering the fact width is
     58 @*    multiple of 4 or 8. and height as multiple of 2.
     59 @*
     60 @* @param[in] pu1_src
     61 @*  uword8 pointer to the source
     62 @*
     63 @* @param[out] pu1_dst
     64 @*  uword8 pointer to the destination
     65 @*
     66 @* @param[in] src_strd
     67 @*  integer source stride
     68 @*
     69 @* @param[in] dst_strd
     70 @*  integer destination stride
     71 @*
     72 @* @param[in] pi1_coeff
     73 @*  word8 pointer to the filter coefficients
     74 @*
     75 @* @param[in] ht
     76 @*  integer height of the array
     77 @*
     78 @* @param[in] wd
     79 @*  integer width of the array
     80 @*
     81 @* @returns
     82 @*
     83 @* @remarks
     84 @*  none
     85 @*
     86 @*******************************************************************************
     87 @*/
     88 
     89 @void ihevc_inter_pred_luma_vert (
     90 @                            uword8 *pu1_src,
     91 @                            uword8 *pu1_dst,
     92 @                            word32 src_strd,
     93 @                            word32 dst_strd,
     94 @                            word8 *pi1_coeff,
     95 @                            word32 ht,
     96 @                            word32 wd   )
     97 
     98 @**************variables vs registers*****************************************
     99 @   r0 => *pu1_src
    100 @   r1 => *pu1_dst
    101 @   r2 =>  src_strd
    102 @   r6 =>  dst_strd
    103 @   r12 => *pi1_coeff
    104 @   r5 =>  ht
    105 @   r3 =>  wd
    106 .text
    107 .align 4
    108 .syntax unified
    109 
    110 
    111 
    112 .globl ihevc_inter_pred_luma_vert_a9q
    113 
    114 .type ihevc_inter_pred_luma_vert_a9q, %function
    115 
    116 ihevc_inter_pred_luma_vert_a9q:
    117 
    118     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    119 
    120     ldr         r12,[sp,#40]                @load pi1_coeff
    121     mov         r6,r3
    122     ldr         r5,[sp,#48]                 @load wd
    123     vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
    124     sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
    125     vabs.s8     d0,d0                       @vabs_s8(coeff)
    126     add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
    127     ldr         r3,[sp,#44]                 @load ht
    128     subs        r7,r3,#0                    @r3->ht
    129     @ble        end_loops           @end loop jump
    130     vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
    131     cmp         r5,#8
    132     vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
    133     vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
    134     vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
    135     vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
    136     vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
    137     vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
    138     vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
    139     blt         core_loop_wd_4              @core loop wd 4 jump
    140     str         r0, [sp, #-4]!
    141     str         r1, [sp, #-4]!
    142 
    143     bic         r4,r5,#7                    @r5 ->wd
    144     rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
    145     rsb         r8,r4,r2,lsl #2             @r2->src_strd
    146     mov         r3, r5, lsr #3              @divide by 8
    147     mul         r7, r3                      @multiply height by width
    148     sub         r7, #4                      @subtract by one for epilog
    149 
    150 prolog:
    151 
    152     and         r10, r0, #31
    153     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    154     vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    155     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    156     subs        r4,r4,#8
    157     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    158     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    159     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    160     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    161     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    162     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    163     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    164     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    165     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    166     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    167     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    168     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    169     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    170     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    171     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    172     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    173 
    174 
    175     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    176     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    177 
    178     addle       r0,r0,r8
    179     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    180 
    181     bicle       r4,r5,#7                    @r5 ->wd
    182     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    183 
    184     pld         [r3]
    185     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    186     pld         [r3, r2]
    187     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    188     pld         [r3, r2, lsl #1]
    189     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    190 
    191     add         r3, r3, r2
    192     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    193 
    194     pld         [r3, r2, lsl #1]
    195     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    196 
    197     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    198     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    199 
    200     vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    201     vmull.u8    q6,d3,d23
    202     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    203     vmlsl.u8    q6,d2,d22
    204     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    205     vmlsl.u8    q6,d4,d24
    206     vmlal.u8    q6,d5,d25
    207     vmlal.u8    q6,d6,d26
    208     vmlsl.u8    q6,d7,d27
    209     vmlal.u8    q6,d16,d28
    210     vmlsl.u8    q6,d17,d29
    211     add         r14,r1,r6
    212     vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    213     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    214     addle       r1,r1,r9
    215 
    216     vmull.u8    q7,d4,d23
    217     subs        r7,r7,#4
    218     vmlsl.u8    q7,d3,d22
    219     vmlsl.u8    q7,d5,d24
    220     vmlal.u8    q7,d6,d25
    221     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    222     vmlal.u8    q7,d7,d26
    223     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    224     vmlsl.u8    q7,d16,d27
    225     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    226     vmlal.u8    q7,d17,d28
    227     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    228     vmlsl.u8    q7,d18,d29
    229     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    230 
    231     vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    232     vqrshrun.s16 d12,q6,#6
    233 
    234 
    235     blt         epilog_end                  @jumps to epilog_end
    236     beq         epilog                      @jumps to epilog
    237 
    238 kernel_8:
    239 
    240     subs        r4,r4,#8
    241     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    242 
    243     addle       r0,r0,r8
    244     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    245 
    246     bicle       r4,r5,#7                    @r5 ->wd
    247     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    248 
    249     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    250     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    251 
    252     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    253     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    254 
    255     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    256     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    257 
    258     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    259 
    260     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    261     vst1.8      {d12},[r14],r6
    262 
    263 @   and         r11, r0, #31
    264     vqrshrun.s16 d14,q7,#6
    265 
    266     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    267     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    268 
    269     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    270     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    271 
    272     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    273 
    274     vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    275     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    276 
    277     vst1.8      {d14},[r14],r6
    278     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    279 
    280     add         r14,r1,#0
    281     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    282 
    283     add         r1, r1, #8
    284     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    285 
    286     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    287 
    288     addle       r1,r1,r9
    289     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    290 
    291 @   cmp         r11, r10
    292     vmull.u8    q6,d3,d23
    293 
    294     add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
    295     vmlsl.u8    q6,d2,d22
    296 
    297     add         r10, r10, r2                @ 11*strd
    298     vmlsl.u8    q6,d4,d24
    299 
    300     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    301     vmlal.u8    q6,d5,d25
    302 
    303     vmlal.u8    q6,d6,d26
    304     vst1.8      {d8},[r14],r6               @vst1_u8(pu1_dst,sto_res)@
    305 
    306     pld         [r10]                       @11+ 0
    307     vmlsl.u8    q6,d7,d27
    308 
    309     pld         [r10, r2]                   @11+ 1*strd
    310     vmlal.u8    q6,d16,d28
    311 
    312     pld         [r10, r2, lsl #1]           @11+ 2*strd
    313     vmlsl.u8    q6,d17,d29
    314 
    315     add         r10, r10, r2                @12*strd
    316     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    317 
    318     pld         [r10, r2, lsl #1]           @11+ 3*strd
    319     vmull.u8    q7,d4,d23
    320 
    321 @   mov         r10, r11
    322     vmlsl.u8    q7,d3,d22
    323 
    324     subs        r7,r7,#4
    325     vmlsl.u8    q7,d5,d24
    326 
    327     vmlal.u8    q7,d6,d25
    328     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    329     vmlal.u8    q7,d7,d26
    330     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    331     vmlsl.u8    q7,d16,d27
    332     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    333     vmlal.u8    q7,d17,d28
    334     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    335     vmlsl.u8    q7,d18,d29
    336     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    337 
    338     vqrshrun.s16 d12,q6,#6
    339     vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    340 
    341 
    342 
    343     bgt         kernel_8                    @jumps to kernel_8
    344 
    345 epilog:
    346 
    347     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    348     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    349     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    350     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    351     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    352     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    353     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    354     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    355     vst1.8      {d12},[r14],r6
    356 
    357     vqrshrun.s16 d14,q7,#6
    358 
    359     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    360     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    361     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    362     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    363     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    364     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    365     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    366     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    367     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    368     vst1.8      {d14},[r14],r6
    369 
    370     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    371 
    372     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    373     vmull.u8    q6,d3,d23
    374     vmlsl.u8    q6,d2,d22
    375     vmlsl.u8    q6,d4,d24
    376     vmlal.u8    q6,d5,d25
    377     vmlal.u8    q6,d6,d26
    378     vmlsl.u8    q6,d7,d27
    379     vmlal.u8    q6,d16,d28
    380     vmlsl.u8    q6,d17,d29
    381     add         r14,r1,r6
    382     vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    383     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    384 
    385     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    386     vmull.u8    q7,d4,d23
    387     vmlsl.u8    q7,d3,d22
    388     vmlsl.u8    q7,d5,d24
    389     vmlal.u8    q7,d6,d25
    390     vmlal.u8    q7,d7,d26
    391     vmlsl.u8    q7,d16,d27
    392     vmlal.u8    q7,d17,d28
    393     vmlsl.u8    q7,d18,d29
    394 
    395     vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    396     vqrshrun.s16 d12,q6,#6
    397 
    398 epilog_end:
    399     vst1.8      {d12},[r14],r6
    400     vqrshrun.s16 d14,q7,#6
    401 
    402     vst1.8      {d14},[r14],r6
    403 
    404 
    405 end_loops:
    406     tst         r5,#7
    407     ldr         r1, [sp], #4
    408     ldr         r0, [sp], #4
    409 
    410     ldmfdeq     sp!,{r4-r12,r15}            @reload the registers from sp
    411     mov         r5, #4
    412     add         r0, r0, #8
    413     add         r1, r1, #8
    414     mov         r7, #16
    415     @
    416 
    417 core_loop_wd_4:
    418     rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
    419     rsb         r8,r5,r2,lsl #2             @r2->src_strd
    420     vmov.i8     d4,#0
    421 
    422 outer_loop_wd_4:
    423     subs        r12,r5,#0
    424     ble         end_inner_loop_wd_4         @outer loop jump
    425 
    426 inner_loop_wd_4:
    427     add         r3,r0,r2
    428     vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
    429     subs        r12,r12,#4
    430     vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
    431     vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
    432     vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
    433     vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
    434 
    435     vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
    436     add         r0,r0,#4
    437     vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
    438     vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
    439 
    440     vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
    441     vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
    442     vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
    443 
    444     vmull.u8    q4,d7,d23
    445     vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
    446     vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
    447     vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
    448     vmlsl.u8    q4,d6,d22
    449     vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
    450 
    451     vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
    452     vmlsl.u8    q4,d4,d24
    453     vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
    454     vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
    455 
    456     vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
    457     vmlal.u8    q4,d5,d25
    458     vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
    459     vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
    460 
    461     vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
    462     vmlal.u8    q4,d6,d26
    463     vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
    464     vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
    465 
    466     vdup.u32    d4,d7[1]
    467     vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
    468 
    469     vmlsl.u8    q4,d7,d27
    470     vld1.u32    {d4[1]},[r3],r2
    471     vmlal.u8    q4,d4,d28
    472     vdup.u32    d5,d4[1]
    473     vqrshrun.s16 d0,q0,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    474 
    475     vld1.u32    {d5[1]},[r3]
    476     add         r3,r1,r6
    477     vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
    478 
    479     vmlsl.u8    q4,d5,d29
    480     vst1.32     {d0[1]},[r3],r6             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
    481     vqrshrun.s16 d8,q4,#6
    482 
    483     vst1.32     {d8[0]},[r3],r6
    484     add         r1,r1,#4
    485     vst1.32     {d8[1]},[r3]
    486     bgt         inner_loop_wd_4
    487 
    488 end_inner_loop_wd_4:
    489     subs        r7,r7,#4
    490     add         r1,r1,r9
    491     add         r0,r0,r8
    492     bgt         outer_loop_wd_4
    493 
    494     ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
    495 
    496 
    497 
    498 @/**
    499 @*******************************************************************************
    500 @*
    501 @* @brief
    502 @*     interprediction luma filter for vertical 16bit output
    503 @*
    504 @* @par description:
    505 @*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
    506 @*    the elements pointed by 'pu1_src' and  writes to the location pointed by
    507 @*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
    508 @*    an input for weighted prediction   assumptions : the function is optimized
    509 @*    considering the fact width is  multiple of 4 or 8. and height as multiple
    510 @*    of 2.
    511 @*
    512 @* @param[in] pu1_src
    513 @*  uword8 pointer to the source
    514 @*
    515 @* @param[out] pi2_dst
    516 @*  word16 pointer to the destination
    517 @*
    518 @* @param[in] src_strd
    519 @*  integer source stride
    520 @*
    521 @* @param[in] dst_strd
    522 @*  integer destination stride
    523 @*
    524 @* @param[in] pi1_coeff
    525 @*  word8 pointer to the filter coefficients
    526 @*
    527 @* @param[in] ht
    528 @*  integer height of the array
    529 @*
    530 @* @param[in] wd
    531 @*  integer width of the array
    532 @*
    533 @* @returns
    534 @*
    535 @* @remarks
    536 @*  none
    537 @*
    538 @*******************************************************************************
    539 @*/
    540 
    541 @void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
    542 @                                    word16 *pi2_dst,
    543 @                                    word32 src_strd,
    544 @                                    word32 dst_strd,
    545 @                                    word8 *pi1_coeff,
    546 @                                    word32 ht,
    547 @                                    word32 wd   )
    548 
    549 @**************variables vs registers*****************************************
    550 @   r0 => *pu1_src
    551 @   r1 => *pu1_dst
    552 @   r2 =>  src_strd
    553 @   r6 =>  dst_strd
    554 @   r12 => *pi1_coeff
    555 @   r5 =>  ht
    556 @   r3 =>  wd
    557 
    558 
    559 
    560 .globl ihevc_inter_pred_luma_vert_w16out_a9q
    561 
    562 .type ihevc_inter_pred_luma_vert_w16out_a9q, %function
    563 
    564 ihevc_inter_pred_luma_vert_w16out_a9q:
    565 
    566     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    567 
    568     ldr         r12,[sp,#40]                @load pi1_coeff
    569     mov         r6,r3
    570     ldr         r5,[sp,#48]                 @load wd
    571     vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
    572     sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
    573     vabs.s8     d0,d0                       @vabs_s8(coeff)
    574     add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
    575     ldr         r3,[sp,#44]                 @load ht
    576     subs        r7,r3,#0                    @r3->ht
    577     @ble        end_loops_16out         @end loop jump
    578     vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
    579     cmp         r5,#8
    580     vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
    581     vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
    582     vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
    583     vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
    584     vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
    585     vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
    586     vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
    587     blt         core_loop_wd_4_16out        @core loop wd 4 jump
    588     str         r0, [sp, #-4]!
    589     str         r1, [sp, #-4]!
    590 
    591     bic         r4,r5,#7                    @r5 ->wd
    592     rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
    593     rsb         r8,r4,r2,lsl #2             @r2->src_strd
    594     mov         r6, r6, lsl #1
    595     mov         r3, r5, lsr #3              @divide by 8
    596     mul         r7, r3                      @multiply height by width
    597     sub         r7, #4                      @subtract by one for epilog
    598 
    599 prolog_16out:
    600 
    601     and         r10, r0, #31
    602     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    603 
    604     vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    605     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    606     subs        r4,r4,#8
    607     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    608     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    609     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    610     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    611     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    612     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    613     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    614     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    615     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    616     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    617     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    618     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    619     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    620     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    621     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    622     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    623 
    624 
    625     addle       r0,r0,r8
    626     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    627 
    628     bicle       r4,r5,#7                    @r5 ->wd
    629     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    630 
    631     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    632     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    633 
    634     pld         [r3]
    635     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    636     pld         [r3, r2]
    637     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    638     pld         [r3, r2, lsl #1]
    639     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    640     add         r3, r3, r2
    641     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    642     pld         [r3, r2, lsl #1]
    643     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    644 
    645     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    646     vmull.u8    q6,d3,d23
    647     vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    648     vmlsl.u8    q6,d2,d22
    649     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    650     vmlsl.u8    q6,d4,d24
    651     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    652     vmlal.u8    q6,d5,d25
    653     vmlal.u8    q6,d6,d26
    654     vmlsl.u8    q6,d7,d27
    655     vmlal.u8    q6,d16,d28
    656     vmlsl.u8    q6,d17,d29
    657     add         r14,r1,r6
    658     vst1.8      {d8, d9},[r1]!              @vst1_u8(pu1_dst,sto_res)@
    659     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    660     addle       r1,r1,r9,lsl #1
    661 
    662     vmull.u8    q7,d4,d23
    663     subs        r7,r7,#4
    664     vmlsl.u8    q7,d3,d22
    665     vmlsl.u8    q7,d5,d24
    666     vmlal.u8    q7,d6,d25
    667     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    668     vmlal.u8    q7,d7,d26
    669     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    670     vmlsl.u8    q7,d16,d27
    671     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    672     vmlal.u8    q7,d17,d28
    673     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    674     vmlsl.u8    q7,d18,d29
    675     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    676 
    677     vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
    678     @vqrshrun.s16 d12,q6,#6
    679 
    680 
    681     blt         epilog_end_16out
    682     beq         epilog_16out                @jumps to epilog
    683 
    684 kernel_8_16out:
    685 
    686     subs        r4,r4,#8
    687     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    688 
    689     addle       r0,r0,r8
    690     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    691 
    692     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    693     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    694 
    695     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    696     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    697 
    698     bicle       r4,r5,#7                    @r5 ->wd
    699     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    700 
    701     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    702     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    703 
    704     vst1.8      {d12,d13},[r14],r6
    705     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    706 
    707     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    708     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    709 
    710 
    711 @   and         r11, r0, #31
    712     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    713 
    714     vst1.8      {d14,d15},[r14],r6
    715     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    716 
    717     add         r14,r1,r6
    718     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    719 
    720     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    721     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    722 
    723     vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    724     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    725 
    726     vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
    727     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    728 
    729     addle       r1,r1,r9,lsl #1
    730     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    731 
    732 @   cmp         r11, r10
    733     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    734 
    735     add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
    736     vmull.u8    q6,d3,d23
    737 
    738     add         r10, r10, r2                @ 11*strd
    739     vmlsl.u8    q6,d2,d22
    740 
    741     pld         [r10]                       @11+ 0
    742     vmlsl.u8    q6,d4,d24
    743 
    744     pld         [r10, r2]                   @11+ 1*strd
    745     vmlal.u8    q6,d5,d25
    746 
    747     pld         [r10, r2, lsl #1]           @11+ 2*strd
    748     vmlal.u8    q6,d6,d26
    749 
    750     add         r10, r10, r2                @12*strd
    751     vmlsl.u8    q6,d7,d27
    752 
    753     pld         [r10, r2, lsl #1]           @11+ 3*strd
    754     vmlal.u8    q6,d16,d28
    755 
    756 @   mov         r10, r11
    757     vmlsl.u8    q6,d17,d29
    758 
    759     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    760     vmull.u8    q7,d4,d23
    761 
    762     subs        r7,r7,#4
    763     vmlsl.u8    q7,d3,d22
    764 
    765     vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
    766     vmlsl.u8    q7,d5,d24
    767 
    768     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    769     vmlal.u8    q7,d6,d25
    770 
    771     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    772     vmlal.u8    q7,d7,d26
    773 
    774     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    775     vmlsl.u8    q7,d16,d27
    776 
    777     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    778     vmlal.u8    q7,d17,d28
    779 
    780     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    781     vmlsl.u8    q7,d18,d29
    782 
    783 
    784     bgt         kernel_8_16out              @jumps to kernel_8
    785 
    786 epilog_16out:
    787 
    788     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    789     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    790     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    791     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    792     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    793     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    794     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    795     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    796     vst1.8      {d12,d13},[r14],r6
    797 
    798     @vqrshrun.s16 d14,q7,#6
    799 
    800     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    801     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    802     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    803     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    804     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    805     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    806     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    807     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    808     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    809     vst1.8      {d14,d15},[r14],r6
    810 
    811     @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    812 
    813     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    814     vmull.u8    q6,d3,d23
    815     vmlsl.u8    q6,d2,d22
    816     vmlsl.u8    q6,d4,d24
    817     vmlal.u8    q6,d5,d25
    818     vmlal.u8    q6,d6,d26
    819     vmlsl.u8    q6,d7,d27
    820     vmlal.u8    q6,d16,d28
    821     vmlsl.u8    q6,d17,d29
    822     add         r14,r1,r6
    823     vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
    824     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    825 
    826     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    827     vmull.u8    q7,d4,d23
    828     vmlsl.u8    q7,d3,d22
    829     vmlsl.u8    q7,d5,d24
    830     vmlal.u8    q7,d6,d25
    831     vmlal.u8    q7,d7,d26
    832     vmlsl.u8    q7,d16,d27
    833     vmlal.u8    q7,d17,d28
    834     vmlsl.u8    q7,d18,d29
    835 
    836     vst1.8      {d10,d11},[r14],r6          @vst1_u8(pu1_dst_tmp,sto_res)@
    837     @vqrshrun.s16 d12,q6,#6
    838 
    839 epilog_end_16out:
    840     vst1.8      {d12,d13},[r14],r6
    841     @vqrshrun.s16 d14,q7,#6
    842 
    843     vst1.8      {d14,d15},[r14],r6
    844 
    845 
    846 end_loops_16out:
    847     tst         r5,#7
    848     ldr         r1, [sp], #4
    849     ldr         r0, [sp], #4
    850 
    851     ldmfdeq     sp!,{r4-r12,r15}            @reload the registers from sp
    852     mov         r5, #4
    853     add         r0, r0, #8
    854     add         r1, r1, #16
    855     mov         r7, #16
    856     mov         r6, r6, lsr #1
    857 
    858     @
    859 
    860 core_loop_wd_4_16out:
    861     rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
    862     rsb         r8,r5,r2,lsl #2             @r2->src_strd
    863     vmov.i8     d4,#0
    864     mov         r6, r6, lsl #1
    865 
    866 outer_loop_wd_4_16out:
    867     subs        r12,r5,#0
    868     ble         end_inner_loop_wd_4_16out   @outer loop jump
    869 
    870 inner_loop_wd_4_16out:
    871     add         r3,r0,r2
    872     vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
    873     subs        r12,r12,#4
    874     vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
    875     vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
    876     vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
    877     vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
    878 
    879     vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
    880     add         r0,r0,#4
    881     vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
    882     vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
    883 
    884     vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
    885     vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
    886     vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
    887 
    888     vmull.u8    q4,d7,d23
    889     vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
    890     vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
    891     vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
    892     vmlsl.u8    q4,d6,d22
    893     vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
    894 
    895     vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
    896     vmlsl.u8    q4,d4,d24
    897     vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
    898     vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
    899 
    900     vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
    901     vmlal.u8    q4,d5,d25
    902     vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
    903     vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
    904 
    905     vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
    906     vmlal.u8    q4,d6,d26
    907     vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
    908     vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
    909 
    910     vdup.u32    d4,d7[1]
    911     vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
    912 
    913     vmlsl.u8    q4,d7,d27
    914     vld1.u32    {d4[1]},[r3],r2
    915     vmlal.u8    q4,d4,d28
    916     vdup.u32    d5,d4[1]
    917     @vqrshrun.s16 d0,q0,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    918 
    919     vld1.u32    {d5[1]},[r3]
    920     add         r3,r1,r6
    921     vst1.32     {d0},[r1]!                  @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
    922 
    923     vmlsl.u8    q4,d5,d29
    924     vst1.32     {d1},[r3],r6                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
    925     @vqrshrun.s16 d8,q4,#6
    926 
    927     vst1.32     {d8},[r3],r6
    928     @add        r1,r1,#4
    929     vst1.32     {d9},[r3]
    930     bgt         inner_loop_wd_4_16out
    931 
    932 end_inner_loop_wd_4_16out:
    933     subs        r7,r7,#4
    934     add         r1,r1,r9,lsl #1
    935     add         r0,r0,r8
    936     bgt         outer_loop_wd_4_16out
    937 
    938     ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
    939 
    940 
    941 
    942 
    943 
    944 
    945 
    946 
    947 
    948