Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_filters_luma_vert.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  parthiban v
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*  - ihevc_inter_pred_luma_vert()
     35 @*
     36 @* @remarks
     37 @*  none
     38 @*
     39 @*******************************************************************************
     40 @*/
     41 
     42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 @/* include reconstruction */
     44 
     45 
     46 
     47 @/**
     48 @*******************************************************************************
     49 @*
     50 @* @brief
     51 @*     interprediction luma filter for vertical input
     52 @*
     53 @* @par description:
     54 @*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     55 @*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     56 @*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
     57 @*    assumptions : the function is optimized considering the fact width is
     58 @*    multiple of 4 or 8. and height as multiple of 2.
     59 @*
     60 @* @param[in] pu1_src
     61 @*  uword8 pointer to the source
     62 @*
     63 @* @param[out] pu1_dst
     64 @*  uword8 pointer to the destination
     65 @*
     66 @* @param[in] src_strd
     67 @*  integer source stride
     68 @*
     69 @* @param[in] dst_strd
     70 @*  integer destination stride
     71 @*
     72 @* @param[in] pi1_coeff
     73 @*  word8 pointer to the filter coefficients
     74 @*
     75 @* @param[in] ht
     76 @*  integer height of the array
     77 @*
     78 @* @param[in] wd
     79 @*  integer width of the array
     80 @*
     81 @* @returns
     82 @*
     83 @* @remarks
     84 @*  none
     85 @*
     86 @*******************************************************************************
     87 @*/
     88 
     89 @void ihevc_inter_pred_luma_vert (
     90 @                            uword8 *pu1_src,
     91 @                            uword8 *pu1_dst,
     92 @                            word32 src_strd,
     93 @                            word32 dst_strd,
     94 @                            word8 *pi1_coeff,
     95 @                            word32 ht,
     96 @                            word32 wd   )
     97 
     98 @**************variables vs registers*****************************************
     99 @   r0 => *pu1_src
    100 @   r1 => *pu1_dst
    101 @   r2 =>  src_strd
    102 @   r6 =>  dst_strd
    103 @   r12 => *pi1_coeff
    104 @   r5 =>  ht
    105 @   r3 =>  wd
    106 
    107 .equ    coeff_offset,   104
    108 .equ    ht_offset,      108
    109 .equ    wd_offset,      112
    110 
    111 .text
    112 .align 4
    113 .syntax unified
    114 
    115 
    116 
    117 .globl ihevc_inter_pred_luma_vert_a9q
    118 
    119 .type ihevc_inter_pred_luma_vert_a9q, %function
    120 
    121 ihevc_inter_pred_luma_vert_a9q:
    122 
    123     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    124     vpush        {d8 - d15}
    125 
    126     ldr         r12,[sp,#coeff_offset]                @load pi1_coeff
    127     mov         r6,r3
    128     ldr         r5,[sp,#wd_offset]                 @load wd
    129     vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
    130     sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
    131     vabs.s8     d0,d0                       @vabs_s8(coeff)
    132     add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
    133     ldr         r3,[sp,#ht_offset]                 @load ht
    134     subs        r7,r3,#0                    @r3->ht
    135     @ble        end_loops           @end loop jump
    136     vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
    137     cmp         r5,#8
    138     vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
    139     vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
    140     vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
    141     vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
    142     vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
    143     vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
    144     vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
    145     blt         core_loop_wd_4              @core loop wd 4 jump
    146     str         r0, [sp, #-4]!
    147     str         r1, [sp, #-4]!
    148 
    149     bic         r4,r5,#7                    @r5 ->wd
    150     rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
    151     rsb         r8,r4,r2,lsl #2             @r2->src_strd
    152     mov         r3, r5, lsr #3              @divide by 8
    153     mul         r7, r3                      @multiply height by width
    154     sub         r7, #4                      @subtract by one for epilog
    155 
    156 prolog:
    157 
    158     and         r10, r0, #31
    159     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    160     vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    161     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    162     subs        r4,r4,#8
    163     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    164     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    165     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    166     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    167     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    168     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    169     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    170     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    171     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    172     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    173     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    174     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    175     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    176     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    177     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    178     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    179 
    180 
    181     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    182     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    183 
    184     addle       r0,r0,r8
    185     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    186 
    187     bicle       r4,r5,#7                    @r5 ->wd
    188     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    189 
    190     pld         [r3]
    191     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    192     pld         [r3, r2]
    193     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    194     pld         [r3, r2, lsl #1]
    195     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    196 
    197     add         r3, r3, r2
    198     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    199 
    200     pld         [r3, r2, lsl #1]
    201     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    202 
    203     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    204     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    205 
    206     vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    207     vmull.u8    q6,d3,d23
    208     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    209     vmlsl.u8    q6,d2,d22
    210     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    211     vmlsl.u8    q6,d4,d24
    212     vmlal.u8    q6,d5,d25
    213     vmlal.u8    q6,d6,d26
    214     vmlsl.u8    q6,d7,d27
    215     vmlal.u8    q6,d16,d28
    216     vmlsl.u8    q6,d17,d29
    217     add         r14,r1,r6
    218     vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    219     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    220     addle       r1,r1,r9
    221 
    222     vmull.u8    q7,d4,d23
    223     subs        r7,r7,#4
    224     vmlsl.u8    q7,d3,d22
    225     vmlsl.u8    q7,d5,d24
    226     vmlal.u8    q7,d6,d25
    227     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    228     vmlal.u8    q7,d7,d26
    229     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    230     vmlsl.u8    q7,d16,d27
    231     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    232     vmlal.u8    q7,d17,d28
    233     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    234     vmlsl.u8    q7,d18,d29
    235     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    236 
    237     vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    238     vqrshrun.s16 d12,q6,#6
    239 
    240 
    241     blt         epilog_end                  @jumps to epilog_end
    242     beq         epilog                      @jumps to epilog
    243 
    244 kernel_8:
    245 
    246     subs        r4,r4,#8
    247     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    248 
    249     addle       r0,r0,r8
    250     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    251 
    252     bicle       r4,r5,#7                    @r5 ->wd
    253     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    254 
    255     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    256     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    257 
    258     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    259     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    260 
    261     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    262     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    263 
    264     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    265 
    266     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    267     vst1.8      {d12},[r14],r6
    268 
    269 @   and         r11, r0, #31
    270     vqrshrun.s16 d14,q7,#6
    271 
    272     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    273     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    274 
    275     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    276     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    277 
    278     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    279 
    280     vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    281     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    282 
    283     vst1.8      {d14},[r14],r6
    284     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    285 
    286     add         r14,r1,#0
    287     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    288 
    289     add         r1, r1, #8
    290     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    291 
    292     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    293 
    294     addle       r1,r1,r9
    295     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    296 
    297 @   cmp         r11, r10
    298     vmull.u8    q6,d3,d23
    299 
    300     add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
    301     vmlsl.u8    q6,d2,d22
    302 
    303     add         r10, r10, r2                @ 11*strd
    304     vmlsl.u8    q6,d4,d24
    305 
    306     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    307     vmlal.u8    q6,d5,d25
    308 
    309     vmlal.u8    q6,d6,d26
    310     vst1.8      {d8},[r14],r6               @vst1_u8(pu1_dst,sto_res)@
    311 
    312     pld         [r10]                       @11+ 0
    313     vmlsl.u8    q6,d7,d27
    314 
    315     pld         [r10, r2]                   @11+ 1*strd
    316     vmlal.u8    q6,d16,d28
    317 
    318     pld         [r10, r2, lsl #1]           @11+ 2*strd
    319     vmlsl.u8    q6,d17,d29
    320 
    321     add         r10, r10, r2                @12*strd
    322     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    323 
    324     pld         [r10, r2, lsl #1]           @11+ 3*strd
    325     vmull.u8    q7,d4,d23
    326 
    327 @   mov         r10, r11
    328     vmlsl.u8    q7,d3,d22
    329 
    330     subs        r7,r7,#4
    331     vmlsl.u8    q7,d5,d24
    332 
    333     vmlal.u8    q7,d6,d25
    334     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    335     vmlal.u8    q7,d7,d26
    336     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    337     vmlsl.u8    q7,d16,d27
    338     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    339     vmlal.u8    q7,d17,d28
    340     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    341     vmlsl.u8    q7,d18,d29
    342     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    343 
    344     vqrshrun.s16 d12,q6,#6
    345     vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    346 
    347 
    348 
    349     bgt         kernel_8                    @jumps to kernel_8
    350 
    351 epilog:
    352 
    353     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    354     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    355     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    356     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    357     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    358     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    359     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    360     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    361     vst1.8      {d12},[r14],r6
    362 
    363     vqrshrun.s16 d14,q7,#6
    364 
    365     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    366     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    367     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    368     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    369     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    370     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    371     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    372     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    373     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    374     vst1.8      {d14},[r14],r6
    375 
    376     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    377 
    378     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    379     vmull.u8    q6,d3,d23
    380     vmlsl.u8    q6,d2,d22
    381     vmlsl.u8    q6,d4,d24
    382     vmlal.u8    q6,d5,d25
    383     vmlal.u8    q6,d6,d26
    384     vmlsl.u8    q6,d7,d27
    385     vmlal.u8    q6,d16,d28
    386     vmlsl.u8    q6,d17,d29
    387     add         r14,r1,r6
    388     vst1.8      {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    389     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    390 
    391     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    392     vmull.u8    q7,d4,d23
    393     vmlsl.u8    q7,d3,d22
    394     vmlsl.u8    q7,d5,d24
    395     vmlal.u8    q7,d6,d25
    396     vmlal.u8    q7,d7,d26
    397     vmlsl.u8    q7,d16,d27
    398     vmlal.u8    q7,d17,d28
    399     vmlsl.u8    q7,d18,d29
    400 
    401     vst1.8      {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    402     vqrshrun.s16 d12,q6,#6
    403 
    404 epilog_end:
    405     vst1.8      {d12},[r14],r6
    406     vqrshrun.s16 d14,q7,#6
    407 
    408     vst1.8      {d14},[r14],r6
    409 
    410 
    411 end_loops:
    412     tst         r5,#7
    413     ldr         r1, [sp], #4
    414     ldr         r0, [sp], #4
    415 
    416     beq         end1
    417 
    418     mov         r5, #4
    419     add         r0, r0, #8
    420     add         r1, r1, #8
    421     mov         r7, #16
    422     @
    423 
    424 core_loop_wd_4:
    425     rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
    426     rsb         r8,r5,r2,lsl #2             @r2->src_strd
    427     vmov.i8     d4,#0
    428 
    429 outer_loop_wd_4:
    430     subs        r12,r5,#0
    431     ble         end_inner_loop_wd_4         @outer loop jump
    432 
    433 inner_loop_wd_4:
    434     add         r3,r0,r2
    435     vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
    436     subs        r12,r12,#4
    437     vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
    438     vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
    439     vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
    440     vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
    441 
    442     vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
    443     add         r0,r0,#4
    444     vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
    445     vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
    446 
    447     vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
    448     vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
    449     vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
    450 
    451     vmull.u8    q4,d7,d23
    452     vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
    453     vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
    454     vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
    455     vmlsl.u8    q4,d6,d22
    456     vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
    457 
    458     vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
    459     vmlsl.u8    q4,d4,d24
    460     vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
    461     vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
    462 
    463     vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
    464     vmlal.u8    q4,d5,d25
    465     vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
    466     vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
    467 
    468     vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
    469     vmlal.u8    q4,d6,d26
    470     vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
    471     vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
    472 
    473     vdup.u32    d4,d7[1]
    474     vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
    475 
    476     vmlsl.u8    q4,d7,d27
    477     vld1.u32    {d4[1]},[r3],r2
    478     vmlal.u8    q4,d4,d28
    479     vdup.u32    d5,d4[1]
    480     vqrshrun.s16 d0,q0,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    481 
    482     vld1.u32    {d5[1]},[r3]
    483     add         r3,r1,r6
    484     vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
    485 
    486     vmlsl.u8    q4,d5,d29
    487     vst1.32     {d0[1]},[r3],r6             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
    488     vqrshrun.s16 d8,q4,#6
    489 
    490     vst1.32     {d8[0]},[r3],r6
    491     add         r1,r1,#4
    492     vst1.32     {d8[1]},[r3]
    493     bgt         inner_loop_wd_4
    494 
    495 end_inner_loop_wd_4:
    496     subs        r7,r7,#4
    497     add         r1,r1,r9
    498     add         r0,r0,r8
    499     bgt         outer_loop_wd_4
    500 
    501 end1:
    502     vpop         {d8 - d15}
    503     ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
    504 
    505 
    506 
    507 @/**
    508 @*******************************************************************************
    509 @*
    510 @* @brief
    511 @*     interprediction luma filter for vertical 16bit output
    512 @*
    513 @* @par description:
    514 @*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
    515 @*    the elements pointed by 'pu1_src' and  writes to the location pointed by
    516 @*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
    517 @*    an input for weighted prediction   assumptions : the function is optimized
    518 @*    considering the fact width is  multiple of 4 or 8. and height as multiple
    519 @*    of 2.
    520 @*
    521 @* @param[in] pu1_src
    522 @*  uword8 pointer to the source
    523 @*
    524 @* @param[out] pi2_dst
    525 @*  word16 pointer to the destination
    526 @*
    527 @* @param[in] src_strd
    528 @*  integer source stride
    529 @*
    530 @* @param[in] dst_strd
    531 @*  integer destination stride
    532 @*
    533 @* @param[in] pi1_coeff
    534 @*  word8 pointer to the filter coefficients
    535 @*
    536 @* @param[in] ht
    537 @*  integer height of the array
    538 @*
    539 @* @param[in] wd
    540 @*  integer width of the array
    541 @*
    542 @* @returns
    543 @*
    544 @* @remarks
    545 @*  none
    546 @*
    547 @*******************************************************************************
    548 @*/
    549 
    550 @void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
    551 @                                    word16 *pi2_dst,
    552 @                                    word32 src_strd,
    553 @                                    word32 dst_strd,
    554 @                                    word8 *pi1_coeff,
    555 @                                    word32 ht,
    556 @                                    word32 wd   )
    557 
    558 @**************variables vs registers*****************************************
    559 @   r0 => *pu1_src
    560 @   r1 => *pu1_dst
    561 @   r2 =>  src_strd
    562 @   r6 =>  dst_strd
    563 @   r12 => *pi1_coeff
    564 @   r5 =>  ht
    565 @   r3 =>  wd
    566 
    567 
    568 
    569 .globl ihevc_inter_pred_luma_vert_w16out_a9q
    570 
    571 .type ihevc_inter_pred_luma_vert_w16out_a9q, %function
    572 
    573 ihevc_inter_pred_luma_vert_w16out_a9q:
    574 
    575     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    576     vpush        {d8 - d15}
    577 
    578     ldr         r12,[sp,#coeff_offset]                @load pi1_coeff
    579     mov         r6,r3
    580     ldr         r5,[sp,#wd_offset]                 @load wd
    581     vld1.u8     {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
    582     sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
    583     vabs.s8     d0,d0                       @vabs_s8(coeff)
    584     add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
    585     ldr         r3,[sp,#ht_offset]                 @load ht
    586     subs        r7,r3,#0                    @r3->ht
    587     @ble        end_loops_16out         @end loop jump
    588     vdup.u8     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
    589     cmp         r5,#8
    590     vdup.u8     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
    591     vdup.u8     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
    592     vdup.u8     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
    593     vdup.u8     d26,d0[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
    594     vdup.u8     d27,d0[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
    595     vdup.u8     d28,d0[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
    596     vdup.u8     d29,d0[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
    597     blt         core_loop_wd_4_16out        @core loop wd 4 jump
    598     str         r0, [sp, #-4]!
    599     str         r1, [sp, #-4]!
    600 
    601     bic         r4,r5,#7                    @r5 ->wd
    602     rsb         r9,r4,r6,lsl #2             @r6->dst_strd   r5  ->wd
    603     rsb         r8,r4,r2,lsl #2             @r2->src_strd
    604     mov         r6, r6, lsl #1
    605     mov         r3, r5, lsr #3              @divide by 8
    606     mul         r7, r3                      @multiply height by width
    607     sub         r7, #4                      @subtract by one for epilog
    608 
    609 prolog_16out:
    610 
    611     and         r10, r0, #31
    612     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    613 
    614     vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    615     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    616     subs        r4,r4,#8
    617     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    618     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    619     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    620     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    621     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    622     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    623     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    624     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    625     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    626     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    627     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    628     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    629     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    630     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    631     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    632     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    633 
    634 
    635     addle       r0,r0,r8
    636     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    637 
    638     bicle       r4,r5,#7                    @r5 ->wd
    639     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    640 
    641     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    642     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    643 
    644     pld         [r3]
    645     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    646     pld         [r3, r2]
    647     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    648     pld         [r3, r2, lsl #1]
    649     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    650     add         r3, r3, r2
    651     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    652     pld         [r3, r2, lsl #1]
    653     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    654 
    655     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    656     vmull.u8    q6,d3,d23
    657     vld1.u8     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    658     vmlsl.u8    q6,d2,d22
    659     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    660     vmlsl.u8    q6,d4,d24
    661     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    662     vmlal.u8    q6,d5,d25
    663     vmlal.u8    q6,d6,d26
    664     vmlsl.u8    q6,d7,d27
    665     vmlal.u8    q6,d16,d28
    666     vmlsl.u8    q6,d17,d29
    667     add         r14,r1,r6
    668     vst1.8      {d8, d9},[r1]!              @vst1_u8(pu1_dst,sto_res)@
    669     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    670     addle       r1,r1,r9,lsl #1
    671 
    672     vmull.u8    q7,d4,d23
    673     subs        r7,r7,#4
    674     vmlsl.u8    q7,d3,d22
    675     vmlsl.u8    q7,d5,d24
    676     vmlal.u8    q7,d6,d25
    677     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    678     vmlal.u8    q7,d7,d26
    679     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    680     vmlsl.u8    q7,d16,d27
    681     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    682     vmlal.u8    q7,d17,d28
    683     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    684     vmlsl.u8    q7,d18,d29
    685     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    686 
    687     vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
    688     @vqrshrun.s16 d12,q6,#6
    689 
    690 
    691     blt         epilog_end_16out
    692     beq         epilog_16out                @jumps to epilog
    693 
    694 kernel_8_16out:
    695 
    696     subs        r4,r4,#8
    697     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    698 
    699     addle       r0,r0,r8
    700     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    701 
    702     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    703     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    704 
    705     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    706     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    707 
    708     bicle       r4,r5,#7                    @r5 ->wd
    709     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    710 
    711     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    712     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    713 
    714     vst1.8      {d12,d13},[r14],r6
    715     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    716 
    717     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    718     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    719 
    720 
    721 @   and         r11, r0, #31
    722     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    723 
    724     vst1.8      {d14,d15},[r14],r6
    725     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    726 
    727     add         r14,r1,r6
    728     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    729 
    730     vld1.u8     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    731     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    732 
    733     vld1.u8     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    734     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    735 
    736     vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
    737     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    738 
    739     addle       r1,r1,r9,lsl #1
    740     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    741 
    742 @   cmp         r11, r10
    743     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    744 
    745     add         r10, r3, r2, lsl #3         @ 10*strd - 8+2
    746     vmull.u8    q6,d3,d23
    747 
    748     add         r10, r10, r2                @ 11*strd
    749     vmlsl.u8    q6,d2,d22
    750 
    751     pld         [r10]                       @11+ 0
    752     vmlsl.u8    q6,d4,d24
    753 
    754     pld         [r10, r2]                   @11+ 1*strd
    755     vmlal.u8    q6,d5,d25
    756 
    757     pld         [r10, r2, lsl #1]           @11+ 2*strd
    758     vmlal.u8    q6,d6,d26
    759 
    760     add         r10, r10, r2                @12*strd
    761     vmlsl.u8    q6,d7,d27
    762 
    763     pld         [r10, r2, lsl #1]           @11+ 3*strd
    764     vmlal.u8    q6,d16,d28
    765 
    766 @   mov         r10, r11
    767     vmlsl.u8    q6,d17,d29
    768 
    769     vld1.u8     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    770     vmull.u8    q7,d4,d23
    771 
    772     subs        r7,r7,#4
    773     vmlsl.u8    q7,d3,d22
    774 
    775     vst1.8      {d10, d11},[r14],r6         @vst1_u8(pu1_dst_tmp,sto_res)@
    776     vmlsl.u8    q7,d5,d24
    777 
    778     vld1.u8     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    779     vmlal.u8    q7,d6,d25
    780 
    781     vld1.u8     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    782     vmlal.u8    q7,d7,d26
    783 
    784     vld1.u8     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    785     vmlsl.u8    q7,d16,d27
    786 
    787     vld1.u8     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    788     vmlal.u8    q7,d17,d28
    789 
    790     vld1.u8     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    791     vmlsl.u8    q7,d18,d29
    792 
    793 
    794     bgt         kernel_8_16out              @jumps to kernel_8
    795 
    796 epilog_16out:
    797 
    798     vmull.u8    q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    799     vmlsl.u8    q4,d0,d22                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
    800     vmlsl.u8    q4,d2,d24                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
    801     vmlal.u8    q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    802     vmlal.u8    q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    803     vmlsl.u8    q4,d5,d27                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
    804     vmlal.u8    q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    805     vmlsl.u8    q4,d7,d29                   @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
    806     vst1.8      {d12,d13},[r14],r6
    807 
    808     @vqrshrun.s16 d14,q7,#6
    809 
    810     vld1.u8     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    811     vmull.u8    q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    812     vmlsl.u8    q5,d1,d22                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
    813     vmlsl.u8    q5,d3,d24                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
    814     vmlal.u8    q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    815     vmlal.u8    q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    816     vmlsl.u8    q5,d6,d27                   @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
    817     vmlal.u8    q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    818     vmlsl.u8    q5,d16,d29                  @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
    819     vst1.8      {d14,d15},[r14],r6
    820 
    821     @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    822 
    823     vld1.u8     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    824     vmull.u8    q6,d3,d23
    825     vmlsl.u8    q6,d2,d22
    826     vmlsl.u8    q6,d4,d24
    827     vmlal.u8    q6,d5,d25
    828     vmlal.u8    q6,d6,d26
    829     vmlsl.u8    q6,d7,d27
    830     vmlal.u8    q6,d16,d28
    831     vmlsl.u8    q6,d17,d29
    832     add         r14,r1,r6
    833     vst1.8      {d8,d9},[r1]!               @vst1_u8(pu1_dst,sto_res)@
    834     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    835 
    836     vld1.u8     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    837     vmull.u8    q7,d4,d23
    838     vmlsl.u8    q7,d3,d22
    839     vmlsl.u8    q7,d5,d24
    840     vmlal.u8    q7,d6,d25
    841     vmlal.u8    q7,d7,d26
    842     vmlsl.u8    q7,d16,d27
    843     vmlal.u8    q7,d17,d28
    844     vmlsl.u8    q7,d18,d29
    845 
    846     vst1.8      {d10,d11},[r14],r6          @vst1_u8(pu1_dst_tmp,sto_res)@
    847     @vqrshrun.s16 d12,q6,#6
    848 
    849 epilog_end_16out:
    850     vst1.8      {d12,d13},[r14],r6
    851     @vqrshrun.s16 d14,q7,#6
    852 
    853     vst1.8      {d14,d15},[r14],r6
    854 
    855 
    856 end_loops_16out:
    857     tst         r5,#7
    858     ldr         r1, [sp], #4
    859     ldr         r0, [sp], #4
    860 
    861     beq         end2
    862 
    863     mov         r5, #4
    864     add         r0, r0, #8
    865     add         r1, r1, #16
    866     mov         r7, #16
    867     mov         r6, r6, lsr #1
    868 
    869     @
    870 
    871 core_loop_wd_4_16out:
    872     rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
    873     rsb         r8,r5,r2,lsl #2             @r2->src_strd
    874     vmov.i8     d4,#0
    875     mov         r6, r6, lsl #1
    876 
    877 outer_loop_wd_4_16out:
    878     subs        r12,r5,#0
    879     ble         end_inner_loop_wd_4_16out   @outer loop jump
    880 
    881 inner_loop_wd_4_16out:
    882     add         r3,r0,r2
    883     vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
    884     subs        r12,r12,#4
    885     vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
    886     vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
    887     vld1.u32    {d4[0]},[r0]                @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
    888     vmull.u8    q0,d5,d23                   @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
    889 
    890     vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
    891     add         r0,r0,#4
    892     vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
    893     vmlsl.u8    q0,d4,d22                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
    894 
    895     vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
    896     vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
    897     vmlsl.u8    q0,d6,d24                   @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
    898 
    899     vmull.u8    q4,d7,d23
    900     vdup.u32    d4,d7[1]                    @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
    901     vmull.u8    q1,d7,d25                   @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
    902     vld1.u32    {d4[1]},[r3],r2             @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
    903     vmlsl.u8    q4,d6,d22
    904     vmlal.u8    q0,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
    905 
    906     vdup.u32    d5,d4[1]                    @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
    907     vmlsl.u8    q4,d4,d24
    908     vld1.u32    {d5[1]},[r3],r2             @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
    909     vmlsl.u8    q1,d5,d27                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
    910 
    911     vdup.u32    d6,d5[1]                    @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
    912     vmlal.u8    q4,d5,d25
    913     vld1.u32    {d6[1]},[r3],r2             @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
    914     vmlal.u8    q0,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
    915 
    916     vdup.u32    d7,d6[1]                    @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
    917     vmlal.u8    q4,d6,d26
    918     vld1.u32    {d7[1]},[r3],r2             @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
    919     vmlsl.u8    q1,d7,d29                   @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
    920 
    921     vdup.u32    d4,d7[1]
    922     vadd.i16    q0,q0,q1                    @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
    923 
    924     vmlsl.u8    q4,d7,d27
    925     vld1.u32    {d4[1]},[r3],r2
    926     vmlal.u8    q4,d4,d28
    927     vdup.u32    d5,d4[1]
    928     @vqrshrun.s16 d0,q0,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    929 
    930     vld1.u32    {d5[1]},[r3]
    931     add         r3,r1,r6
    932     vst1.32     {d0},[r1]!                  @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
    933 
    934     vmlsl.u8    q4,d5,d29
    935     vst1.32     {d1},[r3],r6                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
    936     @vqrshrun.s16 d8,q4,#6
    937 
    938     vst1.32     {d8},[r3],r6
    939     @add        r1,r1,#4
    940     vst1.32     {d9},[r3]
    941     bgt         inner_loop_wd_4_16out
    942 
    943 end_inner_loop_wd_4_16out:
    944     subs        r7,r7,#4
    945     add         r1,r1,r9,lsl #1
    946     add         r0,r0,r8
    947     bgt         outer_loop_wd_4_16out
    948 end2:
    949     vpop         {d8 - d15}
    950     ldmfd       sp!, {r4-r12, r15}          @reload the registers from sp
    951 
    952 
    953 
    954 
    955 
    956 
    957 
    958 
    959 
    960