Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_filters_luma_vert_w16inp.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*  - ihevc_inter_pred_luma_vert()
     35 //*
     36 //* //remarks
     37 //*  none
     38 //*
     39 //*******************************************************************************
     40 //*/
     41 
     42 ///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 ///* include reconstruction */
     44 //
     45 
     46 ///**
     47 //*******************************************************************************
     48 //*
     49 //* //brief
     50 //*    luma vertical filter for 16bit input.
     51 //*
     52 //* //par description:
     53 //*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     54 //*     the elements pointed by 'pu1_src' and  writes to the location pointed by
     55 //*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
     56 //*     clipped to lie  between 0 and 255   assumptions : the function is
     57 //*     optimized considering the fact width is  multiple of 4. and height as
     58 //*     multiple of 2.
     59 //*
     60 //* //param[in] pi2_src
     61 //*  word16 pointer to the source
     62 //*
     63 //* //param[out] pu1_dst
     64 //*  uword8 pointer to the destination
     65 //*
     66 //* //param[in] src_strd
     67 //*  integer source stride
     68 //*
     69 //* //param[in] dst_strd
     70 //*  integer destination stride
     71 //*
     72 //* //param[in] pi1_coeff
     73 //*  word8 pointer to the filter coefficients
     74 //*
     75 //* //param[in] ht
     76 //*  integer height of the array
     77 //*
     78 //* //param[in] wd
     79 //*  integer width of the array
     80 //*
     81 //* //returns
     82 //*
     83 //* //remarks
     84 //*  none
     85 //*
     86 //*******************************************************************************
     87 //*/
     88 
     89 //void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
     90 //                                    uword8 *pu1_dst,
     91 //                                    word32 src_strd,
     92 //                                    word32 dst_strd,
     93 //                                    word8 *pi1_coeff,
     94 //                                    word32 ht,
     95 //                                    word32 wd   )
     96 //**************variables vs registers*****************************************
     97 //  r0 => *pu2_src
     98 //  r1 => *pu1_dst
     99 //  r2 =>  src_strd
    100 //  r3 =>  dst_strd
    101 //  r4 => *pi1_coeff
    102 //  r5 =>  ht
    103 //  r6 =>  wd
    104 
    105 .text
    106 .align 4
    107 
    108 .include "ihevc_neon_macros.s"
    109 
    110 .globl ihevc_inter_pred_luma_vert_w16inp_w16out_av8
    111 
    112 .type ihevc_inter_pred_luma_vert_w16inp_w16out_av8, %function
    113 
    114 ihevc_inter_pred_luma_vert_w16inp_w16out_av8:
    115 
    116     //stmfd     sp!, {r4-r12, r14}  //stack stores the values of the arguments
    117 
    118     stp         x19,x20,[sp, #-16]!
    119 
    120     mov         x15,x4 // pi1_coeff
    121     mov         x16,x5 // ht
    122     mov         x17,x6 // wd
    123 
    124 
    125     mov         x12,x15                     //load pi1_coeff
    126     lsl         x6,x3,#1
    127     mov         x5,x17                      //load wd
    128     ld1         {v0.8b},[x12]               //coeff = ld1_s8(pi1_coeff)
    129     lsl         x2, x2,#1
    130     sub         x12,x2,x2,lsl #2            //src_ctrd & pi1_coeff
    131     //vabs.s8   d0,d0               //vabs_s8(coeff)
    132     add         x0,x0,x12                   //r0->pu1_src   r12->pi1_coeff
    133     mov         x3,x16                      //load ht
    134     subs        x7,x3,#0                    //r3->ht
    135     //ble       end_loops           //end loop jump
    136     sxtl        v0.8h,v0.8b
    137     dup         v22.4h,v0.h[0]              //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
    138     dup         v23.4h,v0.h[1]              //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
    139     dup         v24.4h,v0.h[2]              //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
    140     dup         v25.4h,v0.h[3]              //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
    141     dup         v26.4h,v0.h[4]              //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
    142     dup         v27.4h,v0.h[5]              //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
    143     dup         v28.4h,v0.h[6]              //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
    144     dup         v29.4h,v0.h[7]              //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
    145     movi        v30.4s,#8, lsl #16
    146 
    147     sub         x9,x5,x6,lsl #2             //r6->dst_strd  r5  ->wd
    148     neg         x9,x9
    149     sub         x8,x5,x2,lsl #2             //r2->src_strd
    150     neg         x8,x8
    151     sub         x8,x8,x5
    152     sub         x9,x9,x5
    153     lsr         x3, x5, #2                  //divide by 4
    154     mul         x7, x7, x3                  //multiply height by width
    155     sub         x7, x7, #4                  //subtract by one for epilog
    156     mov         x4,x5                       //r5 ->wd
    157     //mov           r2, r2, lsl #1
    158 
    159 prolog:
    160 
    161     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
    162     ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
    163     ld1         {v0.4h},[x0], #8            //src_tmp1 = ld1_u8(pu1_src_tmp)//
    164     subs        x4,x4,#4
    165     ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
    166     smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
    167     ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
    168     smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
    169     ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
    170     smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
    171     ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
    172     smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
    173     ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
    174     smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
    175     ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
    176     smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
    177     smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
    178     smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
    179 
    180     ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
    181 
    182     smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
    183     add         x20,x0,x8,lsl #0
    184     csel        x0,x20,x0,le
    185     smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
    186     csel        x4,x5,x4,le
    187     smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
    188     ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
    189     smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
    190     ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
    191     smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
    192     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
    193     smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
    194     smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
    195     smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
    196     sub         v19.4s, v19.4s, v30.4s
    197 
    198     ld1         {v1.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
    199     smull       v21.4s,v3.4h,v23.4h
    200     ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
    201     smlal       v21.4s,v2.4h,v22.4h
    202     ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
    203     smlal       v21.4s,v4.4h,v24.4h
    204     smlal       v21.4s,v5.4h,v25.4h
    205     smlal       v21.4s,v6.4h,v26.4h
    206     smlal       v21.4s,v7.4h,v27.4h
    207     smlal       v21.4s,v16.4h,v28.4h
    208     smlal       v21.4s,v17.4h,v29.4h
    209     add         x14,x1,x6
    210     sub         v20.4s, v20.4s, v30.4s
    211     shrn        v19.4h, v19.4s, #6
    212     //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
    213 
    214     smull       v31.4s,v4.4h,v23.4h
    215     smlal       v31.4s,v3.4h,v22.4h
    216     smlal       v31.4s,v5.4h,v24.4h
    217     smlal       v31.4s,v6.4h,v25.4h
    218     ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
    219     smlal       v31.4s,v7.4h,v26.4h
    220     ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
    221     smlal       v31.4s,v16.4h,v27.4h
    222     ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
    223     smlal       v31.4s,v17.4h,v28.4h
    224     ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
    225     smlal       v31.4s,v18.4h,v29.4h
    226     ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
    227 
    228     st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
    229     sub         v21.4s, v21.4s, v30.4s
    230     shrn        v20.4h, v20.4s, #6
    231     //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
    232     add         x20, x1, x9
    233     csel        x1, x20, x1, le
    234 
    235     subs        x7,x7,#4
    236 
    237 
    238     blt         epilog_end                  //jumps to epilog_end
    239     beq         epilog                      //jumps to epilog
    240 
    241 kernel_8:
    242 
    243     smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
    244     subs        x4,x4,#4
    245     smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
    246     add         x20,x0,x8,lsl #0
    247     csel        x0,x20,x0,le
    248     smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
    249     smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
    250     smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
    251     smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
    252     smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
    253     smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
    254     st1         {v20.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
    255 
    256     sub         v31.4S, v31.4s, v30.4s
    257     shrn        v21.4h, v21.4s, #6
    258     //vqrshrun d12,q6,#6
    259     ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
    260 
    261     smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
    262     smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
    263     smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
    264     smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
    265     smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
    266     smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
    267     st1         {v21.2s},[x14],x6
    268 
    269     smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
    270     ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
    271 
    272     smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
    273 
    274     sub         v19.4s, v19.4s, v30.4s
    275     shrn        v31.4h, v31.4s, #6
    276     //vqrshrun d14,q7,#6
    277 
    278     smull       v21.4s,v3.4h,v23.4h
    279     csel        x4,x5,x4,le
    280 
    281     smlal       v21.4s,v2.4h,v22.4h
    282     ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
    283 
    284     smlal       v21.4s,v4.4h,v24.4h
    285     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
    286 
    287     smlal       v21.4s,v5.4h,v25.4h
    288 
    289     smlal       v21.4s,v6.4h,v26.4h
    290     st1         {v31.2s},[x14],x6
    291 
    292     smlal       v21.4s,v7.4h,v27.4h
    293     ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
    294 
    295     smlal       v21.4s,v16.4h,v28.4h
    296     add         x14,x1,x6
    297 
    298     smlal       v21.4s,v17.4h,v29.4h
    299     ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
    300 
    301     sub         v20.4s, v20.4s, v30.4s
    302     shrn        v19.4h, v19.4s, #6
    303     //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
    304     ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
    305 
    306     smull       v31.4s,v4.4h,v23.4h
    307     smlal       v31.4s,v3.4h,v22.4h
    308     smlal       v31.4s,v5.4h,v24.4h
    309     ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
    310 
    311     smlal       v31.4s,v6.4h,v25.4h
    312     ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
    313     smlal       v31.4s,v7.4h,v26.4h
    314     ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
    315     smlal       v31.4s,v16.4h,v27.4h
    316     ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
    317     smlal       v31.4s,v17.4h,v28.4h
    318     ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
    319     smlal       v31.4s,v18.4h,v29.4h
    320     st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
    321 
    322     sub         v21.4s, v21.4s, v30.4s
    323     shrn        v20.4h, v20.4s, #6
    324     add         x20, x1, x9
    325     csel        x1, x20, x1, le
    326 
    327     //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
    328     subs        x7,x7,#4
    329 
    330     bgt         kernel_8                    //jumps to kernel_8
    331 
    332 epilog:
    333 
    334     smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
    335     smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
    336     smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
    337     smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
    338     smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
    339     smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
    340     smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
    341     smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
    342     st1         {v20.2s},[x14],x6
    343 
    344     sub         v31.4s, v31.4s, v30.4s
    345     shrn        v21.4h, v21.4s, #6
    346     //vqrshrun d12,q6,#6
    347 
    348     ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
    349     smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
    350     smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
    351     smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
    352     smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
    353     smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
    354     smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
    355     smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
    356     smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
    357     st1         {v21.2s},[x14],x6
    358 
    359     sub         v19.4s, v19.4s, v30.4s
    360     shrn        v31.4h, v31.4s, #6
    361     //vqrshrun d14,q7,#6
    362 
    363     ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
    364     smull       v21.4s,v3.4h,v23.4h
    365     smlal       v21.4s,v2.4h,v22.4h
    366     smlal       v21.4s,v4.4h,v24.4h
    367     smlal       v21.4s,v5.4h,v25.4h
    368     smlal       v21.4s,v6.4h,v26.4h
    369     smlal       v21.4s,v7.4h,v27.4h
    370     smlal       v21.4s,v16.4h,v28.4h
    371     smlal       v21.4s,v17.4h,v29.4h
    372     st1         {v31.2s},[x14],x6
    373     sub         v20.4s, v20.4s, v30.4s
    374     shrn        v19.4h, v19.4s, #6
    375     //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
    376 
    377     ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
    378     smull       v31.4s,v4.4h,v23.4h
    379     smlal       v31.4s,v3.4h,v22.4h
    380     smlal       v31.4s,v5.4h,v24.4h
    381     smlal       v31.4s,v6.4h,v25.4h
    382     smlal       v31.4s,v7.4h,v26.4h
    383     smlal       v31.4s,v16.4h,v27.4h
    384     smlal       v31.4s,v17.4h,v28.4h
    385     smlal       v31.4s,v18.4h,v29.4h
    386     sub         v21.4s, v21.4s, v30.4s
    387     shrn        v20.4h, v20.4s, #6
    388     //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
    389 
    390     add         x14,x1,x6
    391     st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
    392 
    393 epilog_end:
    394     st1         {v20.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
    395     shrn        v21.4h, v21.4s, #6
    396     //vqrshrun d12,q6,#6
    397 
    398     st1         {v21.2s},[x14],x6
    399     sub         v31.4s, v31.4s, v30.4s
    400     shrn        v31.4h, v31.4s, #6
    401     //vqrshrun d14,q7,#6
    402 
    403     st1         {v31.2s},[x14],x6
    404 
    405 
    406 end_loops:
    407 
    408     //ldmfd     sp!,{r4-r12,r15}            //reload the registers from sp
    409     ldp         x19, x20,[sp], #16
    410 
    411     ret
    412 
    413 
    414 
    415 
    416 
    417 
    418 
    419