Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 
     20 ///**
     21 //******************************************************************************
     22 //* //file
     23 //*  ihevc_inter_pred_luma_horz_w16out.s
     24 //*
     25 //* //brief
     26 //*  contains function definitions for inter prediction  interpolation.
     27 //* functions are coded using neon  intrinsics and can be compiled using
     28 
     29 //* rvct
     30 //*
     31 //* //author
     32 //*  parthiban v
     33 //*
     34 //* //par list of functions:
     35 //*
     36 //*  - ihevc_inter_pred_luma_horz_w16out()
     37 //*
     38 //* //remarks
     39 //*  none
     40 //*
     41 //*******************************************************************************
     42 //*/
     43 ///**
     44 //*******************************************************************************
     45 //*
     46 //* //brief
     47 //*   interprediction luma filter for horizontal 16bit output
     48 //*
     49 //* //par description:
     50 //*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
     51 //*     to the elements pointed by 'pu1_src' and  writes to the location pointed
     52 //*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
     53 //*     as an input for vertical filtering or weighted  prediction   assumptions :
     54 //*     the function is optimized considering the fact width is  multiple of 4 or
     55 //*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
     56 //*     is optimized further.
     57 //*
     58 //* //param[in] pu1_src
     59 //*  uword8 pointer to the source
     60 //*
     61 //* //param[out] pi2_dst
     62 //*  word16 pointer to the destination
     63 //*
     64 //* //param[in] src_strd
     65 //*  integer source stride
     66 //*
     67 //* //param[in] dst_strd
     68 //*  integer destination stride
     69 //*
     70 //* //param[in] pi1_coeff
     71 //*  word8 pointer to the filter coefficients
     72 //*
     73 //* //param[in] ht
     74 //*  integer height of the array
     75 //*
     76 //* //param[in] wd
     77 //*  integer width of the array
     78 //*
     79 //* //returns
     80 //*
     81 //* //remarks
     82 //*  none
     83 //*
     84 //*******************************************************************************
     85 //*/
     86 
     87 //void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
     88 //                                word16 *pi2_dst,
     89 //                                word32 src_strd,
     90 //                                word32 dst_strd,
     91 //                                word8 *pi1_coeff,
     92 //                                word32 ht,
     93 //                                word32 wd
     94 
     95 
     96 //x0 - free
     97 //x1 - dst_ptr
     98 //x2 - src_strd
     99 //x3 - dst_strd
    100 //x8 - src_ptx2
    101 //x9 - inner loop counter
    102 //x10 - dst_ptx2
    103 //x11 - free
    104 //x12 - dst_strd2
    105 //x13 - src_strd1
    106 //x14 - wd
    107 //x15 - #1
    108 //x16 - src_ptx1
    109 //x19 - loop_counter
    110 .text
    111 .align 4
    112 
    113 .include "ihevc_neon_macros.s"
    114 
    115 .globl ihevc_inter_pred_luma_horz_w16out_av8
    116 
    117 .type ihevc_inter_pred_luma_horz_w16out_av8, %function
    118 
    119 ihevc_inter_pred_luma_horz_w16out_av8:
    120 
    121     // stmfd sp!, {x8-x16, x19}                //stack stores the values of the arguments
    122     push_v_regs
    123     stp         x19, x20,[sp,#-16]!
    124     mov         x20,#1
    125     bic         x19, x19, x20               // clearing bit[0], so that it goes back to mode
    126     mov         x8,x4                       //loads pi1_coeff
    127     mov         x11,x5                      //loads ht
    128 
    129 
    130     ld1         {v0.8b},[x8]                //coeff = vld1_s8(pi1_coeff)
    131     sub         x19,x11,#0                  //checks for ht == 0
    132     abs         v2.8b, v0.8b                //vabs_s8(coeff)
    133     mov         x15,#1
    134     //ble          end_loops
    135     mov         x14,x6                      //loads wd
    136     dup         v24.8b, v2.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    137     sub         x16,x0,#3                   //pu1_src - 3
    138     dup         v25.8b, v2.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    139     add         x8,x16,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
    140     dup         v26.8b, v2.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    141     sub         x20,x14,x2,lsl #1           //2*src_strd - wd
    142     neg         x13, x20
    143     dup         v27.8b, v2.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    144     sub         x20,x14,x3                  //dst_strd - wd
    145     neg         x12, x20
    146     dup         v28.8b, v2.8b[4]            //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
    147 
    148     dup         v29.8b, v2.8b[5]            //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
    149     and         x11,x19,#1                  //calculating ht_residue ht_residue = (ht & 1)
    150     dup         v30.8b, v2.8b[6]            //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
    151     sub         x19,x19,x11                 //decrement height by ht_residue(residue value is calculated outside)
    152     dup         v31.8b, v2.8b[7]            //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
    153 
    154     cmp         x11,#1
    155     beq         odd_height_decision
    156 
    157 even_height_decision:
    158     mov         x11,x1
    159     cmp         x14,#4
    160     ble         outer_loop_4
    161 
    162     cmp         x14,#24
    163     mov         x20,#16
    164     csel        x14, x20, x14,eq
    165     add         x20, x12,#8
    166     csel        x12, x20, x12,eq
    167     add         x20, x13,#8
    168     csel        x13, x20, x13,eq
    169 
    170     cmp         x14,#16
    171     bge         outer_loop_16_branch
    172 
    173     cmp         x14,#12
    174     add         x20, x12,#4
    175     csel        x12, x20, x12,eq
    176     add         x20, x13,#4
    177     csel        x13, x20, x13,eq
    178 outer_loop_8_branch:
    179     b           outer_loop_8
    180 
    181 outer_loop_16_branch:
    182     b           outer_loop_16
    183 
    184 
    185 odd_height_decision:
    186     cmp         x14,#24
    187     beq         outer_loop_8_branch
    188     cmp         x14,#12
    189     beq         outer_loop_4
    190     b           even_height_decision
    191 
    192 outer_loop4_residual:
    193     sub         x16,x0,#3                   //pu1_src - 3
    194     mov         x1,x11
    195     add         x1, x1,#16
    196     mov         x14,#4
    197     add         x16, x16,#8
    198     mov         x19,#16
    199     add         x12, x12,#4
    200     add         x13, x13,#4
    201 
    202 outer_loop_4:
    203     add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
    204     add         x8,x16,x2                   //pu1_src + src_strd
    205 
    206     subs        x9,x14,#0                   //checks wd
    207     ble         end_inner_loop_4
    208 
    209 inner_loop_4:
    210     mov         x15,#1
    211     ld1         {v20.2s},[x16],x15          //vector load pu1_src
    212     ld1         {v21.2s},[x16],x15
    213     ld1         {v22.2s},[x8],x15           //vector load pu1_src + src_strd
    214     ld1         {v23.2s},[x8],x15
    215 
    216     zip1        v0.2s, v20.2s, v22.2s
    217     zip2        v12.2s, v20.2s, v22.2s      //vector zip the i iteration and ii interation in single register
    218     zip1        v1.2s, v21.2s, v23.2s
    219     zip2        v13.2s, v21.2s, v23.2s
    220 
    221     ld1         {v20.2s},[x16],x15
    222     ld1         {v21.2s},[x16],x15
    223     ld1         {v22.2s},[x8],x15
    224     ld1         {v23.2s},[x8],x15
    225 
    226     zip1        v2.2s, v20.2s, v22.2s
    227     zip2        v14.2s, v20.2s, v22.2s
    228     zip1        v3.2s, v21.2s, v23.2s
    229     zip2        v15.2s, v21.2s, v23.2s
    230 
    231     ld1         {v20.2s},[x16],x15
    232     ld1         {v21.2s},[x16],x15
    233     ld1         {v22.2s},[x8],x15
    234     ld1         {v23.2s},[x8],x15
    235 
    236     zip1        v4.2s, v20.2s, v22.2s
    237     zip2        v16.2s, v20.2s, v22.2s
    238     zip1        v5.2s, v21.2s, v23.2s
    239     zip2        v17.2s, v21.2s, v23.2s
    240 
    241     ld1         {v20.2s},[x16],x15
    242     ld1         {v21.2s},[x16],x15
    243     ld1         {v22.2s},[x8],x15
    244     ld1         {v23.2s},[x8],x15
    245 
    246     //add        x16,x16,#4                        //increment the input pointer
    247     sub         x16,x16,#4
    248     //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    249     //vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
    250     //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    251 
    252     //vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
    253     //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
    254     //vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
    255     //vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
    256     sub         x8,x8,#4
    257     // add        x8,x8,#4                        //increment the input pointer
    258     // vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
    259     // vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
    260     // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
    261     // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
    262     // vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
    263     // vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
    264     //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
    265 
    266 
    267 
    268 
    269 
    270 
    271     zip1        v6.2s, v20.2s, v22.2s
    272     zip2        v18.2s, v20.2s, v22.2s
    273     zip1        v7.2s, v21.2s, v23.2s
    274     zip2        v19.2s, v21.2s, v23.2s
    275 
    276     umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
    277     umlsl       v8.8h, v0.8b, v24.8b
    278     umlsl       v8.8h, v2.8b, v26.8b
    279     umlal       v8.8h, v3.8b, v27.8b
    280     umlal       v8.8h, v4.8b, v28.8b
    281     umlsl       v8.8h, v5.8b, v29.8b
    282     umlal       v8.8h, v6.8b, v30.8b
    283     umlsl       v8.8h, v7.8b, v31.8b
    284 
    285     // vqrshrun.s16 d8,q4,#6                        //narrow right shift and saturating the result
    286     st1         {v8.d}[0],[x1],#8           //store the i iteration result which is in upper part of the register
    287     st1         {v8.d}[1],[x10],#8          //store the ii iteration result which is in lower part of the register
    288     subs        x9,x9,#4                    //decrement the wd by 4
    289     bgt         inner_loop_4
    290 
    291 end_inner_loop_4:
    292     subs        x19,x19,#2                  //decrement the ht by 4
    293     add         x16,x16,x13                 //increment the input pointer 2*src_strd-wd
    294     add         x1,x10,x12,lsl #1           //increment the output pointer 2*dst_strd-wd
    295     bgt         outer_loop_4
    296 
    297 
    298 height_residue_4:
    299 
    300     mov         x11,x5                      //loads ht
    301     and         x11,x11,#1                  //calculating ht_residue ht_residue = (ht & 1)
    302     cmp         x11,#0
    303     //beq        end_loops
    304     // ldmeqfd sp!,{x8-x16,pc}                  //reload the registers from sp
    305     bne         lbl280
    306     ldp         x19, x20,[sp], #16
    307     pop_v_regs
    308     ret
    309 lbl280:
    310 
    311 outer_loop_height_residue_4:
    312 
    313 
    314     subs        x9,x14,#0                   //checks wd
    315     ble         end_inner_loop_height_residue_4
    316 
    317 inner_loop_height_residue_4:
    318     mov         x15, #1
    319     ld1         {v0.2s},[x16],x15           //vector load pu1_src
    320     ld1         {v1.2s},[x16],x15
    321 
    322 
    323 
    324 
    325 
    326 
    327     // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    328     // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
    329     // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    330 
    331 
    332 
    333     //add        x16,x16,#4                        //increment the input pointer
    334     // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
    335     // vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
    336     // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
    337     // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
    338     ld1         {v2.2s},[x16],x15
    339     umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
    340     ld1         {v3.2s},[x16],x15
    341     umlsl       v8.8h, v0.8b, v24.8b
    342     ld1         {v4.2s},[x16],x15
    343     umlsl       v8.8h, v2.8b, v26.8b
    344     ld1         {v5.2s},[x16],x15
    345     umlal       v8.8h, v3.8b, v27.8b
    346     ld1         {v6.2s},[x16],x15
    347     umlal       v8.8h, v4.8b, v28.8b
    348     ld1         {v7.2s},[x16],x15
    349     umlsl       v8.8h, v5.8b, v29.8b
    350     sub         x16,x16,#4
    351     umlal       v8.8h, v6.8b, v30.8b
    352     umlsl       v8.8h, v7.8b, v31.8b        //store the i iteration result which is in upper part of the register
    353     subs        x9,x9,#4                    //decrement the wd by 4
    354     st1         {v8.d}[0],[x1],#8
    355     bgt         inner_loop_height_residue_4
    356 
    357 end_inner_loop_height_residue_4:
    358     subs        x11,x11,#1                  //decrement the ht by 4
    359     sub         x20,x14,x2
    360     neg         x13, x20
    361     add         x16,x16,x13                 //increment the input pointer src_strd-wd
    362     add         x1,x1,x12                   //increment the output pointer dst_strd-wd
    363     bgt         outer_loop_height_residue_4
    364 
    365     // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    366     ldp         x19, x20,[sp], #16
    367     pop_v_regs
    368     ret
    369 
    370 outer_loop8_residual:
    371     sub         x16,x0,#3                   //pu1_src - 3
    372     mov         x1,x11
    373     mov         x19,#32
    374     add         x1, x1,#32
    375     add         x16, x16,#16
    376     mov         x14,#8
    377     add         x12, x12,#8
    378     add         x13, x13,#8
    379 
    380 outer_loop_8:
    381 
    382     add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
    383     add         x8,x16,x2                   //pu1_src + src_strd
    384     subs        x9,x14,#0                   //checks wd
    385 
    386     ble         end_inner_loop_8
    387 
    388 inner_loop_8:
    389     mov         x15, #1
    390     ld1         {v0.2s},[x16],x15           //vector load pu1_src
    391     ld1         {v1.2s},[x16],x15
    392     ld1         {v2.2s},[x16],x15
    393     ld1         {v3.2s},[x16],x15
    394 
    395 
    396 
    397 
    398 
    399     // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    400     // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
    401     // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    402     // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
    403     // vext.u8    d6,d0,d1,#6                        //vector extract of src [0_6]
    404     // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
    405     // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
    406     // vext.u8    d14,d12,d13,#2
    407 
    408     //vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
    409     // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
    410     // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
    411     //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
    412     //vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
    413     //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
    414     ld1         {v4.2s},[x16],x15
    415     umull       v8.8h, v1.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    416     ld1         {v5.2s},[x16],x15
    417     umlal       v8.8h, v3.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    418     ld1         {v6.2s},[x16],x15
    419     umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    420     ld1         {v7.2s},[x16],x15
    421     umlsl       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    422     ld1         {v12.2s},[x8],x15           //vector load pu1_src + src_strd
    423     umlal       v8.8h, v4.8b, v28.8b        //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    424     ld1         {v13.2s},[x8],x15
    425     umlsl       v8.8h, v5.8b, v29.8b        //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    426     ld1         {v14.2s},[x8],x15
    427     umlal       v8.8h, v6.8b, v30.8b        //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    428     ld1         {v15.2s},[x8],x15
    429     umlsl       v8.8h, v7.8b, v31.8b        //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    430     ld1         {v16.2s},[x8],x15           //vector load pu1_src + src_strd
    431 
    432     umull       v10.8h, v15.8b, v27.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    433     ld1         {v17.2s},[x8],x15
    434     umlsl       v10.8h, v14.8b, v26.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    435     ld1         {v18.2s},[x8],x15
    436     umlal       v10.8h, v16.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    437     ld1         {v19.2s},[x8],x15           //vector load pu1_src + src_strd
    438     umlsl       v10.8h, v17.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    439     // vqrshrun.s16     d20,q4,#6                        //right shift and saturating narrow result 1
    440     umlal       v10.8h, v18.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    441     umlsl       v10.8h, v19.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    442     st1         {v8.8h},[x1],#16            //store the result pu1_dst
    443     umlsl       v10.8h, v12.8b, v24.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    444     umlal       v10.8h, v13.8b, v25.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    445 
    446 
    447 
    448     // vqrshrun.s16 d8,q5,#6                        //right shift and saturating narrow result 2
    449     subs        x9,x9,#8                    //decrement the wd loop
    450     st1         {v10.8h},[x10],#16          //store the result pu1_dst
    451     cmp         x9,#4
    452     bgt         inner_loop_8
    453 
    454 end_inner_loop_8:
    455     subs        x19,x19,#2                  //decrement the ht loop
    456     add         x16,x16,x13                 //increment the src pointer by 2*src_strd-wd
    457     add         x1,x10,x12,lsl #1           //increment the dst pointer by 2*dst_strd-wd
    458     bgt         outer_loop_8
    459 
    460 
    461 
    462 
    463 
    464     mov         x14,x6                      //loads wd
    465     cmp         x14,#12
    466 
    467     beq         outer_loop4_residual
    468 
    469     mov         x11,x5                      //loads ht
    470     and         x11,x11,#1
    471     cmp         x11,#1
    472     beq         height_residue_4
    473 
    474 //end_loops
    475 
    476     // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    477     ldp         x19, x20,[sp], #16
    478     pop_v_regs
    479     ret
    480 
    481 
    482 
    483 
    484 
    485 outer_loop_16:
    486     mov         x15, #-7
    487     stp         x0,x11,[sp,#-16]!
    488     add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
    489     add         x8,x16,x2                   //pu1_src + src_strd
    490     and         x0, x16, #31
    491     sub         x9,x14,#0                   //checks wd
    492     //ble          end_loops1
    493     add         x20,x16, x2, lsl #1
    494     prfm        PLDL1KEEP,[x20]
    495     ld1         {v0.2s},[x16],#8            //vector load pu1_src
    496     ld1         {v1.2s},[x16],x15           //vector load pu1_src
    497     add         x20,x8, x2, lsl #1
    498     prfm        PLDL1KEEP,[x20]
    499     ld1         {v2.2s},[x16],#8
    500     ld1         {v3.2s},[x16],x15
    501     ld1         {v4.2s},[x16],#8
    502     ld1         {v5.2s},[x16],x15
    503     ld1         {v6.2s},[x16],#8
    504     ld1         {v7.2s},[x16],x15
    505     ld1         {v12.2s},[x16],#8
    506     ld1         {v13.2s},[x16],x15
    507     umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    508     ld1         {v14.2s},[x16],#8
    509     ld1         {v15.2s},[x16],x15
    510     umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    511     ld1         {v16.2s},[x16],#8
    512     ld1         {v17.2s},[x16],x15
    513     umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    514     ld1         {v18.2s},[x16],#8
    515     ld1         {v19.2s},[x16],x15
    516     umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    517     umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    518     umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    519     umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    520     umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    521 
    522 
    523 inner_loop_16:
    524 
    525 
    526     subs        x9,x9,#16
    527     umull       v20.8h, v3.8b, v25.8b
    528 
    529     add         x16, x16,#8
    530     umlsl       v20.8h, v1.8b, v24.8b
    531 
    532     ld1         {v0.2s},[x8],#8             //vector load pu1_src
    533     ld1         {v1.2s},[x8],x15            //vector load pu1_src
    534     umlal       v20.8h, v7.8b, v27.8b
    535 
    536     ld1         {v2.2s},[x8],#8
    537     ld1         {v3.2s},[x8],x15
    538     umlsl       v20.8h, v5.8b, v26.8b
    539 
    540     ld1         {v4.2s},[x8],#8
    541     ld1         {v5.2s},[x8],x15
    542     umlal       v20.8h, v13.8b, v28.8b
    543 
    544     ld1         {v6.2s},[x8],#8
    545     ld1         {v7.2s},[x8],x15
    546     umlal       v20.8h, v17.8b, v30.8b
    547 
    548     ld1         {v12.2s},[x8],#8
    549     ld1         {v13.2s},[x8],x15
    550     umlsl       v20.8h, v15.8b, v29.8b
    551 
    552     ld1         {v14.2s},[x8],#8
    553     ld1         {v15.2s},[x8],x15
    554     umlsl       v20.8h, v19.8b, v31.8b
    555 
    556     ld1         {v16.2s},[x8],#8
    557     ld1         {v17.2s},[x8],x15
    558     umull       v10.8h, v2.8b, v25.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    559 
    560     ld1         {v18.2s},[x8],#8
    561     ld1         {v19.2s},[x8],x15
    562     umlal       v10.8h, v6.8b, v27.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    563 
    564     add         x8, x8,#8
    565     umlsl       v10.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    566     add         x20,x16, x2, lsl #2
    567     prfm        PLDL1KEEP,[x20]
    568     add         x20,x8, x2, lsl #2
    569     prfm        PLDL1KEEP,[x20]
    570     st1         {v8.16b},[x1],#16           //store the result pu1_dst
    571     umlsl       v10.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    572 
    573     add         x20,x16,x13                 //increment the src pointer by 2*src_strd-wd
    574     csel        x16, x20, x16,eq
    575     umlal       v10.8h, v12.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    576 
    577     add         x20,x16,x2                  //pu1_src + src_strd
    578     csel        x8, x20, x8,eq
    579     umlsl       v10.8h, v14.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    580 
    581 //    and            x11, x16, #31
    582     umlal       v10.8h, v16.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    583 
    584     sub         x20,x19,#2
    585     csel        x19, x20, x19,eq
    586     umlsl       v10.8h, v18.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    587 
    588     //cmp            x11, x0
    589     umull       v22.8h, v3.8b, v25.8b
    590 
    591 //    add x20,x16, x2, lsl #2
    592     prfm        PLDL1KEEP,[x20]
    593     umlsl       v22.8h, v1.8b, v24.8b
    594 
    595     st1         {v20.8h},[x1],#16
    596     umlal       v22.8h, v7.8b, v27.8b
    597 
    598 //    add x20,x8, x2, lsl #2
    599     prfm        PLDL1KEEP,[x20]
    600     umlsl       v22.8h, v5.8b, v26.8b
    601 
    602 //    mov            x0, x11
    603     umlal       v22.8h, v13.8b, v28.8b
    604 
    605     cmp         x19,#0
    606     umlal       v22.8h, v17.8b, v30.8b
    607 
    608     st1         {v10.8h},[x10],#16
    609     umlsl       v22.8h, v15.8b, v29.8b
    610 
    611     umlsl       v22.8h, v19.8b, v31.8b
    612 
    613     beq         epilog_16
    614 
    615     ld1         {v0.2s},[x16],#8            //vector load pu1_src
    616     ld1         {v1.2s},[x16],x15           //vector load pu1_src
    617     ld1         {v2.2s},[x16],#8
    618     ld1         {v3.2s},[x16],x15
    619     ld1         {v4.2s},[x16],#8
    620     ld1         {v5.2s},[x16],x15
    621     ld1         {v6.2s},[x16],#8
    622     ld1         {v7.2s},[x16],x15
    623     ld1         {v12.2s},[x16],#8
    624     ld1         {v13.2s},[x16],x15
    625     umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    626     ld1         {v14.2s},[x16],#8
    627     ld1         {v15.2s},[x16],x15
    628     umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    629     ld1         {v16.2s},[x16],#8
    630     ld1         {v17.2s},[x16],x15
    631     umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    632     ld1         {v18.2s},[x16],#8
    633     ld1         {v19.2s},[x16],x15
    634     umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    635     umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    636     cmp         x9,#0
    637     umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    638     mov         x20,x14
    639     csel        x9, x20, x9,eq
    640     umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    641     st1         {v22.16b},[x10],#16         //store the result pu1_dst
    642     umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    643     add         x20,x10,x12,lsl #1
    644     csel        x1, x20, x1,eq
    645     add         x20,x1,x3,lsl #1            //pu1_dst + dst_strd
    646     csel        x10, x20, x10,eq
    647     b           inner_loop_16
    648 
    649 
    650 epilog_16:
    651 //    vqrshrun.s16 d11,q11,#6
    652     st1         {v22.16b},[x10],#16         //store the result pu1_dst
    653 
    654     ldp         x0,x11,[sp],#16
    655     mov         x14,x6
    656     cmp         x14,#24
    657     beq         outer_loop8_residual
    658     add         x1,x10,x12,lsl #1
    659     mov         x11,x5                      //loads ht
    660     and         x11,x11,#1
    661     cmp         x11,#1
    662     beq         height_residue_4
    663 
    664 end_loops1:
    665 
    666     // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    667     ldp         x19, x20,[sp], #16
    668     pop_v_regs
    669     ret
    670 
    671 
    672 
    673 
    674 
    675 
    676 
    677 
    678 
    679