Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_chroma_horz_neon.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs / akshaya mukund
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*
     35 //* //remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* //brief
     44 //*       chroma interprediction filter to store horizontal 16bit ouput
     45 //*
     46 //* //par description:
     47 //*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
     48 //*    to the elements pointed by 'pu1_src' and  writes to the location pointed
     49 //*    by 'pu1_dst'  no downshifting or clipping is done and the output is  used
     50 //*    as an input for vertical filtering or weighted  prediction
     51 //*
     52 //* //param[in] pu1_src
     53 //*  uword8 pointer to the source
     54 //*
     55 //* //param[out] pi2_dst
     56 //*  word16 pointer to the destination
     57 //*
     58 //* //param[in] src_strd
     59 //*  integer source stride
     60 //*
     61 //* //param[in] dst_strd
     62 //*  integer destination stride
     63 //*
     64 //* //param[in] pi1_coeff
     65 //*  word8 pointer to the filter coefficients
     66 //*
     67 //* //param[in] ht
     68 //*  integer height of the array
     69 //*
     70 //* //param[in] wd
     71 //*  integer width of the array
     72 //*
     73 //* //returns
     74 //*
     75 //* //remarks
     76 //*  none
     77 //*
     78 //*******************************************************************************
     79 //*/
     80 //void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
     81 //                                          word16 *pi2_dst,
     82 //                                          word32 src_strd,
     83 //                                          word32 dst_strd,
     84 //                                          word8 *pi1_coeff,
     85 //                                          word32 ht,
     86 //                                          word32 wd)
     87 //**************variables vs registers*****************************************
     88 //x0 => *pu1_src
     89 //x1 => *pi2_dst
     90 //x2 =>  src_strd
     91 //x3 =>  dst_strd
     92 
     93 
     94 .text
     95 .align 4
     96 
     97 .include "ihevc_neon_macros.s"
     98 
     99 .globl ihevc_inter_pred_chroma_horz_w16out_av8
    100 
    101 
    102 .type ihevc_inter_pred_chroma_horz_w16out_av8, %function
    103 
    104 ihevc_inter_pred_chroma_horz_w16out_av8:
    105 
    106     // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
    107 
    108     stp         d10,d11,[sp,#-16]!
    109     stp         d12,d13,[sp,#-16]!
    110     stp         d14,d15,[sp,#-16]!
    111     stp         x19, x20,[sp,#-16]!
    112 
    113     mov         x15,x4 // pi1_coeff
    114     mov         x16,x5 // ht
    115     mov         x17,x6 // wd
    116 
    117     mov         x4,x15                      //loads pi1_coeff
    118     mov         x6,x16                      //loads ht
    119     mov         x10,x17                     //loads wd
    120 
    121     ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
    122     subs        x14,x6,#0                   //checks for ht == 0
    123     abs         v2.8b, v0.8b                //vabs_s8(coeff)
    124 
    125 //******* added
    126     mov         x11, #2
    127 //******* added ends
    128 
    129     ble         end_loops
    130 
    131     dup         v24.8b, v2.b[0]             //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    132     sub         x12,x0,#2                   //pu1_src - 2
    133     dup         v25.8b, v2.b[1]             //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    134     add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
    135     dup         v26.8b, v2.b[2]             //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    136 
    137     tst         x10,#3                      //checks wd for multiples of 4
    138     lsl         x5, x10, #1                 //2wd
    139 
    140     dup         v27.8b, v2.b[3]             //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    141 
    142     and         x7,x14,#1                   //added                //calculating ht_residue ht_residue = (ht & 1)
    143     sub         x14,x14,x7                  //added                //decrement height by ht_residue(residue value is calculated outside)
    144 
    145     bne         outer_loop_4                // this branching happens when the width is 2 or 6
    146 
    147     cmp         x10,#12
    148     beq         skip_16
    149 
    150     cmp         x10,#8
    151     bge         outer_loop_16
    152 
    153 skip_16:
    154     tst         x6,#3
    155 
    156 //******* removal
    157     //mov        x11,#8
    158 //******* removal ends
    159 
    160     sub         x9,x0,#2
    161     beq         outer_loop_ht_4             //this branching happens when the height is a a multiple of 4
    162 
    163 
    164 
    165 //     cmp        x10,#12
    166 //     beq     outer_loop_8
    167 //     cmp        x10,#16
    168 //     bge    outer_loop_16
    169     b           outer_loop_8
    170 
    171 
    172 
    173 outer_loop_16:
    174     add         x4,x12,x2
    175 
    176 
    177     and         x0, x12, #31
    178     add         x20,x12, x2 , lsl #1
    179     prfm        PLDL1KEEP,[x20]
    180 
    181 
    182 
    183 
    184 
    185 
    186     add         x19,x12,#8
    187     ld1         { v0.2s},[x12],x11          //vector load pu1_src
    188     ld1         { v1.2s},[x19],x11          //vector load pu1_src
    189     mov         x10,x5                      //2wd
    190     mul         x14, x14 , x10
    191     ld1         { v2.2s},[x12],x11          //vector load pu1_src
    192     ld1         { v3.2s},[x19],x11          //vector load pu1_src
    193     add         x20,x4, x2 , lsl #1
    194     prfm        PLDL1KEEP,[x20]
    195     mov         x9,#10
    196     ld1         { v4.2s},[x12],x11          //vector load pu1_src
    197     ld1         { v5.2s},[x19],x11          //vector load pu1_src
    198     sub         x20,x3,#8
    199     neg         x6, x20
    200     sub         x8,x3,#8
    201     ld1         { v6.2s},[x12],x9           //vector load pu1_src
    202     ld1         { v7.2s},[x19],x9           //vector load pu1_src
    203 
    204 
    205     add         x19,x4,#8
    206     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    207     ld1         { v29.2s},[x4],x11          //vector load pu1_src
    208     ld1         { v31.2s},[x19],x11         //vector load pu1_src
    209 
    210     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    211 
    212     ld1         { v10.2s},[x4],x11          //vector load pu1_src
    213     ld1         { v11.2s},[x19],x11         //vector load pu1_src
    214 
    215     umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    216 
    217     ld1         { v12.2s},[x4],x11          //vector load pu1_src
    218     ld1         { v13.2s},[x19],x11         //vector load pu1_src
    219 
    220     umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    221 
    222     ld1         { v14.4s},[x4],x9           //vector load pu1_src
    223     ld1         { v15.2s},[x19],x9          //vector load pu1_src
    224 
    225     umull       v28.8h, v3.8b, v25.8b
    226     lsl         x6,x6,#1
    227     sub         x20,x5,x3,lsl #1
    228     neg         x3, x20
    229     umlsl       v28.8h, v1.8b, v24.8b
    230     lsl         x8,x8,#1
    231     sub         x20,x5,x2,lsl #1
    232     neg         x7, x20
    233     umlal       v28.8h, v5.8b, v26.8b
    234 
    235     umlsl       v28.8h, v7.8b, v27.8b
    236     cmp         x14,#32
    237     beq         epilog_end
    238     sub         x14, x14,#64
    239 
    240 inner_loop_16:
    241 
    242     // and            x7, x12, #31                    //decrement the wd loop
    243     // cmp            x7, x0
    244     add         x20,x12, x2 , lsl #2
    245     prfm        PLDL1KEEP,[x20]
    246     add         x20,x4, x2 , lsl #2
    247     prfm        PLDL1KEEP,[x20]
    248 
    249 
    250     subs        x10,x10,#16
    251 
    252     umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    253 
    254 
    255 
    256 //     add x20,x12,x2,lsl #1
    257     //csel x12, x20, x12,eq
    258 //     sub x20,x12,x5
    259     //csel x12, x20, x12,eq
    260     add         x20,x12,x7
    261     csel        x12, x20, x12,eq
    262     add         x20,x12,x2
    263     csel        x4, x20, x4,eq
    264 
    265 
    266     st1         { v30.8h}, [x1],#16
    267     umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    268 
    269 
    270 
    271 
    272     add         x19,x12,#8
    273     ld1         { v0.2s},[x12],x11          //vector load pu1_src
    274     ld1         { v1.2s},[x19],x11          //vector load pu1_src
    275     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    276 
    277 
    278 
    279 
    280     ld1         { v2.2s},[x12],x11          //vector load pu1_src
    281     ld1         { v3.2s},[x19],x11          //vector load pu1_src
    282     umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    283 
    284 
    285     ld1         { v4.2s},[x12],x11          //vector load pu1_src
    286     ld1         { v5.2s},[x19],x11          //vector load pu1_src
    287     umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    288 
    289     st1         { v28.8h}, [x1],x8
    290     umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    291 
    292     ld1         { v6.2s},[x12],x9           //vector load pu1_src
    293     ld1         { v7.2s},[x19],x9           //vector load pu1_src
    294     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    295 
    296     add         x19,x4,#8
    297     ld1         { v29.2s},[x4],x11          //vector load pu1_src
    298     ld1         { v31.2s},[x19],x11         //vector load pu1_src
    299     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    300 
    301 
    302     ld1         { v10.2s},[x4],x11          //vector load pu1_src
    303     ld1         { v11.2s},[x19],x11         //vector load pu1_src
    304     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    305 
    306     ld1         { v12.2s},[x4],x11          //vector load pu1_src
    307     ld1         { v13.2s},[x19],x11         //vector load pu1_src
    308     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    309 
    310     ld1         { v14.2s},[x4],x9           //vector load pu1_src
    311     ld1         { v15.2s},[x19],x9          //vector load pu1_src
    312     umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    313 
    314     st1         { v22.8h},[x1],#16          //store the result pu1_dst
    315     umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    316 
    317     csel        x10, x5, x10,eq             //2wd
    318     umull       v28.8h, v3.8b, v25.8b
    319 
    320 
    321 
    322     umlsl       v28.8h, v1.8b, v24.8b
    323     st1         { v20.8h},[x1],x6           //store the result pu1_dst
    324 
    325 
    326     add         x20,x1,x3,lsl #1
    327     csel        x1, x20, x1,eq
    328     umlal       v28.8h, v5.8b, v26.8b
    329 
    330     subs        x14,x14,#32                 //decrement the ht loop
    331     umlsl       v28.8h, v7.8b, v27.8b
    332 
    333 
    334 
    335 //     mov            x0, x7
    336     bgt         inner_loop_16
    337 
    338 
    339 
    340     add         x14,x14,#64
    341     cmp         x14,#32
    342     beq         epilog_end
    343 
    344 epilog:
    345 
    346     st1         { v30.8h}, [x1],#16
    347     umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    348     st1         { v28.8h}, [x1],x8
    349 
    350 
    351 
    352     umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    353     subs        x10,x10,#16                 //decrement the wd loop
    354     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    355 //     add x20,x12,x2,lsl #1
    356     //csel x12, x20, x12,eq
    357     add         x20,x12,x7
    358     csel        x12, x20, x12,eq
    359     umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    360     // sub x20,x12,x5
    361     //csel x12, x20, x12,eq
    362     csel        x10, x5, x10,eq             //2wd
    363     add         x20,x12,x2
    364     csel        x4, x20, x4,eq
    365     umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    366 
    367     add         x19,x12,#8
    368     ld1         { v0.2s},[x12],x11          //vector load pu1_src
    369     ld1         { v1.2s},[x19],x11          //vector load pu1_src
    370 
    371     umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    372 
    373     ld1         { v2.2s},[x12],x11          //vector load pu1_src
    374     ld1         { v3.2s},[x19],x11          //vector load pu1_src
    375 
    376     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    377 
    378     ld1         { v4.2s},[x12],x11          //vector load pu1_src
    379     ld1         { v5.2s},[x19],x11          //vector load pu1_src
    380 
    381     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    382     ld1         { v6.2s},[x12],x9           //vector load pu1_src
    383     ld1         { v7.2s},[x19],x9           //vector load pu1_src
    384     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    385 
    386     add         x19,x4,#8
    387     ld1         { v29.2s},[x4],x11          //vector load pu1_src
    388     ld1         { v31.2s},[x19],x11         //vector load pu1_src
    389     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    390 
    391     ld1         { v10.2s},[x4],x11          //vector load pu1_src
    392     ld1         { v11.2s},[x19],x11         //vector load pu1_src
    393     umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    394 
    395     umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    396 
    397     ld1         { v12.2s},[x4],x11          //vector load pu1_src
    398     ld1         { v13.2s},[x19],x11         //vector load pu1_src
    399     umull       v28.8h, v3.8b, v25.8b
    400 
    401     ld1         { v14.2s},[x4],x9           //vector load pu1_src
    402     ld1         { v15.2s},[x19],x9          //vector load pu1_src
    403 
    404     umlsl       v28.8h, v1.8b, v24.8b
    405     st1         { v22.8h},[x1],#16          //store the result pu1_dst
    406     umlal       v28.8h, v5.8b, v26.8b
    407     st1         { v20.8h},[x1],x6           //store the result pu1_dst
    408     umlsl       v28.8h, v7.8b, v27.8b
    409     add         x20,x1,x3,lsl #1
    410     csel        x1, x20, x1,eq
    411 
    412 
    413 epilog_end:
    414 
    415     umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    416     umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    417     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    418     umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    419 
    420 
    421     umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    422     umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    423     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    424     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    425 
    426 
    427     st1         { v30.8h}, [x1],#16
    428     st1         { v28.8h}, [x1],x8
    429     st1         { v22.8h},[x1],#16          //store the result pu1_dst
    430     st1         { v20.8h},[x1],x6           //store the result pu1_dst
    431 
    432 
    433     mov         x6,x16                      //loads ht
    434 
    435     and         x7,x6,#1
    436 
    437     cmp         x7,#0
    438     mov         x10,x5
    439     add         x20,x12,x2,lsl #1
    440     csel        x12, x20, x12,ne
    441     sub         x20,x12,x5
    442     csel        x12, x20, x12,ne
    443     add         x20,x1,x3,lsl #1
    444     csel        x1, x20, x1,ne
    445 
    446 
    447     bgt         loop_residue_4
    448 
    449     b           end_loops
    450 
    451 
    452 
    453 
    454 outer_loop_8:
    455 
    456     add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
    457     mov         x10,x5                      //2wd
    458     add         x4,x12,x2                   //pu1_src + src_strd
    459 
    460 inner_loop_8:
    461     //ld1 {v0.2s, v1.2s},[x12],x11                //vector load pu1_src
    462     ld1         {v0.2s},[x12],x11           //vector load pu1_src
    463     ld1         {v1.2s},[x12],x11           //vector load pu1_src
    464     ld1         {v2.2s},[x12],x11           //vector load pu1_src
    465     ld1         {v3.2s},[x12],x11           //vector load pu1_src
    466 
    467 
    468     //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    469     umull       v29.8h, v1.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    470     umlsl       v29.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    471     //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    472     //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
    473     umlal       v29.8h, v2.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    474     umlsl       v29.8h, v3.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    475 
    476     //ld1 {v12.2s, v13.2s},[x4],x11                //vector load pu1_src + src_strd
    477     ld1         {v4.2s},[x4],x11            //vector load pu1_src
    478     ld1         {v5.2s},[x4],x11            //vector load pu1_src
    479     ld1         {v6.2s},[x4],x11            //vector load pu1_src
    480     ld1         {v7.2s},[x4],x11            //vector load pu1_src
    481     //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
    482     umull       v10.8h, v5.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    483     umlsl       v10.8h, v4.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    484     //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
    485     //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
    486     umlal       v10.8h, v6.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    487     umlsl       v10.8h, v7.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    488 
    489     st1         {v29.8h}, [x1],#16
    490 
    491     subs        x10,x10,#8                  //decrement the wd loop
    492     st1         {v10.8h},[x6],#16           //store the result pu1_dst
    493     bgt         inner_loop_8
    494 
    495     sub         x12,x12,x5
    496     subs        x14,x14,#2                  //decrement the ht loop
    497     sub         x1,x1,x5,lsl #1
    498     add         x12,x12,x2,lsl #1
    499     add         x1,x1,x3,lsl #2
    500     bgt         outer_loop_8
    501 
    502     cmp         x7,#0
    503     mov         x10,x5
    504     bgt         loop_residue_4
    505 
    506     b           end_loops
    507 
    508 
    509 
    510 //height if 4 comes
    511 outer_loop_ht_4:
    512 
    513     mov         x10,x5
    514 
    515 prologue_ht_4:
    516     lsl         x8, x3, #1
    517 
    518 inner_loop_ht_4:
    519 
    520     mov         x12,x9
    521     mov         x4,x1
    522 
    523     sub         x0, x2, #6                  // not sure if x0 needs to be preserved
    524 
    525     ld1         {v0.2s},[x12],x11           //(1)vector load pu1_src
    526     ld1         {v1.2s},[x12],x11           //(1)vector load pu1_src
    527     ld1         {v2.2s},[x12],x11           //(1)vector load pu1_src
    528     ld1         {v3.2s},[x12],x0            //(1)vector load pu1_src
    529 
    530     ld1         {v4.2s},[x12],x11           //(2)vector load pu1_src
    531     ld1         {v5.2s},[x12],x11           //(2)vector load pu1_src
    532     ld1         {v6.2s},[x12],x11           //(2)vector load pu1_src
    533     ld1         {v7.2s},[x12],x0            //(2)vector load pu1_src
    534 
    535     ld1         {v14.2s},[x12],x11          //(3)vector load pu1_src
    536     umull       v29.8h, v1.8b, v25.8b       //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    537 
    538     ld1         {v15.2s},[x12],x11          //(3)vector load pu1_src
    539     umlsl       v29.8h, v0.8b, v24.8b       //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    540 
    541     ld1         {v16.2s},[x12],x11          //(3)vector load pu1_src
    542     umlal       v29.8h, v2.8b, v26.8b       //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    543 
    544     ld1         {v17.2s},[x12],x0           //(3)vector load pu1_src
    545     umlsl       v29.8h, v3.8b, v27.8b       //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    546 
    547     ld1         {v18.2s},[x12],x11          //(4)vector load pu1_src
    548     umull       v10.8h, v5.8b, v25.8b       //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    549 
    550     ld1         {v19.2s},[x12],x11          //(4)vector load pu1_src
    551     umlsl       v10.8h, v4.8b, v24.8b       //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    552 
    553     ld1         {v20.2s},[x12],x11          //(4)vector load pu1_src
    554     umlal       v10.8h, v6.8b, v26.8b       //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    555 
    556     ld1         {v21.2s},[x12],x2           //(4)vector load pu1_src
    557     umlsl       v10.8h, v7.8b, v27.8b       //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    558 
    559     add         x9,x9,#8                    //(core loop)
    560 
    561     subs        x10,x10,#8                  //(prologue)decrement the wd loop
    562     beq         epilogue
    563 
    564 core_loop:
    565     st1         {v29.8h},[x4],x8            //(1)store the result pu1_dst
    566     mov         x12,x9
    567 
    568     ld1         {v0.2s},[x12],x11           //(1_1)vector load pu1_src
    569     umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    570 
    571     ld1         {v1.2s},[x12],x11           //(1_1)vector load pu1_src
    572     umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    573 
    574     ld1         {v2.2s},[x12],x11           //(1_1)vector load pu1_src
    575     umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    576 
    577     ld1         {v3.2s},[x12],x0            //(1_1)vector load pu1_src
    578     umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    579 
    580     st1         {v10.8h},[x4],x8            //(2)store the result pu1_dst
    581     add         x9,x9,#8                    //(core loop)
    582 
    583     ld1         {v4.2s},[x12],x11           //(2_1)vector load pu1_src
    584     umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    585 
    586     ld1         {v5.2s},[x12],x11           //(2_1)vector load pu1_src
    587     umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    588 
    589     ld1         {v6.2s},[x12],x11           //(2_1)vector load pu1_src
    590     umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    591 
    592     ld1         {v7.2s},[x12],x0            //(2_1)vector load pu1_src
    593     umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    594 
    595     st1         {v12.8h},[x4],x8            //(3)store the result pu1_dst
    596     add         x1,x1,#16                   //(core loop)
    597 
    598     ld1         {v14.2s},[x12],x11          //(3_1)vector load pu1_src
    599     umull       v29.8h, v1.8b, v25.8b       //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    600 
    601     ld1         {v15.2s},[x12],x11          //(3_1)vector load pu1_src
    602     umlsl       v29.8h, v0.8b, v24.8b       //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    603 
    604     ld1         {v16.2s},[x12],x11          //(3_1)vector load pu1_src
    605     umlal       v29.8h, v2.8b, v26.8b       //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    606 
    607     ld1         {v17.2s},[x12],x0           //(3_1)vector load pu1_src
    608     umlsl       v29.8h, v3.8b, v27.8b       //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    609 
    610     st1         {v22.8h}, [x4], x8          //(4)store the result pu1_dst
    611     subs        x10,x10,#8                  //(core loop)
    612 
    613     umull       v10.8h, v5.8b, v25.8b       //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    614     ld1         {v18.2s},[x12],x11          //(4_1)vector load pu1_src
    615 
    616     ld1         {v19.2s},[x12],x11          //(4_1)vector load pu1_src
    617     umlsl       v10.8h, v4.8b, v24.8b       //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    618 
    619     ld1         {v20.2s},[x12],x11          //(4_1)vector load pu1_src
    620     umlal       v10.8h, v6.8b, v26.8b       //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    621 
    622     mov         x4, x1                      //(core loop)
    623 
    624     ld1         {v21.2s},[x12],x0           //(4_1)vector load pu1_src
    625     umlsl       v10.8h, v7.8b, v27.8b       //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    626 
    627 
    628 
    629     bgt         core_loop                   //loopback
    630 
    631 epilogue:
    632     umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    633 
    634     umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    635 
    636     umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    637 
    638     umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    639 
    640     st1         {v29.8h},[x4], x8           //(1)store the result pu1_dst
    641 
    642     umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    643     umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    644 
    645     umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    646 
    647     umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    648 
    649     st1         {v10.8h},[x4], x8           //(2)store the result pu1_dst
    650 
    651     st1         {v12.8h},[x4], x8           //(3)store the result pu1_dst
    652 
    653     add         x1,x1,#16                   //(core loop)
    654 
    655     st1         {v22.8h},[x4], x8           //(4)store the result pu1_dst
    656 
    657     sub         x9,x9,x5
    658     subs        x14,x14,#4                  //decrement the ht loop
    659     sub         x1,x1,x5,lsl #1
    660     add         x9,x9,x2,lsl #2
    661     add         x1,x1,x3,lsl #3
    662     bgt         outer_loop_ht_4
    663 
    664     cmp         x7,#0
    665     mov         x10,x5
    666     csel        x12, x9, x12,gt
    667     csel        x4, x1, x4,gt
    668     bgt         loop_residue_4
    669 
    670     b           end_loops
    671 
    672 outer_loop_4:
    673     add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
    674     mov         x10,x5
    675     add         x4,x12,x2                   //pu1_src + src_strd
    676 
    677 inner_loop_4:
    678     //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
    679     ld1         {v20.2s},[x12],x11          //vector load pu1_src
    680     ld1         {v21.2s},[x12],x11          //vector load pu1_src
    681     ld1         {v22.2s},[x12],x11          //vector load pu1_src
    682     ld1         {v23.2s},[x12]              //vector load pu1_src
    683 
    684 //**** removal
    685     //add        x12,x12,#4                        //increment the input pointer
    686 //**** removal ends
    687 //**** addn
    688     sub         x12,x12,#2                  //increment the input pointer
    689 //**** addn ends
    690     ld1         {v16.2s},[x4],x11           //vector load pu1_src
    691     ld1         {v17.2s},[x4],x11           //vector load pu1_src
    692     ld1         {v18.2s},[x4],x11           //vector load pu1_src
    693     ld1         {v19.2s},[x4]               //vector load pu1_src
    694     //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    695     //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    696     //ld1 {v12.2s, v13.2s},[x4]                    //vector load pu1_src + src_strd
    697     //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
    698 
    699     //add        x4,x4,#4                        //increment the input pointer
    700     sub         x4,x4,#2
    701     //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
    702     //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
    703     //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
    704 
    705 //**** removal
    706     //zip1 v0.2s, v0.2s, v12.2s
    707     //zip2  v12.2s, v0.2s, v12.2s                             //vector zip the i iteration and ii interation in single register
    708     //zip1 v2.2s, v2.2s, v14.2s
    709     //zip2  v14.2s, v2.2s, v14.2s
    710     //zip1 v4.2s, v4.2s, v16.2s
    711     //zip2  v16.2s, v4.2s, v16.2s
    712     //zip1 v6.2s, v6.2s, v18.2s
    713     //zip2  v18.2s, v6.2s, v18.2s
    714 //**** removal ends
    715 //**** addn
    716     zip1        v0.2s, v20.2s, v16.2s
    717     zip2        v4.2s, v20.2s, v16.2s       //vector zip the i iteration and ii interation in single register
    718     zip1        v1.2s, v21.2s, v17.2s
    719     zip2        v5.2s, v21.2s, v17.2s
    720     zip1        v2.2s, v22.2s, v18.2s
    721     zip2        v6.2s, v22.2s, v18.2s
    722     zip1        v3.2s, v23.2s, v19.2s
    723     zip2        v7.2s, v23.2s, v19.2s
    724 //**** addn ends
    725 
    726     umull       v29.8h, v1.8b, v25.8b       //arithmetic operations for ii iteration in the same time
    727     umlsl       v29.8h, v0.8b, v24.8b
    728     umlal       v29.8h, v2.8b, v26.8b
    729     umlsl       v29.8h, v3.8b, v27.8b
    730 
    731     st1         {v29.d}[0],[x1],#8          //store the i iteration result which is in upper part of the register
    732     subs        x10,x10,#4                  //decrement the wd by 4
    733 
    734     st1         {v29.d}[1],[x6],#8          //store the ii iteration result which is in lower part of the register
    735 
    736     bgt         inner_loop_4
    737 
    738     sub         x12,x12,x5
    739     subs        x14,x14,#2                  //decrement the ht by 2
    740     sub         x1,x1,x5,lsl #1
    741     add         x12,x12,x2,lsl #1
    742     add         x1,x1,x3,lsl #2
    743     bgt         outer_loop_4
    744 
    745     cmp         x7,#0
    746     mov         x10,x5
    747     beq         end_loops
    748 
    749 loop_residue_4:
    750 
    751     mov         x10,x5                      //2wd
    752 
    753 loop_residue:
    754 
    755     //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
    756     ld1         {v20.2s},[x12],x11          //vector load pu1_src
    757     ld1         {v21.2s},[x12],x11          //vector load pu1_src
    758     ld1         {v22.2s},[x12],x11          //vector load pu1_src
    759     ld1         {v23.2s},[x12]              //vector load pu1_src
    760     //vext.u8        d2,d0,d1,#2                //vector extract of src[0_2]
    761     //umull v8.8h, v2.8b, v25.8b                //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    762     //umlsl v8.8h, v0.8b, v24.8b                //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    763     //vext.u8        d4,d0,d1,#4                //vector extract of src[0_4]
    764     //add            x12,x12,#4                //pu1_src + 4
    765     sub         x12, x12, #2
    766     //vext.u8        d6,d0,d1,#6                //vector extract of src[0_6]
    767     //umlal v8.8h, v4.8b, v26.8b                //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    768     //umlsl v8.8h, v6.8b, v27.8b                //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    769     umull       v29.8h, v21.8b, v25.8b
    770     umlsl       v29.8h, v20.8b, v24.8b
    771     umlal       v29.8h, v22.8b, v26.8b
    772     umlsl       v29.8h, v23.8b, v27.8b
    773 
    774     st1         {v29.1d},[x1]               //store the result pu1_dst
    775     subs        x10,x10,#4                  //decrement the wd loop
    776     add         x1,x1,#8                    //pi2_dst + 8
    777 
    778     bgt         loop_residue                //loop again
    779 
    780     //inner loop ends
    781     //add            x8,x3,lsl #1            //2*dst_strd
    782     //sub             x8,x8,x5,lsl #1            //2*dst_strd - 2wd
    783     //sub             x9,x2,x5                //src_strd - 2wd
    784     //subs             x7,x7,#1                //decrement the ht loop
    785     //add             x12,x12,x9                //pu1_src + src_strd
    786     //add            x1,x1,x8                //pu1_dst + 2*dst_strd
    787     //bgt              outer_loop_residue_4    //loop again
    788     //b                 end_loops                //jumps to end
    789 
    790 end_loops:
    791 
    792     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    793     ldp         x19, x20,[sp],#16
    794     ldp         d14,d15,[sp],#16
    795     ldp         d12,d13,[sp],#16
    796     ldp         d10,d11,[sp],#16
    797     ret
    798 
    799 
    800 
    801 
    802 
    803 
    804