Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_chroma_horz_neon.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs / akshaya mukund
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*
     35 //* //remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* //brief
     44 //*    chroma interprediction filter for horizontal input
     45 //*
     46 //* //par description:
     47 //*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
     48 //*    to the elements pointed by 'pu1_src' and  writes to the location pointed
     49 //*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
     50 //*    assumptions : the function is optimized considering the fact width is
     51 //*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
     52 //*    width 4,8 is optimized further
     53 //*
     54 //* //param[in] pu1_src
     55 //*  uword8 pointer to the source
     56 //*
     57 //* //param[out] pu1_dst
     58 //*  uword8 pointer to the destination
     59 //*
     60 //* //param[in] src_strd
     61 //*  integer source stride
     62 //*
     63 //* //param[in] dst_strd
     64 //*  integer destination stride
     65 //*
     66 //* //param[in] pi1_coeff
     67 //*  word8 pointer to the filter coefficients
     68 //*
     69 //* //param[in] ht
     70 //*  integer height of the array
     71 //*
     72 //* //param[in] wd
     73 //*  integer width of the array
     74 //*
     75 //* //returns
     76 //*
     77 //* //remarks
     78 //*  none
     79 //*
     80 //*******************************************************************************
     81 //*/
     82 
     83 //void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
     84 //                                   uword8 *pu1_dst,
     85 //                                   word32 src_strd,
     86 //                                   word32 dst_strd,
     87 //                                   word8 *pi1_coeff,
     88 //                                   word32 ht,
     89 //                                   word32 wd)
     90 //**************variables vs registers*****************************************
     91 //x0 => *pu1_src
     92 //x1 => *pi2_dst
     93 //x2 =>  src_strd
     94 //x3 =>  dst_strd
     95 
     96 .text
     97 .align 4
     98 
     99 .include "ihevc_neon_macros.s"
    100 
    101 .globl ihevc_inter_pred_chroma_horz_av8
    102 
    103 .type ihevc_inter_pred_chroma_horz_av8, %function
    104 
    105 ihevc_inter_pred_chroma_horz_av8:
    106 
    107     // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
    108 
    109     stp         d9,d10,[sp,#-16]!
    110     stp         d11,d12,[sp,#-16]!
    111     stp         d13,d14,[sp,#-16]!
    112     stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
    113                                             // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
    114     stp         x19, x20,[sp,#-16]!
    115 
    116     mov         x15,x4 // pi1_coeff
    117     mov         x16,x5 // ht
    118     mov         x17,x6 // wd
    119 
    120 
    121     mov         x4,x15                      //loads pi1_coeff
    122     mov         x7,x16                      //loads ht
    123     mov         x10,x17                     //loads wd
    124 
    125     ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
    126     subs        x14,x7,#0                   //checks for ht == 0
    127     abs         v2.8b, v0.8b                //vabs_s8(coeff)
    128     mov         x11,#2
    129     ble         end_loops
    130 
    131     dup         v24.8b, v2.b[0]             //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    132     sub         x12,x0,#2                   //pu1_src - 2
    133     dup         v25.8b, v2.b[1]             //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    134     add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
    135     dup         v26.8b, v2.b[2]             //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    136 
    137     tst         x10,#3                      //checks wd for multiples
    138     lsl         x5, x10, #1
    139 
    140     dup         v27.8b, v2.b[3]             //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    141 
    142     bne         outer_loop_4
    143     cmp         x10,#12
    144     beq         skip_16
    145 
    146     cmp         x10,#8
    147     bge         outer_loop_16
    148 skip_16:
    149     tst         x7,#3
    150 
    151     sub         x9,x0,#2
    152     beq         outer_loop_ht_4             //jumps to else condition
    153 
    154     b           outer_loop_8
    155 
    156 
    157 outer_loop_16:
    158     mov         x10,x5                      //2wd
    159     mul         x14, x14 , x10
    160 
    161     sub         x20,x3,#16
    162     neg         x6, x20
    163 
    164     add         x4,x12,x2
    165     mov         x9,#10
    166     and         x0, x12, #31
    167     sub         x20,x5,x3,lsl #1
    168     neg         x8, x20
    169     add         x20,x12, x2 , lsl #1
    170     prfm        PLDL1KEEP,[x20]
    171 
    172 
    173 
    174     add         x19,x12,#8
    175     ld1         { v0.2s},[x12],x11          //vector load pu1_src
    176     ld1         { v1.2s},[x19],x11          //vector load pu1_src
    177     add         x20,x4, x2 , lsl #1
    178     prfm        PLDL1KEEP,[x20]
    179 
    180     ld1         { v2.2s},[x12],x11          //vector load pu1_src
    181     ld1         { v3.2s},[x19],x11          //vector load pu1_src
    182 
    183     ld1         { v4.2s},[x12],x11          //vector load pu1_src
    184     ld1         { v5.2s},[x19],x11          //vector load pu1_src
    185 
    186     ld1         { v6.2s},[x12],x9           //vector load pu1_src
    187     ld1         { v7.2s},[x19],x9           //vector load pu1_src
    188 
    189 
    190     add         x19,x4,#8
    191     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    192     ld1         { v29.2s},[x4],x11          //vector load pu1_src
    193     ld1         { v9.2s},[x19],x11          //vector load pu1_src
    194 
    195     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    196 
    197     ld1         { v10.2s},[x4],x11          //vector load pu1_src
    198     ld1         { v11.2s},[x19],x11         //vector load pu1_src
    199 
    200     umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    201 
    202     ld1         { v12.2s},[x4],x11          //vector load pu1_src
    203     ld1         { v13.2s},[x19],x11         //vector load pu1_src
    204 
    205     umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    206 
    207     ld1         { v14.2s},[x4],x9           //vector load pu1_src
    208     ld1         { v15.2s},[x19],x9          //vector load pu1_src
    209 
    210     umull       v28.8h, v3.8b, v25.8b
    211 
    212     umlsl       v28.8h, v1.8b, v24.8b
    213 
    214 
    215     umlal       v28.8h, v5.8b, v26.8b
    216 
    217     umlsl       v28.8h, v7.8b, v27.8b
    218 
    219 
    220     cmp         x14,#32
    221     beq         epilog_end
    222     sub         x14, x14,#64
    223 
    224 inner_loop_16:
    225 
    226 
    227 
    228 
    229 //     bgt            l_2
    230 
    231 //    add x20,x12, x2 , lsl #1
    232     prfm        PLDL1KEEP,[x20]
    233 //    add x20,x4, x2 , lsl #1
    234     prfm        PLDL1KEEP,[x20]
    235 
    236 
    237 
    238     subs        x10,x10,#16
    239 
    240     umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    241 
    242 
    243     add         x20,x12,x8
    244     csel        x12, x20, x12,eq
    245     add         x20,x12,x2
    246     csel        x4, x20, x4,eq
    247     umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    248 
    249 
    250 
    251     add         x20,x12, x2 , lsl #2
    252     prfm        PLDL1KEEP,[x20]
    253     sqrshrun    v30.8b, v30.8h,#6
    254 
    255     add         x19,x12,#8
    256     ld1         { v0.2s},[x12],x11          //vector load pu1_src
    257     ld1         { v1.2s},[x19],x11          //vector load pu1_src
    258 
    259     sqrshrun    v31.8b, v28.8h,#6
    260 
    261 
    262 
    263     ld1         { v2.2s},[x12],x11          //vector load pu1_src
    264     ld1         { v3.2s},[x19],x11          //vector load pu1_src
    265     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    266 
    267 
    268 
    269 
    270     ld1         { v4.2s},[x12],x11          //vector load pu1_src
    271     ld1         { v5.2s},[x19],x11          //vector load pu1_src
    272     umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    273 
    274 
    275     ld1         { v6.2s},[x12],x9           //vector load pu1_src
    276     ld1         { v7.2s},[x19],x9           //vector load pu1_src
    277     umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    278 
    279     add         x20,x4, x2 , lsl #2
    280     prfm        PLDL1KEEP,[x20]
    281     umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    282 
    283     //mov       v30.s[1],v31.s[0]
    284     add         x13,x1,#8
    285     st1         { v30.4h}, [x1],x3
    286     st1         { v31.4h}, [x13],x3
    287     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    288 
    289     add         x19,x4,#8
    290     ld1         { v29.2s},[x4],x11          //vector load pu1_src
    291     ld1         { v9.2s},[x19],x11          //vector load pu1_src
    292     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    293 
    294 
    295     ld1         { v10.2s},[x4],x11          //vector load pu1_src
    296     ld1         { v11.2s},[x19],x11         //vector load pu1_src
    297     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    298 
    299     ld1         { v12.2s},[x4],x11          //vector load pu1_src
    300     ld1         { v13.2s},[x19],x11         //vector load pu1_src
    301     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    302 
    303     ld1         { v14.2s},[x4],x9           //vector load pu1_src
    304     ld1         { v15.2s},[x19],x11         //vector load pu1_src
    305     umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    306 
    307     cmp         x10,#0
    308     sqrshrun    v22.8b, v22.8h,#6
    309     sqrshrun    v23.8b, v20.8h,#6
    310 
    311 
    312 
    313     umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    314 
    315     csel        x10, x5, x10,eq             //2wd
    316     umull       v28.8h, v3.8b, v25.8b
    317 
    318 
    319     //add       x13,x1,#8
    320     //mov       v22.s[1],v23.s[0]
    321     st1         { v22.4h},[x1],x6           //store the result pu1_dst
    322     st1         { v23.4h},[x13],x6          //store the result pu1_dst
    323     umlsl       v28.8h, v1.8b, v24.8b
    324 
    325 
    326     add         x20,x1,x8
    327     csel        x1, x20, x1,eq
    328     umlal       v28.8h, v5.8b, v26.8b
    329 
    330     subs        x14,x14,#32                 //decrement the ht loop
    331     umlsl       v28.8h, v7.8b, v27.8b
    332 
    333 //      mov            x0, x7
    334 
    335     bgt         inner_loop_16
    336 
    337 
    338 
    339     add         x14,x14,#64
    340     cmp         x14,#32
    341     beq         epilog_end
    342 
    343 epilog:
    344     sqrshrun    v30.8b, v30.8h,#6
    345     sqrshrun    v31.8b, v28.8h,#6
    346 
    347 
    348 
    349     add         x13,x1,#8
    350     //mov       v30.s[1],v31.s[0]
    351     st1         { v30.4h}, [x1],x3
    352     st1         { v31.4h}, [x13],x3
    353 
    354     umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    355 
    356 
    357 
    358 
    359     umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    360     subs        x10,x10,#16                 //decrement the wd loop
    361     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    362     add         x20,x12,x8
    363     csel        x12, x20, x12,eq
    364     umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    365     csel        x10, x5, x10,eq             //2wd
    366 
    367 
    368     add         x20,x12,x2
    369     csel        x4, x20, x4,eq
    370     umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    371 
    372     add         x19,x12,#8
    373     ld1         { v0.2s},[x12],x11          //vector load pu1_src
    374     ld1         { v1.2s},[x19],x11          //vector load pu1_src
    375 
    376     umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    377     ld1         { v2.2s},[x12],x11          //vector load pu1_src
    378     ld1         { v3.2s},[x19],x11          //vector load pu1_src
    379     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    380 
    381     ld1         { v4.2s},[x12],x11          //vector load pu1_src
    382     ld1         { v5.2s},[x19],x11          //vector load pu1_src
    383 
    384     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    385     ld1         { v6.2s},[x12],x9           //vector load pu1_src
    386     ld1         { v7.2s},[x19],x9           //vector load pu1_src
    387     umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    388 
    389 
    390     add         x19,x4,#8
    391     ld1         { v29.2s},[x4],x11          //vector load pu1_src
    392     ld1         { v9.2s},[x19],x11          //vector load pu1_src
    393     umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    394     ld1         { v10.2s},[x4],x11          //vector load pu1_src
    395     ld1         { v11.2s},[x19],x11         //vector load pu1_src
    396     umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    397 
    398     umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    399 
    400     ld1         { v12.2s},[x4],x11          //vector load pu1_src
    401     ld1         { v13.2s},[x19],x11         //vector load pu1_src
    402     umull       v28.8h, v3.8b, v25.8b
    403     ld1         { v14.2s},[x4],x9           //vector load pu1_src
    404     ld1         { v15.2s},[x19],x9          //vector load pu1_src
    405     umlsl       v28.8h, v1.8b, v24.8b
    406     sqrshrun    v22.8b, v22.8h,#6
    407     sqrshrun    v23.8b, v20.8h,#6
    408 
    409     //mov       v22.s[1],v23.s[0]
    410     st1         { v22.4h},[x1],x6           //store the result pu1_dst
    411     st1         { v23.4h},[x13],x6          //store the result pu1_dst
    412     umlal       v28.8h, v5.8b, v26.8b
    413 
    414     umlsl       v28.8h, v7.8b, v27.8b
    415     add         x20,x1,x8
    416     csel        x1, x20, x1,eq
    417 
    418 
    419 
    420 epilog_end:
    421     sqrshrun    v30.8b, v30.8h,#6
    422     sqrshrun    v31.8b, v28.8h,#6
    423 
    424 
    425     umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    426     umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    427     umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    428     umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    429 
    430 
    431     umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    432     umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    433     umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    434     umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    435     sqrshrun    v22.8b, v22.8h,#6
    436     sqrshrun    v23.8b, v20.8h,#6
    437 
    438     add         x13,x1,#8
    439 
    440     //mov       v30.s[1],v31.s[0]
    441     st1         { v30.4h}, [x1],x3
    442     st1         { v31.4h}, [x13],x3
    443 
    444     //mov       v22.s[1],v23.s[0]
    445     st1         { v22.4h},[x1]              //store the result pu1_dst
    446     st1         { v23.4h},[x13]             //store the result pu1_dst
    447 
    448 
    449 
    450     b           end_loops
    451 
    452 
    453 
    454 
    455 
    456 
    457 
    458 
    459 
    460 
    461 
    462 
    463 
    464 
    465 
    466 
    467 
    468 
    469 
    470 outer_loop_8:
    471 
    472 
    473     add         x6,x1,x3                    //pu1_dst + dst_strd
    474     mov         x7,x5
    475     add         x4,x12,x2                   //pu1_src + src_strd
    476 
    477 
    478 inner_loop_8:
    479     //ld1 {v0.2s, v1.2s},[x12],x11                //vector load pu1_src
    480     ld1         {v0.2s},[x12],x11           //vector load pu1_src
    481     ld1         {v1.2s},[x12],x11           //vector load pu1_src
    482     ld1         {v2.2s},[x12],x11           //vector load pu1_src
    483     ld1         {v3.2s},[x12],x11           //vector load pu1_src
    484 
    485     //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    486     umull       v29.8h, v1.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    487     umlsl       v29.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    488     //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    489     //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
    490     umlal       v29.8h, v2.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    491     umlsl       v29.8h, v3.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    492 
    493     ld1         {v4.2s},[x4],x11            //vector load pu1_src
    494     ld1         {v5.2s},[x4],x11            //vector load pu1_src
    495     ld1         {v6.2s},[x4],x11            //vector load pu1_src
    496     ld1         {v7.2s},[x4],x11            //vector load pu1_src
    497     //ld1 {v12.2s, v13.2s},[x4],x11                //vector load pu1_src + src_strd
    498     //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
    499     umull       v10.8h, v5.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    500     umlsl       v10.8h, v4.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    501     //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
    502     //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
    503     sqrshrun    v29.8b, v29.8h,#6           //right shift and saturating narrow result 1
    504     umlal       v10.8h, v6.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    505     umlsl       v10.8h, v7.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    506 
    507     st1         {v29.8b},[x1],#8            //store the result pu1_dst
    508 
    509     sqrshrun    v10.8b, v10.8h,#6           //right shift and saturating narrow result 2
    510     subs        x7,x7,#8                    //decrement the wd loop
    511     st1         {v10.8b},[x6],#8            //store the result pu1_dst
    512     bgt         inner_loop_8
    513 
    514     sub         x12,x12,x5
    515     subs        x14,x14,#2                  //decrement the ht loop
    516     sub         x1,x1,x5
    517     add         x12,x12,x2,lsl #1
    518     add         x1,x1,x3,lsl #1
    519     bgt         outer_loop_8
    520     b           end_loops
    521 
    522 //height if 4 comes
    523 outer_loop_ht_4:
    524 
    525     mov         x7,x5
    526 
    527 prologue_ht_4:
    528 
    529 inner_loop_ht_4:
    530 
    531     mov         x12,x9
    532     mov         x4,x1
    533 
    534     sub         x8, x2, #6
    535 
    536     ld1         {v0.2s},[x12],x11           //(1)vector load pu1_src
    537     ld1         {v1.2s},[x12],x11           //(1)vector load pu1_src
    538     ld1         {v2.2s},[x12],x11           //(1)vector load pu1_src
    539     //ld1 {v3.2s},[x12],x2                //(1)vector load pu1_src
    540     ld1         {v3.2s},[x12],x8            //(1)vector load pu1_src
    541 
    542     //sub        x12, x12, #6                //(1)
    543 
    544     ld1         {v4.2s},[x12],x11           //(2)vector load pu1_src
    545     ld1         {v5.2s},[x12],x11           //(2)vector load pu1_src
    546     ld1         {v6.2s},[x12],x11           //(2)vector load pu1_src
    547     //ld1 {v7.2s},[x12],x2                //(2)vector load pu1_src
    548     ld1         {v7.2s},[x12],x8            //(2)vector load pu1_src
    549 
    550     //sub        x12, x12, #6                //(2)
    551 
    552     ld1         {v14.2s},[x12],x11          //(3)vector load pu1_src
    553     umull       v29.8h, v1.8b, v25.8b       //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    554 
    555     ld1         {v15.2s},[x12],x11          //(3)vector load pu1_src
    556     umlsl       v29.8h, v0.8b, v24.8b       //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    557 
    558     ld1         {v16.2s},[x12],x11          //(3)vector load pu1_src
    559     umlal       v29.8h, v2.8b, v26.8b       //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    560 
    561     //ld1 {v17.2s},[x12],x2                //(3)vector load pu1_src
    562     ld1         {v17.2s},[x12],x8           //(3)vector load pu1_src
    563     umlsl       v29.8h, v3.8b, v27.8b       //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    564 
    565     //sub        x12, x12, #6                //(3)
    566     umull       v10.8h, v5.8b, v25.8b       //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    567 
    568     ld1         {v18.2s},[x12],x11          //(4)vector load pu1_src
    569     umlsl       v10.8h, v4.8b, v24.8b       //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    570 
    571     ld1         {v19.2s},[x12],x11          //(4)vector load pu1_src
    572     umlal       v10.8h, v6.8b, v26.8b       //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    573 
    574     ld1         {v20.2s},[x12],x11          //(4)vector load pu1_src
    575     umlsl       v10.8h, v7.8b, v27.8b       //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    576 
    577     ld1         {v21.2s},[x12],x2           //(4)vector load pu1_src
    578     sqrshrun    v29.8b, v29.8h,#6           //(1)right shift and saturating narrow result 1
    579 
    580     add         x9,x9,#8                    //(core loop)
    581 
    582     subs        x7,x7,#8                    //(prologue)decrement the wd loop
    583     beq         epilogue
    584 
    585 core_loop:
    586     mov         x12,x9
    587 
    588     ld1         {v0.2s},[x12],x11           //(1_1)vector load pu1_src
    589     umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    590 
    591     ld1         {v1.2s},[x12],x11           //(1_1)vector load pu1_src
    592     umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    593 
    594     ld1         {v2.2s},[x12],x11           //(1_1)vector load pu1_src
    595     umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    596 
    597     //ld1 {v3.2s},[x12],x2                //(1_1)vector load pu1_src
    598     ld1         {v3.2s},[x12],x8            //(1_1)vector load pu1_src
    599     umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    600 
    601     //sub        x12, x12, #6                //(1_1)
    602 
    603     st1         {v29.8b},[x4],x3            //(1)store the result pu1_dst
    604     sqrshrun    v10.8b, v10.8h,#6           //(2)right shift and saturating narrow result 2
    605 
    606     ld1         {v4.2s},[x12],x11           //(2_1)vector load pu1_src
    607     umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    608 
    609     ld1         {v5.2s},[x12],x11           //(2_1)vector load pu1_src
    610     umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    611 
    612     ld1         {v6.2s},[x12],x11           //(2_1)vector load pu1_src
    613     umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    614 
    615     //ld1 {v7.2s},[x12],x2                //(2_1)vector load pu1_src
    616     ld1         {v7.2s},[x12],x8            //(2_1)vector load pu1_src
    617     umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    618 
    619     //sub        x12, x12, #6                //(2_1)
    620 
    621     st1         {v10.8b},[x4],x3            //(2)store the result pu1_dst
    622     sqrshrun    v12.8b, v12.8h,#6           //(3)right shift and saturating narrow result 1
    623 
    624     ld1         {v14.2s},[x12],x11          //(3_1)vector load pu1_src
    625     umull       v29.8h, v1.8b, v25.8b       //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    626 
    627     ld1         {v15.2s},[x12],x11          //(3_1)vector load pu1_src
    628     umlsl       v29.8h, v0.8b, v24.8b       //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    629 
    630     ld1         {v16.2s},[x12],x11          //(3_1)vector load pu1_src
    631     umlal       v29.8h, v2.8b, v26.8b       //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    632 
    633     //ld1 {v17.2s},[x12],x2                //(3_1)vector load pu1_src
    634     ld1         {v17.2s},[x12],x8           //(3_1)vector load pu1_src
    635     umlsl       v29.8h, v3.8b, v27.8b       //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    636 
    637     //sub        x12, x12, #6                //(3_1)
    638 
    639     st1         {v12.8b},[x4],x3            //(3)store the result pu1_dst
    640     sqrshrun    v22.8b, v22.8h,#6           //(4)right shift and saturating narrow result 2
    641 
    642     add         x9,x9,#8                    //(core loop)
    643 
    644     umull       v10.8h, v5.8b, v25.8b       //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    645     ld1         {v18.2s},[x12],x11          //(4_1)vector load pu1_src
    646 
    647     ld1         {v19.2s},[x12],x11          //(4_1)vector load pu1_src
    648     umlsl       v10.8h, v4.8b, v24.8b       //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    649 
    650     ld1         {v20.2s},[x12],x11          //(4_1)vector load pu1_src
    651     umlal       v10.8h, v6.8b, v26.8b       //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    652 
    653     ld1         {v21.2s},[x12],x2           //(4_1)vector load pu1_src
    654     umlsl       v10.8h, v7.8b, v27.8b       //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    655 
    656     add         x1,x1,#8                    //(core loop)
    657 
    658     subs        x7,x7,#8                    //(core loop)
    659 
    660     st1         {v22.8b},[x4], x3           //(4)store the result pu1_dst
    661     sqrshrun    v29.8b, v29.8h,#6           //(1_1)right shift and saturating narrow result 1
    662 
    663     mov         x4, x1                      //(core loop)
    664 
    665     bgt         core_loop                   //loopback
    666 
    667 epilogue:
    668     umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    669 
    670     umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    671 
    672     umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    673 
    674     umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    675 
    676     st1         {v29.8b},[x4],x3            //(1)store the result pu1_dst
    677     sqrshrun    v10.8b, v10.8h,#6           //(2)right shift and saturating narrow result 2
    678 
    679     umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
    680     umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    681 
    682     umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    683 
    684     umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    685 
    686     st1         {v10.8b},[x4],x3            //(2)store the result pu1_dst
    687     sqrshrun    v12.8b, v12.8h,#6           //(3)right shift and saturating narrow result 1
    688 
    689     st1         {v12.8b},[x4],x3            //(3)store the result pu1_dst
    690 
    691     add         x1,x1,#8                    //(core loop)
    692 
    693     sqrshrun    v22.8b, v22.8h,#6           //(4)right shift and saturating narrow result 2
    694 
    695 
    696     st1         {v22.8b},[x4], x3           //(4)store the result pu1_dst
    697 
    698     sub         x9,x9,x5
    699     subs        x14,x14,#4                  //decrement the ht loop
    700     sub         x1,x1,x5
    701     add         x9,x9,x2,lsl #2
    702     add         x1,x1,x3,lsl #2
    703     bgt         outer_loop_ht_4
    704     b           end_loops
    705 
    706 outer_loop_4:
    707     add         x6,x1,x3                    //pu1_dst + dst_strd
    708     mov         x7,x5
    709     add         x4,x12,x2                   //pu1_src + src_strd
    710 
    711 inner_loop_4:
    712     //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
    713 
    714     ld1         {v20.2s},[x12],x11          //vector load pu1_src
    715     ld1         {v21.2s},[x12],x11          //vector load pu1_src
    716     ld1         {v22.2s},[x12],x11          //vector load pu1_src
    717     ld1         {v23.2s},[x12]              //vector load pu1_src
    718 
    719     sub         x12,x12,#2                  //increment the input pointer
    720     ld1         {v16.2s},[x4],x11           //vector load pu1_src
    721     ld1         {v17.2s},[x4],x11           //vector load pu1_src
    722     ld1         {v18.2s},[x4],x11           //vector load pu1_src
    723     ld1         {v19.2s},[x4]               //vector load pu1_src
    724     //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    725     //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    726     //ld1 {v12.2s, v13.2s},[x4]                    //vector load pu1_src + src_strd
    727     //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
    728 
    729     sub         x4,x4,#2                    //increment the input pointer
    730     //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
    731     //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
    732     //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
    733 
    734     zip1        v0.2s, v20.2s, v16.2s
    735     zip2        v4.2s, v20.2s, v16.2s       //vector zip the i iteration and ii interation in single register
    736     zip1        v1.2s, v21.2s, v17.2s
    737     zip2        v5.2s, v21.2s, v17.2s
    738     zip1        v2.2s, v22.2s, v18.2s
    739     zip2        v6.2s, v22.2s, v18.2s
    740     zip1        v3.2s, v23.2s, v19.2s
    741     zip2        v7.2s, v23.2s, v19.2s
    742 
    743     umull       v29.8h, v1.8b, v25.8b       //arithmetic operations for ii iteration in the same time
    744     umlsl       v29.8h, v0.8b, v24.8b
    745     umlal       v29.8h, v2.8b, v26.8b
    746     umlsl       v29.8h, v3.8b, v27.8b
    747 
    748     sqrshrun    v29.8b, v29.8h,#6           //narrow right shift and saturating the result
    749     st1         {v29.s}[0],[x1],#4          //store the i iteration result which is in upper part of the register
    750     subs        x7,x7,#4                    //decrement the wd by 4
    751 
    752     st1         {v29.s}[1],[x6],#4          //store the ii iteration result which is in lower part of the register
    753 
    754     bgt         inner_loop_4
    755 
    756     sub         x12,x12,x5
    757     subs        x14,x14,#2                  //decrement the ht by 2
    758     sub         x1,x1,x5
    759     add         x12,x12,x2,lsl #1
    760     add         x1,x1,x3,lsl #1
    761     bgt         outer_loop_4
    762 
    763 end_loops:
    764 
    765     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    766     ldp         x19, x20,[sp],#16
    767     ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
    768                                             // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
    769     ldp         d13,d14,[sp],#16
    770     ldp         d11,d12,[sp],#16
    771     ldp         d9,d10,[sp],#16
    772     ret
    773 
    774 
    775 
    776 
    777 
    778 
    779 
    780 
    781