Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for inter prediction  interpolation.
     27 //*
     28 //* @author
     29 //*  Mohit
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_inter_pred_luma_horz_hpel_vert_hpel_av8()
     34 //*
     35 //* @remarks
     36 //*  None
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 
     41 
     42 
     43 //void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
     44 //                                UWORD8 *pu1_dst,
     45 //                                WORD32 src_strd,,
     46 //                                WORD32 dst_strd,
     47 //                                WORD32 ht,
     48 //                                WORD32 wd,
     49 //                                    UWORD8* pu1_tmp,
     50 //                                  UWORD32 dydx)
     51 
     52 //**************Variables Vs Registers*****************************************
     53 //    x0 => *pu1_src
     54 //    x1 => *pu1_dst
     55 //    x2 =>  src_strd
     56 //    x3 =>  dst_strd
     57 //    x4 =>  ht
     58 //    x5 =>  wd
     59 
     60 
     61 .text
     62 .p2align 2
     63 .include "ih264_neon_macros.s"
     64 
     65 
     66 
     67     .global ih264_inter_pred_luma_horz_hpel_vert_hpel_av8
     68 
     69 ih264_inter_pred_luma_horz_hpel_vert_hpel_av8:
     70 
     71     //store register values to stack
     72     push_v_regs
     73     stp       x19, x20, [sp, #-16]!
     74 
     75     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
     76     sub       x0, x0, #2                //pu1_src-2
     77 
     78     movi      v26.8h, #0x14             // Filter coeff 20 into Q13
     79     movi      v24.8h, #0x5              // Filter coeff 5  into Q12
     80     movi      v27.8h, #0x14             // Filter coeff 20 into Q13
     81     movi      v25.8h, #0x5              // Filter coeff 5  into Q12
     82     mov       x7, #0x20
     83     mov       x8, #0x30
     84     subs      x12, x5, #4               //if wd=4 branch to loop_4
     85     beq       loop_4_start
     86 
     87     subs      x12, x5, #8               //if wd=8 branch to loop_8
     88     beq       loop_8_start
     89 
     90     //when  wd=16
     91     movi      v28.8h, #0x14             // Filter coeff 20 into Q13
     92     movi      v30.8h, #0x5              // Filter coeff 5  into Q12
     93     sub       x2, x2, #16
     94     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0]
     95     ld1       {v12.2s}, [x0], x2        // Vector load from src[0_0]
     96     ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0]
     97     ld1       {v13.2s}, [x0], x2        // Vector load from src[1_0]
     98     ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0]
     99     ld1       {v14.2s}, [x0], x2        // Vector load from src[2_0]
    100     ld1       {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0]
    101     ld1       {v15.2s}, [x0], x2        // Vector load from src[3_0]
    102     ld1       {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0]
    103     ld1       {v16.2s}, [x0], x2        // Vector load from src[4_0]
    104 loop_16:
    105 
    106     ld1       {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0]
    107     ld1       {v17.2s}, [x0], x2        // Vector load from src[5_0]
    108 
    109 
    110     uaddl     v20.8h, v4.8b, v6.8b
    111     uaddl     v18.8h, v0.8b, v10.8b
    112     uaddl     v22.8h, v2.8b, v8.8b
    113     mla       v18.8h, v20.8h , v28.8h
    114     uaddl     v24.8h, v5.8b, v7.8b
    115     uaddl     v20.8h, v1.8b, v11.8b
    116     uaddl     v26.8h, v3.8b, v9.8b
    117     mla       v20.8h, v24.8h , v28.8h
    118     uaddl     v24.8h, v14.8b, v15.8b
    119     mls       v18.8h, v22.8h , v30.8h
    120     uaddl     v22.8h, v12.8b, v17.8b
    121     mls       v20.8h, v26.8h , v30.8h
    122     uaddl     v26.8h, v13.8b, v16.8b
    123     mla       v22.8h, v24.8h , v28.8h
    124     mls       v22.8h, v26.8h , v30.8h
    125 
    126     ext       v24.16b, v18.16b , v20.16b , #4
    127     ext       v26.16b, v18.16b , v20.16b , #6
    128 
    129     ext       v23.16b, v18.16b , v20.16b , #10
    130     add       v0.8h, v24.8h , v26.8h
    131     ext       v24.16b, v18.16b , v20.16b , #2
    132     ext       v26.16b, v18.16b , v20.16b , #8
    133     add       v24.8h, v24.8h , v26.8h
    134 
    135     saddl     v26.4s, v18.4h, v23.4h
    136     smlal     v26.4s, v0.4h, v28.4h
    137     smlsl     v26.4s, v24.4h, v30.4h
    138 
    139     saddl2    v23.4s, v18.8h, v23.8h
    140     smlal2    v23.4s, v0.8h, v28.8h
    141     smlsl2    v23.4s, v24.8h, v30.8h
    142 
    143     sqrshrun  v18.4h, v26.4s, #10
    144     sqrshrun  v19.4h, v23.4s, #10
    145 
    146 
    147     uqxtn     v18.8b, v18.8h
    148     uqxtn     v19.8b, v19.8h
    149     mov       v18.s[1], v19.s[0]
    150 
    151     ext       v24.16b, v20.16b , v22.16b , #4
    152     ext       v26.16b, v20.16b , v22.16b , #6
    153     ext       v0.16b, v20.16b , v22.16b , #10
    154 
    155     add       v25.8h, v24.8h , v26.8h
    156     ext       v24.16b, v20.16b , v22.16b , #2
    157     ext       v26.16b, v20.16b , v22.16b , #8
    158     add       v24.8h, v24.8h , v26.8h
    159 
    160     saddl     v26.4s, v0.4h, v20.4h
    161     smlal     v26.4s, v25.4h, v28.4h
    162     smlsl     v26.4s, v24.4h, v30.4h
    163 
    164     saddl2    v22.4s, v0.8h, v20.8h
    165     smlal2    v22.4s, v25.8h, v28.8h
    166     smlsl2    v22.4s, v24.8h, v30.8h
    167 
    168     sqrshrun  v19.4h, v26.4s, #10
    169     sqrshrun  v25.4h, v22.4s, #10
    170 
    171     uaddl     v24.8h, v7.8b, v9.8b
    172 
    173 
    174 
    175     uqxtn     v19.8b, v19.8h
    176     uqxtn     v25.8b, v25.8h
    177     mov       v19.s[1], v25.s[0]
    178 
    179     uaddl     v22.8h, v4.8b, v10.8b
    180     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0]
    181 
    182 
    183     ld1       {v12.2s}, [x0], x2        // Vector load from src[6_0]
    184     uaddl     v20.8h, v6.8b, v8.8b
    185     uaddl     v26.8h, v5.8b, v11.8b
    186     st1       {v18.2s, v19.2s}, [x1], x3 // store row 0
    187 
    188 
    189 //ROW_2
    190 
    191 
    192     uaddl     v18.8h, v2.8b, v0.8b
    193 
    194     mla       v18.8h, v20.8h , v28.8h
    195 
    196     uaddl     v20.8h, v3.8b, v1.8b
    197 
    198     mla       v20.8h, v24.8h , v28.8h
    199     uaddl     v24.8h, v15.8b, v16.8b
    200     mls       v18.8h, v22.8h , v30.8h
    201     uaddl     v22.8h, v13.8b, v12.8b
    202     mls       v20.8h, v26.8h , v30.8h
    203     uaddl     v26.8h, v14.8b, v17.8b
    204     mla       v22.8h, v24.8h , v28.8h
    205     mls       v22.8h, v26.8h , v30.8h
    206 
    207     ext       v24.16b, v18.16b , v20.16b , #4
    208     ext       v26.16b, v18.16b , v20.16b , #6
    209 
    210     ext       v23.16b, v18.16b , v20.16b , #10
    211     add       v2.8h, v24.8h , v26.8h
    212     ext       v24.16b, v18.16b , v20.16b , #2
    213     ext       v26.16b, v18.16b , v20.16b , #8
    214     add       v24.8h, v24.8h , v26.8h
    215 
    216     saddl     v26.4s, v18.4h, v23.4h
    217     smlal     v26.4s, v2.4h, v28.4h
    218     smlsl     v26.4s, v24.4h, v30.4h
    219 
    220     saddl2    v23.4s, v18.8h, v23.8h
    221     smlal2    v23.4s, v2.8h, v28.8h
    222     smlsl2    v23.4s, v24.8h, v30.8h
    223 
    224     sqrshrun  v18.4h, v26.4s, #10
    225     sqrshrun  v19.4h, v23.4s, #10
    226 
    227 
    228 
    229     uqxtn     v18.8b, v18.8h
    230     uqxtn     v19.8b, v19.8h
    231     mov       v18.s[1], v19.s[0]
    232 
    233     ext       v24.16b, v20.16b , v22.16b , #4
    234     ext       v26.16b, v20.16b , v22.16b , #6
    235     ext       v2.16b, v20.16b , v22.16b , #10
    236 
    237     add       v25.8h, v24.8h , v26.8h
    238     ext       v24.16b, v20.16b , v22.16b , #2
    239     ext       v26.16b, v20.16b , v22.16b , #8
    240     add       v24.8h, v24.8h , v26.8h
    241 
    242     saddl     v26.4s, v2.4h, v20.4h
    243     smlal     v26.4s, v25.4h, v28.4h
    244     smlsl     v26.4s, v24.4h, v30.4h
    245 
    246     saddl2    v22.4s, v2.8h, v20.8h
    247     smlal2    v22.4s, v25.8h, v28.8h
    248     smlsl2    v22.4s, v24.8h, v30.8h
    249 
    250     sqrshrun  v19.4h, v26.4s, #10
    251     sqrshrun  v25.4h, v22.4s, #10
    252     uaddl     v24.8h, v9.8b, v11.8b
    253 
    254     uqxtn     v19.8b, v19.8h
    255     uqxtn     v25.8b, v25.8h
    256     mov       v19.s[1], v25.s[0]
    257 
    258 
    259     uaddl     v22.8h, v6.8b, v0.8b
    260     ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0]
    261 
    262 
    263     ld1       {v13.2s}, [x0], x2        // Vector load from src[7_0]
    264     uaddl     v20.8h, v8.8b, v10.8b
    265     uaddl     v26.8h, v7.8b, v1.8b
    266     st1       {v18.2s, v19.2s}, [x1], x3 // store row 1
    267 
    268 //ROW_3
    269 
    270 
    271     uaddl     v18.8h, v4.8b, v2.8b
    272 
    273     mla       v18.8h, v20.8h , v28.8h
    274 
    275     uaddl     v20.8h, v5.8b, v3.8b
    276 
    277     mla       v20.8h, v24.8h , v28.8h
    278     uaddl     v24.8h, v16.8b, v17.8b
    279     mls       v18.8h, v22.8h , v30.8h
    280     uaddl     v22.8h, v14.8b, v13.8b
    281     mls       v20.8h, v26.8h , v30.8h
    282     uaddl     v26.8h, v15.8b, v12.8b
    283     mla       v22.8h, v24.8h , v28.8h
    284     mls       v22.8h, v26.8h , v30.8h
    285 
    286     ext       v24.16b, v18.16b , v20.16b , #4
    287     ext       v26.16b, v18.16b , v20.16b , #6
    288 
    289     ext       v23.16b, v18.16b , v20.16b , #10
    290     add       v4.8h, v24.8h , v26.8h
    291     ext       v24.16b, v18.16b , v20.16b , #2
    292     ext       v26.16b, v18.16b , v20.16b , #8
    293     add       v24.8h, v24.8h , v26.8h
    294 
    295     saddl     v26.4s, v18.4h, v23.4h
    296     smlal     v26.4s, v4.4h, v28.4h
    297     smlsl     v26.4s, v24.4h, v30.4h
    298 
    299     saddl2    v23.4s, v18.8h, v23.8h
    300     smlal2    v23.4s, v4.8h, v28.8h
    301     smlsl2    v23.4s, v24.8h, v30.8h
    302 
    303     sqrshrun  v18.4h, v26.4s, #10
    304     sqrshrun  v19.4h, v23.4s, #10
    305 
    306 
    307     uqxtn     v18.8b, v18.8h
    308     uqxtn     v19.8b, v19.8h
    309     mov       v18.s[1], v19.s[0]
    310 
    311 
    312     ext       v24.16b, v20.16b , v22.16b , #4
    313     ext       v26.16b, v20.16b , v22.16b , #6
    314     ext       v4.16b, v20.16b , v22.16b , #10
    315 
    316     add       v25.8h, v24.8h , v26.8h
    317     ext       v24.16b, v20.16b , v22.16b , #2
    318     ext       v26.16b, v20.16b , v22.16b , #8
    319     add       v24.8h, v24.8h , v26.8h
    320 
    321     saddl     v26.4s, v4.4h, v20.4h
    322     smlal     v26.4s, v25.4h, v28.4h
    323     smlsl     v26.4s, v24.4h, v30.4h
    324 
    325     saddl2    v22.4s, v4.8h, v20.8h
    326     smlal2    v22.4s, v25.8h, v28.8h
    327     smlsl2    v22.4s, v24.8h, v30.8h
    328 
    329     sqrshrun  v19.4h, v26.4s, #10
    330     sqrshrun  v25.4h, v22.4s, #10
    331 
    332     uaddl     v24.8h, v11.8b, v1.8b
    333 
    334 
    335     uqxtn     v19.8b, v19.8h
    336     uqxtn     v25.8b, v25.8h
    337     mov       v19.s[1], v25.s[0]
    338 
    339 
    340 
    341     uaddl     v22.8h, v8.8b, v2.8b
    342     ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0]
    343 
    344 
    345     ld1       {v14.2s}, [x0], x2        // Vector load from src[8_0]
    346     uaddl     v20.8h, v10.8b, v0.8b
    347     uaddl     v26.8h, v9.8b, v3.8b
    348     st1       {v18.2s, v19.2s}, [x1], x3 // store row 2
    349 
    350 
    351 //ROW_4
    352 
    353     uaddl     v18.8h, v6.8b, v4.8b
    354 
    355     mla       v18.8h, v20.8h , v28.8h
    356 
    357     uaddl     v20.8h, v7.8b, v5.8b
    358 
    359     mla       v20.8h, v24.8h , v28.8h
    360     uaddl     v24.8h, v17.8b, v12.8b
    361     mls       v18.8h, v22.8h , v30.8h
    362     uaddl     v22.8h, v15.8b, v14.8b
    363     mls       v20.8h, v26.8h , v30.8h
    364     uaddl     v26.8h, v16.8b, v13.8b
    365     mla       v22.8h, v24.8h , v28.8h
    366     mls       v22.8h, v26.8h , v30.8h
    367 
    368     ext       v24.16b, v18.16b , v20.16b , #4
    369     ext       v26.16b, v18.16b , v20.16b , #6
    370 
    371     ext       v23.16b, v18.16b , v20.16b , #10
    372     add       v6.8h, v24.8h , v26.8h
    373     ext       v24.16b, v18.16b , v20.16b , #2
    374     ext       v26.16b, v18.16b , v20.16b , #8
    375     add       v24.8h, v24.8h , v26.8h
    376 
    377     saddl     v26.4s, v18.4h, v23.4h
    378     smlal     v26.4s, v6.4h, v28.4h
    379     smlsl     v26.4s, v24.4h, v30.4h
    380 
    381     saddl2    v23.4s, v18.8h, v23.8h
    382     smlal2    v23.4s, v6.8h, v28.8h
    383     smlsl2    v23.4s, v24.8h, v30.8h
    384 
    385     sqrshrun  v18.4h, v26.4s, #10
    386     sqrshrun  v19.4h, v23.4s, #10
    387 
    388     uqxtn     v18.8b, v18.8h
    389     uqxtn     v19.8b, v19.8h
    390     mov       v18.s[1], v19.s[0]
    391 
    392 
    393     ext       v24.16b, v20.16b , v22.16b , #4
    394     ext       v26.16b, v20.16b , v22.16b , #6
    395     ext       v6.16b, v20.16b , v22.16b , #10
    396 
    397     add       v25.8h, v24.8h , v26.8h
    398     ext       v24.16b, v20.16b , v22.16b , #2
    399     ext       v26.16b, v20.16b , v22.16b , #8
    400     add       v24.8h, v24.8h , v26.8h
    401 
    402     saddl     v26.4s, v6.4h, v20.4h
    403     smlal     v26.4s, v25.4h, v28.4h
    404     smlsl     v26.4s, v24.4h, v30.4h
    405 
    406     saddl2    v22.4s, v6.8h, v20.8h
    407     smlal2    v22.4s, v25.8h, v28.8h
    408     smlsl2    v22.4s, v24.8h, v30.8h
    409 
    410     mov       v6.16b, v2.16b
    411     mov       v7.16b, v3.16b
    412 
    413     mov       v2.16b, v10.16b
    414     mov       v3.16b, v11.16b
    415 
    416     subs      x4, x4, #4
    417     sqrshrun  v19.4h, v26.4s, #10
    418     sqrshrun  v25.4h, v22.4s, #10
    419     mov       v10.16b, v0.16b
    420     mov       v11.16b, v1.16b
    421 
    422     mov       v24.8b, v14.8b
    423 
    424     mov       v14.16b, v12.16b
    425     mov       v15.16b, v13.16b
    426 
    427 
    428     uqxtn     v19.8b, v19.8h
    429     uqxtn     v25.8b, v25.8h
    430     mov       v19.s[1], v25.s[0]
    431 
    432 
    433 
    434     mov       v0.16b, v8.16b
    435     mov       v1.16b, v9.16b
    436 
    437     mov       v8.16b, v4.16b
    438     mov       v9.16b, v5.16b
    439 
    440     mov       v12.16b, v16.16b
    441     mov       v13.16b, v17.16b
    442 
    443     mov       v4.16b, v10.16b
    444     mov       v5.16b, v11.16b
    445 
    446     mov       v16.8b, v24.8b
    447     st1       {v18.2s, v19.2s}, [x1], x3 // store row 3
    448 
    449     bgt       loop_16                   // looping if height =16
    450     b         end_func
    451 
    452 loop_8_start:
    453     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    454     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    455     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    456     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    457     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    458 
    459 loop_8:
    460 
    461     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    462     uaddl     v14.8h, v4.8b, v6.8b
    463     uaddl     v12.8h, v0.8b, v10.8b
    464     uaddl     v16.8h, v2.8b, v8.8b
    465     mla       v12.8h, v14.8h , v26.8h
    466     uaddl     v18.8h, v5.8b, v7.8b
    467     uaddl     v14.8h, v1.8b, v11.8b
    468     uaddl     v22.8h, v3.8b, v9.8b
    469     mla       v14.8h, v18.8h , v26.8h
    470     mls       v12.8h, v16.8h , v24.8h
    471     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
    472     uaddl     v16.8h, v6.8b, v8.8b
    473     mls       v14.8h, v22.8h , v24.8h
    474     uaddl     v28.8h, v2.8b, v0.8b
    475 
    476     ext       v22.16b, v12.16b , v14.16b , #10
    477     uaddl     v18.8h, v4.8b, v10.8b
    478     mla       v28.8h, v16.8h , v26.8h
    479     saddl     v30.4s, v12.4h, v22.4h
    480 
    481     saddl2    v22.4s, v12.8h, v22.8h
    482     ext       v16.16b, v12.16b , v14.16b , #4
    483     mls       v28.8h, v18.8h , v24.8h
    484     ext       v18.16b, v12.16b , v14.16b , #6
    485     ext       v20.16b, v12.16b , v14.16b , #8
    486     ext       v14.16b, v12.16b , v14.16b , #2
    487     add       v16.8h, v16.8h , v18.8h
    488     add       v18.8h, v14.8h , v20.8h
    489     uaddl     v20.8h, v7.8b, v9.8b
    490     smlal     v30.4s, v16.4h, v26.4h
    491     smlsl     v30.4s, v18.4h, v24.4h
    492     smlal2    v22.4s, v16.8h, v26.8h
    493     smlsl2    v22.4s, v18.8h, v24.8h
    494     uaddl     v14.8h, v3.8b, v1.8b
    495 
    496     mla       v14.8h, v20.8h , v26.8h
    497     sqrshrun  v12.4h, v30.4s, #10
    498     uaddl     v16.8h, v5.8b, v11.8b
    499     sqrshrun  v13.4h, v22.4s, #10
    500     mls       v14.8h, v16.8h , v24.8h
    501     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
    502     uqxtn     v25.8b, v12.8h
    503     uqxtn     v13.8b, v13.8h
    504     mov       v25.s[1], v13.s[0]
    505     uaddl     v16.8h, v8.8b, v10.8b
    506 
    507 
    508     ext       v22.16b, v28.16b , v14.16b , #10
    509     uaddl     v20.8h, v4.8b, v2.8b
    510     saddl     v30.4s, v28.4h, v22.4h
    511     mla       v20.8h, v16.8h , v26.8h
    512 
    513     saddl2    v22.4s, v28.8h, v22.8h
    514     ext       v16.16b, v28.16b , v14.16b , #4
    515     ext       v18.16b, v28.16b , v14.16b , #6
    516     ext       v12.16b, v28.16b , v14.16b , #8
    517     ext       v14.16b, v28.16b , v14.16b , #2
    518     add       v16.8h, v16.8h , v18.8h
    519     add       v18.8h, v12.8h , v14.8h
    520 
    521     smlal     v30.4s, v16.4h, v26.4h
    522     smlsl     v30.4s, v18.4h, v24.4h
    523     smlal2    v22.4s, v16.8h, v26.8h
    524     smlsl2    v22.4s, v18.8h, v24.8h
    525 
    526 
    527     uaddl     v18.8h, v6.8b, v0.8b
    528     sqrshrun  v16.4h, v30.4s, #10
    529 
    530     sqrshrun  v17.4h, v22.4s, #10
    531 
    532     mov       v12.8b, v25.8b
    533     mov       v25.8b, v24.8b
    534 
    535     uaddl     v28.8h, v9.8b, v11.8b
    536     uqxtn     v13.8b, v16.8h
    537     uqxtn     v17.8b, v17.8h
    538     mov       v13.s[1], v17.s[0]
    539 
    540 
    541     uaddl     v14.8h, v5.8b, v3.8b
    542     uaddl     v22.8h, v7.8b, v1.8b
    543     mls       v20.8h, v18.8h , v24.8h
    544     st1       {v12.2s}, [x1], x3        // store row 0
    545     mla       v14.8h, v28.8h , v26.8h
    546     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
    547     uaddl     v30.8h, v10.8b, v0.8b
    548     uaddl     v28.8h, v6.8b, v4.8b
    549     mls       v14.8h, v22.8h , v24.8h
    550     st1       {v13.2s}, [x1], x3        // store row 1
    551     mla       v28.8h, v30.8h , v26.8h
    552 
    553     ext       v22.16b, v20.16b , v14.16b , #10
    554     saddl     v30.4s, v20.4h, v22.4h
    555 
    556     saddl2    v22.4s, v20.8h, v22.8h
    557     ext       v16.16b, v20.16b , v14.16b , #4
    558     ext       v18.16b, v20.16b , v14.16b , #6
    559     ext       v12.16b, v20.16b , v14.16b , #8
    560     ext       v14.16b, v20.16b , v14.16b , #2
    561     add       v16.8h, v16.8h , v18.8h
    562     add       v18.8h, v14.8h , v12.8h
    563     uaddl     v20.8h, v8.8b, v2.8b
    564     smlal     v30.4s, v16.4h, v26.4h
    565     smlsl     v30.4s, v18.4h, v24.4h
    566     smlal2    v22.4s, v16.8h, v26.8h
    567     smlsl2    v22.4s, v18.8h, v24.8h
    568     uaddl     v18.8h, v11.8b, v1.8b
    569     uaddl     v16.8h, v7.8b, v5.8b
    570     sqrshrun  v12.4h, v30.4s, #10
    571     uaddl     v30.8h, v9.8b, v3.8b
    572     mla       v16.8h, v18.8h , v26.8h
    573     sqrshrun  v13.4h, v22.4s, #10
    574     mls       v28.8h, v20.8h , v24.8h
    575 
    576     mls       v16.8h, v30.8h , v24.8h
    577     uqxtn     v27.8b, v12.8h
    578     uqxtn     v13.8b, v13.8h
    579     mov       v27.s[1], v13.s[0]
    580 
    581 
    582     ext       v22.16b, v28.16b , v16.16b , #10
    583 
    584     saddl     v30.4s, v28.4h, v22.4h
    585 
    586     saddl2    v22.4s, v28.8h, v22.8h
    587     ext       v12.16b, v28.16b , v16.16b , #4
    588     ext       v18.16b, v28.16b , v16.16b , #6
    589     ext       v20.16b, v28.16b , v16.16b , #8
    590     ext       v28.16b, v28.16b , v16.16b , #2
    591     add       v12.8h, v12.8h , v18.8h
    592     add       v18.8h, v28.8h , v20.8h
    593 
    594     smlal     v30.4s, v12.4h, v26.4h
    595     smlsl     v30.4s, v18.4h, v24.4h
    596     smlal2    v22.4s, v12.8h, v26.8h
    597     smlsl2    v22.4s, v18.8h, v24.8h
    598 
    599 
    600     mov       v12.8b, v27.8b
    601     mov       v27.8b, v26.8b
    602 
    603     sqrshrun  v16.4h, v30.4s, #10
    604 
    605     mov       v6.16b, v2.16b
    606     mov       v7.16b, v3.16b
    607 
    608     sqrshrun  v17.4h, v22.4s, #10
    609 
    610     mov       v2.16b, v10.16b
    611     mov       v3.16b, v11.16b
    612 
    613     mov       v10.16b, v0.16b
    614     mov       v11.16b, v1.16b
    615 
    616     subs      x4, x4, #4
    617     uqxtn     v13.8b, v16.8h
    618     uqxtn     v17.8b, v17.8h
    619     mov       v13.s[1], v17.s[0]
    620 
    621 
    622     mov       v0.16b, v8.16b
    623     mov       v1.16b, v9.16b
    624 
    625     mov       v8.16b, v4.16b
    626     mov       v9.16b, v5.16b
    627 
    628     mov       v4.16b, v10.16b
    629     mov       v5.16b, v11.16b
    630 
    631     st1       {v12.2s}, [x1], x3        // store row 2
    632     st1       {v13.2s}, [x1], x3        // store row 3
    633 
    634     bgt       loop_8                    //if height =8  loop
    635     b         end_func
    636 
    637 loop_4_start:
    638     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    639     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    640     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    641     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    642     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    643 
    644 loop_4:
    645     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    646     uaddl     v14.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
    647     uaddl     v12.8h, v0.8b, v10.8b     // temp = src[0_0] + src[5_0]
    648     uaddl     v16.8h, v2.8b, v8.8b      // temp2 = src[1_0] + src[4_0]
    649     mla       v12.8h, v14.8h , v26.8h   // temp += temp1 * 20
    650     uaddl     v18.8h, v5.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
    651     uaddl     v14.8h, v1.8b, v11.8b     // temp = src[0_0] + src[5_0]
    652     uaddl     v22.8h, v3.8b, v9.8b      // temp2 = src[1_0] + src[4_0]
    653     mla       v14.8h, v18.8h , v26.8h   // temp += temp1 * 20
    654     mls       v12.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    655     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
    656     uaddl     v16.8h, v6.8b, v8.8b
    657     mls       v14.8h, v22.8h , v24.8h   // temp -= temp2 * 5
    658     //Q6 and Q7 have filtered values
    659     uaddl     v28.8h, v2.8b, v0.8b
    660 
    661     ext       v22.16b, v12.16b , v14.16b , #10
    662     uaddl     v18.8h, v4.8b, v10.8b
    663     mla       v28.8h, v16.8h , v26.8h
    664     saddl     v30.4s, v12.4h, v22.4h
    665 
    666     saddl     v22.4s, v13.4h, v23.4h
    667     ext       v16.16b, v12.16b , v14.16b , #4
    668     mls       v28.8h, v18.8h , v24.8h
    669     ext       v18.16b, v12.16b , v14.16b , #6
    670     ext       v20.16b, v12.16b , v14.16b , #8
    671     ext       v14.16b, v12.16b , v14.16b , #2
    672     add       v16.8h, v16.8h , v18.8h
    673     add       v18.8h, v14.8h , v20.8h
    674     uaddl     v20.8h, v7.8b, v9.8b
    675     smlal     v30.4s, v16.4h, v26.4h
    676     smlsl     v30.4s, v18.4h, v24.4h
    677     smlal     v22.4s, v17.4h, v26.4h
    678     smlsl     v22.4s, v19.4h, v24.4h
    679     uaddl     v14.8h, v3.8b, v1.8b
    680 
    681     mla       v14.8h, v20.8h , v26.8h
    682     sqrshrun  v12.4h, v30.4s, #10
    683     uaddl     v16.8h, v5.8b, v11.8b
    684     sqrshrun  v13.4h, v22.4s, #10
    685     mls       v14.8h, v16.8h , v24.8h
    686     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
    687     uqxtn     v25.8b, v12.8h
    688     uaddl     v16.8h, v8.8b, v10.8b
    689 
    690     ext       v22.16b, v28.16b , v14.16b , #10
    691     uaddl     v20.8h, v4.8b, v2.8b
    692     saddl     v30.4s, v28.4h, v22.4h
    693     mla       v20.8h, v16.8h , v26.8h
    694 
    695     saddl     v22.4s, v29.4h, v23.4h
    696     ext       v16.16b, v28.16b , v14.16b , #4
    697     ext       v18.16b, v28.16b , v14.16b , #6
    698     ext       v12.16b, v28.16b , v14.16b , #8
    699     ext       v14.16b, v28.16b , v14.16b , #2
    700     add       v16.8h, v16.8h , v18.8h
    701     add       v18.8h, v12.8h , v14.8h
    702 
    703     smlal     v30.4s, v16.4h, v26.4h
    704     smlsl     v30.4s, v18.4h, v24.4h
    705     smlal     v22.4s, v17.4h, v26.4h
    706     smlsl     v22.4s, v19.4h, v24.4h
    707 
    708 
    709     uaddl     v18.8h, v6.8b, v0.8b
    710     sqrshrun  v16.4h, v30.4s, #10
    711 
    712     sqrshrun  v17.4h, v22.4s, #10
    713 
    714     mov       v12.8b, v25.8b
    715     mov       v25.8b, v24.8b
    716 
    717     uaddl     v28.8h, v9.8b, v11.8b
    718     uqxtn     v13.8b, v16.8h
    719 
    720 
    721 
    722     uaddl     v14.8h, v5.8b, v3.8b
    723     uaddl     v22.8h, v7.8b, v1.8b
    724     mls       v20.8h, v18.8h , v24.8h
    725     st1       {v12.s}[0], [x1], x3      // store row 0
    726     mla       v14.8h, v28.8h , v26.8h
    727     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
    728     uaddl     v30.8h, v10.8b, v0.8b
    729     uaddl     v28.8h, v6.8b, v4.8b
    730     mls       v14.8h, v22.8h , v24.8h
    731     st1       {v13.s}[0], [x1], x3      //store row 1
    732     mla       v28.8h, v30.8h , v26.8h
    733 
    734     ext       v22.16b, v20.16b , v14.16b , #10
    735     saddl     v30.4s, v20.4h, v22.4h
    736 
    737     saddl     v22.4s, v21.4h, v23.4h
    738     ext       v16.16b, v20.16b , v14.16b , #4
    739     ext       v18.16b, v20.16b , v14.16b , #6
    740     ext       v12.16b, v20.16b , v14.16b , #8
    741     ext       v14.16b, v20.16b , v14.16b , #2
    742     add       v16.8h, v16.8h , v18.8h
    743     add       v18.8h, v14.8h , v12.8h
    744     uaddl     v20.8h, v8.8b, v2.8b
    745     smlal     v30.4s, v16.4h, v26.4h
    746     smlsl     v30.4s, v18.4h, v24.4h
    747     smlal     v22.4s, v17.4h, v26.4h
    748     smlsl     v22.4s, v19.4h, v24.4h
    749     uaddl     v18.8h, v11.8b, v1.8b
    750     uaddl     v16.8h, v7.8b, v5.8b
    751     sqrshrun  v12.4h, v30.4s, #10
    752     uaddl     v30.8h, v9.8b, v3.8b
    753     mla       v16.8h, v18.8h , v26.8h
    754     sqrshrun  v13.4h, v22.4s, #10
    755     mls       v28.8h, v20.8h , v24.8h
    756 
    757     mls       v16.8h, v30.8h , v24.8h
    758     uqxtn     v27.8b, v12.8h
    759 
    760     ext       v22.16b, v28.16b , v16.16b , #10
    761 
    762     saddl     v30.4s, v28.4h, v22.4h
    763 
    764     saddl     v22.4s, v29.4h, v23.4h
    765     ext       v12.16b, v28.16b , v16.16b , #4
    766     ext       v18.16b, v28.16b , v16.16b , #6
    767     ext       v20.16b, v28.16b , v16.16b , #8
    768     ext       v28.16b, v28.16b , v16.16b , #2
    769     add       v12.8h, v12.8h , v18.8h
    770     add       v18.8h, v28.8h , v20.8h
    771 
    772     smlal     v30.4s, v12.4h, v26.4h
    773     smlsl     v30.4s, v18.4h, v24.4h
    774     smlal     v22.4s, v13.4h, v26.4h
    775     smlsl     v22.4s, v19.4h, v24.4h
    776 
    777 
    778     mov       v12.8b, v27.8b
    779     mov       v27.8b, v26.8b
    780 
    781     sqrshrun  v16.4h, v30.4s, #10
    782 
    783     mov       v6.16b, v2.16b
    784     mov       v7.16b, v3.16b
    785 
    786     sqrshrun  v17.4h, v22.4s, #10
    787 
    788     mov       v2.16b, v10.16b
    789     mov       v3.16b, v11.16b
    790 
    791     mov       v10.16b, v0.16b
    792     mov       v11.16b, v1.16b
    793 
    794     subs      x4, x4, #4
    795     uqxtn     v13.8b, v16.8h
    796 
    797     mov       v0.16b, v8.16b
    798     mov       v1.16b, v9.16b
    799 
    800     mov       v8.16b, v4.16b
    801     mov       v9.16b, v5.16b
    802 
    803 
    804     mov       v4.16b, v10.16b
    805     mov       v5.16b, v11.16b
    806 
    807 
    808     st1       {v12.s}[0], [x1], x3      // store row 2
    809     st1       {v13.s}[0], [x1], x3      // store row 3
    810 
    811     bgt       loop_4
    812 
    813 end_func:
    814     //Restoring registers from stack
    815     ldp       x19, x20, [sp], #16
    816     pop_v_regs
    817     ret
    818 
    819 
    820 
    821