Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for inter prediction  interpolation.
     27 //*
     28 //* @author
     29 //*  Mohit
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_inter_pred_luma_horz_hpel_vert_hpel_av8()
     34 //*
     35 //* @remarks
     36 //*  None
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 
     41 
     42 
     43 //void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
     44 //                                UWORD8 *pu1_dst,
     45 //                                WORD32 src_strd,,
     46 //                                WORD32 dst_strd,
     47 //                                WORD32 ht,
     48 //                                WORD32 wd,
     49 //                                    UWORD8* pu1_tmp,
     50 //                                  UWORD32 dydx)
     51 
     52 //**************Variables Vs Registers*****************************************
     53 //    x0 => *pu1_src
     54 //    x1 => *pu1_dst
     55 //    w2 =>  src_strd
     56 //    w3 =>  dst_strd
     57 //    w4 =>  ht
     58 //    w5 =>  wd
     59 
     60 
     61 .text
     62 .p2align 2
     63 .include "ih264_neon_macros.s"
     64 
     65 
     66 
     67     .global ih264_inter_pred_luma_horz_hpel_vert_hpel_av8
     68 
     69 ih264_inter_pred_luma_horz_hpel_vert_hpel_av8:
     70 
     71     //store register values to stack
     72     push_v_regs
     73     stp       x19, x20, [sp, #-16]!
     74     sxtw      x2, w2
     75     sxtw      x3, w3
     76     sxtw      x4, w4
     77     sxtw      x5, w5
     78 
     79     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
     80     sub       x0, x0, #2                //pu1_src-2
     81 
     82     movi      v26.8h, #0x14             // Filter coeff 20 into Q13
     83     movi      v24.8h, #0x5              // Filter coeff 5  into Q12
     84     movi      v27.8h, #0x14             // Filter coeff 20 into Q13
     85     movi      v25.8h, #0x5              // Filter coeff 5  into Q12
     86     mov       x7, #0x20
     87     mov       x8, #0x30
     88     subs      x12, x5, #4               //if wd=4 branch to loop_4
     89     beq       loop_4_start
     90 
     91     subs      x12, x5, #8               //if wd=8 branch to loop_8
     92     beq       loop_8_start
     93 
     94     //when  wd=16
     95     movi      v28.8h, #0x14             // Filter coeff 20 into Q13
     96     movi      v30.8h, #0x5              // Filter coeff 5  into Q12
     97     sub       x2, x2, #16
     98     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0]
     99     ld1       {v12.2s}, [x0], x2        // Vector load from src[0_0]
    100     ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0]
    101     ld1       {v13.2s}, [x0], x2        // Vector load from src[1_0]
    102     ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0]
    103     ld1       {v14.2s}, [x0], x2        // Vector load from src[2_0]
    104     ld1       {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0]
    105     ld1       {v15.2s}, [x0], x2        // Vector load from src[3_0]
    106     ld1       {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0]
    107     ld1       {v16.2s}, [x0], x2        // Vector load from src[4_0]
    108 loop_16:
    109 
    110     ld1       {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0]
    111     ld1       {v17.2s}, [x0], x2        // Vector load from src[5_0]
    112 
    113 
    114     uaddl     v20.8h, v4.8b, v6.8b
    115     uaddl     v18.8h, v0.8b, v10.8b
    116     uaddl     v22.8h, v2.8b, v8.8b
    117     mla       v18.8h, v20.8h , v28.8h
    118     uaddl     v24.8h, v5.8b, v7.8b
    119     uaddl     v20.8h, v1.8b, v11.8b
    120     uaddl     v26.8h, v3.8b, v9.8b
    121     mla       v20.8h, v24.8h , v28.8h
    122     uaddl     v24.8h, v14.8b, v15.8b
    123     mls       v18.8h, v22.8h , v30.8h
    124     uaddl     v22.8h, v12.8b, v17.8b
    125     mls       v20.8h, v26.8h , v30.8h
    126     uaddl     v26.8h, v13.8b, v16.8b
    127     mla       v22.8h, v24.8h , v28.8h
    128     mls       v22.8h, v26.8h , v30.8h
    129 
    130     ext       v24.16b, v18.16b , v20.16b , #4
    131     ext       v26.16b, v18.16b , v20.16b , #6
    132 
    133     ext       v23.16b, v18.16b , v20.16b , #10
    134     add       v0.8h, v24.8h , v26.8h
    135     ext       v24.16b, v18.16b , v20.16b , #2
    136     ext       v26.16b, v18.16b , v20.16b , #8
    137     add       v24.8h, v24.8h , v26.8h
    138 
    139     saddl     v26.4s, v18.4h, v23.4h
    140     smlal     v26.4s, v0.4h, v28.4h
    141     smlsl     v26.4s, v24.4h, v30.4h
    142 
    143     saddl2    v23.4s, v18.8h, v23.8h
    144     smlal2    v23.4s, v0.8h, v28.8h
    145     smlsl2    v23.4s, v24.8h, v30.8h
    146 
    147     sqrshrun  v18.4h, v26.4s, #10
    148     sqrshrun  v19.4h, v23.4s, #10
    149 
    150 
    151     uqxtn     v18.8b, v18.8h
    152     uqxtn     v19.8b, v19.8h
    153     mov       v18.s[1], v19.s[0]
    154 
    155     ext       v24.16b, v20.16b , v22.16b , #4
    156     ext       v26.16b, v20.16b , v22.16b , #6
    157     ext       v0.16b, v20.16b , v22.16b , #10
    158 
    159     add       v25.8h, v24.8h , v26.8h
    160     ext       v24.16b, v20.16b , v22.16b , #2
    161     ext       v26.16b, v20.16b , v22.16b , #8
    162     add       v24.8h, v24.8h , v26.8h
    163 
    164     saddl     v26.4s, v0.4h, v20.4h
    165     smlal     v26.4s, v25.4h, v28.4h
    166     smlsl     v26.4s, v24.4h, v30.4h
    167 
    168     saddl2    v22.4s, v0.8h, v20.8h
    169     smlal2    v22.4s, v25.8h, v28.8h
    170     smlsl2    v22.4s, v24.8h, v30.8h
    171 
    172     sqrshrun  v19.4h, v26.4s, #10
    173     sqrshrun  v25.4h, v22.4s, #10
    174 
    175     uaddl     v24.8h, v7.8b, v9.8b
    176 
    177 
    178 
    179     uqxtn     v19.8b, v19.8h
    180     uqxtn     v25.8b, v25.8h
    181     mov       v19.s[1], v25.s[0]
    182 
    183     uaddl     v22.8h, v4.8b, v10.8b
    184     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0]
    185 
    186 
    187     ld1       {v12.2s}, [x0], x2        // Vector load from src[6_0]
    188     uaddl     v20.8h, v6.8b, v8.8b
    189     uaddl     v26.8h, v5.8b, v11.8b
    190     st1       {v18.2s, v19.2s}, [x1], x3 // store row 0
    191 
    192 
    193 //ROW_2
    194 
    195 
    196     uaddl     v18.8h, v2.8b, v0.8b
    197 
    198     mla       v18.8h, v20.8h , v28.8h
    199 
    200     uaddl     v20.8h, v3.8b, v1.8b
    201 
    202     mla       v20.8h, v24.8h , v28.8h
    203     uaddl     v24.8h, v15.8b, v16.8b
    204     mls       v18.8h, v22.8h , v30.8h
    205     uaddl     v22.8h, v13.8b, v12.8b
    206     mls       v20.8h, v26.8h , v30.8h
    207     uaddl     v26.8h, v14.8b, v17.8b
    208     mla       v22.8h, v24.8h , v28.8h
    209     mls       v22.8h, v26.8h , v30.8h
    210 
    211     ext       v24.16b, v18.16b , v20.16b , #4
    212     ext       v26.16b, v18.16b , v20.16b , #6
    213 
    214     ext       v23.16b, v18.16b , v20.16b , #10
    215     add       v2.8h, v24.8h , v26.8h
    216     ext       v24.16b, v18.16b , v20.16b , #2
    217     ext       v26.16b, v18.16b , v20.16b , #8
    218     add       v24.8h, v24.8h , v26.8h
    219 
    220     saddl     v26.4s, v18.4h, v23.4h
    221     smlal     v26.4s, v2.4h, v28.4h
    222     smlsl     v26.4s, v24.4h, v30.4h
    223 
    224     saddl2    v23.4s, v18.8h, v23.8h
    225     smlal2    v23.4s, v2.8h, v28.8h
    226     smlsl2    v23.4s, v24.8h, v30.8h
    227 
    228     sqrshrun  v18.4h, v26.4s, #10
    229     sqrshrun  v19.4h, v23.4s, #10
    230 
    231 
    232 
    233     uqxtn     v18.8b, v18.8h
    234     uqxtn     v19.8b, v19.8h
    235     mov       v18.s[1], v19.s[0]
    236 
    237     ext       v24.16b, v20.16b , v22.16b , #4
    238     ext       v26.16b, v20.16b , v22.16b , #6
    239     ext       v2.16b, v20.16b , v22.16b , #10
    240 
    241     add       v25.8h, v24.8h , v26.8h
    242     ext       v24.16b, v20.16b , v22.16b , #2
    243     ext       v26.16b, v20.16b , v22.16b , #8
    244     add       v24.8h, v24.8h , v26.8h
    245 
    246     saddl     v26.4s, v2.4h, v20.4h
    247     smlal     v26.4s, v25.4h, v28.4h
    248     smlsl     v26.4s, v24.4h, v30.4h
    249 
    250     saddl2    v22.4s, v2.8h, v20.8h
    251     smlal2    v22.4s, v25.8h, v28.8h
    252     smlsl2    v22.4s, v24.8h, v30.8h
    253 
    254     sqrshrun  v19.4h, v26.4s, #10
    255     sqrshrun  v25.4h, v22.4s, #10
    256     uaddl     v24.8h, v9.8b, v11.8b
    257 
    258     uqxtn     v19.8b, v19.8h
    259     uqxtn     v25.8b, v25.8h
    260     mov       v19.s[1], v25.s[0]
    261 
    262 
    263     uaddl     v22.8h, v6.8b, v0.8b
    264     ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0]
    265 
    266 
    267     ld1       {v13.2s}, [x0], x2        // Vector load from src[7_0]
    268     uaddl     v20.8h, v8.8b, v10.8b
    269     uaddl     v26.8h, v7.8b, v1.8b
    270     st1       {v18.2s, v19.2s}, [x1], x3 // store row 1
    271 
    272 //ROW_3
    273 
    274 
    275     uaddl     v18.8h, v4.8b, v2.8b
    276 
    277     mla       v18.8h, v20.8h , v28.8h
    278 
    279     uaddl     v20.8h, v5.8b, v3.8b
    280 
    281     mla       v20.8h, v24.8h , v28.8h
    282     uaddl     v24.8h, v16.8b, v17.8b
    283     mls       v18.8h, v22.8h , v30.8h
    284     uaddl     v22.8h, v14.8b, v13.8b
    285     mls       v20.8h, v26.8h , v30.8h
    286     uaddl     v26.8h, v15.8b, v12.8b
    287     mla       v22.8h, v24.8h , v28.8h
    288     mls       v22.8h, v26.8h , v30.8h
    289 
    290     ext       v24.16b, v18.16b , v20.16b , #4
    291     ext       v26.16b, v18.16b , v20.16b , #6
    292 
    293     ext       v23.16b, v18.16b , v20.16b , #10
    294     add       v4.8h, v24.8h , v26.8h
    295     ext       v24.16b, v18.16b , v20.16b , #2
    296     ext       v26.16b, v18.16b , v20.16b , #8
    297     add       v24.8h, v24.8h , v26.8h
    298 
    299     saddl     v26.4s, v18.4h, v23.4h
    300     smlal     v26.4s, v4.4h, v28.4h
    301     smlsl     v26.4s, v24.4h, v30.4h
    302 
    303     saddl2    v23.4s, v18.8h, v23.8h
    304     smlal2    v23.4s, v4.8h, v28.8h
    305     smlsl2    v23.4s, v24.8h, v30.8h
    306 
    307     sqrshrun  v18.4h, v26.4s, #10
    308     sqrshrun  v19.4h, v23.4s, #10
    309 
    310 
    311     uqxtn     v18.8b, v18.8h
    312     uqxtn     v19.8b, v19.8h
    313     mov       v18.s[1], v19.s[0]
    314 
    315 
    316     ext       v24.16b, v20.16b , v22.16b , #4
    317     ext       v26.16b, v20.16b , v22.16b , #6
    318     ext       v4.16b, v20.16b , v22.16b , #10
    319 
    320     add       v25.8h, v24.8h , v26.8h
    321     ext       v24.16b, v20.16b , v22.16b , #2
    322     ext       v26.16b, v20.16b , v22.16b , #8
    323     add       v24.8h, v24.8h , v26.8h
    324 
    325     saddl     v26.4s, v4.4h, v20.4h
    326     smlal     v26.4s, v25.4h, v28.4h
    327     smlsl     v26.4s, v24.4h, v30.4h
    328 
    329     saddl2    v22.4s, v4.8h, v20.8h
    330     smlal2    v22.4s, v25.8h, v28.8h
    331     smlsl2    v22.4s, v24.8h, v30.8h
    332 
    333     sqrshrun  v19.4h, v26.4s, #10
    334     sqrshrun  v25.4h, v22.4s, #10
    335 
    336     uaddl     v24.8h, v11.8b, v1.8b
    337 
    338 
    339     uqxtn     v19.8b, v19.8h
    340     uqxtn     v25.8b, v25.8h
    341     mov       v19.s[1], v25.s[0]
    342 
    343 
    344 
    345     uaddl     v22.8h, v8.8b, v2.8b
    346     ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0]
    347 
    348 
    349     ld1       {v14.2s}, [x0], x2        // Vector load from src[8_0]
    350     uaddl     v20.8h, v10.8b, v0.8b
    351     uaddl     v26.8h, v9.8b, v3.8b
    352     st1       {v18.2s, v19.2s}, [x1], x3 // store row 2
    353 
    354 
    355 //ROW_4
    356 
    357     uaddl     v18.8h, v6.8b, v4.8b
    358 
    359     mla       v18.8h, v20.8h , v28.8h
    360 
    361     uaddl     v20.8h, v7.8b, v5.8b
    362 
    363     mla       v20.8h, v24.8h , v28.8h
    364     uaddl     v24.8h, v17.8b, v12.8b
    365     mls       v18.8h, v22.8h , v30.8h
    366     uaddl     v22.8h, v15.8b, v14.8b
    367     mls       v20.8h, v26.8h , v30.8h
    368     uaddl     v26.8h, v16.8b, v13.8b
    369     mla       v22.8h, v24.8h , v28.8h
    370     mls       v22.8h, v26.8h , v30.8h
    371 
    372     ext       v24.16b, v18.16b , v20.16b , #4
    373     ext       v26.16b, v18.16b , v20.16b , #6
    374 
    375     ext       v23.16b, v18.16b , v20.16b , #10
    376     add       v6.8h, v24.8h , v26.8h
    377     ext       v24.16b, v18.16b , v20.16b , #2
    378     ext       v26.16b, v18.16b , v20.16b , #8
    379     add       v24.8h, v24.8h , v26.8h
    380 
    381     saddl     v26.4s, v18.4h, v23.4h
    382     smlal     v26.4s, v6.4h, v28.4h
    383     smlsl     v26.4s, v24.4h, v30.4h
    384 
    385     saddl2    v23.4s, v18.8h, v23.8h
    386     smlal2    v23.4s, v6.8h, v28.8h
    387     smlsl2    v23.4s, v24.8h, v30.8h
    388 
    389     sqrshrun  v18.4h, v26.4s, #10
    390     sqrshrun  v19.4h, v23.4s, #10
    391 
    392     uqxtn     v18.8b, v18.8h
    393     uqxtn     v19.8b, v19.8h
    394     mov       v18.s[1], v19.s[0]
    395 
    396 
    397     ext       v24.16b, v20.16b , v22.16b , #4
    398     ext       v26.16b, v20.16b , v22.16b , #6
    399     ext       v6.16b, v20.16b , v22.16b , #10
    400 
    401     add       v25.8h, v24.8h , v26.8h
    402     ext       v24.16b, v20.16b , v22.16b , #2
    403     ext       v26.16b, v20.16b , v22.16b , #8
    404     add       v24.8h, v24.8h , v26.8h
    405 
    406     saddl     v26.4s, v6.4h, v20.4h
    407     smlal     v26.4s, v25.4h, v28.4h
    408     smlsl     v26.4s, v24.4h, v30.4h
    409 
    410     saddl2    v22.4s, v6.8h, v20.8h
    411     smlal2    v22.4s, v25.8h, v28.8h
    412     smlsl2    v22.4s, v24.8h, v30.8h
    413 
    414     mov       v6.16b, v2.16b
    415     mov       v7.16b, v3.16b
    416 
    417     mov       v2.16b, v10.16b
    418     mov       v3.16b, v11.16b
    419 
    420     subs      x4, x4, #4
    421     sqrshrun  v19.4h, v26.4s, #10
    422     sqrshrun  v25.4h, v22.4s, #10
    423     mov       v10.16b, v0.16b
    424     mov       v11.16b, v1.16b
    425 
    426     mov       v24.8b, v14.8b
    427 
    428     mov       v14.16b, v12.16b
    429     mov       v15.16b, v13.16b
    430 
    431 
    432     uqxtn     v19.8b, v19.8h
    433     uqxtn     v25.8b, v25.8h
    434     mov       v19.s[1], v25.s[0]
    435 
    436 
    437 
    438     mov       v0.16b, v8.16b
    439     mov       v1.16b, v9.16b
    440 
    441     mov       v8.16b, v4.16b
    442     mov       v9.16b, v5.16b
    443 
    444     mov       v12.16b, v16.16b
    445     mov       v13.16b, v17.16b
    446 
    447     mov       v4.16b, v10.16b
    448     mov       v5.16b, v11.16b
    449 
    450     mov       v16.8b, v24.8b
    451     st1       {v18.2s, v19.2s}, [x1], x3 // store row 3
    452 
    453     bgt       loop_16                   // looping if height =16
    454     b         end_func
    455 
    456 loop_8_start:
    457     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    458     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    459     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    460     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    461     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    462 
    463 loop_8:
    464 
    465     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    466     uaddl     v14.8h, v4.8b, v6.8b
    467     uaddl     v12.8h, v0.8b, v10.8b
    468     uaddl     v16.8h, v2.8b, v8.8b
    469     mla       v12.8h, v14.8h , v26.8h
    470     uaddl     v18.8h, v5.8b, v7.8b
    471     uaddl     v14.8h, v1.8b, v11.8b
    472     uaddl     v22.8h, v3.8b, v9.8b
    473     mla       v14.8h, v18.8h , v26.8h
    474     mls       v12.8h, v16.8h , v24.8h
    475     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
    476     uaddl     v16.8h, v6.8b, v8.8b
    477     mls       v14.8h, v22.8h , v24.8h
    478     uaddl     v28.8h, v2.8b, v0.8b
    479 
    480     ext       v22.16b, v12.16b , v14.16b , #10
    481     uaddl     v18.8h, v4.8b, v10.8b
    482     mla       v28.8h, v16.8h , v26.8h
    483     saddl     v30.4s, v12.4h, v22.4h
    484 
    485     saddl2    v22.4s, v12.8h, v22.8h
    486     ext       v16.16b, v12.16b , v14.16b , #4
    487     mls       v28.8h, v18.8h , v24.8h
    488     ext       v18.16b, v12.16b , v14.16b , #6
    489     ext       v20.16b, v12.16b , v14.16b , #8
    490     ext       v14.16b, v12.16b , v14.16b , #2
    491     add       v16.8h, v16.8h , v18.8h
    492     add       v18.8h, v14.8h , v20.8h
    493     uaddl     v20.8h, v7.8b, v9.8b
    494     smlal     v30.4s, v16.4h, v26.4h
    495     smlsl     v30.4s, v18.4h, v24.4h
    496     smlal2    v22.4s, v16.8h, v26.8h
    497     smlsl2    v22.4s, v18.8h, v24.8h
    498     uaddl     v14.8h, v3.8b, v1.8b
    499 
    500     mla       v14.8h, v20.8h , v26.8h
    501     sqrshrun  v12.4h, v30.4s, #10
    502     uaddl     v16.8h, v5.8b, v11.8b
    503     sqrshrun  v13.4h, v22.4s, #10
    504     mls       v14.8h, v16.8h , v24.8h
    505     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
    506     uqxtn     v25.8b, v12.8h
    507     uqxtn     v13.8b, v13.8h
    508     mov       v25.s[1], v13.s[0]
    509     uaddl     v16.8h, v8.8b, v10.8b
    510 
    511 
    512     ext       v22.16b, v28.16b , v14.16b , #10
    513     uaddl     v20.8h, v4.8b, v2.8b
    514     saddl     v30.4s, v28.4h, v22.4h
    515     mla       v20.8h, v16.8h , v26.8h
    516 
    517     saddl2    v22.4s, v28.8h, v22.8h
    518     ext       v16.16b, v28.16b , v14.16b , #4
    519     ext       v18.16b, v28.16b , v14.16b , #6
    520     ext       v12.16b, v28.16b , v14.16b , #8
    521     ext       v14.16b, v28.16b , v14.16b , #2
    522     add       v16.8h, v16.8h , v18.8h
    523     add       v18.8h, v12.8h , v14.8h
    524 
    525     smlal     v30.4s, v16.4h, v26.4h
    526     smlsl     v30.4s, v18.4h, v24.4h
    527     smlal2    v22.4s, v16.8h, v26.8h
    528     smlsl2    v22.4s, v18.8h, v24.8h
    529 
    530 
    531     uaddl     v18.8h, v6.8b, v0.8b
    532     sqrshrun  v16.4h, v30.4s, #10
    533 
    534     sqrshrun  v17.4h, v22.4s, #10
    535 
    536     mov       v12.8b, v25.8b
    537     mov       v25.8b, v24.8b
    538 
    539     uaddl     v28.8h, v9.8b, v11.8b
    540     uqxtn     v13.8b, v16.8h
    541     uqxtn     v17.8b, v17.8h
    542     mov       v13.s[1], v17.s[0]
    543 
    544 
    545     uaddl     v14.8h, v5.8b, v3.8b
    546     uaddl     v22.8h, v7.8b, v1.8b
    547     mls       v20.8h, v18.8h , v24.8h
    548     st1       {v12.2s}, [x1], x3        // store row 0
    549     mla       v14.8h, v28.8h , v26.8h
    550     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
    551     uaddl     v30.8h, v10.8b, v0.8b
    552     uaddl     v28.8h, v6.8b, v4.8b
    553     mls       v14.8h, v22.8h , v24.8h
    554     st1       {v13.2s}, [x1], x3        // store row 1
    555     mla       v28.8h, v30.8h , v26.8h
    556 
    557     ext       v22.16b, v20.16b , v14.16b , #10
    558     saddl     v30.4s, v20.4h, v22.4h
    559 
    560     saddl2    v22.4s, v20.8h, v22.8h
    561     ext       v16.16b, v20.16b , v14.16b , #4
    562     ext       v18.16b, v20.16b , v14.16b , #6
    563     ext       v12.16b, v20.16b , v14.16b , #8
    564     ext       v14.16b, v20.16b , v14.16b , #2
    565     add       v16.8h, v16.8h , v18.8h
    566     add       v18.8h, v14.8h , v12.8h
    567     uaddl     v20.8h, v8.8b, v2.8b
    568     smlal     v30.4s, v16.4h, v26.4h
    569     smlsl     v30.4s, v18.4h, v24.4h
    570     smlal2    v22.4s, v16.8h, v26.8h
    571     smlsl2    v22.4s, v18.8h, v24.8h
    572     uaddl     v18.8h, v11.8b, v1.8b
    573     uaddl     v16.8h, v7.8b, v5.8b
    574     sqrshrun  v12.4h, v30.4s, #10
    575     uaddl     v30.8h, v9.8b, v3.8b
    576     mla       v16.8h, v18.8h , v26.8h
    577     sqrshrun  v13.4h, v22.4s, #10
    578     mls       v28.8h, v20.8h , v24.8h
    579 
    580     mls       v16.8h, v30.8h , v24.8h
    581     uqxtn     v27.8b, v12.8h
    582     uqxtn     v13.8b, v13.8h
    583     mov       v27.s[1], v13.s[0]
    584 
    585 
    586     ext       v22.16b, v28.16b , v16.16b , #10
    587 
    588     saddl     v30.4s, v28.4h, v22.4h
    589 
    590     saddl2    v22.4s, v28.8h, v22.8h
    591     ext       v12.16b, v28.16b , v16.16b , #4
    592     ext       v18.16b, v28.16b , v16.16b , #6
    593     ext       v20.16b, v28.16b , v16.16b , #8
    594     ext       v28.16b, v28.16b , v16.16b , #2
    595     add       v12.8h, v12.8h , v18.8h
    596     add       v18.8h, v28.8h , v20.8h
    597 
    598     smlal     v30.4s, v12.4h, v26.4h
    599     smlsl     v30.4s, v18.4h, v24.4h
    600     smlal2    v22.4s, v12.8h, v26.8h
    601     smlsl2    v22.4s, v18.8h, v24.8h
    602 
    603 
    604     mov       v12.8b, v27.8b
    605     mov       v27.8b, v26.8b
    606 
    607     sqrshrun  v16.4h, v30.4s, #10
    608 
    609     mov       v6.16b, v2.16b
    610     mov       v7.16b, v3.16b
    611 
    612     sqrshrun  v17.4h, v22.4s, #10
    613 
    614     mov       v2.16b, v10.16b
    615     mov       v3.16b, v11.16b
    616 
    617     mov       v10.16b, v0.16b
    618     mov       v11.16b, v1.16b
    619 
    620     subs      x4, x4, #4
    621     uqxtn     v13.8b, v16.8h
    622     uqxtn     v17.8b, v17.8h
    623     mov       v13.s[1], v17.s[0]
    624 
    625 
    626     mov       v0.16b, v8.16b
    627     mov       v1.16b, v9.16b
    628 
    629     mov       v8.16b, v4.16b
    630     mov       v9.16b, v5.16b
    631 
    632     mov       v4.16b, v10.16b
    633     mov       v5.16b, v11.16b
    634 
    635     st1       {v12.2s}, [x1], x3        // store row 2
    636     st1       {v13.2s}, [x1], x3        // store row 3
    637 
    638     bgt       loop_8                    //if height =8  loop
    639     b         end_func
    640 
    641 loop_4_start:
    642     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    643     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    644     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    645     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    646     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    647 
    648 loop_4:
    649     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    650     uaddl     v14.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
    651     uaddl     v12.8h, v0.8b, v10.8b     // temp = src[0_0] + src[5_0]
    652     uaddl     v16.8h, v2.8b, v8.8b      // temp2 = src[1_0] + src[4_0]
    653     mla       v12.8h, v14.8h , v26.8h   // temp += temp1 * 20
    654     uaddl     v18.8h, v5.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
    655     uaddl     v14.8h, v1.8b, v11.8b     // temp = src[0_0] + src[5_0]
    656     uaddl     v22.8h, v3.8b, v9.8b      // temp2 = src[1_0] + src[4_0]
    657     mla       v14.8h, v18.8h , v26.8h   // temp += temp1 * 20
    658     mls       v12.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    659     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
    660     uaddl     v16.8h, v6.8b, v8.8b
    661     mls       v14.8h, v22.8h , v24.8h   // temp -= temp2 * 5
    662     //Q6 and Q7 have filtered values
    663     uaddl     v28.8h, v2.8b, v0.8b
    664 
    665     ext       v22.16b, v12.16b , v14.16b , #10
    666     uaddl     v18.8h, v4.8b, v10.8b
    667     mla       v28.8h, v16.8h , v26.8h
    668     saddl     v30.4s, v12.4h, v22.4h
    669 
    670     saddl     v22.4s, v13.4h, v23.4h
    671     ext       v16.16b, v12.16b , v14.16b , #4
    672     mls       v28.8h, v18.8h , v24.8h
    673     ext       v18.16b, v12.16b , v14.16b , #6
    674     ext       v20.16b, v12.16b , v14.16b , #8
    675     ext       v14.16b, v12.16b , v14.16b , #2
    676     add       v16.8h, v16.8h , v18.8h
    677     add       v18.8h, v14.8h , v20.8h
    678     uaddl     v20.8h, v7.8b, v9.8b
    679     smlal     v30.4s, v16.4h, v26.4h
    680     smlsl     v30.4s, v18.4h, v24.4h
    681     smlal     v22.4s, v17.4h, v26.4h
    682     smlsl     v22.4s, v19.4h, v24.4h
    683     uaddl     v14.8h, v3.8b, v1.8b
    684 
    685     mla       v14.8h, v20.8h , v26.8h
    686     sqrshrun  v12.4h, v30.4s, #10
    687     uaddl     v16.8h, v5.8b, v11.8b
    688     sqrshrun  v13.4h, v22.4s, #10
    689     mls       v14.8h, v16.8h , v24.8h
    690     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
    691     uqxtn     v25.8b, v12.8h
    692     uaddl     v16.8h, v8.8b, v10.8b
    693 
    694     ext       v22.16b, v28.16b , v14.16b , #10
    695     uaddl     v20.8h, v4.8b, v2.8b
    696     saddl     v30.4s, v28.4h, v22.4h
    697     mla       v20.8h, v16.8h , v26.8h
    698 
    699     saddl     v22.4s, v29.4h, v23.4h
    700     ext       v16.16b, v28.16b , v14.16b , #4
    701     ext       v18.16b, v28.16b , v14.16b , #6
    702     ext       v12.16b, v28.16b , v14.16b , #8
    703     ext       v14.16b, v28.16b , v14.16b , #2
    704     add       v16.8h, v16.8h , v18.8h
    705     add       v18.8h, v12.8h , v14.8h
    706 
    707     smlal     v30.4s, v16.4h, v26.4h
    708     smlsl     v30.4s, v18.4h, v24.4h
    709     smlal     v22.4s, v17.4h, v26.4h
    710     smlsl     v22.4s, v19.4h, v24.4h
    711 
    712 
    713     uaddl     v18.8h, v6.8b, v0.8b
    714     sqrshrun  v16.4h, v30.4s, #10
    715 
    716     sqrshrun  v17.4h, v22.4s, #10
    717 
    718     mov       v12.8b, v25.8b
    719     mov       v25.8b, v24.8b
    720 
    721     uaddl     v28.8h, v9.8b, v11.8b
    722     uqxtn     v13.8b, v16.8h
    723 
    724 
    725 
    726     uaddl     v14.8h, v5.8b, v3.8b
    727     uaddl     v22.8h, v7.8b, v1.8b
    728     mls       v20.8h, v18.8h , v24.8h
    729     st1       {v12.s}[0], [x1], x3      // store row 0
    730     mla       v14.8h, v28.8h , v26.8h
    731     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
    732     uaddl     v30.8h, v10.8b, v0.8b
    733     uaddl     v28.8h, v6.8b, v4.8b
    734     mls       v14.8h, v22.8h , v24.8h
    735     st1       {v13.s}[0], [x1], x3      //store row 1
    736     mla       v28.8h, v30.8h , v26.8h
    737 
    738     ext       v22.16b, v20.16b , v14.16b , #10
    739     saddl     v30.4s, v20.4h, v22.4h
    740 
    741     saddl     v22.4s, v21.4h, v23.4h
    742     ext       v16.16b, v20.16b , v14.16b , #4
    743     ext       v18.16b, v20.16b , v14.16b , #6
    744     ext       v12.16b, v20.16b , v14.16b , #8
    745     ext       v14.16b, v20.16b , v14.16b , #2
    746     add       v16.8h, v16.8h , v18.8h
    747     add       v18.8h, v14.8h , v12.8h
    748     uaddl     v20.8h, v8.8b, v2.8b
    749     smlal     v30.4s, v16.4h, v26.4h
    750     smlsl     v30.4s, v18.4h, v24.4h
    751     smlal     v22.4s, v17.4h, v26.4h
    752     smlsl     v22.4s, v19.4h, v24.4h
    753     uaddl     v18.8h, v11.8b, v1.8b
    754     uaddl     v16.8h, v7.8b, v5.8b
    755     sqrshrun  v12.4h, v30.4s, #10
    756     uaddl     v30.8h, v9.8b, v3.8b
    757     mla       v16.8h, v18.8h , v26.8h
    758     sqrshrun  v13.4h, v22.4s, #10
    759     mls       v28.8h, v20.8h , v24.8h
    760 
    761     mls       v16.8h, v30.8h , v24.8h
    762     uqxtn     v27.8b, v12.8h
    763 
    764     ext       v22.16b, v28.16b , v16.16b , #10
    765 
    766     saddl     v30.4s, v28.4h, v22.4h
    767 
    768     saddl     v22.4s, v29.4h, v23.4h
    769     ext       v12.16b, v28.16b , v16.16b , #4
    770     ext       v18.16b, v28.16b , v16.16b , #6
    771     ext       v20.16b, v28.16b , v16.16b , #8
    772     ext       v28.16b, v28.16b , v16.16b , #2
    773     add       v12.8h, v12.8h , v18.8h
    774     add       v18.8h, v28.8h , v20.8h
    775 
    776     smlal     v30.4s, v12.4h, v26.4h
    777     smlsl     v30.4s, v18.4h, v24.4h
    778     smlal     v22.4s, v13.4h, v26.4h
    779     smlsl     v22.4s, v19.4h, v24.4h
    780 
    781 
    782     mov       v12.8b, v27.8b
    783     mov       v27.8b, v26.8b
    784 
    785     sqrshrun  v16.4h, v30.4s, #10
    786 
    787     mov       v6.16b, v2.16b
    788     mov       v7.16b, v3.16b
    789 
    790     sqrshrun  v17.4h, v22.4s, #10
    791 
    792     mov       v2.16b, v10.16b
    793     mov       v3.16b, v11.16b
    794 
    795     mov       v10.16b, v0.16b
    796     mov       v11.16b, v1.16b
    797 
    798     subs      x4, x4, #4
    799     uqxtn     v13.8b, v16.8h
    800 
    801     mov       v0.16b, v8.16b
    802     mov       v1.16b, v9.16b
    803 
    804     mov       v8.16b, v4.16b
    805     mov       v9.16b, v5.16b
    806 
    807 
    808     mov       v4.16b, v10.16b
    809     mov       v5.16b, v11.16b
    810 
    811 
    812     st1       {v12.s}[0], [x1], x3      // store row 2
    813     st1       {v13.s}[0], [x1], x3      // store row 3
    814 
    815     bgt       loop_4
    816 
    817 end_func:
    818     //Restoring registers from stack
    819     ldp       x19, x20, [sp], #16
    820     pop_v_regs
    821     ret
    822 
    823 
    824 
    825