Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for inter prediction  interpolation.
     27 //*
     28 //* @author
     29 //*  Mohit
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_inter_pred_luma_horz_hpel_vert_qpel_av8()
     34 //*
     35 //* @remarks
     36 //*  None
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 
     41 ///* All the functions here are replicated from ih264_inter_pred_filters.c
     42 //
     43 
     44 ///**
     45 ///**
     46 ///**
     47 //*******************************************************************************
     48 //*
     49 //* @brief
     50 //*   This function implements a two stage cascaded six tap filter. It
     51 //*    applies the six tap filter in the horizontal direction on the
     52 //*    predictor values, followed by applying the same filter in the
     53 //*    vertical direction on the output of the first stage. It then averages
     54 //*    the output of the 1st stage and the output of the 2nd stage to obtain
     55 //*    the quarter pel values. The six tap filtering operation is described
     56 //*    in sec 8.4.2.2.1 titled "Luma sample interpolation process".
     57 //*
     58 //* @par Description:
     59 //*     This function is called to obtain pixels lying at the following
     60 //*    location (1/2,1/4) or (1/2,3/4). The function interpolates
     61 //*    the predictors first in the horizontal direction and then in the
     62 //*    vertical direction to output the (1/2,1/2). It then averages
     63 //*      the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4)
     64 //*       or (1/2,3/4) depending on the offset.
     65 //*
     66 //* @param[in] pu1_src
     67 //*  UWORD8 pointer to the source
     68 //*
     69 //* @param[out] pu1_dst
     70 //*  UWORD8 pointer to the destination
     71 //*
     72 //* @param[in] src_strd
     73 //*  integer source stride
     74 //*
     75 //* @param[in] dst_strd
     76 //*  integer destination stride
     77 //*
     78 //* @param[in] ht
     79 //*  integer height of the array
     80 //*
     81 //* @param[in] wd
     82 //*  integer width of the array
     83 //*
     84 //* @param[in] pu1_tmp: temporary buffer
     85 //*
     86 //* @param[in] dydx: x and y reference offset for qpel calculations
     87 //*
     88 //* @returns
     89 //*
     90 //* @remarks
     91 //*  None
     92 //*
     93 //*******************************************************************************
     94 //*/;
     95 
     96 //void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
     97 //                                UWORD8 *pu1_dst,
     98 //                                WORD32 src_strd,,
     99 //                                WORD32 dst_strd,
    100 //                                WORD32 ht,
    101 //                                WORD32 wd,
    102 //                                    UWORD8* pu1_tmp,
    103 //                                  UWORD32 dydx)
    104 
    105 //**************Variables Vs Registers*****************************************
    106 //    x0 => *pu1_src
    107 //    x1 => *pu1_dst
    108 //    w2 =>  src_strd
    109 //    w3 =>  dst_strd
    110 //    w4 =>  ht
    111 //    w5 =>  wd
    112 //    x6 => *pu1_tmp
    113 //    w7 =>  dydx
    114 
    115 .text
    116 .p2align 2
    117 .include "ih264_neon_macros.s"
    118 
    119 
    120 
    121     .global ih264_inter_pred_luma_horz_hpel_vert_qpel_av8
    122 
    123 ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
    124 
    125 
    126     // store register values to stack
    127     push_v_regs
    128     stp       x19, x20, [sp, #-16]!
    129     sxtw      x2, w2
    130     sxtw      x3, w3
    131     sxtw      x4, w4
    132     sxtw      x5, w5
    133 
    134 
    135 
    136     sub       x0, x0, x2, lsl #1        // pu1_src-2*src_strd
    137     sub       x0, x0, #2                // pu1_src-2
    138 
    139     mov       x9, x6
    140 
    141                                         // by writing to w7 here, we clear the upper half of x7
    142     lsr       w7, w7, #3                // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
    143 
    144     add       x7, x7, #2
    145     mov       x6, #48
    146     madd      x7, x7, x6, x9
    147 
    148     subs      x12, x5, #4               //if wd=4 branch to loop_4
    149     beq       loop_4_start
    150 
    151     subs      x12, x5, #8               //if wd=8 branch to loop_8
    152     beq       loop_8_start
    153 
    154     //when  wd=16
    155     movi      v22.8h, #20               // Filter coeff 0x14 into Q11
    156     movi      v24.8h, #5                // Filter coeff 0x5  into Q12
    157     add       x8, x0, #8
    158     add       x14, x1, #8
    159     add       x10, x9, #8
    160     mov       x12, x4
    161     add       x11, x7, #8
    162 loop_16_lowhalf_start:
    163     ld1       {v0.2s, v1.2s}, [x0], x2  // row -2 load for horizontal filter
    164     ext       v5.8b, v0.8b , v1.8b , #5
    165     uaddl     v6.8h, v0.8b, v5.8b
    166 
    167     ext       v2.8b, v0.8b , v1.8b , #2
    168     ext       v3.8b, v0.8b , v1.8b , #3
    169     uaddl     v8.8h, v2.8b, v3.8b
    170     ext       v4.8b, v0.8b , v1.8b , #4
    171     mla       v6.8h, v8.8h , v22.8h
    172     ext       v1.8b, v0.8b , v1.8b , #1
    173     uaddl     v8.8h, v1.8b, v4.8b
    174     ld1       {v0.2s, v1.2s}, [x0], x2  // row -1 load for horizontal filter
    175     mls       v6.8h, v8.8h , v24.8h
    176     ext       v5.8b, v0.8b , v1.8b , #5
    177     uaddl     v8.8h, v0.8b, v5.8b
    178     ext       v2.8b, v0.8b , v1.8b , #2
    179     ext       v3.8b, v0.8b , v1.8b , #3
    180     uaddl     v10.8h, v2.8b, v3.8b
    181 
    182     st1       {v6.4s}, [x9], x6         // store temp buffer 0
    183 
    184     ext       v4.8b, v0.8b , v1.8b , #4
    185     mla       v8.8h, v10.8h , v22.8h
    186     ext       v1.8b, v0.8b , v1.8b , #1
    187     uaddl     v10.8h, v1.8b, v4.8b
    188     ld1       {v0.2s, v1.2s}, [x0], x2  // row 0 load for horizontal filter
    189     mls       v8.8h, v10.8h , v24.8h
    190     ext       v5.8b, v0.8b , v1.8b , #5
    191     uaddl     v10.8h, v0.8b, v5.8b
    192     ext       v2.8b, v0.8b , v1.8b , #2
    193     ext       v3.8b, v0.8b , v1.8b , #3
    194     uaddl     v12.8h, v2.8b, v3.8b
    195 
    196     st1       {v8.4s}, [x9], x6         // store temp buffer 1
    197 
    198     ext       v4.8b, v0.8b , v1.8b , #4
    199     mla       v10.8h, v12.8h , v22.8h
    200     ext       v1.8b, v0.8b , v1.8b , #1
    201     uaddl     v12.8h, v1.8b, v4.8b
    202     ld1       {v0.2s, v1.2s}, [x0], x2  // row 1 load for horizontal filter
    203     mls       v10.8h, v12.8h , v24.8h
    204     ext       v5.8b, v0.8b , v1.8b , #5
    205     uaddl     v12.8h, v0.8b, v5.8b
    206     ext       v2.8b, v0.8b , v1.8b , #2
    207     ext       v3.8b, v0.8b , v1.8b , #3
    208     uaddl     v14.8h, v2.8b, v3.8b
    209 
    210     st1       {v10.4s}, [x9], x6        // store temp buffer 2
    211 
    212     ext       v4.8b, v0.8b , v1.8b , #4
    213     mla       v12.8h, v14.8h , v22.8h
    214     ext       v1.8b, v0.8b , v1.8b , #1
    215     uaddl     v14.8h, v1.8b, v4.8b
    216     ld1       {v0.2s, v1.2s}, [x0], x2  // row 2 load for horizontal filter
    217     mls       v12.8h, v14.8h , v24.8h
    218     ext       v5.8b, v0.8b , v1.8b , #5
    219     uaddl     v14.8h, v0.8b, v5.8b
    220     ext       v2.8b, v0.8b , v1.8b , #2
    221     ext       v3.8b, v0.8b , v1.8b , #3
    222     uaddl     v16.8h, v2.8b, v3.8b
    223 
    224     st1       {v12.4s}, [x9], x6        // store temp buffer 3
    225 
    226     ext       v4.8b, v0.8b , v1.8b , #4
    227     mla       v14.8h, v16.8h , v22.8h
    228     ext       v1.8b, v0.8b , v1.8b , #1
    229     uaddl     v16.8h, v1.8b, v4.8b
    230 
    231     mls       v14.8h, v16.8h , v24.8h
    232 loop_16_lowhalf:
    233 
    234     ld1       {v0.2s, v1.2s}, [x0], x2  // row 3 load for horizontal filter
    235     ext       v5.8b, v0.8b , v1.8b , #5
    236     ext       v2.8b, v0.8b , v1.8b , #2
    237     ext       v3.8b, v0.8b , v1.8b , #3
    238     uaddl     v16.8h, v0.8b, v5.8b
    239 
    240     st1       {v14.4s}, [x9], x6        // store temp buffer 4
    241 
    242     uaddl     v18.8h, v2.8b, v3.8b
    243     ext       v4.8b, v0.8b , v1.8b , #4
    244     mla       v16.8h, v18.8h , v22.8h
    245     ext       v1.8b, v0.8b , v1.8b , #1
    246     add       v28.8h, v8.8h , v14.8h
    247     uaddl     v18.8h, v1.8b, v4.8b
    248     add       v30.8h, v10.8h , v12.8h
    249     mls       v16.8h, v18.8h , v24.8h
    250     ld1       {v0.2s, v1.2s}, [x0], x2  // row 4 load for hoorizontal filter
    251     ext       v5.8b, v0.8b , v1.8b , #5
    252     ext       v2.8b, v0.8b , v1.8b , #2
    253     ext       v3.8b, v0.8b , v1.8b , #3
    254     uaddl     v20.8h, v0.8b, v5.8b
    255 
    256     st1       {v16.4s}, [x9], x6        // store temp buffer x5
    257 
    258     saddl     v18.4s, v6.4h, v16.4h
    259 
    260     ld1       {v26.4s}, [x7], x6        // load from temp buffer 0
    261 
    262     saddl2    v6.4s, v6.8h, v16.8h
    263 
    264     sqrshrun  v26.8b, v26.8h, #5
    265 
    266     smlal     v18.4s, v30.4h, v22.4h
    267     smlsl     v18.4s, v28.4h, v24.4h
    268     smlal2    v6.4s, v30.8h, v22.8h
    269     smlsl2    v6.4s, v28.8h, v24.8h
    270     uaddl     v2.8h, v2.8b, v3.8b
    271     ext       v4.8b, v0.8b , v1.8b , #4
    272     mla       v20.8h, v2.8h , v22.8h
    273     sqrshrun  v18.4h, v18.4s, #10
    274     ext       v1.8b, v0.8b , v1.8b , #1
    275     sqrshrun  v19.4h, v6.4s, #10
    276     add       v28.8h, v10.8h , v16.8h
    277     uaddl     v2.8h, v1.8b, v4.8b
    278     add       v30.8h, v12.8h , v14.8h
    279     mls       v20.8h, v2.8h , v24.8h
    280 
    281     uqxtn     v18.8b, v18.8h
    282     uqxtn     v19.8b, v19.8h
    283     mov       v18.s[1], v19.s[0]
    284 
    285     ld1       {v0.2s, v1.2s}, [x0], x2  // row 5 load for horizontal filter
    286 
    287     urhadd    v26.8b, v18.8b , v26.8b
    288 
    289     ext       v5.8b, v0.8b , v1.8b , #5
    290     ext       v2.8b, v0.8b , v1.8b , #2
    291 
    292     st1       {v20.4s}, [x9], x6        // store temp buffer x6
    293 
    294     saddl     v18.4s, v8.4h, v20.4h
    295 
    296     saddl2    v6.4s, v8.8h, v20.8h
    297 
    298     ld1       {v8.4s}, [x7], x6         //load from temp buffer 1
    299 
    300 
    301     st1       {v26.2s}, [x1], x3        // store row 0
    302 
    303     smlal     v18.4s, v30.4h, v22.4h
    304     smlsl     v18.4s, v28.4h, v24.4h
    305     smlal2    v6.4s, v30.8h, v22.8h
    306     smlsl2    v6.4s, v28.8h, v24.8h
    307 
    308     sqrshrun  v28.8b, v8.8h, #5
    309     ext       v3.8b, v0.8b , v1.8b , #3
    310     uaddl     v8.8h, v0.8b, v5.8b
    311     uaddl     v2.8h, v2.8b, v3.8b
    312     sqrshrun  v18.4h, v18.4s, #10
    313     ext       v4.8b, v0.8b , v1.8b , #4
    314     sqrshrun  v19.4h, v6.4s, #10
    315     mla       v8.8h, v2.8h , v22.8h
    316     ext       v1.8b, v0.8b , v1.8b , #1
    317     add       v26.8h, v12.8h , v20.8h
    318     uaddl     v2.8h, v1.8b, v4.8b
    319     uqxtn     v18.8b, v18.8h
    320     uqxtn     v19.8b, v19.8h
    321     mov       v18.s[1], v19.s[0]
    322     add       v30.8h, v14.8h , v16.8h
    323     mls       v8.8h, v2.8h , v24.8h
    324     ld1       {v0.2s, v1.2s}, [x0], x2  // row 6 load for horizontal filter
    325 
    326     urhadd    v28.8b, v28.8b , v18.8b
    327 
    328     ext       v5.8b, v0.8b , v1.8b , #5
    329     ext       v2.8b, v0.8b , v1.8b , #2
    330     ext       v3.8b, v0.8b , v1.8b , #3
    331 
    332     st1       {v28.2s}, [x1], x3        // store row 1
    333 
    334     uaddl     v28.8h, v0.8b, v5.8b
    335 
    336     st1       {v8.4s}, [x9], x6         // store temp buffer x7
    337 
    338     saddl     v18.4s, v10.4h, v8.4h
    339     saddl2    v6.4s, v10.8h, v8.8h
    340 
    341     ld1       {v10.4s}, [x7], x6        // load from temp buffer 2
    342 
    343     smlal     v18.4s, v30.4h, v22.4h
    344     smlsl     v18.4s, v26.4h, v24.4h
    345 
    346     smlal2    v6.4s, v30.8h, v22.8h
    347     smlsl2    v6.4s, v26.8h, v24.8h
    348 
    349     sqrshrun  v26.8b, v10.8h, #5
    350 
    351     uaddl     v2.8h, v2.8b, v3.8b
    352     ext       v4.8b, v0.8b , v1.8b , #4
    353     mla       v28.8h, v2.8h , v22.8h
    354     sqrshrun  v18.4h, v18.4s, #10
    355     ext       v1.8b, v0.8b , v1.8b , #1
    356     sqrshrun  v19.4h, v6.4s, #10
    357     add       v10.8h, v14.8h , v8.8h
    358     uaddl     v2.8h, v1.8b, v4.8b
    359     add       v30.8h, v16.8h , v20.8h
    360     mls       v28.8h, v2.8h , v24.8h
    361     uqxtn     v27.8b, v18.8h
    362     uqxtn     v19.8b, v19.8h
    363     mov       v27.s[1], v19.s[0]
    364     saddl     v18.4s, v12.4h, v28.4h
    365     saddl2    v6.4s, v12.8h, v28.8h
    366 
    367     urhadd    v26.8b, v26.8b , v27.8b
    368 
    369     smlal     v18.4s, v30.4h, v22.4h
    370     smlsl     v18.4s, v10.4h, v24.4h
    371     smlal2    v6.4s, v30.8h, v22.8h
    372     smlsl2    v6.4s, v10.8h, v24.8h
    373 
    374     st1       {v26.2s}, [x1], x3        // store row 2
    375 
    376     st1       {v28.2s, v29.2s}, [x9]
    377 
    378 
    379     sqrshrun  v18.4h, v18.4s, #10
    380 
    381     mov       v10.16b, v20.16b
    382     mov       v11.16b, v21.16b
    383     ld1       {v30.4s}, [x7], x6        // load from temp buffer 3
    384 
    385     sqrshrun  v19.4h, v6.4s, #10
    386     subs      x4, x4, #4
    387 
    388     sqrshrun  v30.8b, v30.8h, #5
    389 
    390     uqxtn     v18.8b, v18.8h
    391     uqxtn     v19.8b, v19.8h
    392     mov       v18.s[1], v19.s[0]
    393 
    394     mov       v12.16b, v8.16b
    395     mov       v13.16b, v9.16b
    396     mov       v6.16b, v14.16b
    397     mov       v7.16b, v15.16b
    398 
    399     urhadd    v30.8b, v18.8b , v30.8b
    400 
    401     mov       v8.16b, v16.16b
    402     mov       v9.16b, v17.16b
    403     mov       v14.16b, v28.16b
    404     mov       v15.16b, v29.16b
    405 
    406     st1       {v30.2s}, [x1], x3        // store row 3
    407 
    408     bgt       loop_16_lowhalf           // looping if height =16
    409 
    410 
    411 loop_16_highhalf_start:
    412     ld1       {v0.2s, v1.2s}, [x8], x2
    413     ext       v5.8b, v0.8b , v1.8b , #5
    414     uaddl     v6.8h, v0.8b, v5.8b
    415     ext       v2.8b, v0.8b , v1.8b , #2
    416     ext       v3.8b, v0.8b , v1.8b , #3
    417     uaddl     v8.8h, v2.8b, v3.8b
    418     ext       v4.8b, v0.8b , v1.8b , #4
    419     mla       v6.8h, v8.8h , v22.8h
    420     ext       v1.8b, v0.8b , v1.8b , #1
    421     uaddl     v8.8h, v1.8b, v4.8b
    422     ld1       {v0.2s, v1.2s}, [x8], x2
    423     mls       v6.8h, v8.8h , v24.8h
    424     ext       v5.8b, v0.8b , v1.8b , #5
    425     uaddl     v8.8h, v0.8b, v5.8b
    426     ext       v2.8b, v0.8b , v1.8b , #2
    427     ext       v3.8b, v0.8b , v1.8b , #3
    428     uaddl     v10.8h, v2.8b, v3.8b
    429 
    430     st1       {v6.4s}, [x10], x6
    431 
    432     ext       v4.8b, v0.8b , v1.8b , #4
    433     mla       v8.8h, v10.8h , v22.8h
    434     ext       v1.8b, v0.8b , v1.8b , #1
    435     uaddl     v10.8h, v1.8b, v4.8b
    436     ld1       {v0.2s, v1.2s}, [x8], x2
    437     mls       v8.8h, v10.8h , v24.8h
    438     ext       v5.8b, v0.8b , v1.8b , #5
    439     uaddl     v10.8h, v0.8b, v5.8b
    440     ext       v2.8b, v0.8b , v1.8b , #2
    441     ext       v3.8b, v0.8b , v1.8b , #3
    442     uaddl     v12.8h, v2.8b, v3.8b
    443 
    444     st1       {v8.4s}, [x10], x6
    445 
    446     ext       v4.8b, v0.8b , v1.8b , #4
    447     mla       v10.8h, v12.8h , v22.8h
    448     ext       v1.8b, v0.8b , v1.8b , #1
    449     uaddl     v12.8h, v1.8b, v4.8b
    450     ld1       {v0.2s, v1.2s}, [x8], x2
    451     mls       v10.8h, v12.8h , v24.8h
    452     ext       v5.8b, v0.8b , v1.8b , #5
    453     uaddl     v12.8h, v0.8b, v5.8b
    454     ext       v2.8b, v0.8b , v1.8b , #2
    455     ext       v3.8b, v0.8b , v1.8b , #3
    456     uaddl     v14.8h, v2.8b, v3.8b
    457 
    458     st1       {v10.4s}, [x10], x6
    459 
    460     ext       v4.8b, v0.8b , v1.8b , #4
    461     mla       v12.8h, v14.8h , v22.8h
    462     ext       v1.8b, v0.8b , v1.8b , #1
    463     uaddl     v14.8h, v1.8b, v4.8b
    464     ld1       {v0.2s, v1.2s}, [x8], x2
    465     mls       v12.8h, v14.8h , v24.8h
    466     ext       v5.8b, v0.8b , v1.8b , #5
    467     uaddl     v14.8h, v0.8b, v5.8b
    468     ext       v2.8b, v0.8b , v1.8b , #2
    469     ext       v3.8b, v0.8b , v1.8b , #3
    470     uaddl     v16.8h, v2.8b, v3.8b
    471 
    472     st1       {v12.4s}, [x10], x6
    473 
    474     ext       v4.8b, v0.8b , v1.8b , #4
    475     mla       v14.8h, v16.8h , v22.8h
    476     ext       v1.8b, v0.8b , v1.8b , #1
    477     uaddl     v16.8h, v1.8b, v4.8b
    478 
    479     mls       v14.8h, v16.8h , v24.8h
    480 
    481 loop_16_highhalf:
    482 
    483     ld1       {v0.2s, v1.2s}, [x8], x2
    484     ext       v5.8b, v0.8b , v1.8b , #5
    485     ext       v2.8b, v0.8b , v1.8b , #2
    486     ext       v3.8b, v0.8b , v1.8b , #3
    487     uaddl     v16.8h, v0.8b, v5.8b
    488 
    489     st1       {v14.4s}, [x10], x6
    490 
    491     uaddl     v18.8h, v2.8b, v3.8b
    492     ext       v4.8b, v0.8b , v1.8b , #4
    493     mla       v16.8h, v18.8h , v22.8h
    494     ext       v1.8b, v0.8b , v1.8b , #1
    495     add       v28.8h, v8.8h , v14.8h
    496     uaddl     v18.8h, v1.8b, v4.8b
    497     add       v30.8h, v10.8h , v12.8h
    498     mls       v16.8h, v18.8h , v24.8h
    499     ld1       {v0.2s, v1.2s}, [x8], x2
    500     ext       v5.8b, v0.8b , v1.8b , #5
    501     ext       v2.8b, v0.8b , v1.8b , #2
    502     ext       v3.8b, v0.8b , v1.8b , #3
    503     uaddl     v20.8h, v0.8b, v5.8b
    504 
    505     st1       {v16.4s}, [x10], x6
    506 
    507     saddl     v18.4s, v6.4h, v16.4h
    508 
    509     ld1       {v26.4s}, [x11], x6
    510 
    511     saddl2    v6.4s, v6.8h, v16.8h
    512 
    513     sqrshrun  v26.8b, v26.8h, #5
    514 
    515     smlal     v18.4s, v30.4h, v22.4h
    516     smlsl     v18.4s, v28.4h, v24.4h
    517     smlal2    v6.4s, v30.8h, v22.8h
    518     smlsl2    v6.4s, v28.8h, v24.8h
    519     uaddl     v2.8h, v2.8b, v3.8b
    520     ext       v4.8b, v0.8b , v1.8b , #4
    521     mla       v20.8h, v2.8h , v22.8h
    522     sqrshrun  v18.4h, v18.4s, #10
    523     ext       v1.8b, v0.8b , v1.8b , #1
    524     sqrshrun  v19.4h, v6.4s, #10
    525     add       v28.8h, v10.8h , v16.8h
    526     uaddl     v2.8h, v1.8b, v4.8b
    527     add       v30.8h, v12.8h , v14.8h
    528     mls       v20.8h, v2.8h , v24.8h
    529     uqxtn     v18.8b, v18.8h
    530     uqxtn     v19.8b, v19.8h
    531     mov       v18.s[1], v19.s[0]
    532     ld1       {v0.2s, v1.2s}, [x8], x2
    533 
    534     urhadd    v26.8b, v18.8b , v26.8b
    535 
    536     ext       v5.8b, v0.8b , v1.8b , #5
    537     ext       v2.8b, v0.8b , v1.8b , #2
    538 
    539     st1       {v20.4s}, [x10], x6
    540 
    541     saddl     v18.4s, v8.4h, v20.4h
    542     saddl2    v6.4s, v8.8h, v20.8h
    543 
    544     ld1       {v8.4s}, [x11], x6
    545 
    546 
    547     st1       {v26.2s}, [x14], x3       //store row 0
    548 
    549     smlal     v18.4s, v30.4h, v22.4h
    550     smlsl     v18.4s, v28.4h, v24.4h
    551     smlal2    v6.4s, v30.8h, v22.8h
    552     smlsl2    v6.4s, v28.8h, v24.8h
    553     sqrshrun  v28.8b, v8.8h, #5
    554     ext       v3.8b, v0.8b , v1.8b , #3
    555     uaddl     v8.8h, v0.8b, v5.8b
    556     uaddl     v2.8h, v2.8b, v3.8b
    557     sqrshrun  v18.4h, v18.4s, #10
    558     ext       v4.8b, v0.8b , v1.8b , #4
    559     sqrshrun  v19.4h, v6.4s, #10
    560     mla       v8.8h, v2.8h , v22.8h
    561     ext       v1.8b, v0.8b , v1.8b , #1
    562     add       v26.8h, v12.8h , v20.8h
    563     uaddl     v2.8h, v1.8b, v4.8b
    564     uqxtn     v18.8b, v18.8h
    565     uqxtn     v19.8b, v19.8h
    566     mov       v18.s[1], v19.s[0]
    567     add       v30.8h, v14.8h , v16.8h
    568     mls       v8.8h, v2.8h , v24.8h
    569     ld1       {v0.2s, v1.2s}, [x8], x2
    570 
    571     urhadd    v28.8b, v28.8b , v18.8b
    572 
    573     ext       v5.8b, v0.8b , v1.8b , #5
    574     ext       v2.8b, v0.8b , v1.8b , #2
    575     ext       v3.8b, v0.8b , v1.8b , #3
    576 
    577     st1       {v28.2s}, [x14], x3       //store row 1
    578 
    579     uaddl     v28.8h, v0.8b, v5.8b
    580 
    581     st1       {v8.4s}, [x10], x6
    582 
    583     saddl     v18.4s, v10.4h, v8.4h
    584     saddl2    v6.4s, v10.8h, v8.8h
    585 
    586     ld1       {v10.4s}, [x11], x6
    587 
    588     smlal     v18.4s, v30.4h, v22.4h
    589     smlsl     v18.4s, v26.4h, v24.4h
    590     smlal2    v6.4s, v30.8h, v22.8h
    591     smlsl2    v6.4s, v26.8h, v24.8h
    592 
    593     sqrshrun  v26.8b, v10.8h, #5
    594     uaddl     v2.8h, v2.8b, v3.8b
    595     ext       v4.8b, v0.8b , v1.8b , #4
    596     mla       v28.8h, v2.8h , v22.8h
    597     sqrshrun  v18.4h, v18.4s, #10
    598     ext       v1.8b, v0.8b , v1.8b , #1
    599     sqrshrun  v19.4h, v6.4s, #10
    600     add       v10.8h, v14.8h , v8.8h
    601     uaddl     v2.8h, v1.8b, v4.8b
    602     add       v30.8h, v16.8h , v20.8h
    603     mls       v28.8h, v2.8h , v24.8h
    604     uqxtn     v27.8b, v18.8h
    605     uqxtn     v19.8b, v19.8h
    606     mov       v27.s[1], v19.s[0]
    607 
    608 
    609     saddl     v18.4s, v12.4h, v28.4h
    610     saddl2    v6.4s, v12.8h, v28.8h
    611 
    612     urhadd    v26.8b, v26.8b , v27.8b
    613 
    614     smlal     v18.4s, v30.4h, v22.4h
    615     smlsl     v18.4s, v10.4h, v24.4h
    616     smlal2    v6.4s, v30.8h, v22.8h
    617     smlsl2    v6.4s, v10.8h, v24.8h
    618 
    619     st1       {v26.2s}, [x14], x3       // store row 2
    620 
    621     st1       {v28.4s}, [x10]
    622 
    623     sqrshrun  v18.4h, v18.4s, #10
    624     mov       v10.16b, v20.16b
    625     mov       v11.16b, v21.16b
    626     ld1       {v30.4s}, [x11], x6
    627 
    628     sqrshrun  v19.4h, v6.4s, #10
    629     subs      x12, x12, #4
    630 
    631     sqrshrun  v30.8b, v30.8h, #5
    632 
    633     uqxtn     v18.8b, v18.8h
    634     uqxtn     v19.8b, v19.8h
    635     mov       v18.s[1], v19.s[0]
    636 
    637     mov       v12.16b, v8.16b
    638     mov       v13.16b, v9.16b
    639     mov       v6.16b, v14.16b
    640     mov       v7.16b, v15.16b
    641     urhadd    v30.8b, v18.8b , v30.8b
    642 
    643     mov       v8.16b, v16.16b
    644     mov       v9.16b, v17.16b
    645     mov       v14.16b, v28.16b
    646     mov       v15.16b, v29.16b
    647     st1       {v30.2s}, [x14], x3       // store row 3
    648 
    649     bgt       loop_16_highhalf          // looping if height = 8 or 16
    650     b         end_func
    651 
    652 loop_8_start:
    653 
    654     movi      v22.8h, #0x14             // Filter coeff 20 into Q11
    655     movi      v24.8h, #5                // Filter coeff 5  into Q12
    656     ld1       {v0.2s, v1.2s}, [x0], x2  // row -2 load for horizontal filter
    657     ext       v5.8b, v0.8b , v1.8b , #5
    658     uaddl     v6.8h, v0.8b, v5.8b
    659 
    660     ext       v2.8b, v0.8b , v1.8b , #2
    661     ext       v3.8b, v0.8b , v1.8b , #3
    662     uaddl     v8.8h, v2.8b, v3.8b
    663     ext       v4.8b, v0.8b , v1.8b , #4
    664     mla       v6.8h, v8.8h , v22.8h
    665     ext       v1.8b, v0.8b , v1.8b , #1
    666     uaddl     v8.8h, v1.8b, v4.8b
    667     ld1       {v0.2s, v1.2s}, [x0], x2  // row -1 load for horizontal filter
    668     mls       v6.8h, v8.8h , v24.8h
    669     ext       v5.8b, v0.8b , v1.8b , #5
    670     uaddl     v8.8h, v0.8b, v5.8b
    671     ext       v2.8b, v0.8b , v1.8b , #2
    672     ext       v3.8b, v0.8b , v1.8b , #3
    673     uaddl     v10.8h, v2.8b, v3.8b
    674 
    675     st1       {v6.4s}, [x9], x6         // store temp buffer 0
    676 
    677     ext       v4.8b, v0.8b , v1.8b , #4
    678     mla       v8.8h, v10.8h , v22.8h
    679     ext       v1.8b, v0.8b , v1.8b , #1
    680     uaddl     v10.8h, v1.8b, v4.8b
    681     ld1       {v0.2s, v1.2s}, [x0], x2  // row 0 load for horizontal filter
    682     mls       v8.8h, v10.8h , v24.8h
    683     ext       v5.8b, v0.8b , v1.8b , #5
    684     uaddl     v10.8h, v0.8b, v5.8b
    685     ext       v2.8b, v0.8b , v1.8b , #2
    686     ext       v3.8b, v0.8b , v1.8b , #3
    687     uaddl     v12.8h, v2.8b, v3.8b
    688 
    689     st1       {v8.4s}, [x9], x6         // store temp buffer 1
    690 
    691     ext       v4.8b, v0.8b , v1.8b , #4
    692     mla       v10.8h, v12.8h , v22.8h
    693     ext       v1.8b, v0.8b , v1.8b , #1
    694     uaddl     v12.8h, v1.8b, v4.8b
    695     ld1       {v0.2s, v1.2s}, [x0], x2  // row 1 load for horizontal filter
    696     mls       v10.8h, v12.8h , v24.8h
    697     ext       v5.8b, v0.8b , v1.8b , #5
    698     uaddl     v12.8h, v0.8b, v5.8b
    699     ext       v2.8b, v0.8b , v1.8b , #2
    700     ext       v3.8b, v0.8b , v1.8b , #3
    701     uaddl     v14.8h, v2.8b, v3.8b
    702 
    703     st1       {v10.4s}, [x9], x6        // store temp buffer 2
    704 
    705     ext       v4.8b, v0.8b , v1.8b , #4
    706     mla       v12.8h, v14.8h , v22.8h
    707     ext       v1.8b, v0.8b , v1.8b , #1
    708     uaddl     v14.8h, v1.8b, v4.8b
    709     ld1       {v0.2s, v1.2s}, [x0], x2  // row 2 load for horizontal filter
    710     mls       v12.8h, v14.8h , v24.8h
    711     ext       v5.8b, v0.8b , v1.8b , #5
    712     uaddl     v14.8h, v0.8b, v5.8b
    713     ext       v2.8b, v0.8b , v1.8b , #2
    714     ext       v3.8b, v0.8b , v1.8b , #3
    715     uaddl     v16.8h, v2.8b, v3.8b
    716 
    717     st1       {v12.4s}, [x9], x6        // store temp buffer 3
    718 
    719     ext       v4.8b, v0.8b , v1.8b , #4
    720     mla       v14.8h, v16.8h , v22.8h
    721     ext       v1.8b, v0.8b , v1.8b , #1
    722     uaddl     v16.8h, v1.8b, v4.8b
    723 
    724     mls       v14.8h, v16.8h , v24.8h
    725 loop_8:
    726 
    727     ld1       {v0.2s, v1.2s}, [x0], x2  // row 3 load for horizontal filter
    728     ext       v5.8b, v0.8b , v1.8b , #5
    729     ext       v2.8b, v0.8b , v1.8b , #2
    730     ext       v3.8b, v0.8b , v1.8b , #3
    731     uaddl     v16.8h, v0.8b, v5.8b
    732 
    733     st1       {v14.4s}, [x9], x6        // store temp buffer 4
    734 
    735     uaddl     v18.8h, v2.8b, v3.8b
    736     ext       v4.8b, v0.8b , v1.8b , #4
    737     mla       v16.8h, v18.8h , v22.8h
    738     ext       v1.8b, v0.8b , v1.8b , #1
    739     add       v28.8h, v8.8h , v14.8h
    740     uaddl     v18.8h, v1.8b, v4.8b
    741     add       v30.8h, v10.8h , v12.8h
    742     mls       v16.8h, v18.8h , v24.8h
    743     ld1       {v0.2s, v1.2s}     , [x0], x2 // row 4 load for hoorizontal filter
    744     ext       v5.8b, v0.8b , v1.8b , #5
    745     ext       v2.8b, v0.8b , v1.8b , #2
    746     ext       v3.8b, v0.8b , v1.8b , #3
    747     uaddl     v20.8h, v0.8b, v5.8b
    748 
    749     st1       {v16.4s}, [x9], x6        // store temp buffer x5
    750 
    751     saddl     v18.4s, v6.4h, v16.4h
    752 
    753     ld1       {v26.4s}, [x7], x6        // load from temp buffer 0
    754 
    755     saddl2    v6.4s, v6.8h, v16.8h
    756 
    757     sqrshrun  v26.8b, v26.8h, #5
    758 
    759     smlal     v18.4s, v30.4h, v22.4h
    760     smlsl     v18.4s, v28.4h, v24.4h
    761     smlal2    v6.4s, v30.8h, v22.8h
    762     smlsl2    v6.4s, v28.8h, v24.8h
    763     uaddl     v2.8h, v2.8b, v3.8b
    764     ext       v4.8b, v0.8b , v1.8b , #4
    765     mla       v20.8h, v2.8h , v22.8h
    766     sqrshrun  v18.4h, v18.4s, #10
    767     ext       v1.8b, v0.8b , v1.8b , #1
    768     sqrshrun  v19.4h, v6.4s, #10
    769     add       v28.8h, v10.8h , v16.8h
    770     uaddl     v2.8h, v1.8b, v4.8b
    771     add       v30.8h, v12.8h , v14.8h
    772     mls       v20.8h, v2.8h , v24.8h
    773 
    774     uqxtn     v18.8b, v18.8h
    775     uqxtn     v19.8b, v19.8h
    776     mov       v18.s[1], v19.s[0]
    777 
    778     ld1       {v0.2s, v1.2s}, [x0], x2  // row 5 load for horizontal filter
    779 
    780     urhadd    v26.8b, v18.8b , v26.8b
    781 
    782     ext       v5.8b, v0.8b , v1.8b , #5
    783     ext       v2.8b, v0.8b , v1.8b , #2
    784 
    785     st1       {v20.4s}, [x9], x6        // store temp buffer x6
    786 
    787     saddl     v18.4s, v8.4h, v20.4h
    788 
    789     saddl2    v6.4s, v8.8h, v20.8h
    790 
    791     ld1       {v8.4s}, [x7], x6         //load from temp buffer 1
    792 
    793 
    794     st1       {v26.2s}, [x1], x3        // store row 0
    795 
    796     smlal     v18.4s, v30.4h, v22.4h
    797     smlsl     v18.4s, v28.4h, v24.4h
    798 
    799 
    800 
    801     smlal2    v6.4s, v30.8h, v22.8h
    802     smlsl2    v6.4s, v28.8h, v24.8h
    803 
    804     sqrshrun  v28.8b, v8.8h, #5
    805 
    806     ext       v3.8b, v0.8b , v1.8b , #3
    807     uaddl     v8.8h, v0.8b, v5.8b
    808     uaddl     v2.8h, v2.8b, v3.8b
    809     sqrshrun  v18.4h, v18.4s, #10
    810     ext       v4.8b, v0.8b , v1.8b , #4
    811     sqrshrun  v19.4h, v6.4s, #10
    812     mla       v8.8h, v2.8h , v22.8h
    813     ext       v1.8b, v0.8b , v1.8b , #1
    814     add       v26.8h, v12.8h , v20.8h
    815     uaddl     v2.8h, v1.8b, v4.8b
    816 
    817 
    818     uqxtn     v18.8b, v18.8h
    819     uqxtn     v19.8b, v19.8h
    820     mov       v18.s[1], v19.s[0]
    821 
    822     add       v30.8h, v14.8h , v16.8h
    823     mls       v8.8h, v2.8h , v24.8h
    824     ld1       {v0.2s, v1.2s}, [x0], x2  // row 6 load for horizontal filter
    825 
    826     urhadd    v28.8b, v28.8b , v18.8b
    827 
    828     ext       v5.8b, v0.8b , v1.8b , #5
    829     ext       v2.8b, v0.8b , v1.8b , #2
    830     ext       v3.8b, v0.8b , v1.8b , #3
    831 
    832     st1       {v28.2s}, [x1], x3        // store row 1
    833 
    834     uaddl     v28.8h, v0.8b, v5.8b
    835 
    836     st1       {v8.4s}, [x9], x6         // store temp buffer x7
    837 
    838     saddl     v18.4s, v10.4h, v8.4h
    839     saddl2    v6.4s, v10.8h, v8.8h
    840 
    841     ld1       {v10.4s}, [x7], x6        // load from temp buffer 2
    842 
    843     smlal     v18.4s, v30.4h, v22.4h
    844     smlsl     v18.4s, v26.4h, v24.4h
    845     smlal2    v6.4s, v30.8h, v22.8h
    846     smlsl2    v6.4s, v26.8h, v24.8h
    847 
    848     sqrshrun  v26.8b, v10.8h, #5
    849     uaddl     v2.8h, v2.8b, v3.8b
    850     ext       v4.8b, v0.8b , v1.8b , #4
    851     mla       v28.8h, v2.8h , v22.8h
    852     sqrshrun  v18.4h, v18.4s, #10
    853     ext       v1.8b, v0.8b , v1.8b , #1
    854     sqrshrun  v19.4h, v6.4s, #10
    855     add       v10.8h, v14.8h , v8.8h
    856     uaddl     v2.8h, v1.8b, v4.8b
    857     add       v30.8h, v16.8h , v20.8h
    858     mls       v28.8h, v2.8h , v24.8h
    859 
    860     uqxtn     v27.8b, v18.8h
    861     uqxtn     v19.8b, v19.8h
    862 
    863     mov       v27.s[1], v19.s[0]
    864 
    865     saddl     v18.4s, v12.4h, v28.4h
    866     saddl2    v6.4s, v12.8h, v28.8h
    867 
    868     urhadd    v26.8b, v26.8b , v27.8b
    869 
    870     smlal     v18.4s, v30.4h, v22.4h
    871     smlsl     v18.4s, v10.4h, v24.4h
    872     smlal2    v6.4s, v30.8h, v22.8h
    873     smlsl2    v6.4s, v10.8h, v24.8h
    874 
    875     st1       {v26.2s}, [x1], x3        // store row 2
    876 
    877     st1       {v28.2s, v29.2s}, [x9]
    878 
    879 
    880     sqrshrun  v18.4h, v18.4s, #10
    881     mov       v10.16b, v20.16b
    882     mov       v11.16b, v21.16b
    883     ld1       {v30.4s}, [x7], x6        // load from temp buffer 3
    884 
    885     sqrshrun  v19.4h, v6.4s, #10
    886     subs      x4, x4, #4
    887 
    888     sqrshrun  v30.8b, v30.8h, #5
    889 
    890 
    891     uqxtn     v18.8b, v18.8h
    892     uqxtn     v19.8b, v19.8h
    893     mov       v18.s[1], v19.s[0]
    894 
    895 
    896     mov       v12.16b, v8.16b
    897     mov       v13.16b, v9.16b
    898     mov       v6.16b, v14.16b
    899     mov       v7.16b, v15.16b
    900 
    901     urhadd    v30.8b, v18.8b , v30.8b
    902     mov       v8.16b, v16.16b
    903     mov       v9.16b, v17.16b
    904     mov       v14.16b, v28.16b
    905     mov       v15.16b, v29.16b
    906     st1       {v30.2s}, [x1], x3        // store row 3
    907 
    908     bgt       loop_8                    //if height =8 or 16  loop
    909     b         end_func
    910 
    911 loop_4_start:
    912     movi      v22.8h, #20               // Filter coeff 20 into D22
    913     movi      v23.8h, #5                // Filter coeff 5  into D23
    914 
    915     ld1       {v0.2s, v1.2s}, [x0], x2  //row -2 load
    916     ext       v5.8b, v0.8b , v1.8b , #5
    917     uaddl     v6.8h, v0.8b, v5.8b
    918     ext       v2.8b, v0.8b , v1.8b , #2
    919     ext       v3.8b, v0.8b , v1.8b , #3
    920     uaddl     v8.8h, v2.8b, v3.8b
    921     ext       v4.8b, v0.8b , v1.8b , #4
    922     mla       v6.4h, v8.4h , v22.4h
    923     ext       v1.8b, v0.8b , v1.8b , #1
    924     uaddl     v8.8h, v1.8b, v4.8b
    925     ld1       {v0.2s, v1.2s}, [x0], x2  // row -1 load
    926     mls       v6.4h, v8.4h , v23.4h
    927     ext       v5.8b, v0.8b , v1.8b , #5
    928     uaddl     v8.8h, v0.8b, v5.8b
    929     ext       v2.8b, v0.8b , v1.8b , #2
    930     ext       v3.8b, v0.8b , v1.8b , #3
    931     uaddl     v10.8h, v2.8b, v3.8b
    932 
    933     st1       {v6.2s}, [x9], x6         // store temp buffer 0
    934 
    935     ext       v4.8b, v0.8b , v1.8b , #4
    936     mla       v8.4h, v10.4h , v22.4h
    937     ext       v1.8b, v0.8b , v1.8b , #1
    938     uaddl     v10.8h, v1.8b, v4.8b
    939     ld1       {v0.2s, v1.2s}, [x0], x2  // row 0 load
    940     mls       v8.4h, v10.4h , v23.4h
    941     ext       v5.8b, v0.8b , v1.8b , #5
    942     uaddl     v10.8h, v0.8b, v5.8b
    943     ext       v2.8b, v0.8b , v1.8b , #2
    944     ext       v3.8b, v0.8b , v1.8b , #3
    945     uaddl     v12.8h, v2.8b, v3.8b
    946 
    947     st1       {v8.2s}, [x9], x6         // store temp buffer 1
    948 
    949     ext       v4.8b, v0.8b , v1.8b , #4
    950     mla       v10.4h, v12.4h , v22.4h
    951     ext       v1.8b, v0.8b , v1.8b , #1
    952     uaddl     v12.8h, v1.8b, v4.8b
    953     ld1       {v0.2s, v1.2s}, [x0], x2  // row 1 load
    954     mls       v10.4h, v12.4h , v23.4h
    955     ext       v5.8b, v0.8b , v1.8b , #5
    956     uaddl     v12.8h, v0.8b, v5.8b
    957     ext       v2.8b, v0.8b , v1.8b , #2
    958     ext       v3.8b, v0.8b , v1.8b , #3
    959     uaddl     v14.8h, v2.8b, v3.8b
    960 
    961     st1       {v10.2s}, [x9], x6        // store temp buffer 2
    962 
    963     ext       v4.8b, v0.8b , v1.8b , #4
    964     mla       v12.4h, v14.4h , v22.4h
    965     ext       v1.8b, v0.8b , v1.8b , #1
    966     uaddl     v14.8h, v1.8b, v4.8b
    967     ld1       {v0.2s, v1.2s}, [x0], x2  // row 2 load
    968     mls       v12.4h, v14.4h , v23.4h
    969     ext       v5.8b, v0.8b , v1.8b , #5
    970     uaddl     v14.8h, v0.8b, v5.8b
    971     ext       v2.8b, v0.8b , v1.8b , #2
    972     ext       v3.8b, v0.8b , v1.8b , #3
    973     uaddl     v16.8h, v2.8b, v3.8b
    974     ext       v4.8b, v0.8b , v1.8b , #4
    975     mla       v14.4h, v16.4h , v22.4h
    976     ext       v1.8b, v0.8b , v1.8b , #1
    977     uaddl     v16.8h, v1.8b, v4.8b
    978 
    979     st1       {v12.2s}, [x9], x6        // store temp buffer 3
    980 
    981     mls       v14.4h, v16.4h , v23.4h
    982 
    983 loop_4:
    984 
    985     ld1       {v0.2s, v1.2s}, [x0], x2  // row 3 load
    986     ext       v5.8b, v0.8b , v1.8b , #5
    987     uaddl     v16.8h, v0.8b, v5.8b
    988     ext       v2.8b, v0.8b , v1.8b , #2
    989     ext       v3.8b, v0.8b , v1.8b , #3
    990     uaddl     v18.8h, v2.8b, v3.8b
    991     st1       {v14.2s}, [x9], x6        // store temp buffer 4
    992     ext       v4.8b, v0.8b , v1.8b , #4
    993     mla       v16.4h, v18.4h , v22.4h
    994     ext       v1.8b, v0.8b , v1.8b , #1
    995     uaddl     v18.8h, v1.8b, v4.8b
    996     add       v2.4h, v10.4h , v12.4h
    997     mls       v16.4h, v18.4h , v23.4h
    998     add       v3.4h, v8.4h , v14.4h
    999     ld1       {v18.2s, v19.2s}, [x0], x2 // row 4 load
   1000     ext       v25.8b, v18.8b , v19.8b , #5
   1001     uaddl     v26.8h, v18.8b, v25.8b
   1002     ext       v20.8b, v18.8b , v19.8b , #2
   1003 
   1004     st1       {v16.2s}, [x9], x6        // store temp buffer 5
   1005 
   1006     saddl     v0.4s, v6.4h, v16.4h
   1007     smlal     v0.4s, v2.4h, v22.4h
   1008     ext       v21.8b, v18.8b , v19.8b , #3
   1009     uaddl     v28.8h, v20.8b, v21.8b
   1010     ext       v24.8b, v18.8b , v19.8b , #4
   1011     smlsl     v0.4s, v3.4h, v23.4h
   1012     mla       v26.4h, v28.4h , v22.4h
   1013     ext       v19.8b, v18.8b , v19.8b , #1
   1014     uaddl     v28.8h, v19.8b, v24.8b
   1015     add       v2.4h, v12.4h , v14.4h
   1016     mls       v26.4h, v28.4h , v23.4h
   1017     sqrshrun  v0.4h, v0.4s, #0xa
   1018     add       v3.4h, v10.4h , v16.4h
   1019     ld1       {v18.2s, v19.2s}, [x0], x2 // row 5 load
   1020     ext       v25.8b, v18.8b , v19.8b , #5
   1021     uqxtn     v11.8b, v0.8h
   1022     uaddl     v28.8h, v18.8b, v25.8b
   1023 
   1024     st1       {v26.2s}, [x9], x6        // store temp buffer 6
   1025 
   1026     //Q3 available here
   1027     ld1       {v6.2s}, [x7], x6         // load from temp buffer 0
   1028     ld1       {v7.2s}, [x7], x6         // load from temp buffer 1
   1029 
   1030     sqrshrun  v9.8b, v6.8h, #5
   1031     sqrshrun  v7.8b, v7.8h, #5
   1032     mov       v9.s[1], v7.s[0]
   1033 
   1034     ext       v20.8b, v18.8b , v19.8b , #2
   1035 
   1036     saddl     v0.4s, v8.4h, v26.4h
   1037     smlal     v0.4s, v2.4h, v22.4h
   1038     ext       v21.8b, v18.8b , v19.8b , #3
   1039     uaddl     v6.8h, v20.8b, v21.8b
   1040     ext       v24.8b, v18.8b , v19.8b , #4
   1041     smlsl     v0.4s, v3.4h, v23.4h
   1042     mla       v28.4h, v6.4h , v22.4h
   1043     ext       v19.8b, v18.8b , v19.8b , #1
   1044     uaddl     v6.8h, v19.8b, v24.8b
   1045     add       v2.4h, v14.4h , v16.4h
   1046     mls       v28.4h, v6.4h , v23.4h
   1047     sqrshrun  v0.4h, v0.4s, #0xa
   1048     add       v3.4h, v12.4h , v26.4h
   1049     ld1       {v18.2s, v19.2s}, [x0], x2 // row 6 load
   1050     ext       v25.8b, v18.8b , v19.8b , #5
   1051     uqxtn     v13.8b, v0.8h
   1052 
   1053     trn1      v11.2s, v11.2s, v13.2s
   1054     trn2      v13.2s, v11.2s, v13.2s
   1055     saddl     v0.4s, v10.4h, v28.4h
   1056     urhadd    v9.8b, v9.8b , v11.8b
   1057 
   1058     st1       {v28.2s}, [x9], x6        // store temp buffer 7
   1059 
   1060     smlal     v0.4s, v2.4h, v22.4h
   1061     uaddl     v30.8h, v18.8b, v25.8b
   1062 
   1063     st1       {v9.s}[0], [x1], x3       // store row 0
   1064 
   1065     ext       v20.8b, v18.8b , v19.8b , #2
   1066 
   1067     st1       {v9.s}[1], [x1], x3       // store row 1
   1068 
   1069     ext       v21.8b, v18.8b , v19.8b , #3
   1070     smlsl     v0.4s, v3.4h, v23.4h
   1071     uaddl     v8.8h, v20.8b, v21.8b
   1072     ext       v24.8b, v18.8b , v19.8b , #4
   1073     mla       v30.4h, v8.4h , v22.4h
   1074     ext       v19.8b, v18.8b , v19.8b , #1
   1075     uaddl     v8.8h, v19.8b, v24.8b
   1076     sqrshrun  v0.4h, v0.4s, #0xa
   1077     add       v2.4h, v16.4h , v26.4h
   1078     mls       v30.4h, v8.4h , v23.4h
   1079     uqxtn     v4.8b, v0.8h
   1080 
   1081     add       v3.4h, v14.4h , v28.4h
   1082 
   1083 
   1084     saddl     v0.4s, v12.4h, v30.4h
   1085 
   1086     st1       {v30.2s}, [x9]
   1087 
   1088     smlal     v0.4s, v2.4h, v22.4h
   1089 
   1090     ld1       {v8.2s}, [x7], x6         // load from temp buffer 2
   1091     ld1       {v9.2s}, [x7], x6         // load from temp buffer 3
   1092     smlsl     v0.4s, v3.4h, v23.4h
   1093     subs      x4, x4, #4
   1094 
   1095     sqrshrun  v10.8b, v8.8h, #5
   1096     sqrshrun  v9.8b, v9.8h, #5
   1097     mov       v10.s[1], v9.s[0]
   1098 
   1099     mov       v12.8b, v28.8b
   1100 
   1101     sqrshrun  v0.4h, v0.4s, #0xa
   1102     mov       v6.8b, v14.8b
   1103     mov       v8.8b, v16.8b
   1104 
   1105     uqxtn     v5.8b, v0.8h
   1106 
   1107     trn1      v4.2s, v4.2s, v5.2s
   1108     trn2      v5.2s, v4.2s, v5.2s
   1109     urhadd    v4.8b, v4.8b , v10.8b
   1110     mov       v10.8b, v26.8b
   1111     mov       v14.8b, v30.8b
   1112 
   1113     st1       {v4.s}[0], [x1], x3       // store row 2
   1114     st1       {v4.s}[1], [x1], x3       // store row 3
   1115 
   1116     bgt       loop_4
   1117 
   1118 end_func:
   1119     //Restoring registers from stack
   1120     ldp       x19, x20, [sp], #16
   1121     pop_v_regs
   1122     ret
   1123 
   1124 
   1125 
   1126