Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for inter prediction  interpolation.
     27 //*
     28 //* @author
     29 //*  Mohit
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_inter_pred_luma_horz_qpel_vert_hpel_av8()
     34 //*
     35 //* @remarks
     36 //*  None
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 
     41 ///* All the functions here are replicated from ih264_inter_pred_filters.c
     42 //
     43 
     44 ///**
     45 ///**
     46 ///**
     47 //*******************************************************************************
     48 //*
     49 //* @brief
     50 //*   This function implements a two stage cascaded six tap filter. It
     51 //*   applies the six tap filter in the vertical direction on the
     52 //*   predictor values, followed by applying the same filter in the
     53 //*   horizontal direction on the output of the first stage. It then averages
     54 //*     the output of the 1st stage and the final stage to obtain the quarter
     55 //*   pel values.The six tap filtering operation is described in sec 8.4.2.2.1
     56 //*   titled "Luma sample interpolation process".
     57 //*
     58 //* @par Description:
     59 //*    This function is called to obtain pixels lying at the following
     60 //*    location (1/4,1/2) or (3/4,1/2). The function interpolates
     61 //*    the predictors first in the verical direction and then in the
     62 //*    horizontal direction to output the (1/2,1/2). It then averages
     63 //*      the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2)
     64 //*       or (3/4,1/2) depending on the offset.
     65 //*
     66 //* @param[in] pu1_src
     67 //*  UWORD8 pointer to the source
     68 //*
     69 //* @param[out] pu1_dst
     70 //*  UWORD8 pointer to the destination
     71 //*
     72 //* @param[in] src_strd
     73 //*  integer source stride
     74 //*
     75 //* @param[in] dst_strd
     76 //*  integer destination stride
     77 //*
     78 //* @param[in] ht
     79 //*  integer height of the array
     80 //*
     81 //* @param[in] wd
     82 //*  integer width of the array
     83 //*
     84 //* @param[in] pu1_tmp: temporary buffer
     85 //*
     86 //* @param[in] dydx: x and y reference offset for qpel calculations
     87 //*
     88 //* @returns
     89 //*
     90 //* @remarks
     91 //*  None
     92 //*
     93 //*******************************************************************************
     94 //*/;
     95 
     96 //void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
     97 //                                UWORD8 *pu1_dst,
     98 //                                WORD32 src_strd,,
     99 //                                WORD32 dst_strd,
    100 //                                WORD32 ht,
    101 //                                WORD32 wd,
    102 //                                    UWORD8* pu1_tmp,
    103 //                                  UWORD32 dydx)
    104 
    105 //**************Variables Vs Registers*****************************************
    106 //    x0 => *pu1_src
    107 //    x1 => *pu1_dst
    108 //    x2 =>  src_strd
    109 //    x3 =>  dst_strd
    110 //    x4 =>  ht
    111 //    x5 =>  wd
    112 //    x6 =>  dydx
    113 //    x9 => *pu1_tmp
    114 
    115 .text
    116 .p2align 2
    117 .include "ih264_neon_macros.s"
    118 
    119 
    120 
    121     .global ih264_inter_pred_luma_horz_qpel_vert_hpel_av8
    122 
    123 ih264_inter_pred_luma_horz_qpel_vert_hpel_av8:
    124 
    125     // STMFD sp!, {x4-x12, x14}          //store register values to stack
    126     push_v_regs
    127     stp       x19, x20, [sp, #-16]!
    128 
    129     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
    130     sub       x0, x0, #2                //pu1_src-2
    131     mov       x9, x6
    132     mov       x6, x7
    133 
    134     and       x6, x6, #2                // dydx & 0x3 followed by dydx>>1 and dydx<<1
    135 
    136     add       x7, x9, #4
    137     add       x6, x7, x6                // pi16_pred1_temp += (x_offset>>1)
    138 
    139     movi      v26.8h, #0x14             // Filter coeff 20 into Q13
    140     movi      v24.8h, #0x5              // Filter coeff 5  into Q12
    141     movi      v27.8h, #0x14             // Filter coeff 20 into Q13
    142     movi      v25.8h, #0x5              // Filter coeff 5  into Q12
    143     mov       x7, #0x20
    144     mov       x8, #0x30
    145     subs      x12, x5, #4               //if wd=4 branch to loop_4
    146     beq       loop_4_start
    147 
    148     subs      x12, x5, #8               //if wd=8 branch to loop_8
    149     beq       loop_8_start
    150 
    151     //when  wd=16
    152     movi      v28.8h, #0x14             // Filter coeff 20 into Q13
    153     movi      v30.8h, #0x5              // Filter coeff 5  into Q12
    154     sub       x2, x2, #16
    155     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0]
    156     ld1       {v12.2s}, [x0], x2        // Vector load from src[0_0]
    157     ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0]
    158     ld1       {v13.2s}, [x0], x2        // Vector load from src[1_0]
    159     ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0]
    160     ld1       {v14.2s}, [x0], x2        // Vector load from src[2_0]
    161     ld1       {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0]
    162     ld1       {v15.2s}, [x0], x2        // Vector load from src[3_0]
    163     ld1       {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0]
    164     ld1       {v16.2s}, [x0], x2        // Vector load from src[4_0]
    165 
    166 loop_16:
    167 
    168     ld1       {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0]
    169     ld1       {v17.2s}, [x0], x2        // Vector load from src[5_0]
    170 
    171 
    172     uaddl     v20.8h, v4.8b, v6.8b
    173     uaddl     v18.8h, v0.8b, v10.8b
    174     uaddl     v22.8h, v2.8b, v8.8b
    175     mla       v18.8h, v20.8h , v28.8h
    176     uaddl     v24.8h, v5.8b, v7.8b
    177     uaddl     v20.8h, v1.8b, v11.8b
    178     uaddl     v26.8h, v3.8b, v9.8b
    179     mla       v20.8h, v24.8h , v28.8h
    180     uaddl     v24.8h, v14.8b, v15.8b
    181     mls       v18.8h, v22.8h , v30.8h
    182     uaddl     v22.8h, v12.8b, v17.8b
    183     mls       v20.8h, v26.8h , v30.8h
    184     uaddl     v26.8h, v13.8b, v16.8b
    185     mla       v22.8h, v24.8h , v28.8h
    186     mls       v22.8h, v26.8h , v30.8h
    187     st1       {v18.4s }, [x9], #16
    188     st1       {v20.4s}, [x9], #16
    189     ext       v24.16b, v18.16b , v20.16b , #4
    190     ext       v26.16b, v18.16b , v20.16b , #6
    191     st1       {v22.4s}, [x9]
    192     ext       v22.16b, v18.16b , v20.16b , #10
    193     add       v0.8h, v24.8h , v26.8h
    194     ext       v24.16b, v18.16b , v20.16b , #2
    195     ext       v26.16b, v18.16b , v20.16b , #8
    196     add       v24.8h, v24.8h , v26.8h
    197 
    198     saddl     v26.4s, v18.4h, v22.4h
    199     smlal     v26.4s, v0.4h, v28.4h
    200     smlsl     v26.4s, v24.4h, v30.4h
    201 
    202     saddl2    v22.4s, v18.8h, v22.8h
    203     smlal2    v22.4s, v0.8h, v28.8h
    204     smlsl2    v22.4s, v24.8h, v30.8h
    205 
    206     sqrshrun  v18.4h, v26.4s, #10
    207     sqrshrun  v19.4h, v22.4s, #10
    208     ld1       {v22.4s}, [x9], #16
    209 
    210     uqxtn     v18.8b, v18.8h
    211     uqxtn     v19.8b, v19.8h
    212     mov       v18.s[1], v19.s[0]
    213 
    214     ext       v24.16b, v20.16b , v22.16b , #4
    215     ext       v26.16b, v20.16b , v22.16b , #6
    216     ext       v0.16b, v20.16b , v22.16b , #10
    217     st1       {v18.2s}, [x1]
    218     add       v18.8h, v24.8h , v26.8h
    219     ext       v24.16b, v20.16b , v22.16b , #2
    220     ext       v26.16b, v20.16b , v22.16b , #8
    221     add       v24.8h, v24.8h , v26.8h
    222 
    223     saddl     v26.4s, v0.4h, v20.4h
    224     smlal     v26.4s, v18.4h, v28.4h
    225     smlsl     v26.4s, v24.4h, v30.4h
    226 
    227     saddl2    v22.4s, v0.8h, v20.8h
    228     smlal2    v22.4s, v18.8h, v28.8h
    229     smlsl2    v22.4s, v24.8h, v30.8h
    230 
    231     sqrshrun  v19.4h, v26.4s, #10
    232     sqrshrun  v18.4h, v22.4s, #10
    233 
    234     uaddl     v24.8h, v7.8b, v9.8b
    235     ld1       {v20.4s}, [x6], #16
    236     ld1       {v22.4s}, [x6], x7
    237 
    238 
    239     uqxtn     v19.8b, v19.8h
    240     uqxtn     v18.8b, v18.8h
    241     mov       v19.s[1], v18.s[0]
    242 
    243     ld1       {v18.2s}, [x1]
    244     sqrshrun  v20.8b, v20.8h, #5
    245     sqrshrun  v21.8b, v22.8h, #5
    246     uaddl     v22.8h, v4.8b, v10.8b
    247     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0]
    248     urhadd    v18.16b, v18.16b , v20.16b
    249     urhadd    v19.16b, v19.16b , v21.16b
    250 
    251     ld1       {v12.2s}, [x0], x2        // Vector load from src[6_0]
    252     uaddl     v20.8h, v6.8b, v8.8b
    253     uaddl     v26.8h, v5.8b, v11.8b
    254     st1       {v18.2s, v19.2s}, [x1], x3 // store row 0
    255 
    256 
    257 //ROW_2
    258 
    259 
    260     uaddl     v18.8h, v2.8b, v0.8b
    261 
    262     mla       v18.8h, v20.8h , v28.8h
    263 
    264     uaddl     v20.8h, v3.8b, v1.8b
    265 
    266     mla       v20.8h, v24.8h , v28.8h
    267     uaddl     v24.8h, v15.8b, v16.8b
    268     mls       v18.8h, v22.8h , v30.8h
    269     uaddl     v22.8h, v13.8b, v12.8b
    270     mls       v20.8h, v26.8h , v30.8h
    271     uaddl     v26.8h, v14.8b, v17.8b
    272     mla       v22.8h, v24.8h , v28.8h
    273     mls       v22.8h, v26.8h , v30.8h
    274     st1       {v18.4s}, [x9], #16
    275     st1       {v20.4s}, [x9], #16
    276     ext       v24.16b, v18.16b , v20.16b , #4
    277     ext       v26.16b, v18.16b , v20.16b , #6
    278     st1       {v22.4s}, [x9]
    279     ext       v22.16b, v18.16b , v20.16b , #10
    280     add       v2.8h, v24.8h , v26.8h
    281     ext       v24.16b, v18.16b , v20.16b , #2
    282     ext       v26.16b, v18.16b , v20.16b , #8
    283     add       v24.8h, v24.8h , v26.8h
    284 
    285     saddl     v26.4s, v18.4h, v22.4h
    286     smlal     v26.4s, v2.4h, v28.4h
    287     smlsl     v26.4s, v24.4h, v30.4h
    288 
    289     saddl2    v22.4s, v18.8h, v22.8h
    290     smlal2    v22.4s, v2.8h, v28.8h
    291     smlsl2    v22.4s, v24.8h, v30.8h
    292 
    293     sqrshrun  v18.4h, v26.4s, #10
    294     sqrshrun  v19.4h, v22.4s, #10
    295 
    296     ld1       {v22.4s}, [x9], #16
    297 
    298     uqxtn     v18.8b, v18.8h
    299     uqxtn     v19.8b, v19.8h
    300     mov       v18.s[1], v19.s[0]
    301 
    302     ext       v24.16b, v20.16b , v22.16b , #4
    303     ext       v26.16b, v20.16b , v22.16b , #6
    304     ext       v2.16b, v20.16b , v22.16b , #10
    305     st1       {v18.2s}, [x1]
    306     add       v18.8h, v24.8h , v26.8h
    307     ext       v24.16b, v20.16b , v22.16b , #2
    308     ext       v26.16b, v20.16b , v22.16b , #8
    309     add       v24.8h, v24.8h , v26.8h
    310 
    311     saddl     v26.4s, v2.4h, v20.4h
    312     smlal     v26.4s, v18.4h, v28.4h
    313     smlsl     v26.4s, v24.4h, v30.4h
    314 
    315     saddl2    v22.4s, v2.8h, v20.8h
    316     smlal2    v22.4s, v18.8h, v28.8h
    317     smlsl2    v22.4s, v24.8h, v30.8h
    318 
    319     sqrshrun  v19.4h, v26.4s, #10
    320     sqrshrun  v18.4h, v22.4s, #10
    321     uaddl     v24.8h, v9.8b, v11.8b
    322     ld1       {v20.4s}, [x6], #16
    323     ld1       {v22.4s}, [x6], x7
    324     uqxtn     v19.8b, v19.8h
    325     uqxtn     v18.8b, v18.8h
    326     mov       v19.s[1], v18.s[0]
    327     ld1       {v18.4s}, [x1]
    328     sqrshrun  v20.8b, v20.8h, #5
    329     sqrshrun  v21.8b, v22.8h, #5
    330 
    331     uaddl     v22.8h, v6.8b, v0.8b
    332     ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0]
    333 
    334     urhadd    v18.16b, v18.16b , v20.16b
    335     urhadd    v19.16b, v19.16b , v21.16b
    336     ld1       {v13.2s}, [x0], x2        // Vector load from src[7_0]
    337     uaddl     v20.8h, v8.8b, v10.8b
    338     uaddl     v26.8h, v7.8b, v1.8b
    339     st1       {v18.2s, v19.2s}, [x1], x3 // store row 1
    340 
    341 //ROW_3
    342 
    343 
    344     uaddl     v18.8h, v4.8b, v2.8b
    345 
    346     mla       v18.8h, v20.8h , v28.8h
    347 
    348     uaddl     v20.8h, v5.8b, v3.8b
    349 
    350     mla       v20.8h, v24.8h , v28.8h
    351     uaddl     v24.8h, v16.8b, v17.8b
    352     mls       v18.8h, v22.8h , v30.8h
    353     uaddl     v22.8h, v14.8b, v13.8b
    354     mls       v20.8h, v26.8h , v30.8h
    355     uaddl     v26.8h, v15.8b, v12.8b
    356     mla       v22.8h, v24.8h , v28.8h
    357     mls       v22.8h, v26.8h , v30.8h
    358     st1       {v18.4s}, [x9], #16
    359     st1       {v20.4s}, [x9], #16
    360     ext       v24.16b, v18.16b , v20.16b , #4
    361     ext       v26.16b, v18.16b , v20.16b , #6
    362     st1       {v22.4s}, [x9]
    363     ext       v22.16b, v18.16b , v20.16b , #10
    364     add       v4.8h, v24.8h , v26.8h
    365     ext       v24.16b, v18.16b , v20.16b , #2
    366     ext       v26.16b, v18.16b , v20.16b , #8
    367     add       v24.8h, v24.8h , v26.8h
    368 
    369     saddl     v26.4s, v18.4h, v22.4h
    370     smlal     v26.4s, v4.4h, v28.4h
    371     smlsl     v26.4s, v24.4h, v30.4h
    372 
    373     saddl2    v22.4s, v18.8h, v22.8h
    374     smlal2    v22.4s, v4.8h, v28.8h
    375     smlsl2    v22.4s, v24.8h, v30.8h
    376 
    377     sqrshrun  v18.4h, v26.4s, #10
    378     sqrshrun  v19.4h, v22.4s, #10
    379     ld1       {v22.4s}, [x9], #16
    380 
    381     uqxtn     v18.8b, v18.8h
    382     uqxtn     v19.8b, v19.8h
    383     mov       v18.s[1], v19.s[0]
    384 
    385 
    386     ext       v24.16b, v20.16b , v22.16b , #4
    387     ext       v26.16b, v20.16b , v22.16b , #6
    388     ext       v4.16b, v20.16b , v22.16b , #10
    389     st1       {v18.2s}, [x1]
    390     add       v18.8h, v24.8h , v26.8h
    391     ext       v24.16b, v20.16b , v22.16b , #2
    392     ext       v26.16b, v20.16b , v22.16b , #8
    393     add       v24.8h, v24.8h , v26.8h
    394 
    395     saddl     v26.4s, v4.4h, v20.4h
    396     smlal     v26.4s, v18.4h, v28.4h
    397     smlsl     v26.4s, v24.4h, v30.4h
    398 
    399     saddl2    v22.4s, v4.8h, v20.8h
    400     smlal2    v22.4s, v18.8h, v28.8h
    401     smlsl2    v22.4s, v24.8h, v30.8h
    402 
    403     sqrshrun  v19.4h, v26.4s, #10
    404     sqrshrun  v18.4h, v22.4s, #10
    405 
    406     uaddl     v24.8h, v11.8b, v1.8b
    407     ld1       {v20.4s}, [x6], #16
    408     ld1       {v22.4s}, [x6], x7
    409 
    410     uqxtn     v19.8b, v19.8h
    411     uqxtn     v18.8b, v18.8h
    412     mov       v19.s[1], v18.s[0]
    413 
    414     ld1       {v18.2s}, [x1]
    415     sqrshrun  v20.8b, v20.8h, #5
    416     sqrshrun  v21.8b, v22.8h, #5
    417 
    418     uaddl     v22.8h, v8.8b, v2.8b
    419     ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0]
    420 
    421     urhadd    v18.16b, v18.16b , v20.16b
    422     urhadd    v19.16b, v19.16b , v21.16b
    423     ld1       {v14.2s}, [x0], x2        // Vector load from src[8_0]
    424     uaddl     v20.8h, v10.8b, v0.8b
    425     uaddl     v26.8h, v9.8b, v3.8b
    426     st1       {v18.2s, v19.2s}, [x1], x3 // store row 2
    427 
    428 
    429 //ROW_4
    430 
    431     uaddl     v18.8h, v6.8b, v4.8b
    432 
    433     mla       v18.8h, v20.8h , v28.8h
    434 
    435     uaddl     v20.8h, v7.8b, v5.8b
    436 
    437     mla       v20.8h, v24.8h , v28.8h
    438     uaddl     v24.8h, v17.8b, v12.8b
    439     mls       v18.8h, v22.8h , v30.8h
    440     uaddl     v22.8h, v15.8b, v14.8b
    441     mls       v20.8h, v26.8h , v30.8h
    442     uaddl     v26.8h, v16.8b, v13.8b
    443     mla       v22.8h, v24.8h , v28.8h
    444     mls       v22.8h, v26.8h , v30.8h
    445     st1       {v18.4s}, [x9], #16
    446     st1       {v20.4s}, [x9], #16
    447     ext       v24.16b, v18.16b , v20.16b , #4
    448     ext       v26.16b, v18.16b , v20.16b , #6
    449     st1       {v22.4s}, [x9]
    450     ext       v22.16b, v18.16b , v20.16b , #10
    451     add       v6.8h, v24.8h , v26.8h
    452     ext       v24.16b, v18.16b , v20.16b , #2
    453     ext       v26.16b, v18.16b , v20.16b , #8
    454     add       v24.8h, v24.8h , v26.8h
    455 
    456     saddl     v26.4s, v18.4h, v22.4h
    457     smlal     v26.4s, v6.4h, v28.4h
    458     smlsl     v26.4s, v24.4h, v30.4h
    459 
    460     saddl2    v22.4s, v18.8h, v22.8h
    461     smlal2    v22.4s, v6.8h, v28.8h
    462     smlsl2    v22.4s, v24.8h, v30.8h
    463 
    464     sqrshrun  v18.4h, v26.4s, #10
    465     sqrshrun  v19.4h, v22.4s, #10
    466     ld1       {v22.4s}, [x9], #16
    467     uqxtn     v18.8b, v18.8h
    468     uqxtn     v19.8b, v19.8h
    469     mov       v18.s[1], v19.s[0]
    470 
    471 
    472     ext       v24.16b, v20.16b , v22.16b , #4
    473     ext       v26.16b, v20.16b , v22.16b , #6
    474     ext       v6.16b, v20.16b , v22.16b , #10
    475     st1       {v18.2s}, [x1]
    476     add       v18.8h, v24.8h , v26.8h
    477     ext       v24.16b, v20.16b , v22.16b , #2
    478     ext       v26.16b, v20.16b , v22.16b , #8
    479     add       v24.8h, v24.8h , v26.8h
    480 
    481     saddl     v26.4s, v6.4h, v20.4h
    482     smlal     v26.4s, v18.4h, v28.4h
    483     smlsl     v26.4s, v24.4h, v30.4h
    484 
    485     saddl2    v22.4s, v6.8h, v20.8h
    486     smlal2    v22.4s, v18.8h, v28.8h
    487     smlsl2    v22.4s, v24.8h, v30.8h
    488 
    489     mov       v6.16b, v2.16b
    490     mov       v7.16b, v3.16b
    491 
    492     mov       v2.16b, v10.16b
    493     mov       v3.16b, v11.16b
    494 
    495     subs      x4, x4, #4
    496     sqrshrun  v19.4h, v26.4s, #10
    497     sqrshrun  v18.4h, v22.4s, #10
    498     mov       v10.16b, v0.16b
    499     mov       v11.16b, v1.16b
    500 
    501     mov       v24.8b, v14.8b
    502 
    503     mov       v14.16b, v12.16b
    504     mov       v15.16b, v13.16b
    505 
    506 
    507     uqxtn     v19.8b, v19.8h
    508     uqxtn     v18.8b, v18.8h
    509     mov       v19.s[1], v18.s[0]
    510 
    511     ld1       {v20.4s}, [x6], #16
    512     ld1       {v22.4s}, [x6], x7
    513     ld1       {v18.2s}, [x1]
    514     sqrshrun  v20.8b, v20.8h, #5
    515     sqrshrun  v21.8b, v22.8h, #5
    516 
    517     mov       v0.16b, v8.16b
    518     mov       v1.16b, v9.16b
    519 
    520     mov       v8.16b, v4.16b
    521     mov       v9.16b, v5.16b
    522 
    523     mov       v12.16b, v16.16b
    524     mov       v13.16b, v17.16b
    525     urhadd    v18.16b, v18.16b , v20.16b
    526     urhadd    v19.16b, v19.16b , v21.16b
    527 
    528     mov       v4.16b, v10.16b
    529     mov       v5.16b, v11.16b
    530 
    531     mov       v16.8b, v24.8b
    532     st1       {v18.2s, v19.2s}, [x1], x3 // store row 3
    533 
    534     bgt       loop_16                   // looping if height =16
    535     b         end_func
    536 
    537 loop_8_start:
    538     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    539     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    540     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    541     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    542     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    543 
    544 loop_8:
    545 
    546     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    547     uaddl     v14.8h, v4.8b, v6.8b
    548     uaddl     v12.8h, v0.8b, v10.8b
    549     uaddl     v16.8h, v2.8b, v8.8b
    550     mla       v12.8h, v14.8h , v26.8h
    551     uaddl     v18.8h, v5.8b, v7.8b
    552     uaddl     v14.8h, v1.8b, v11.8b
    553     uaddl     v22.8h, v3.8b, v9.8b
    554     mla       v14.8h, v18.8h , v26.8h
    555     mls       v12.8h, v16.8h , v24.8h
    556     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
    557     uaddl     v16.8h, v6.8b, v8.8b
    558     mls       v14.8h, v22.8h , v24.8h
    559     uaddl     v28.8h, v2.8b, v0.8b
    560     st1       {v12.4s}, [x9], #16       // store row 0 to temp buffer: col 0
    561     ext       v22.16b, v12.16b , v14.16b , #10
    562     uaddl     v18.8h, v4.8b, v10.8b
    563     mla       v28.8h, v16.8h , v26.8h
    564     saddl     v30.4s, v12.4h, v22.4h
    565     st1       {v14.4s}, [x9], x7        // store row 0 to temp buffer: col 1
    566     saddl2    v22.4s, v12.8h, v22.8h
    567     ext       v16.16b, v12.16b , v14.16b , #4
    568     mls       v28.8h, v18.8h , v24.8h
    569     ext       v18.16b, v12.16b , v14.16b , #6
    570     ext       v20.16b, v12.16b , v14.16b , #8
    571     ext       v14.16b, v12.16b , v14.16b , #2
    572     add       v16.8h, v16.8h , v18.8h
    573     add       v18.8h, v14.8h , v20.8h
    574     uaddl     v20.8h, v7.8b, v9.8b
    575     smlal     v30.4s, v16.4h, v26.4h
    576     smlsl     v30.4s, v18.4h, v24.4h
    577     smlal2    v22.4s, v16.8h, v26.8h
    578     smlsl2    v22.4s, v18.8h, v24.8h
    579     uaddl     v14.8h, v3.8b, v1.8b
    580     st1       {v28.4s}, [x9], #16       // store row 1 to temp buffer: col 0
    581     mla       v14.8h, v20.8h , v26.8h
    582     sqrshrun  v12.4h, v30.4s, #10
    583     uaddl     v16.8h, v5.8b, v11.8b
    584     sqrshrun  v13.4h, v22.4s, #10
    585     mls       v14.8h, v16.8h , v24.8h
    586     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
    587     uqxtn     v25.8b, v12.8h
    588     uqxtn     v13.8b, v13.8h
    589     mov       v25.s[1], v13.s[0]
    590     uaddl     v16.8h, v8.8b, v10.8b
    591 
    592 
    593     ext       v22.16b, v28.16b , v14.16b , #10
    594     uaddl     v20.8h, v4.8b, v2.8b
    595     saddl     v30.4s, v28.4h, v22.4h
    596     mla       v20.8h, v16.8h , v26.8h
    597     st1       {v14.4s}, [x9], x7        // store row 1 to temp buffer: col 1
    598     saddl2    v22.4s, v28.8h, v22.8h
    599     ext       v16.16b, v28.16b , v14.16b , #4
    600     ext       v18.16b, v28.16b , v14.16b , #6
    601     ext       v12.16b, v28.16b , v14.16b , #8
    602     ext       v14.16b, v28.16b , v14.16b , #2
    603     add       v16.8h, v16.8h , v18.8h
    604     add       v18.8h, v12.8h , v14.8h
    605     ld1       {v14.4s, v15.4s}, [x6], x8 // load row 0 from temp buffer
    606     smlal     v30.4s, v16.4h, v26.4h
    607     smlsl     v30.4s, v18.4h, v24.4h
    608     smlal2    v22.4s, v16.8h, v26.8h
    609     smlsl2    v22.4s, v18.8h, v24.8h
    610     sqrshrun  v14.8b, v14.8h, #0x5
    611     ld1       {v28.4s, v29.4s}, [x6], x8 // load row 1 from temp buffer
    612     uaddl     v18.8h, v6.8b, v0.8b
    613     sqrshrun  v16.4h, v30.4s, #10
    614     sqrshrun  v15.8b, v28.8h, #0x5
    615     sqrshrun  v17.4h, v22.4s, #10
    616 
    617     mov       v12.8b, v25.8b
    618     mov       v25.8b, v24.8b
    619 
    620     uaddl     v28.8h, v9.8b, v11.8b
    621     uqxtn     v13.8b, v16.8h
    622     uqxtn     v17.8b, v17.8h
    623     mov       v13.s[1], v17.s[0]
    624 
    625     urhadd    v12.16b, v12.16b , v14.16b
    626     urhadd    v13.16b, v13.16b , v15.16b
    627     uaddl     v14.8h, v5.8b, v3.8b
    628     uaddl     v22.8h, v7.8b, v1.8b
    629     mls       v20.8h, v18.8h , v24.8h
    630     st1       {v12.2s}, [x1], x3        // store row 0
    631     mla       v14.8h, v28.8h , v26.8h
    632     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
    633     uaddl     v30.8h, v10.8b, v0.8b
    634     uaddl     v28.8h, v6.8b, v4.8b
    635     mls       v14.8h, v22.8h , v24.8h
    636     st1       {v13.2s}, [x1], x3        // store row 1
    637     mla       v28.8h, v30.8h , v26.8h
    638     st1       {v20.4s}, [x9], #16       // store row 2 to temp buffer: col 0
    639     ext       v22.16b, v20.16b , v14.16b , #10
    640     saddl     v30.4s, v20.4h, v22.4h
    641     st1       {v14.2s, v15.2s}, [x9], x7 // store row 2 to temp buffer: col 0
    642     saddl2    v22.4s, v20.8h, v22.8h
    643     ext       v16.16b, v20.16b , v14.16b , #4
    644     ext       v18.16b, v20.16b , v14.16b , #6
    645     ext       v12.16b, v20.16b , v14.16b , #8
    646     ext       v14.16b, v20.16b , v14.16b , #2
    647     add       v16.8h, v16.8h , v18.8h
    648     add       v18.8h, v14.8h , v12.8h
    649     uaddl     v20.8h, v8.8b, v2.8b
    650     smlal     v30.4s, v16.4h, v26.4h
    651     smlsl     v30.4s, v18.4h, v24.4h
    652     smlal2    v22.4s, v16.8h, v26.8h
    653     smlsl2    v22.4s, v18.8h, v24.8h
    654     uaddl     v18.8h, v11.8b, v1.8b
    655     uaddl     v16.8h, v7.8b, v5.8b
    656     sqrshrun  v12.4h, v30.4s, #10
    657     uaddl     v30.8h, v9.8b, v3.8b
    658     mla       v16.8h, v18.8h , v26.8h
    659     sqrshrun  v13.4h, v22.4s, #10
    660     mls       v28.8h, v20.8h , v24.8h
    661     ld1       {v14.4s, v15.4s}, [x6], x8 // load row 2 from temp buffer
    662     mls       v16.8h, v30.8h , v24.8h
    663     uqxtn     v27.8b, v12.8h
    664     uqxtn     v13.8b, v13.8h
    665     mov       v27.s[1], v13.s[0]
    666 
    667     sqrshrun  v14.8b, v14.8h, #5
    668     ext       v22.16b, v28.16b , v16.16b , #10
    669     st1       {v28.4s}, [x9], #16       // store row 3 to temp buffer: col 0
    670     saddl     v30.4s, v28.4h, v22.4h
    671     st1       {v16.2s, v17.2s}, [x9], x7 // store row 3 to temp buffer: col 1
    672     saddl2    v22.4s, v28.8h, v22.8h
    673     ext       v12.16b, v28.16b , v16.16b , #4
    674     ext       v18.16b, v28.16b , v16.16b , #6
    675     ext       v20.16b, v28.16b , v16.16b , #8
    676     ext       v28.16b, v28.16b , v16.16b , #2
    677     add       v12.8h, v12.8h , v18.8h
    678     add       v18.8h, v28.8h , v20.8h
    679     ld1       {v16.4s, v17.4s}, [x6], x8 // load row 3 from temp buffer
    680     smlal     v30.4s, v12.4h, v26.4h
    681     smlsl     v30.4s, v18.4h, v24.4h
    682     smlal2    v22.4s, v12.8h, v26.8h
    683     smlsl2    v22.4s, v18.8h, v24.8h
    684     sqrshrun  v15.8b, v16.8h, #0x5
    685 
    686     mov       v12.8b, v27.8b
    687     mov       v27.8b, v26.8b
    688 
    689     sqrshrun  v16.4h, v30.4s, #10
    690 
    691     mov       v6.16b, v2.16b
    692     mov       v7.16b, v3.16b
    693 
    694     sqrshrun  v17.4h, v22.4s, #10
    695 
    696     mov       v2.16b, v10.16b
    697     mov       v3.16b, v11.16b
    698 
    699     mov       v10.16b, v0.16b
    700     mov       v11.16b, v1.16b
    701 
    702     subs      x4, x4, #4
    703     uqxtn     v13.8b, v16.8h
    704     uqxtn     v17.8b, v17.8h
    705     mov       v13.s[1], v17.s[0]
    706     urhadd    v12.16b, v12.16b , v14.16b
    707     urhadd    v13.16b, v13.16b , v15.16b
    708 
    709     mov       v0.16b, v8.16b
    710     mov       v1.16b, v9.16b
    711 
    712     mov       v8.16b, v4.16b
    713     mov       v9.16b, v5.16b
    714 
    715     mov       v4.16b, v10.16b
    716     mov       v5.16b, v11.16b
    717 
    718     st1       {v12.2s}, [x1], x3        // store row 2
    719     st1       {v13.2s}, [x1], x3        // store row 3
    720 
    721     bgt       loop_8                    //if height =8  loop
    722     b         end_func
    723 
    724 loop_4_start:
    725     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    726     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    727     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    728     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    729     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    730 
    731 loop_4:
    732     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    733     uaddl     v14.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
    734     uaddl     v12.8h, v0.8b, v10.8b     // temp = src[0_0] + src[5_0]
    735     uaddl     v16.8h, v2.8b, v8.8b      // temp2 = src[1_0] + src[4_0]
    736     mla       v12.8h, v14.8h , v26.8h   // temp += temp1 * 20
    737     uaddl     v18.8h, v5.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
    738     uaddl     v14.8h, v1.8b, v11.8b     // temp = src[0_0] + src[5_0]
    739     uaddl     v22.8h, v3.8b, v9.8b      // temp2 = src[1_0] + src[4_0]
    740     mla       v14.8h, v18.8h , v26.8h   // temp += temp1 * 20
    741     mls       v12.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    742     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
    743     uaddl     v16.8h, v6.8b, v8.8b
    744     mls       v14.8h, v22.8h , v24.8h   // temp -= temp2 * 5
    745     //Q6 and Q7 have filtered values
    746     uaddl     v28.8h, v2.8b, v0.8b
    747     st1       {v12.4s}, [x9], #16       // store row 0 to temp buffer: col 0
    748     ext       v22.16b, v12.16b , v14.16b , #10
    749     uaddl     v18.8h, v4.8b, v10.8b
    750     mla       v28.8h, v16.8h , v26.8h
    751     saddl     v30.4s, v12.4h, v22.4h
    752     st1       {v14.4s}, [x9], x7        // store row 0 to temp buffer: col 1
    753     saddl     v22.4s, v13.4h, v23.4h
    754     ext       v16.16b, v12.16b , v14.16b , #4
    755     mls       v28.8h, v18.8h , v24.8h
    756     ext       v18.16b, v12.16b , v14.16b , #6
    757     ext       v20.16b, v12.16b , v14.16b , #8
    758     ext       v14.16b, v12.16b , v14.16b , #2
    759     add       v16.8h, v16.8h , v18.8h
    760     add       v18.8h, v14.8h , v20.8h
    761     uaddl     v20.8h, v7.8b, v9.8b
    762     smlal     v30.4s, v16.4h, v26.4h
    763     smlsl     v30.4s, v18.4h, v24.4h
    764     smlal     v22.4s, v17.4h, v26.4h
    765     smlsl     v22.4s, v19.4h, v24.4h
    766     uaddl     v14.8h, v3.8b, v1.8b
    767     st1       {v28.4s}, [x9], #16       // store row 1 to temp buffer: col 0
    768     mla       v14.8h, v20.8h , v26.8h
    769     sqrshrun  v12.4h, v30.4s, #10
    770     uaddl     v16.8h, v5.8b, v11.8b
    771     sqrshrun  v13.4h, v22.4s, #10
    772     mls       v14.8h, v16.8h , v24.8h
    773     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
    774     uqxtn     v25.8b, v12.8h
    775     uaddl     v16.8h, v8.8b, v10.8b
    776 
    777     ext       v22.16b, v28.16b , v14.16b , #10
    778     uaddl     v20.8h, v4.8b, v2.8b
    779     saddl     v30.4s, v28.4h, v22.4h
    780     mla       v20.8h, v16.8h , v26.8h
    781     st1       {v14.4s}, [x9], x7        // store row 1 to temp buffer: col 1
    782     saddl     v22.4s, v29.4h, v23.4h
    783     ext       v16.16b, v28.16b , v14.16b , #4
    784     ext       v18.16b, v28.16b , v14.16b , #6
    785     ext       v12.16b, v28.16b , v14.16b , #8
    786     ext       v14.16b, v28.16b , v14.16b , #2
    787     add       v16.8h, v16.8h , v18.8h
    788     add       v18.8h, v12.8h , v14.8h
    789     ld1       {v14.2s}, [x6], x8        //load row 0 from temp buffer
    790     smlal     v30.4s, v16.4h, v26.4h
    791     smlsl     v30.4s, v18.4h, v24.4h
    792     smlal     v22.4s, v17.4h, v26.4h
    793     smlsl     v22.4s, v19.4h, v24.4h
    794     sqrshrun  v14.8b, v14.8h, #0x5
    795     ld1       {v28.2s}, [x6], x8        //load row 1 from temp buffer
    796     uaddl     v18.8h, v6.8b, v0.8b
    797     sqrshrun  v16.4h, v30.4s, #10
    798     sqrshrun  v15.8b, v28.8h, #0x5
    799     sqrshrun  v17.4h, v22.4s, #10
    800 
    801     mov       v12.8b, v25.8b
    802     mov       v25.8b, v24.8b
    803 
    804     uaddl     v28.8h, v9.8b, v11.8b
    805     uqxtn     v13.8b, v16.8h
    806 
    807     urhadd    v12.16b, v12.16b , v14.16b
    808     urhadd    v13.16b, v13.16b , v15.16b
    809 
    810     uaddl     v14.8h, v5.8b, v3.8b
    811     uaddl     v22.8h, v7.8b, v1.8b
    812     mls       v20.8h, v18.8h , v24.8h
    813     st1       {v12.s}[0], [x1], x3      // store row 0
    814     mla       v14.8h, v28.8h , v26.8h
    815     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
    816     uaddl     v30.8h, v10.8b, v0.8b
    817     uaddl     v28.8h, v6.8b, v4.8b
    818     mls       v14.8h, v22.8h , v24.8h
    819     st1       {v13.s}[0], [x1], x3      //store row 1
    820     mla       v28.8h, v30.8h , v26.8h
    821     st1       {v20.4s}, [x9], #16       // store row 2 to temp buffer: col 0
    822     ext       v22.16b, v20.16b , v14.16b , #10
    823     saddl     v30.4s, v20.4h, v22.4h
    824     st1       {v14.4s}, [x9], x7        // store row 2 to temp buffer: col 1
    825     saddl     v22.4s, v21.4h, v23.4h
    826     ext       v16.16b, v20.16b , v14.16b , #4
    827     ext       v18.16b, v20.16b , v14.16b , #6
    828     ext       v12.16b, v20.16b , v14.16b , #8
    829     ext       v14.16b, v20.16b , v14.16b , #2
    830     add       v16.8h, v16.8h , v18.8h
    831     add       v18.8h, v14.8h , v12.8h
    832     uaddl     v20.8h, v8.8b, v2.8b
    833     smlal     v30.4s, v16.4h, v26.4h
    834     smlsl     v30.4s, v18.4h, v24.4h
    835     smlal     v22.4s, v17.4h, v26.4h
    836     smlsl     v22.4s, v19.4h, v24.4h
    837     uaddl     v18.8h, v11.8b, v1.8b
    838     uaddl     v16.8h, v7.8b, v5.8b
    839     sqrshrun  v12.4h, v30.4s, #10
    840     uaddl     v30.8h, v9.8b, v3.8b
    841     mla       v16.8h, v18.8h , v26.8h
    842     sqrshrun  v13.4h, v22.4s, #10
    843     mls       v28.8h, v20.8h , v24.8h
    844     ld1       {v14.2s}, [x6], x8        //load row 3 from temp buffer
    845     mls       v16.8h, v30.8h , v24.8h
    846     uqxtn     v27.8b, v12.8h
    847     sqrshrun  v14.8b, v14.8h, #5
    848     ext       v22.16b, v28.16b , v16.16b , #10
    849     st1       {v28.4s}, [x9], #16       // store row 3 to temp buffer: col 0
    850     saddl     v30.4s, v28.4h, v22.4h
    851     st1       {v16.4s}, [x9], x7        // store row 3 to temp buffer: col 1
    852     saddl     v22.4s, v29.4h, v23.4h
    853     ext       v12.16b, v28.16b , v16.16b , #4
    854     ext       v18.16b, v28.16b , v16.16b , #6
    855     ext       v20.16b, v28.16b , v16.16b , #8
    856     ext       v28.16b, v28.16b , v16.16b , #2
    857     add       v12.8h, v12.8h , v18.8h
    858     add       v18.8h, v28.8h , v20.8h
    859     ld1       {v16.2s}, [x6], x8        //load row 4 from temp buffer
    860     smlal     v30.4s, v12.4h, v26.4h
    861     smlsl     v30.4s, v18.4h, v24.4h
    862     smlal     v22.4s, v13.4h, v26.4h
    863     smlsl     v22.4s, v19.4h, v24.4h
    864     sqrshrun  v15.8b, v16.8h, #0x5
    865 
    866     mov       v12.8b, v27.8b
    867     mov       v27.8b, v26.8b
    868 
    869     sqrshrun  v16.4h, v30.4s, #10
    870 
    871     mov       v6.16b, v2.16b
    872     mov       v7.16b, v3.16b
    873 
    874     sqrshrun  v17.4h, v22.4s, #10
    875 
    876     mov       v2.16b, v10.16b
    877     mov       v3.16b, v11.16b
    878 
    879     mov       v10.16b, v0.16b
    880     mov       v11.16b, v1.16b
    881 
    882     subs      x4, x4, #4
    883     uqxtn     v13.8b, v16.8h
    884     urhadd    v12.16b, v12.16b , v14.16b
    885     urhadd    v13.16b, v13.16b , v15.16b
    886 
    887     mov       v0.16b, v8.16b
    888     mov       v1.16b, v9.16b
    889 
    890     mov       v8.16b, v4.16b
    891     mov       v9.16b, v5.16b
    892 
    893 
    894     mov       v4.16b, v10.16b
    895     mov       v5.16b, v11.16b
    896 
    897 
    898     st1       {v12.s}[0], [x1], x3      // store row 2
    899     st1       {v13.s}[0], [x1], x3      // store row 3
    900 
    901     bgt       loop_4
    902 
    903 end_func:
    904     // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    905     ldp       x19, x20, [sp], #16
    906     pop_v_regs
    907     ret
    908 
    909 
    910 
    911