Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for inter prediction  interpolation.
     27 //*
     28 //* @author
     29 //*  Mohit
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_inter_pred_luma_horz_qpel_vert_hpel_av8()
     34 //*
     35 //* @remarks
     36 //*  None
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 
     41 ///* All the functions here are replicated from ih264_inter_pred_filters.c
     42 //
     43 
     44 ///**
     45 ///**
     46 ///**
     47 //*******************************************************************************
     48 //*
     49 //* @brief
     50 //*   This function implements a two stage cascaded six tap filter. It
     51 //*   applies the six tap filter in the vertical direction on the
     52 //*   predictor values, followed by applying the same filter in the
     53 //*   horizontal direction on the output of the first stage. It then averages
     54 //*     the output of the 1st stage and the final stage to obtain the quarter
     55 //*   pel values.The six tap filtering operation is described in sec 8.4.2.2.1
     56 //*   titled "Luma sample interpolation process".
     57 //*
     58 //* @par Description:
     59 //*    This function is called to obtain pixels lying at the following
     60 //*    location (1/4,1/2) or (3/4,1/2). The function interpolates
     61 //*    the predictors first in the verical direction and then in the
     62 //*    horizontal direction to output the (1/2,1/2). It then averages
     63 //*      the output of the 2nd stage and (1/2,1/2) value to obtain (1/4,1/2)
     64 //*       or (3/4,1/2) depending on the offset.
     65 //*
     66 //* @param[in] pu1_src
     67 //*  UWORD8 pointer to the source
     68 //*
     69 //* @param[out] pu1_dst
     70 //*  UWORD8 pointer to the destination
     71 //*
     72 //* @param[in] src_strd
     73 //*  integer source stride
     74 //*
     75 //* @param[in] dst_strd
     76 //*  integer destination stride
     77 //*
     78 //* @param[in] ht
     79 //*  integer height of the array
     80 //*
     81 //* @param[in] wd
     82 //*  integer width of the array
     83 //*
     84 //* @param[in] pu1_tmp: temporary buffer
     85 //*
     86 //* @param[in] dydx: x and y reference offset for qpel calculations
     87 //*
     88 //* @returns
     89 //*
     90 //* @remarks
     91 //*  None
     92 //*
     93 //*******************************************************************************
     94 //*/;
     95 
     96 //void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
     97 //                                UWORD8 *pu1_dst,
     98 //                                WORD32 src_strd,,
     99 //                                WORD32 dst_strd,
    100 //                                WORD32 ht,
    101 //                                WORD32 wd,
    102 //                                    UWORD8* pu1_tmp,
    103 //                                  UWORD32 dydx)
    104 
    105 //**************Variables Vs Registers*****************************************
    106 //    x0 => *pu1_src
    107 //    x1 => *pu1_dst
    108 //    w2 =>  src_strd
    109 //    w3 =>  dst_strd
    110 //    w4 =>  ht
    111 //    w5 =>  wd
    112 //    x6 => *pu1_tmp
    113 //    w7 =>  dydx
    114 
    115 .text
    116 .p2align 2
    117 .include "ih264_neon_macros.s"
    118 
    119 
    120 
    121     .global ih264_inter_pred_luma_horz_qpel_vert_hpel_av8
    122 
    123 ih264_inter_pred_luma_horz_qpel_vert_hpel_av8:
    124 
    125     // STMFD sp!, {x4-x12, x14}          //store register values to stack
    126     push_v_regs
    127     stp       x19, x20, [sp, #-16]!
    128     sxtw      x2, w2
    129     sxtw      x3, w3
    130     sxtw      x4, w4
    131     sxtw      x5, w5
    132 
    133     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
    134     sub       x0, x0, #2                //pu1_src-2
    135     mov       x9, x6
    136     mov       w6, w7
    137 
    138     and       x6, x6, #2                // dydx & 0x3 followed by dydx>>1 and dydx<<1
    139 
    140     add       x7, x9, #4
    141     add       x6, x7, x6                // pi16_pred1_temp += (x_offset>>1)
    142 
    143     movi      v26.8h, #0x14             // Filter coeff 20 into Q13
    144     movi      v24.8h, #0x5              // Filter coeff 5  into Q12
    145     movi      v27.8h, #0x14             // Filter coeff 20 into Q13
    146     movi      v25.8h, #0x5              // Filter coeff 5  into Q12
    147     mov       x7, #0x20
    148     mov       x8, #0x30
    149     subs      x12, x5, #4               //if wd=4 branch to loop_4
    150     beq       loop_4_start
    151 
    152     subs      x12, x5, #8               //if wd=8 branch to loop_8
    153     beq       loop_8_start
    154 
    155     //when  wd=16
    156     movi      v28.8h, #0x14             // Filter coeff 20 into Q13
    157     movi      v30.8h, #0x5              // Filter coeff 5  into Q12
    158     sub       x2, x2, #16
    159     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[0_0]
    160     ld1       {v12.2s}, [x0], x2        // Vector load from src[0_0]
    161     ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[1_0]
    162     ld1       {v13.2s}, [x0], x2        // Vector load from src[1_0]
    163     ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[2_0]
    164     ld1       {v14.2s}, [x0], x2        // Vector load from src[2_0]
    165     ld1       {v6.2s, v7.2s}, [x0], #16 // Vector load from src[3_0]
    166     ld1       {v15.2s}, [x0], x2        // Vector load from src[3_0]
    167     ld1       {v8.2s, v9.2s}, [x0], #16 // Vector load from src[4_0]
    168     ld1       {v16.2s}, [x0], x2        // Vector load from src[4_0]
    169 
    170 loop_16:
    171 
    172     ld1       {v10.2s, v11.2s}, [x0], #16 // Vector load from src[5_0]
    173     ld1       {v17.2s}, [x0], x2        // Vector load from src[5_0]
    174 
    175 
    176     uaddl     v20.8h, v4.8b, v6.8b
    177     uaddl     v18.8h, v0.8b, v10.8b
    178     uaddl     v22.8h, v2.8b, v8.8b
    179     mla       v18.8h, v20.8h , v28.8h
    180     uaddl     v24.8h, v5.8b, v7.8b
    181     uaddl     v20.8h, v1.8b, v11.8b
    182     uaddl     v26.8h, v3.8b, v9.8b
    183     mla       v20.8h, v24.8h , v28.8h
    184     uaddl     v24.8h, v14.8b, v15.8b
    185     mls       v18.8h, v22.8h , v30.8h
    186     uaddl     v22.8h, v12.8b, v17.8b
    187     mls       v20.8h, v26.8h , v30.8h
    188     uaddl     v26.8h, v13.8b, v16.8b
    189     mla       v22.8h, v24.8h , v28.8h
    190     mls       v22.8h, v26.8h , v30.8h
    191     st1       {v18.4s }, [x9], #16
    192     st1       {v20.4s}, [x9], #16
    193     ext       v24.16b, v18.16b , v20.16b , #4
    194     ext       v26.16b, v18.16b , v20.16b , #6
    195     st1       {v22.4s}, [x9]
    196     ext       v22.16b, v18.16b , v20.16b , #10
    197     add       v0.8h, v24.8h , v26.8h
    198     ext       v24.16b, v18.16b , v20.16b , #2
    199     ext       v26.16b, v18.16b , v20.16b , #8
    200     add       v24.8h, v24.8h , v26.8h
    201 
    202     saddl     v26.4s, v18.4h, v22.4h
    203     smlal     v26.4s, v0.4h, v28.4h
    204     smlsl     v26.4s, v24.4h, v30.4h
    205 
    206     saddl2    v22.4s, v18.8h, v22.8h
    207     smlal2    v22.4s, v0.8h, v28.8h
    208     smlsl2    v22.4s, v24.8h, v30.8h
    209 
    210     sqrshrun  v18.4h, v26.4s, #10
    211     sqrshrun  v19.4h, v22.4s, #10
    212     ld1       {v22.4s}, [x9], #16
    213 
    214     uqxtn     v18.8b, v18.8h
    215     uqxtn     v19.8b, v19.8h
    216     mov       v18.s[1], v19.s[0]
    217 
    218     ext       v24.16b, v20.16b , v22.16b , #4
    219     ext       v26.16b, v20.16b , v22.16b , #6
    220     ext       v0.16b, v20.16b , v22.16b , #10
    221     st1       {v18.2s}, [x1]
    222     add       v18.8h, v24.8h , v26.8h
    223     ext       v24.16b, v20.16b , v22.16b , #2
    224     ext       v26.16b, v20.16b , v22.16b , #8
    225     add       v24.8h, v24.8h , v26.8h
    226 
    227     saddl     v26.4s, v0.4h, v20.4h
    228     smlal     v26.4s, v18.4h, v28.4h
    229     smlsl     v26.4s, v24.4h, v30.4h
    230 
    231     saddl2    v22.4s, v0.8h, v20.8h
    232     smlal2    v22.4s, v18.8h, v28.8h
    233     smlsl2    v22.4s, v24.8h, v30.8h
    234 
    235     sqrshrun  v19.4h, v26.4s, #10
    236     sqrshrun  v18.4h, v22.4s, #10
    237 
    238     uaddl     v24.8h, v7.8b, v9.8b
    239     ld1       {v20.4s}, [x6], #16
    240     ld1       {v22.4s}, [x6], x7
    241 
    242 
    243     uqxtn     v19.8b, v19.8h
    244     uqxtn     v18.8b, v18.8h
    245     mov       v19.s[1], v18.s[0]
    246 
    247     ld1       {v18.2s}, [x1]
    248     sqrshrun  v20.8b, v20.8h, #5
    249     sqrshrun  v21.8b, v22.8h, #5
    250     uaddl     v22.8h, v4.8b, v10.8b
    251     ld1       {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0]
    252     urhadd    v18.16b, v18.16b , v20.16b
    253     urhadd    v19.16b, v19.16b , v21.16b
    254 
    255     ld1       {v12.2s}, [x0], x2        // Vector load from src[6_0]
    256     uaddl     v20.8h, v6.8b, v8.8b
    257     uaddl     v26.8h, v5.8b, v11.8b
    258     st1       {v18.2s, v19.2s}, [x1], x3 // store row 0
    259 
    260 
    261 //ROW_2
    262 
    263 
    264     uaddl     v18.8h, v2.8b, v0.8b
    265 
    266     mla       v18.8h, v20.8h , v28.8h
    267 
    268     uaddl     v20.8h, v3.8b, v1.8b
    269 
    270     mla       v20.8h, v24.8h , v28.8h
    271     uaddl     v24.8h, v15.8b, v16.8b
    272     mls       v18.8h, v22.8h , v30.8h
    273     uaddl     v22.8h, v13.8b, v12.8b
    274     mls       v20.8h, v26.8h , v30.8h
    275     uaddl     v26.8h, v14.8b, v17.8b
    276     mla       v22.8h, v24.8h , v28.8h
    277     mls       v22.8h, v26.8h , v30.8h
    278     st1       {v18.4s}, [x9], #16
    279     st1       {v20.4s}, [x9], #16
    280     ext       v24.16b, v18.16b , v20.16b , #4
    281     ext       v26.16b, v18.16b , v20.16b , #6
    282     st1       {v22.4s}, [x9]
    283     ext       v22.16b, v18.16b , v20.16b , #10
    284     add       v2.8h, v24.8h , v26.8h
    285     ext       v24.16b, v18.16b , v20.16b , #2
    286     ext       v26.16b, v18.16b , v20.16b , #8
    287     add       v24.8h, v24.8h , v26.8h
    288 
    289     saddl     v26.4s, v18.4h, v22.4h
    290     smlal     v26.4s, v2.4h, v28.4h
    291     smlsl     v26.4s, v24.4h, v30.4h
    292 
    293     saddl2    v22.4s, v18.8h, v22.8h
    294     smlal2    v22.4s, v2.8h, v28.8h
    295     smlsl2    v22.4s, v24.8h, v30.8h
    296 
    297     sqrshrun  v18.4h, v26.4s, #10
    298     sqrshrun  v19.4h, v22.4s, #10
    299 
    300     ld1       {v22.4s}, [x9], #16
    301 
    302     uqxtn     v18.8b, v18.8h
    303     uqxtn     v19.8b, v19.8h
    304     mov       v18.s[1], v19.s[0]
    305 
    306     ext       v24.16b, v20.16b , v22.16b , #4
    307     ext       v26.16b, v20.16b , v22.16b , #6
    308     ext       v2.16b, v20.16b , v22.16b , #10
    309     st1       {v18.2s}, [x1]
    310     add       v18.8h, v24.8h , v26.8h
    311     ext       v24.16b, v20.16b , v22.16b , #2
    312     ext       v26.16b, v20.16b , v22.16b , #8
    313     add       v24.8h, v24.8h , v26.8h
    314 
    315     saddl     v26.4s, v2.4h, v20.4h
    316     smlal     v26.4s, v18.4h, v28.4h
    317     smlsl     v26.4s, v24.4h, v30.4h
    318 
    319     saddl2    v22.4s, v2.8h, v20.8h
    320     smlal2    v22.4s, v18.8h, v28.8h
    321     smlsl2    v22.4s, v24.8h, v30.8h
    322 
    323     sqrshrun  v19.4h, v26.4s, #10
    324     sqrshrun  v18.4h, v22.4s, #10
    325     uaddl     v24.8h, v9.8b, v11.8b
    326     ld1       {v20.4s}, [x6], #16
    327     ld1       {v22.4s}, [x6], x7
    328     uqxtn     v19.8b, v19.8h
    329     uqxtn     v18.8b, v18.8h
    330     mov       v19.s[1], v18.s[0]
    331     ld1       {v18.4s}, [x1]
    332     sqrshrun  v20.8b, v20.8h, #5
    333     sqrshrun  v21.8b, v22.8h, #5
    334 
    335     uaddl     v22.8h, v6.8b, v0.8b
    336     ld1       {v2.2s, v3.2s}, [x0], #16 // Vector load from src[7_0]
    337 
    338     urhadd    v18.16b, v18.16b , v20.16b
    339     urhadd    v19.16b, v19.16b , v21.16b
    340     ld1       {v13.2s}, [x0], x2        // Vector load from src[7_0]
    341     uaddl     v20.8h, v8.8b, v10.8b
    342     uaddl     v26.8h, v7.8b, v1.8b
    343     st1       {v18.2s, v19.2s}, [x1], x3 // store row 1
    344 
    345 //ROW_3
    346 
    347 
    348     uaddl     v18.8h, v4.8b, v2.8b
    349 
    350     mla       v18.8h, v20.8h , v28.8h
    351 
    352     uaddl     v20.8h, v5.8b, v3.8b
    353 
    354     mla       v20.8h, v24.8h , v28.8h
    355     uaddl     v24.8h, v16.8b, v17.8b
    356     mls       v18.8h, v22.8h , v30.8h
    357     uaddl     v22.8h, v14.8b, v13.8b
    358     mls       v20.8h, v26.8h , v30.8h
    359     uaddl     v26.8h, v15.8b, v12.8b
    360     mla       v22.8h, v24.8h , v28.8h
    361     mls       v22.8h, v26.8h , v30.8h
    362     st1       {v18.4s}, [x9], #16
    363     st1       {v20.4s}, [x9], #16
    364     ext       v24.16b, v18.16b , v20.16b , #4
    365     ext       v26.16b, v18.16b , v20.16b , #6
    366     st1       {v22.4s}, [x9]
    367     ext       v22.16b, v18.16b , v20.16b , #10
    368     add       v4.8h, v24.8h , v26.8h
    369     ext       v24.16b, v18.16b , v20.16b , #2
    370     ext       v26.16b, v18.16b , v20.16b , #8
    371     add       v24.8h, v24.8h , v26.8h
    372 
    373     saddl     v26.4s, v18.4h, v22.4h
    374     smlal     v26.4s, v4.4h, v28.4h
    375     smlsl     v26.4s, v24.4h, v30.4h
    376 
    377     saddl2    v22.4s, v18.8h, v22.8h
    378     smlal2    v22.4s, v4.8h, v28.8h
    379     smlsl2    v22.4s, v24.8h, v30.8h
    380 
    381     sqrshrun  v18.4h, v26.4s, #10
    382     sqrshrun  v19.4h, v22.4s, #10
    383     ld1       {v22.4s}, [x9], #16
    384 
    385     uqxtn     v18.8b, v18.8h
    386     uqxtn     v19.8b, v19.8h
    387     mov       v18.s[1], v19.s[0]
    388 
    389 
    390     ext       v24.16b, v20.16b , v22.16b , #4
    391     ext       v26.16b, v20.16b , v22.16b , #6
    392     ext       v4.16b, v20.16b , v22.16b , #10
    393     st1       {v18.2s}, [x1]
    394     add       v18.8h, v24.8h , v26.8h
    395     ext       v24.16b, v20.16b , v22.16b , #2
    396     ext       v26.16b, v20.16b , v22.16b , #8
    397     add       v24.8h, v24.8h , v26.8h
    398 
    399     saddl     v26.4s, v4.4h, v20.4h
    400     smlal     v26.4s, v18.4h, v28.4h
    401     smlsl     v26.4s, v24.4h, v30.4h
    402 
    403     saddl2    v22.4s, v4.8h, v20.8h
    404     smlal2    v22.4s, v18.8h, v28.8h
    405     smlsl2    v22.4s, v24.8h, v30.8h
    406 
    407     sqrshrun  v19.4h, v26.4s, #10
    408     sqrshrun  v18.4h, v22.4s, #10
    409 
    410     uaddl     v24.8h, v11.8b, v1.8b
    411     ld1       {v20.4s}, [x6], #16
    412     ld1       {v22.4s}, [x6], x7
    413 
    414     uqxtn     v19.8b, v19.8h
    415     uqxtn     v18.8b, v18.8h
    416     mov       v19.s[1], v18.s[0]
    417 
    418     ld1       {v18.2s}, [x1]
    419     sqrshrun  v20.8b, v20.8h, #5
    420     sqrshrun  v21.8b, v22.8h, #5
    421 
    422     uaddl     v22.8h, v8.8b, v2.8b
    423     ld1       {v4.2s, v5.2s}, [x0], #16 // Vector load from src[8_0]
    424 
    425     urhadd    v18.16b, v18.16b , v20.16b
    426     urhadd    v19.16b, v19.16b , v21.16b
    427     ld1       {v14.2s}, [x0], x2        // Vector load from src[8_0]
    428     uaddl     v20.8h, v10.8b, v0.8b
    429     uaddl     v26.8h, v9.8b, v3.8b
    430     st1       {v18.2s, v19.2s}, [x1], x3 // store row 2
    431 
    432 
    433 //ROW_4
    434 
    435     uaddl     v18.8h, v6.8b, v4.8b
    436 
    437     mla       v18.8h, v20.8h , v28.8h
    438 
    439     uaddl     v20.8h, v7.8b, v5.8b
    440 
    441     mla       v20.8h, v24.8h , v28.8h
    442     uaddl     v24.8h, v17.8b, v12.8b
    443     mls       v18.8h, v22.8h , v30.8h
    444     uaddl     v22.8h, v15.8b, v14.8b
    445     mls       v20.8h, v26.8h , v30.8h
    446     uaddl     v26.8h, v16.8b, v13.8b
    447     mla       v22.8h, v24.8h , v28.8h
    448     mls       v22.8h, v26.8h , v30.8h
    449     st1       {v18.4s}, [x9], #16
    450     st1       {v20.4s}, [x9], #16
    451     ext       v24.16b, v18.16b , v20.16b , #4
    452     ext       v26.16b, v18.16b , v20.16b , #6
    453     st1       {v22.4s}, [x9]
    454     ext       v22.16b, v18.16b , v20.16b , #10
    455     add       v6.8h, v24.8h , v26.8h
    456     ext       v24.16b, v18.16b , v20.16b , #2
    457     ext       v26.16b, v18.16b , v20.16b , #8
    458     add       v24.8h, v24.8h , v26.8h
    459 
    460     saddl     v26.4s, v18.4h, v22.4h
    461     smlal     v26.4s, v6.4h, v28.4h
    462     smlsl     v26.4s, v24.4h, v30.4h
    463 
    464     saddl2    v22.4s, v18.8h, v22.8h
    465     smlal2    v22.4s, v6.8h, v28.8h
    466     smlsl2    v22.4s, v24.8h, v30.8h
    467 
    468     sqrshrun  v18.4h, v26.4s, #10
    469     sqrshrun  v19.4h, v22.4s, #10
    470     ld1       {v22.4s}, [x9], #16
    471     uqxtn     v18.8b, v18.8h
    472     uqxtn     v19.8b, v19.8h
    473     mov       v18.s[1], v19.s[0]
    474 
    475 
    476     ext       v24.16b, v20.16b , v22.16b , #4
    477     ext       v26.16b, v20.16b , v22.16b , #6
    478     ext       v6.16b, v20.16b , v22.16b , #10
    479     st1       {v18.2s}, [x1]
    480     add       v18.8h, v24.8h , v26.8h
    481     ext       v24.16b, v20.16b , v22.16b , #2
    482     ext       v26.16b, v20.16b , v22.16b , #8
    483     add       v24.8h, v24.8h , v26.8h
    484 
    485     saddl     v26.4s, v6.4h, v20.4h
    486     smlal     v26.4s, v18.4h, v28.4h
    487     smlsl     v26.4s, v24.4h, v30.4h
    488 
    489     saddl2    v22.4s, v6.8h, v20.8h
    490     smlal2    v22.4s, v18.8h, v28.8h
    491     smlsl2    v22.4s, v24.8h, v30.8h
    492 
    493     mov       v6.16b, v2.16b
    494     mov       v7.16b, v3.16b
    495 
    496     mov       v2.16b, v10.16b
    497     mov       v3.16b, v11.16b
    498 
    499     subs      x4, x4, #4
    500     sqrshrun  v19.4h, v26.4s, #10
    501     sqrshrun  v18.4h, v22.4s, #10
    502     mov       v10.16b, v0.16b
    503     mov       v11.16b, v1.16b
    504 
    505     mov       v24.8b, v14.8b
    506 
    507     mov       v14.16b, v12.16b
    508     mov       v15.16b, v13.16b
    509 
    510 
    511     uqxtn     v19.8b, v19.8h
    512     uqxtn     v18.8b, v18.8h
    513     mov       v19.s[1], v18.s[0]
    514 
    515     ld1       {v20.4s}, [x6], #16
    516     ld1       {v22.4s}, [x6], x7
    517     ld1       {v18.2s}, [x1]
    518     sqrshrun  v20.8b, v20.8h, #5
    519     sqrshrun  v21.8b, v22.8h, #5
    520 
    521     mov       v0.16b, v8.16b
    522     mov       v1.16b, v9.16b
    523 
    524     mov       v8.16b, v4.16b
    525     mov       v9.16b, v5.16b
    526 
    527     mov       v12.16b, v16.16b
    528     mov       v13.16b, v17.16b
    529     urhadd    v18.16b, v18.16b , v20.16b
    530     urhadd    v19.16b, v19.16b , v21.16b
    531 
    532     mov       v4.16b, v10.16b
    533     mov       v5.16b, v11.16b
    534 
    535     mov       v16.8b, v24.8b
    536     st1       {v18.2s, v19.2s}, [x1], x3 // store row 3
    537 
    538     bgt       loop_16                   // looping if height =16
    539     b         end_func
    540 
    541 loop_8_start:
    542     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    543     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    544     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    545     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    546     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    547 
    548 loop_8:
    549 
    550     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    551     uaddl     v14.8h, v4.8b, v6.8b
    552     uaddl     v12.8h, v0.8b, v10.8b
    553     uaddl     v16.8h, v2.8b, v8.8b
    554     mla       v12.8h, v14.8h , v26.8h
    555     uaddl     v18.8h, v5.8b, v7.8b
    556     uaddl     v14.8h, v1.8b, v11.8b
    557     uaddl     v22.8h, v3.8b, v9.8b
    558     mla       v14.8h, v18.8h , v26.8h
    559     mls       v12.8h, v16.8h , v24.8h
    560     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
    561     uaddl     v16.8h, v6.8b, v8.8b
    562     mls       v14.8h, v22.8h , v24.8h
    563     uaddl     v28.8h, v2.8b, v0.8b
    564     st1       {v12.4s}, [x9], #16       // store row 0 to temp buffer: col 0
    565     ext       v22.16b, v12.16b , v14.16b , #10
    566     uaddl     v18.8h, v4.8b, v10.8b
    567     mla       v28.8h, v16.8h , v26.8h
    568     saddl     v30.4s, v12.4h, v22.4h
    569     st1       {v14.4s}, [x9], x7        // store row 0 to temp buffer: col 1
    570     saddl2    v22.4s, v12.8h, v22.8h
    571     ext       v16.16b, v12.16b , v14.16b , #4
    572     mls       v28.8h, v18.8h , v24.8h
    573     ext       v18.16b, v12.16b , v14.16b , #6
    574     ext       v20.16b, v12.16b , v14.16b , #8
    575     ext       v14.16b, v12.16b , v14.16b , #2
    576     add       v16.8h, v16.8h , v18.8h
    577     add       v18.8h, v14.8h , v20.8h
    578     uaddl     v20.8h, v7.8b, v9.8b
    579     smlal     v30.4s, v16.4h, v26.4h
    580     smlsl     v30.4s, v18.4h, v24.4h
    581     smlal2    v22.4s, v16.8h, v26.8h
    582     smlsl2    v22.4s, v18.8h, v24.8h
    583     uaddl     v14.8h, v3.8b, v1.8b
    584     st1       {v28.4s}, [x9], #16       // store row 1 to temp buffer: col 0
    585     mla       v14.8h, v20.8h , v26.8h
    586     sqrshrun  v12.4h, v30.4s, #10
    587     uaddl     v16.8h, v5.8b, v11.8b
    588     sqrshrun  v13.4h, v22.4s, #10
    589     mls       v14.8h, v16.8h , v24.8h
    590     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
    591     uqxtn     v25.8b, v12.8h
    592     uqxtn     v13.8b, v13.8h
    593     mov       v25.s[1], v13.s[0]
    594     uaddl     v16.8h, v8.8b, v10.8b
    595 
    596 
    597     ext       v22.16b, v28.16b , v14.16b , #10
    598     uaddl     v20.8h, v4.8b, v2.8b
    599     saddl     v30.4s, v28.4h, v22.4h
    600     mla       v20.8h, v16.8h , v26.8h
    601     st1       {v14.4s}, [x9], x7        // store row 1 to temp buffer: col 1
    602     saddl2    v22.4s, v28.8h, v22.8h
    603     ext       v16.16b, v28.16b , v14.16b , #4
    604     ext       v18.16b, v28.16b , v14.16b , #6
    605     ext       v12.16b, v28.16b , v14.16b , #8
    606     ext       v14.16b, v28.16b , v14.16b , #2
    607     add       v16.8h, v16.8h , v18.8h
    608     add       v18.8h, v12.8h , v14.8h
    609     ld1       {v14.4s, v15.4s}, [x6], x8 // load row 0 from temp buffer
    610     smlal     v30.4s, v16.4h, v26.4h
    611     smlsl     v30.4s, v18.4h, v24.4h
    612     smlal2    v22.4s, v16.8h, v26.8h
    613     smlsl2    v22.4s, v18.8h, v24.8h
    614     sqrshrun  v14.8b, v14.8h, #0x5
    615     ld1       {v28.4s, v29.4s}, [x6], x8 // load row 1 from temp buffer
    616     uaddl     v18.8h, v6.8b, v0.8b
    617     sqrshrun  v16.4h, v30.4s, #10
    618     sqrshrun  v15.8b, v28.8h, #0x5
    619     sqrshrun  v17.4h, v22.4s, #10
    620 
    621     mov       v12.8b, v25.8b
    622     mov       v25.8b, v24.8b
    623 
    624     uaddl     v28.8h, v9.8b, v11.8b
    625     uqxtn     v13.8b, v16.8h
    626     uqxtn     v17.8b, v17.8h
    627     mov       v13.s[1], v17.s[0]
    628 
    629     urhadd    v12.16b, v12.16b , v14.16b
    630     urhadd    v13.16b, v13.16b , v15.16b
    631     uaddl     v14.8h, v5.8b, v3.8b
    632     uaddl     v22.8h, v7.8b, v1.8b
    633     mls       v20.8h, v18.8h , v24.8h
    634     st1       {v12.2s}, [x1], x3        // store row 0
    635     mla       v14.8h, v28.8h , v26.8h
    636     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
    637     uaddl     v30.8h, v10.8b, v0.8b
    638     uaddl     v28.8h, v6.8b, v4.8b
    639     mls       v14.8h, v22.8h , v24.8h
    640     st1       {v13.2s}, [x1], x3        // store row 1
    641     mla       v28.8h, v30.8h , v26.8h
    642     st1       {v20.4s}, [x9], #16       // store row 2 to temp buffer: col 0
    643     ext       v22.16b, v20.16b , v14.16b , #10
    644     saddl     v30.4s, v20.4h, v22.4h
    645     st1       {v14.2s, v15.2s}, [x9], x7 // store row 2 to temp buffer: col 0
    646     saddl2    v22.4s, v20.8h, v22.8h
    647     ext       v16.16b, v20.16b , v14.16b , #4
    648     ext       v18.16b, v20.16b , v14.16b , #6
    649     ext       v12.16b, v20.16b , v14.16b , #8
    650     ext       v14.16b, v20.16b , v14.16b , #2
    651     add       v16.8h, v16.8h , v18.8h
    652     add       v18.8h, v14.8h , v12.8h
    653     uaddl     v20.8h, v8.8b, v2.8b
    654     smlal     v30.4s, v16.4h, v26.4h
    655     smlsl     v30.4s, v18.4h, v24.4h
    656     smlal2    v22.4s, v16.8h, v26.8h
    657     smlsl2    v22.4s, v18.8h, v24.8h
    658     uaddl     v18.8h, v11.8b, v1.8b
    659     uaddl     v16.8h, v7.8b, v5.8b
    660     sqrshrun  v12.4h, v30.4s, #10
    661     uaddl     v30.8h, v9.8b, v3.8b
    662     mla       v16.8h, v18.8h , v26.8h
    663     sqrshrun  v13.4h, v22.4s, #10
    664     mls       v28.8h, v20.8h , v24.8h
    665     ld1       {v14.4s, v15.4s}, [x6], x8 // load row 2 from temp buffer
    666     mls       v16.8h, v30.8h , v24.8h
    667     uqxtn     v27.8b, v12.8h
    668     uqxtn     v13.8b, v13.8h
    669     mov       v27.s[1], v13.s[0]
    670 
    671     sqrshrun  v14.8b, v14.8h, #5
    672     ext       v22.16b, v28.16b , v16.16b , #10
    673     st1       {v28.4s}, [x9], #16       // store row 3 to temp buffer: col 0
    674     saddl     v30.4s, v28.4h, v22.4h
    675     st1       {v16.2s, v17.2s}, [x9], x7 // store row 3 to temp buffer: col 1
    676     saddl2    v22.4s, v28.8h, v22.8h
    677     ext       v12.16b, v28.16b , v16.16b , #4
    678     ext       v18.16b, v28.16b , v16.16b , #6
    679     ext       v20.16b, v28.16b , v16.16b , #8
    680     ext       v28.16b, v28.16b , v16.16b , #2
    681     add       v12.8h, v12.8h , v18.8h
    682     add       v18.8h, v28.8h , v20.8h
    683     ld1       {v16.4s, v17.4s}, [x6], x8 // load row 3 from temp buffer
    684     smlal     v30.4s, v12.4h, v26.4h
    685     smlsl     v30.4s, v18.4h, v24.4h
    686     smlal2    v22.4s, v12.8h, v26.8h
    687     smlsl2    v22.4s, v18.8h, v24.8h
    688     sqrshrun  v15.8b, v16.8h, #0x5
    689 
    690     mov       v12.8b, v27.8b
    691     mov       v27.8b, v26.8b
    692 
    693     sqrshrun  v16.4h, v30.4s, #10
    694 
    695     mov       v6.16b, v2.16b
    696     mov       v7.16b, v3.16b
    697 
    698     sqrshrun  v17.4h, v22.4s, #10
    699 
    700     mov       v2.16b, v10.16b
    701     mov       v3.16b, v11.16b
    702 
    703     mov       v10.16b, v0.16b
    704     mov       v11.16b, v1.16b
    705 
    706     subs      x4, x4, #4
    707     uqxtn     v13.8b, v16.8h
    708     uqxtn     v17.8b, v17.8h
    709     mov       v13.s[1], v17.s[0]
    710     urhadd    v12.16b, v12.16b , v14.16b
    711     urhadd    v13.16b, v13.16b , v15.16b
    712 
    713     mov       v0.16b, v8.16b
    714     mov       v1.16b, v9.16b
    715 
    716     mov       v8.16b, v4.16b
    717     mov       v9.16b, v5.16b
    718 
    719     mov       v4.16b, v10.16b
    720     mov       v5.16b, v11.16b
    721 
    722     st1       {v12.2s}, [x1], x3        // store row 2
    723     st1       {v13.2s}, [x1], x3        // store row 3
    724 
    725     bgt       loop_8                    //if height =8  loop
    726     b         end_func
    727 
    728 loop_4_start:
    729     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    730     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    731     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    732     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    733     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    734 
    735 loop_4:
    736     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    737     uaddl     v14.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
    738     uaddl     v12.8h, v0.8b, v10.8b     // temp = src[0_0] + src[5_0]
    739     uaddl     v16.8h, v2.8b, v8.8b      // temp2 = src[1_0] + src[4_0]
    740     mla       v12.8h, v14.8h , v26.8h   // temp += temp1 * 20
    741     uaddl     v18.8h, v5.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
    742     uaddl     v14.8h, v1.8b, v11.8b     // temp = src[0_0] + src[5_0]
    743     uaddl     v22.8h, v3.8b, v9.8b      // temp2 = src[1_0] + src[4_0]
    744     mla       v14.8h, v18.8h , v26.8h   // temp += temp1 * 20
    745     mls       v12.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    746     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[6_0]
    747     uaddl     v16.8h, v6.8b, v8.8b
    748     mls       v14.8h, v22.8h , v24.8h   // temp -= temp2 * 5
    749     //Q6 and Q7 have filtered values
    750     uaddl     v28.8h, v2.8b, v0.8b
    751     st1       {v12.4s}, [x9], #16       // store row 0 to temp buffer: col 0
    752     ext       v22.16b, v12.16b , v14.16b , #10
    753     uaddl     v18.8h, v4.8b, v10.8b
    754     mla       v28.8h, v16.8h , v26.8h
    755     saddl     v30.4s, v12.4h, v22.4h
    756     st1       {v14.4s}, [x9], x7        // store row 0 to temp buffer: col 1
    757     saddl     v22.4s, v13.4h, v23.4h
    758     ext       v16.16b, v12.16b , v14.16b , #4
    759     mls       v28.8h, v18.8h , v24.8h
    760     ext       v18.16b, v12.16b , v14.16b , #6
    761     ext       v20.16b, v12.16b , v14.16b , #8
    762     ext       v14.16b, v12.16b , v14.16b , #2
    763     add       v16.8h, v16.8h , v18.8h
    764     add       v18.8h, v14.8h , v20.8h
    765     uaddl     v20.8h, v7.8b, v9.8b
    766     smlal     v30.4s, v16.4h, v26.4h
    767     smlsl     v30.4s, v18.4h, v24.4h
    768     smlal     v22.4s, v17.4h, v26.4h
    769     smlsl     v22.4s, v19.4h, v24.4h
    770     uaddl     v14.8h, v3.8b, v1.8b
    771     st1       {v28.4s}, [x9], #16       // store row 1 to temp buffer: col 0
    772     mla       v14.8h, v20.8h , v26.8h
    773     sqrshrun  v12.4h, v30.4s, #10
    774     uaddl     v16.8h, v5.8b, v11.8b
    775     sqrshrun  v13.4h, v22.4s, #10
    776     mls       v14.8h, v16.8h , v24.8h
    777     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[7_0]
    778     uqxtn     v25.8b, v12.8h
    779     uaddl     v16.8h, v8.8b, v10.8b
    780 
    781     ext       v22.16b, v28.16b , v14.16b , #10
    782     uaddl     v20.8h, v4.8b, v2.8b
    783     saddl     v30.4s, v28.4h, v22.4h
    784     mla       v20.8h, v16.8h , v26.8h
    785     st1       {v14.4s}, [x9], x7        // store row 1 to temp buffer: col 1
    786     saddl     v22.4s, v29.4h, v23.4h
    787     ext       v16.16b, v28.16b , v14.16b , #4
    788     ext       v18.16b, v28.16b , v14.16b , #6
    789     ext       v12.16b, v28.16b , v14.16b , #8
    790     ext       v14.16b, v28.16b , v14.16b , #2
    791     add       v16.8h, v16.8h , v18.8h
    792     add       v18.8h, v12.8h , v14.8h
    793     ld1       {v14.2s}, [x6], x8        //load row 0 from temp buffer
    794     smlal     v30.4s, v16.4h, v26.4h
    795     smlsl     v30.4s, v18.4h, v24.4h
    796     smlal     v22.4s, v17.4h, v26.4h
    797     smlsl     v22.4s, v19.4h, v24.4h
    798     sqrshrun  v14.8b, v14.8h, #0x5
    799     ld1       {v28.2s}, [x6], x8        //load row 1 from temp buffer
    800     uaddl     v18.8h, v6.8b, v0.8b
    801     sqrshrun  v16.4h, v30.4s, #10
    802     sqrshrun  v15.8b, v28.8h, #0x5
    803     sqrshrun  v17.4h, v22.4s, #10
    804 
    805     mov       v12.8b, v25.8b
    806     mov       v25.8b, v24.8b
    807 
    808     uaddl     v28.8h, v9.8b, v11.8b
    809     uqxtn     v13.8b, v16.8h
    810 
    811     urhadd    v12.16b, v12.16b , v14.16b
    812     urhadd    v13.16b, v13.16b , v15.16b
    813 
    814     uaddl     v14.8h, v5.8b, v3.8b
    815     uaddl     v22.8h, v7.8b, v1.8b
    816     mls       v20.8h, v18.8h , v24.8h
    817     st1       {v12.s}[0], [x1], x3      // store row 0
    818     mla       v14.8h, v28.8h , v26.8h
    819     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[8_0]
    820     uaddl     v30.8h, v10.8b, v0.8b
    821     uaddl     v28.8h, v6.8b, v4.8b
    822     mls       v14.8h, v22.8h , v24.8h
    823     st1       {v13.s}[0], [x1], x3      //store row 1
    824     mla       v28.8h, v30.8h , v26.8h
    825     st1       {v20.4s}, [x9], #16       // store row 2 to temp buffer: col 0
    826     ext       v22.16b, v20.16b , v14.16b , #10
    827     saddl     v30.4s, v20.4h, v22.4h
    828     st1       {v14.4s}, [x9], x7        // store row 2 to temp buffer: col 1
    829     saddl     v22.4s, v21.4h, v23.4h
    830     ext       v16.16b, v20.16b , v14.16b , #4
    831     ext       v18.16b, v20.16b , v14.16b , #6
    832     ext       v12.16b, v20.16b , v14.16b , #8
    833     ext       v14.16b, v20.16b , v14.16b , #2
    834     add       v16.8h, v16.8h , v18.8h
    835     add       v18.8h, v14.8h , v12.8h
    836     uaddl     v20.8h, v8.8b, v2.8b
    837     smlal     v30.4s, v16.4h, v26.4h
    838     smlsl     v30.4s, v18.4h, v24.4h
    839     smlal     v22.4s, v17.4h, v26.4h
    840     smlsl     v22.4s, v19.4h, v24.4h
    841     uaddl     v18.8h, v11.8b, v1.8b
    842     uaddl     v16.8h, v7.8b, v5.8b
    843     sqrshrun  v12.4h, v30.4s, #10
    844     uaddl     v30.8h, v9.8b, v3.8b
    845     mla       v16.8h, v18.8h , v26.8h
    846     sqrshrun  v13.4h, v22.4s, #10
    847     mls       v28.8h, v20.8h , v24.8h
    848     ld1       {v14.2s}, [x6], x8        //load row 3 from temp buffer
    849     mls       v16.8h, v30.8h , v24.8h
    850     uqxtn     v27.8b, v12.8h
    851     sqrshrun  v14.8b, v14.8h, #5
    852     ext       v22.16b, v28.16b , v16.16b , #10
    853     st1       {v28.4s}, [x9], #16       // store row 3 to temp buffer: col 0
    854     saddl     v30.4s, v28.4h, v22.4h
    855     st1       {v16.4s}, [x9], x7        // store row 3 to temp buffer: col 1
    856     saddl     v22.4s, v29.4h, v23.4h
    857     ext       v12.16b, v28.16b , v16.16b , #4
    858     ext       v18.16b, v28.16b , v16.16b , #6
    859     ext       v20.16b, v28.16b , v16.16b , #8
    860     ext       v28.16b, v28.16b , v16.16b , #2
    861     add       v12.8h, v12.8h , v18.8h
    862     add       v18.8h, v28.8h , v20.8h
    863     ld1       {v16.2s}, [x6], x8        //load row 4 from temp buffer
    864     smlal     v30.4s, v12.4h, v26.4h
    865     smlsl     v30.4s, v18.4h, v24.4h
    866     smlal     v22.4s, v13.4h, v26.4h
    867     smlsl     v22.4s, v19.4h, v24.4h
    868     sqrshrun  v15.8b, v16.8h, #0x5
    869 
    870     mov       v12.8b, v27.8b
    871     mov       v27.8b, v26.8b
    872 
    873     sqrshrun  v16.4h, v30.4s, #10
    874 
    875     mov       v6.16b, v2.16b
    876     mov       v7.16b, v3.16b
    877 
    878     sqrshrun  v17.4h, v22.4s, #10
    879 
    880     mov       v2.16b, v10.16b
    881     mov       v3.16b, v11.16b
    882 
    883     mov       v10.16b, v0.16b
    884     mov       v11.16b, v1.16b
    885 
    886     subs      x4, x4, #4
    887     uqxtn     v13.8b, v16.8h
    888     urhadd    v12.16b, v12.16b , v14.16b
    889     urhadd    v13.16b, v13.16b , v15.16b
    890 
    891     mov       v0.16b, v8.16b
    892     mov       v1.16b, v9.16b
    893 
    894     mov       v8.16b, v4.16b
    895     mov       v9.16b, v5.16b
    896 
    897 
    898     mov       v4.16b, v10.16b
    899     mov       v5.16b, v11.16b
    900 
    901 
    902     st1       {v12.s}[0], [x1], x3      // store row 2
    903     st1       {v13.s}[0], [x1], x3      // store row 3
    904 
    905     bgt       loop_4
    906 
    907 end_func:
    908     // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    909     ldp       x19, x20, [sp], #16
    910     pop_v_regs
    911     ret
    912 
    913 
    914 
    915