Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_inter_pred_luma_horz_qpel_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for inter prediction horizontal quarter pel interpolation.
     27 //*
     28 //* @author
     29 //*  Mohit
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_inter_pred_luma_horz_qpel_av8()
     34 //*
     35 //* @remarks
     36 //*  None
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 
     41 ///* All the functions here are replicated from ih264_inter_pred_filters.c
     42 //
     43 
     44 ///**
     45 ///**
     46 //*******************************************************************************
     47 //*
     48 //* @brief
     49 //*     Quarter pel interprediction luma filter for horizontal input
     50 //*
     51 //* @par Description:
     52 //* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
     53 //* sec 8.4.2.2.1 titled "Luma sample interpolation process"
     54 //*
     55 //* @param[in] pu1_src
     56 //*  UWORD8 pointer to the source
     57 //*
     58 //* @param[out] pu1_dst
     59 //*  UWORD8 pointer to the destination
     60 //*
     61 //* @param[in] src_strd
     62 //*  integer source stride
     63 //*
     64 //* @param[in] dst_strd
     65 //*  integer destination stride
     66 //*
     67 //* @param[in] ht
     68 //*  integer height of the array
     69 //*
     70 //* @param[in] wd
     71 //*  integer width of the array
     72 //*
     73 // @param[in] pu1_tmp: temporary buffer: UNUSED in this function
     74 //*
     75 //* @param[in] dydx: x and y reference offset for qpel calculations.
     76 //* @returns
     77 //*
     78 // @remarks
     79 //*  None
     80 //*
     81 //*******************************************************************************
     82 //*/
     83 
     84 //void ih264_inter_pred_luma_horz (
     85 //                            UWORD8 *pu1_src,
     86 //                            UWORD8 *pu1_dst,
     87 //                            WORD32 src_strd,
     88 //                            WORD32 dst_strd,
     89 //                            WORD32 ht,
     90 //                            WORD32 wd,
     91 //                              UWORD8* pu1_tmp,
     92 //                             UWORD32 dydx)
     93 
     94 //**************Variables Vs Registers*****************************************
     95 //    x0 => *pu1_src
     96 //    x1 => *pu1_dst
     97 //    w2 =>  src_strd
     98 //    w3 =>  dst_strd
     99 //    w4 =>  ht
    100 //    w5 =>  wd
    101 //    w7 =>  dydx
    102 
    103 .text
    104 .p2align 2
    105 .include "ih264_neon_macros.s"
    106 
    107 
    108 
    109 
    110     .global ih264_inter_pred_luma_horz_qpel_av8
    111 
    112 ih264_inter_pred_luma_horz_qpel_av8:
    113 
    114 
    115     push_v_regs
    116     stp       x19, x20, [sp, #-16]!
    117     sxtw      x2, w2
    118     sxtw      x3, w3
    119     sxtw      x4, w4
    120     sxtw      x5, w5
    121 
    122 
    123     and       x7, x7, #3                //Finds x-offset
    124     add       x7, x0, x7, lsr #1        //pu1_src + (x_offset>>1)
    125     sub       x0, x0, #2                //pu1_src-2
    126     sub       x14, x4, #16
    127     movi      v0.16b, #5                //filter coeff
    128     subs      x12, x5, #8               //if wd=8 branch to loop_8
    129     movi      v1.16b, #20               //filter coeff
    130 
    131     beq       loop_8
    132 
    133     subs      x12, x5, #4               //if wd=4 branch to loop_4
    134     beq       loop_4
    135 
    136 loop_16:                                //when  wd=16
    137     //// Processing row0 and row1
    138     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
    139     add       x14, x14, #1              //for checking loop
    140     ext       v31.8b, v2.8b , v3.8b , #5
    141     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
    142     ext       v30.8b, v3.8b , v4.8b , #5
    143     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
    144     ext       v28.8b, v5.8b , v6.8b , #5
    145     uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row0)
    146     ext       v27.8b, v6.8b , v7.8b , #5
    147     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
    148     ext       v31.8b, v2.8b , v3.8b , #2
    149     uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row1)
    150     ext       v30.8b, v3.8b , v4.8b , #2
    151     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    152     ext       v28.8b, v5.8b , v6.8b , #2
    153     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row0)
    154     ext       v27.8b, v6.8b , v7.8b , #2
    155     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
    156     ext       v31.8b, v2.8b , v3.8b , #3
    157     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row1)
    158     ext       v30.8b, v3.8b , v4.8b , #3
    159     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    160     ext       v28.8b, v5.8b , v6.8b , #3
    161     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row0)
    162     ext       v27.8b, v6.8b , v7.8b , #3
    163     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
    164     ext       v31.8b, v2.8b , v3.8b , #1
    165     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row1)
    166     ext       v30.8b, v3.8b , v4.8b , #1
    167     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    168     ext       v28.8b, v5.8b , v6.8b , #1
    169     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row0)
    170     ext       v27.8b, v6.8b , v7.8b , #1
    171     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
    172     ext       v31.8b, v2.8b , v3.8b , #4
    173     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row1)
    174     ext       v30.8b, v3.8b , v4.8b , #4
    175     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    176     ext       v28.8b, v5.8b , v6.8b , #4
    177     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row0)
    178     ext       v27.8b, v6.8b , v7.8b , #4
    179     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
    180     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2
    181     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row1)
    182 
    183     ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row0)
    184     sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    185     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3
    186     sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    187     ext       v31.8b, v2.8b , v3.8b , #5
    188     urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
    189     urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
    190 
    191     sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
    192     st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row0
    193     ext       v30.8b, v3.8b , v4.8b , #5
    194     sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row1)
    195 
    196 
    197 
    198 //// Processing row2 and row3
    199     ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row1)
    200     ext       v28.8b, v5.8b , v6.8b , #5
    201     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    202     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    203 
    204     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
    205     st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row1
    206     uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row2)
    207     ext       v27.8b, v6.8b , v7.8b , #5
    208     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
    209     ext       v31.8b, v2.8b , v3.8b , #2
    210     uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row3)
    211     ext       v30.8b, v3.8b , v4.8b , #2
    212     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
    213     ext       v27.8b, v6.8b , v7.8b , #2
    214     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row2)
    215     ext       v28.8b, v5.8b , v6.8b , #2
    216     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
    217     ext       v31.8b, v2.8b , v3.8b , #3
    218     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row3)
    219     ext       v30.8b, v3.8b , v4.8b , #3
    220     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
    221     ext       v28.8b, v5.8b , v6.8b , #3
    222     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row2)
    223     ext       v27.8b, v6.8b , v7.8b , #3
    224     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
    225     ext       v31.8b, v2.8b , v3.8b , #1
    226     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row3)
    227     ext       v30.8b, v3.8b , v4.8b , #1
    228     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
    229     ext       v28.8b, v5.8b , v6.8b , #1
    230     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row2)
    231     ext       v27.8b, v6.8b , v7.8b , #1
    232     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
    233     ext       v31.8b, v2.8b , v3.8b , #4
    234     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row3)
    235     ext       v30.8b, v3.8b , v4.8b , #4
    236     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
    237     ext       v28.8b, v5.8b , v6.8b , #4
    238     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row2)
    239     ext       v27.8b, v6.8b , v7.8b , #4
    240     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
    241     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4
    242     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row3)
    243 
    244     ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row2)
    245     sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
    246     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5
    247     sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row2)
    248     ext       v31.8b, v2.8b , v3.8b , #5
    249     urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
    250     urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
    251 
    252     sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
    253     ext       v30.8b, v3.8b , v4.8b , #5
    254     st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row2
    255     sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row3)
    256     ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row3)
    257 
    258 //// Processing row4 and row5
    259     ext       v28.8b, v5.8b , v6.8b , #5
    260     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    261     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    262 
    263     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row4)
    264     st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row3
    265     uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row4)
    266     ext       v27.8b, v6.8b , v7.8b , #5
    267     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row5)
    268     ext       v31.8b, v2.8b , v3.8b , #2
    269     uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row5)
    270     ext       v30.8b, v3.8b , v4.8b , #2
    271     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row4)
    272     ext       v27.8b, v6.8b , v7.8b , #2
    273     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row4)
    274     ext       v28.8b, v5.8b , v6.8b , #2
    275     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row5)
    276     ext       v31.8b, v2.8b , v3.8b , #3
    277     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row5)
    278     ext       v30.8b, v3.8b , v4.8b , #3
    279     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row4)
    280     ext       v28.8b, v5.8b , v6.8b , #3
    281     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row4)
    282     ext       v27.8b, v6.8b , v7.8b , #3
    283     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row5)
    284     ext       v31.8b, v2.8b , v3.8b , #1
    285     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row5)
    286     ext       v30.8b, v3.8b , v4.8b , #1
    287     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
    288     ext       v28.8b, v5.8b , v6.8b , #1
    289     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row4)
    290     ext       v27.8b, v6.8b , v7.8b , #1
    291     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
    292     ext       v31.8b, v2.8b , v3.8b , #4
    293     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row5)
    294     ext       v30.8b, v3.8b , v4.8b , #4
    295     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row4)
    296     ext       v28.8b, v5.8b , v6.8b , #4
    297     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row4)
    298     ext       v27.8b, v6.8b , v7.8b , #4
    299     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row5)
    300     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6
    301     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row5)
    302     ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row4)
    303     sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row4)
    304     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7
    305     sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row4)
    306     ext       v31.8b, v2.8b , v3.8b , #5
    307     urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
    308     urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
    309 
    310     sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row5)
    311     st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row4
    312     ext       v30.8b, v3.8b , v4.8b , #5
    313     sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row5)
    314     ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row5)
    315 
    316 
    317     //// Processing row6 and row7
    318 
    319     ext       v28.8b, v5.8b , v6.8b , #5
    320     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    321     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    322 
    323     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row6)
    324     st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row5
    325     uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row6)
    326     ext       v27.8b, v6.8b , v7.8b , #5
    327     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row7)
    328     ext       v31.8b, v2.8b , v3.8b , #2
    329     uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row7)
    330     ext       v30.8b, v3.8b , v4.8b , #2
    331     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row6)
    332     ext       v27.8b, v6.8b , v7.8b , #2
    333     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row6)
    334     ext       v28.8b, v5.8b , v6.8b , #2
    335     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row7)
    336     ext       v31.8b, v2.8b , v3.8b , #3
    337     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row7)
    338     ext       v30.8b, v3.8b , v4.8b , #3
    339     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row6)
    340     ext       v28.8b, v5.8b , v6.8b , #3
    341     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row6)
    342     ext       v27.8b, v6.8b , v7.8b , #3
    343     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row7)
    344     ext       v31.8b, v2.8b , v3.8b , #1
    345     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row7)
    346     ext       v30.8b, v3.8b , v4.8b , #1
    347     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
    348     ext       v28.8b, v5.8b , v6.8b , #1
    349     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row6)
    350     ext       v27.8b, v6.8b , v7.8b , #1
    351     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
    352     ext       v31.8b, v2.8b , v3.8b , #4
    353     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row7)
    354     ext       v30.8b, v3.8b , v4.8b , #4
    355     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row6)
    356     ext       v28.8b, v5.8b , v6.8b , #4
    357     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row6)
    358     ext       v27.8b, v6.8b , v7.8b , #4
    359     ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row6)
    360     sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row6)
    361     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row7)
    362     sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row6)
    363     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row7)
    364     urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
    365     urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
    366 
    367     ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row7)
    368     sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row7)
    369     st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row6
    370     sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row7)
    371     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    372     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    373 
    374     subs      x12, x14, #1              // if height==16  - looping
    375     st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row7
    376 
    377 
    378 
    379     beq       loop_16
    380     b         end_func
    381 
    382 loop_8:
    383 //// Processing row0 and row1
    384 
    385     ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row1
    386     add       x14, x14, #1              //for checking loop
    387     ext       v28.8b, v5.8b , v6.8b , #5
    388     ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row0
    389     ext       v25.8b, v5.8b , v6.8b , #2
    390     ext       v31.8b, v2.8b , v3.8b , #5
    391     ext       v24.8b, v5.8b , v6.8b , #3
    392     ext       v23.8b, v5.8b , v6.8b , #1
    393     ext       v22.8b, v5.8b , v6.8b , #4
    394     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
    395     ext       v29.8b, v2.8b , v3.8b , #3
    396     umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
    397     umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
    398     umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
    399     umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
    400     ext       v30.8b, v2.8b , v3.8b , #2
    401     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
    402     ext       v27.8b, v2.8b , v3.8b , #1
    403     ext       v26.8b, v2.8b , v3.8b , #4
    404     ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row2
    405     umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    406     umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    407     umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    408     umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    409     ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
    410     sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    411 
    412     //// Processing row2 and row3
    413     ext       v28.8b, v5.8b , v6.8b , #5
    414     ext       v25.8b, v5.8b , v6.8b , #2
    415     ext       v31.8b, v2.8b , v3.8b , #5
    416     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
    417     ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row0)
    418     ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row1)
    419     ext       v24.8b, v5.8b , v6.8b , #3
    420     ext       v23.8b, v5.8b , v6.8b , #1
    421     sqrshrun  v19.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
    422     ext       v22.8b, v5.8b , v6.8b , #4
    423     ext       v29.8b, v2.8b , v3.8b , #3
    424     umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
    425     umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
    426     umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
    427     umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
    428     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    429     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    430 
    431     st1       {v18.8b}, [x1], x3        ////Store dest row0
    432     st1       {v19.8b}, [x1], x3        ////Store dest row1
    433     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
    434     ext       v30.8b, v2.8b , v3.8b , #2
    435     ext       v27.8b, v2.8b , v3.8b , #1
    436     ext       v26.8b, v2.8b , v3.8b , #4
    437     ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row4
    438     umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
    439     umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
    440     umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
    441     umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
    442     ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row5
    443     subs      x9, x4, #4
    444     sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
    445     ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row2)
    446     ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row3)
    447     ext       v28.8b, v5.8b , v6.8b , #5
    448     ext       v25.8b, v5.8b , v6.8b , #2
    449     ext       v31.8b, v2.8b , v3.8b , #5
    450     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row5)
    451     ext       v24.8b, v5.8b , v6.8b , #3
    452     sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
    453     ext       v22.8b, v5.8b , v6.8b , #4
    454     ext       v29.8b, v2.8b , v3.8b , #3
    455     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    456     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    457 
    458     st1       {v18.8b}, [x1], x3        ////Store dest row2
    459     ext       v30.8b, v2.8b , v3.8b , #2
    460     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row4)
    461     st1       {v19.8b}, [x1], x3        ////Store dest row3
    462     beq       end_func                  // Branch if height==4
    463 
    464 //// Processing row4 and row5
    465     ext       v23.8b, v5.8b , v6.8b , #1
    466     umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row5)
    467     umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row5)
    468     umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row5)
    469     umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row5)
    470     ext       v27.8b, v2.8b , v3.8b , #1
    471     ext       v26.8b, v2.8b , v3.8b , #4
    472     ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row6
    473     umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row4)
    474     umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row4)
    475     umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
    476     umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row4)
    477     sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row5)
    478     ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row7
    479     ext       v31.8b, v2.8b , v3.8b , #5
    480     ext       v28.8b, v5.8b , v6.8b , #5
    481     ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row4)
    482     ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row5)
    483     ext       v25.8b, v5.8b , v6.8b , #2
    484     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row7)
    485     ext       v24.8b, v5.8b , v6.8b , #3
    486     ext       v22.8b, v5.8b , v6.8b , #4
    487     sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row4)
    488     ext       v29.8b, v2.8b , v3.8b , #3
    489     ext       v30.8b, v2.8b , v3.8b , #2
    490     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    491     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    492 
    493     st1       {v18.8b}, [x1], x3        ////Store dest row4
    494     ext       v27.8b, v2.8b , v3.8b , #1
    495     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row6)
    496     ext       v26.8b, v2.8b , v3.8b , #4
    497     umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row6)
    498     umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row6)
    499     umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
    500     umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row6)
    501     //// Processing row6 and row7
    502     st1       {v19.8b}, [x1], x3        ////Store dest row5
    503     ext       v23.8b, v5.8b , v6.8b , #1
    504     umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row7)
    505     umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row7)
    506     umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row7)
    507     umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row7)
    508     ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row6)
    509     ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row7)
    510     sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row6)
    511     subs      x12, x14, #1
    512     sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row7)
    513     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    514     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    515 
    516     st1       {v18.8b}, [x1], x3        ////Store dest row6
    517     st1       {v19.8b}, [x1], x3        ////Store dest row7
    518 
    519     beq       loop_8                    //looping if height ==16
    520 
    521     b         end_func
    522 
    523 loop_4:
    524     ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row1
    525     ext       v28.8b, v5.8b , v6.8b , #5
    526     ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row0
    527     ext       v25.8b, v5.8b , v6.8b , #2
    528     ext       v31.8b, v2.8b , v3.8b , #5
    529     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
    530     ext       v24.8b, v5.8b , v6.8b , #3
    531     ext       v23.8b, v5.8b , v6.8b , #1
    532     ext       v22.8b, v5.8b , v6.8b , #4
    533     ext       v29.8b, v2.8b , v3.8b , #3
    534     umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
    535     umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
    536     umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
    537     umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
    538     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
    539     ext       v30.8b, v2.8b , v3.8b , #2
    540     ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row0)
    541     ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row1)
    542     ext       v27.8b, v2.8b , v3.8b , #1
    543     ext       v26.8b, v2.8b , v3.8b , #4
    544     ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row2
    545     umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    546     umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    547     umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    548     umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    549     ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
    550     ext       v28.8b, v5.8b , v6.8b , #5
    551     ext       v25.8b, v5.8b , v6.8b , #2
    552     sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    553     ext       v31.8b, v2.8b , v3.8b , #5
    554     ext       v24.8b, v5.8b , v6.8b , #3
    555 
    556     ext       v23.8b, v5.8b , v6.8b , #1
    557     ext       v22.8b, v5.8b , v6.8b , #4
    558     ext       v29.8b, v2.8b , v3.8b , #3
    559     sqrshrun  v19.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
    560     ext       v30.8b, v2.8b , v3.8b , #2
    561     ext       v27.8b, v2.8b , v3.8b , #1
    562 
    563     //// Processing row2 and row3
    564     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    565     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    566 
    567     st1       {v18.s}[0], [x1], x3      ////Store dest row0
    568     st1       {v19.s}[0], [x1], x3      ////Store dest row1
    569     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
    570     ext       v26.8b, v2.8b , v3.8b , #4
    571     ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row2)
    572     ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row3)
    573 
    574     umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
    575     umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
    576     umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
    577     umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
    578     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
    579     umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
    580     umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
    581     umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
    582     umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
    583     sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
    584     sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
    585     urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
    586     urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
    587 
    588     st1       {v18.s}[0], [x1], x3      ////Store dest row2
    589     subs      x4, x4, #8                // Loop if height =8
    590     st1       {v19.s}[0], [x1], x3      ////Store dest row3
    591 
    592     beq       loop_4
    593 
    594 end_func:
    595 
    596     ldp       x19, x20, [sp], #16
    597     pop_v_regs
    598     ret
    599 
    600 
    601 
    602