Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_inter_pred_luma_vert_qpel_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for inter prediction vertical quarter pel interpolation.
     27 //*
     28 //* @author
     29 //*  Mohit
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_inter_pred_luma_vert_qpel_av8()
     34 //*
     35 //* @remarks
     36 //*  None
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 
     41 ///* All the functions here are replicated from ih264_inter_pred_filters.c
     42 //
     43 
     44 ///**
     45 ///**
     46 //*******************************************************************************
     47 //*
     48 //* @brief
     49 //*     Quarter pel interprediction luma filter for vertical input
     50 //*
     51 //* @par Description:
     52 //* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
     53 //* sec 8.4.2.2.1 titled "Luma sample interpolation process"
     54 //*
     55 //* @param[in] pu1_src
     56 //*  UWORD8 pointer to the source
     57 //*
     58 //* @param[out] pu1_dst
     59 //*  UWORD8 pointer to the destination
     60 //*
     61 //* @param[in] src_strd
     62 //*  integer source stride
     63 //*
     64 //* @param[in] dst_strd
     65 //*  integer destination stride
     66 //*
     67 //* @param[in] ht
     68 //*  integer height of the array
     69 //*
     70 //* @param[in] wd
     71 //*  integer width of the array
     72 //*
     73 //* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
     74 //*
     75 //* @param[in] dydx: x and y reference offset for qpel calculations.
     76 //* @returns
     77 //*
     78 // @remarks
     79 //*  None
     80 //*
     81 //*******************************************************************************
     82 //*/
     83 
     84 //void ih264_inter_pred_luma_vert (
     85 //                            UWORD8 *pu1_src,
     86 //                            UWORD8 *pu1_dst,
     87 //                            WORD32 src_strd,
     88 //                            WORD32 dst_strd,
     89 //                            WORD32 ht,
     90 //                            WORD32 wd,
     91 //                              UWORD8* pu1_tmp,
     92 //                             UWORD32 dydx)
     93 
     94 //**************Variables Vs Registers*****************************************
     95 //    x0 => *pu1_src
     96 //    x1 => *pu1_dst
     97 //    x2 =>  src_strd
     98 //    x3 =>  dst_strd
     99 //    x4 =>  ht
    100 //    x5 =>  wd
    101 //   x7 =>  dydx
    102 
    103 .text
    104 .p2align 2
    105 .include "ih264_neon_macros.s"
    106 
    107 
    108 
    109     .global ih264_inter_pred_luma_vert_qpel_av8
    110 
    111 ih264_inter_pred_luma_vert_qpel_av8:
    112 
    113     push_v_regs
    114     stp       x19, x20, [sp, #-16]!
    115 
    116 
    117     and       x7, x7, #12               //Finds y-offset
    118     lsr       x7, x7, #3                //dydx>>3
    119     mul       x7, x2, x7
    120     add       x7, x0, x7                //pu1_src + (y_offset>>1)*src_strd
    121     sub       x14, x4, #16
    122     movi      v22.8h, #20               // Filter coeff 0x14 into Q11
    123     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
    124     subs      x12, x5, #8               //if wd=8 branch to loop_8
    125     movi      v24.8h, #5                // Filter coeff 0x4  into Q12
    126     beq       loop_8_start
    127 
    128     subs      x12, x5, #4               //if wd=4 branch to loop_4
    129     beq       loop_4_start
    130 
    131 
    132     ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
    133     ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
    134     ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
    135     ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
    136     add       x14, x14, #1              //for checking loop
    137     ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
    138     uaddl     v12.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
    139     ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
    140 
    141 loop_16:                                //when  wd=16
    142 
    143     uaddl     v14.8h, v0.8b, v10.8b     // temp = src[0_0] + src[5_0]
    144     uaddl     v16.8h, v2.8b, v8.8b      // temp2 = src[1_0] + src[4_0]
    145     mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
    146     uaddl     v20.8h, v1.8b, v11.8b     // temp4 = src[0_8] + src[5_8]
    147     uaddl     v18.8h, v5.8b, v7.8b      // temp3 = src[2_8] + src[3_8]
    148     mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
    149     ld1       {v0.2s, v1.2s}, [x0], x2
    150     uaddl     v26.8h, v3.8b, v9.8b      // temp5 = src[1_8] + src[4_8]
    151     uaddl     v12.8h, v6.8b, v8.8b
    152     mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    153     uaddl     v16.8h, v2.8b, v0.8b
    154     uaddl     v18.8h, v4.8b, v10.8b
    155     mla       v16.8h, v12.8h , v22.8h
    156     mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
    157     uaddl     v26.8h, v5.8b, v11.8b
    158     uaddl     v12.8h, v7.8b, v9.8b
    159     sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
    160     uaddl     v14.8h, v3.8b, v1.8b
    161     ld1       {v2.2s, v3.2s}, [x0], x2
    162     mla       v14.8h, v12.8h , v22.8h
    163     mls       v16.8h, v18.8h , v24.8h
    164     sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
    165     ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0
    166     urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
    167     urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
    168     uaddl     v18.8h, v4.8b, v2.8b
    169     uaddl     v12.8h, v8.8b, v10.8b
    170     st1       {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0]
    171     mla       v18.8h, v12.8h , v22.8h
    172     uaddl     v20.8h, v6.8b, v0.8b
    173     mls       v14.8h, v26.8h , v24.8h
    174     sqrshrun  v30.8b, v16.8h, #5
    175     uaddl     v12.8h, v9.8b, v11.8b
    176     uaddl     v16.8h, v5.8b, v3.8b
    177     uaddl     v26.8h, v7.8b, v1.8b
    178     mla       v16.8h, v12.8h , v22.8h
    179     mls       v18.8h, v20.8h , v24.8h
    180     ld1       {v4.2s, v5.2s}, [x0], x2
    181     sqrshrun  v31.8b, v14.8h, #5
    182     ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1
    183     uaddl     v12.8h, v10.8b, v0.8b
    184     urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
    185     urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
    186     uaddl     v14.8h, v6.8b, v4.8b
    187     uaddl     v20.8h, v8.8b, v2.8b
    188     mla       v14.8h, v12.8h , v22.8h
    189     mls       v16.8h, v26.8h , v24.8h
    190     st1       {v30.2s, v31.2s}, [x1], x3 //store row 1
    191     sqrshrun  v30.8b, v18.8h, #5
    192     uaddl     v18.8h, v7.8b, v5.8b
    193     uaddl     v12.8h, v11.8b, v1.8b
    194     mla       v18.8h, v12.8h , v22.8h
    195     uaddl     v26.8h, v9.8b, v3.8b
    196     mls       v14.8h, v20.8h , v24.8h
    197     ld1       {v6.2s, v7.2s}, [x0], x2
    198     sqrshrun  v31.8b, v16.8h, #5
    199     ld1       {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2
    200     mls       v18.8h, v26.8h , v24.8h
    201     urhadd    v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value
    202     urhadd    v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value
    203     uaddl     v12.8h, v0.8b, v2.8b      // temp1 = src[2_0] + src[3_0]
    204     st1       {v30.2s, v31.2s}, [x1], x3 //store row 2
    205     uaddl     v16.8h, v10.8b, v4.8b     // temp2 = src[1_0] + src[4_0]
    206     uaddl     v20.8h, v9.8b, v7.8b      // temp4 = src[0_8] + src[5_8]
    207     sqrshrun  v30.8b, v14.8h, #5
    208     uaddl     v26.8h, v5.8b, v11.8b     // temp5 = src[1_8] + src[4_8]
    209     uaddl     v14.8h, v8.8b, v6.8b      // temp = src[0_0] + src[5_0]
    210     sqrshrun  v31.8b, v18.8h, #5
    211     ld1       {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3
    212     mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
    213     urhadd    v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value
    214     urhadd    v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value
    215     uaddl     v18.8h, v1.8b, v3.8b      // temp3 = src[2_8] + src[3_8]
    216     st1       {v30.2s, v31.2s}, [x1], x3 //store row 3
    217     // 4 rows processed
    218     mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
    219     ld1       {v8.2s, v9.2s}, [x0], x2
    220     uaddl     v12.8h, v2.8b, v4.8b
    221     uaddl     v18.8h, v3.8b, v5.8b
    222     mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    223     uaddl     v28.8h, v9.8b, v11.8b
    224     uaddl     v16.8h, v6.8b, v0.8b
    225     mla       v28.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
    226     mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
    227     uaddl     v26.8h, v1.8b, v7.8b
    228     uaddl     v18.8h, v5.8b, v7.8b
    229     sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
    230     uaddl     v14.8h, v8.8b, v10.8b
    231     sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
    232     ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4
    233     ld1       {v10.2s, v11.2s}, [x0], x2
    234     urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
    235     urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
    236     mls       v28.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
    237     st1       {v30.2s, v31.2s}, [x1], x3 //  store row 4
    238     mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
    239     uaddl     v20.8h, v11.8b, v1.8b
    240     uaddl     v26.8h, v3.8b, v9.8b
    241     mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
    242     uaddl     v12.8h, v6.8b, v4.8b
    243     uaddl     v18.8h, v7.8b, v9.8b
    244     sqrshrun  v31.8b, v28.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
    245     mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    246     uaddl     v16.8h, v8.8b, v2.8b
    247     sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
    248     ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5
    249     mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
    250     urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
    251     urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
    252     uaddl     v14.8h, v10.8b, v0.8b
    253     st1       {v30.2s, v31.2s}, [x1], x3 //  store row 5
    254     mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
    255     ld1       {v0.2s, v1.2s}, [x0], x2
    256     uaddl     v26.8h, v5.8b, v11.8b
    257     uaddl     v12.8h, v8.8b, v6.8b
    258     uaddl     v28.8h, v0.8b, v2.8b
    259     sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
    260     mla       v28.8h, v12.8h , v22.8h   // temp += temp1 * 20
    261     uaddl     v20.8h, v1.8b, v3.8b
    262     mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    263     mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
    264     uaddl     v16.8h, v10.8b, v4.8b
    265     sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
    266     ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6
    267     mov       v2.8b, v6.8b
    268     mov       v3.8b, v7.8b
    269     urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
    270     urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
    271 
    272     mls       v28.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    273     st1       {v30.2s, v31.2s}, [x1], x3 //  store row 6
    274     sqrshrun  v30.8b, v28.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
    275     swp       v0.8b, v4.8b              // swapping registers to put it in order
    276     swp       v1.8b, v5.8b              // swapping registers to put it in order
    277 
    278     mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
    279     mov       v6.8b, v10.8b
    280     mov       v7.8b, v11.8b
    281     subs      x12, x14, #1              // if height==16  - looping
    282     swp       v4.8b, v8.8b
    283     swp       v5.8b, v9.8b
    284     sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
    285     ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7
    286     urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
    287     urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
    288     st1       {v30.2s, v31.2s}, [x1], x3 //  store row 7
    289     bne       end_func                  //if height =8  end function
    290     add       x14, x14, #1              //for checking loop
    291     ld1       {v10.2s, v11.2s}, [x0], x2
    292     uaddl     v12.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
    293 
    294     b         loop_16                   // looping if height =16
    295 
    296 loop_8_start:
    297 //// Processing row0 and row1
    298 
    299     ld1       {v0.2s}, [x0], x2         // Vector load from src[0_0]
    300     ld1       {v1.2s}, [x0], x2         // Vector load from src[1_0]
    301     ld1       {v2.2s}, [x0], x2         // Vector load from src[2_0]
    302     ld1       {v3.2s}, [x0], x2         // Vector load from src[3_0]
    303     add       x14, x14, #1              //for checking loop
    304     ld1       {v4.2s}, [x0], x2         // Vector load from src[4_0]
    305     ld1       {v5.2s}, [x0], x2         // Vector load from src[5_0]
    306 
    307 loop_8:
    308                                         //for checking loop
    309     uaddl     v6.8h, v2.8b, v3.8b       // temp1 = src[2_0] + src[3_0]
    310     uaddl     v8.8h, v0.8b, v5.8b       // temp = src[0_0] + src[5_0]
    311     uaddl     v10.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
    312     mla       v8.8h, v6.8h , v22.8h     // temp += temp1 * 20
    313     ld1       {v6.2s}, [x0], x2
    314     uaddl     v14.8h, v3.8b, v4.8b
    315     uaddl     v16.8h, v1.8b, v6.8b
    316     uaddl     v18.8h, v2.8b, v5.8b
    317     mls       v8.8h, v10.8h , v24.8h    // temp -= temp2 * 5
    318     mla       v16.8h, v14.8h , v22.8h
    319     ld1       {v7.2s}, [x0], x2
    320     uaddl     v20.8h, v4.8b, v5.8b
    321     uaddl     v12.8h, v2.8b, v7.8b
    322     uaddl     v10.8h, v3.8b, v6.8b
    323     mls       v16.8h, v18.8h , v24.8h
    324     sqrshrun  v26.8b, v8.8h, #5         // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
    325     mla       v12.8h, v20.8h , v22.8h
    326     ld1       {v8.2s}, [x7], x2         //Load value for interpolation            (row0)
    327     ld1       {v9.2s}, [x7], x2         //Load value for interpolation            (row1)
    328     ld1       {v0.2s}, [x0], x2
    329     uaddl     v14.8h, v5.8b, v6.8b
    330     sqrshrun  v27.8b, v16.8h, #5
    331     urhadd    v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation
    332     urhadd    v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation
    333 
    334     uaddl     v20.8h, v3.8b, v0.8b
    335     mls       v12.8h, v10.8h , v24.8h
    336     st1       {v26.2s}, [x1], x3        // Vector store to dst[0_0]
    337     uaddl     v18.8h, v4.8b, v7.8b
    338     mla       v20.8h, v14.8h , v22.8h
    339     st1       {v27.2s}, [x1], x3        // Vector store to dst[1_0]
    340     sqrshrun  v28.8b, v12.8h, #5
    341     mls       v20.8h, v18.8h , v24.8h
    342     ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (row2)
    343     ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (row3)
    344     ld1       {v1.2s}, [x0], x2
    345     sqrshrun  v29.8b, v20.8h, #5
    346     subs      x9, x4, #4
    347     urhadd    v28.16b, v12.16b , v28.16b
    348     urhadd    v29.16b, v13.16b , v29.16b
    349     st1       {v28.2s}, [x1], x3        //store row 2
    350     st1       {v29.2s}, [x1], x3        //store row 3
    351     beq       end_func                  // Branch if height==4
    352     uaddl     v14.8h, v6.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
    353     uaddl     v16.8h, v0.8b, v5.8b      // temp = src[0_0] + src[5_0]
    354     uaddl     v18.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
    355     mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
    356     ld1       {v2.2s}, [x0], x2
    357     mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    358     uaddl     v8.8h, v0.8b, v7.8b
    359     uaddl     v10.8h, v1.8b, v6.8b
    360     uaddl     v12.8h, v2.8b, v5.8b
    361     sqrshrun  v26.8b, v18.8h, #5
    362     mla       v12.8h, v8.8h , v22.8h
    363     ld1       {v18.2s}, [x7], x2        //Load value for interpolation            (row4)
    364     ld1       {v19.2s}, [x7], x2        //Load value for interpolation            (row5)
    365     ld1       {v3.2s}, [x0], x2
    366     mls       v12.8h, v10.8h , v24.8h
    367     sqrshrun  v27.8b, v12.8h, #5
    368     urhadd    v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
    369     urhadd    v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
    370 
    371     st1       {v26.2s}, [x1], x3        // store row 4
    372     st1       {v27.2s}, [x1], x3        // store row 5
    373     uaddl     v14.8h, v0.8b, v1.8b      // temp1 = src[2_0] + src[3_0]
    374     uaddl     v16.8h, v2.8b, v7.8b      // temp = src[0_0] + src[5_0]
    375     uaddl     v18.8h, v3.8b, v6.8b      // temp2 = src[1_0] + src[4_0]
    376     mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
    377     ld1       {v4.2s}, [x0], x2
    378     mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    379     uaddl     v8.8h, v2.8b, v1.8b
    380     uaddl     v10.8h, v3.8b, v0.8b
    381     uaddl     v12.8h, v4.8b, v7.8b
    382     sqrshrun  v26.8b, v18.8h, #5
    383     mla       v12.8h, v8.8h , v22.8h
    384     ld1       {v18.2s}, [x7], x2        //Load value for interpolation            (row6)
    385     ld1       {v19.2s}, [x7], x2        //Load value for interpolation            (row7)
    386     ld1       {v5.2s}, [x0], x2
    387     mls       v12.8h, v10.8h , v24.8h
    388     sqrshrun  v27.8b, v12.8h, #5
    389     urhadd    v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
    390     urhadd    v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
    391 
    392     subs      x12, x14, #1
    393     st1       {v26.2s}, [x1], x3        // store row 6
    394     st1       {v27.2s}, [x1], x3        // store row 7
    395     add       x14, x14, #1
    396     beq       loop_8                    //looping if height ==16
    397 
    398     b         end_func
    399 
    400 
    401 loop_4_start:
    402 //// Processing row0 and row1
    403 
    404 
    405     ld1       {v0.s}[0], [x0], x2       // Vector load from src[0_0]
    406     ld1       {v1.s}[0], [x0], x2       // Vector load from src[1_0]
    407     ld1       {v2.s}[0], [x0], x2       // Vector load from src[2_0]
    408     ld1       {v3.s}[0], [x0], x2       // Vector load from src[3_0]
    409     ld1       {v4.s}[0], [x0], x2       // Vector load from src[4_0]
    410     ld1       {v5.s}[0], [x0], x2       // Vector load from src[5_0]
    411 
    412     uaddl     v6.8h, v2.8b, v3.8b       // temp1 = src[2_0] + src[3_0]
    413     uaddl     v8.8h, v0.8b, v5.8b       // temp = src[0_0] + src[5_0]
    414     uaddl     v10.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
    415     mla       v8.8h, v6.8h , v22.8h     // temp += temp1 * 20
    416     ld1       {v6.2s}, [x0], x2
    417     uaddl     v14.8h, v3.8b, v4.8b
    418     uaddl     v16.8h, v1.8b, v6.8b
    419     uaddl     v18.8h, v2.8b, v5.8b
    420     mls       v8.8h, v10.8h , v24.8h    // temp -= temp2 * 5
    421     ld1       {v7.s}[0], [x0], x2
    422     mla       v16.8h, v14.8h , v22.8h
    423     uaddl     v20.8h, v4.8b, v5.8b
    424     uaddl     v12.8h, v2.8b, v7.8b
    425     uaddl     v10.8h, v3.8b, v6.8b
    426     mls       v16.8h, v18.8h , v24.8h
    427     sqrshrun  v26.8b, v8.8h, #5         // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
    428     ld1       {v8.s}[0], [x7], x2       //Load value for interpolation - row 0
    429     ld1       {v9.s}[0], [x7], x2       //Load value for interpolation - row 1
    430     mla       v12.8h, v20.8h , v22.8h
    431     ld1       {v0.s}[0], [x0], x2
    432     uaddl     v14.8h, v5.8b, v6.8b
    433     sqrshrun  v27.8b, v16.8h, #5
    434     uaddl     v20.8h, v3.8b, v0.8b
    435     urhadd    v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation
    436     urhadd    v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation
    437 
    438     mls       v12.8h, v10.8h , v24.8h
    439     st1       {v26.s}[0], [x1], x3      // Vector store to dst[0_0]
    440     uaddl     v18.8h, v4.8b, v7.8b
    441     mla       v20.8h, v14.8h , v22.8h
    442     st1       {v27.s}[0], [x1], x3      // store row 1
    443     sqrshrun  v28.8b, v12.8h, #5
    444     ld1       {v12.s}[0], [x7], x2      //Load value for interpolation - row 2
    445     ld1       {v13.s}[0], [x7], x2      //Load value for interpolation - row 3
    446 
    447     mls       v20.8h, v18.8h , v24.8h
    448     ld1       {v1.s}[0], [x0], x2
    449     sqrshrun  v29.8b, v20.8h, #5
    450     urhadd    v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation
    451     urhadd    v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation
    452 
    453     st1       {v28.s}[0], [x1], x3      //store row 2
    454     st1       {v29.s}[0], [x1], x3      //store row 3
    455 
    456     subs      x9, x4, #4
    457     beq       end_func                  // Branch if height==4
    458 
    459 
    460     uaddl     v14.8h, v6.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
    461     uaddl     v16.8h, v0.8b, v5.8b      // temp = src[0_0] + src[5_0]
    462     uaddl     v18.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
    463     mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
    464     ld1       {v2.s}[0], [x0], x2
    465     mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    466     uaddl     v8.8h, v0.8b, v7.8b
    467     uaddl     v10.8h, v1.8b, v6.8b
    468     uaddl     v12.8h, v2.8b, v5.8b
    469     sqrshrun  v26.8b, v18.8h, #5
    470     ld1       {v18.s}[0], [x7], x2      //Load value for interpolation - row 4
    471     ld1       {v19.s}[0], [x7], x2      //Load value for interpolation - row 5
    472     mla       v12.8h, v8.8h , v22.8h
    473     ld1       {v3.s}[0], [x0], x2
    474     mls       v12.8h, v10.8h , v24.8h
    475     sqrshrun  v27.8b, v12.8h, #5
    476     urhadd    v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
    477     urhadd    v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation
    478 
    479     st1       {v26.s}[0], [x1], x3      //store row 4
    480     st1       {v27.s}[0], [x1], x3      // store row 5
    481     uaddl     v14.8h, v0.8b, v1.8b      // temp1 = src[2_0] + src[3_0]
    482     uaddl     v16.8h, v2.8b, v7.8b      // temp = src[0_0] + src[5_0]
    483     uaddl     v18.8h, v3.8b, v6.8b      // temp2 = src[1_0] + src[4_0]
    484     mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
    485     ld1       {v4.s}[0], [x0], x2
    486     mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
    487     uaddl     v8.8h, v2.8b, v1.8b
    488     uaddl     v10.8h, v3.8b, v0.8b
    489     uaddl     v12.8h, v4.8b, v7.8b
    490     sqrshrun  v26.8b, v18.8h, #5
    491     ld1       {v18.s}[0], [x7], x2      //Load value for interpolation - row 6
    492     ld1       {v19.s}[0], [x7], x2      //Load value for interpolation - row 7
    493     mla       v12.8h, v8.8h , v22.8h
    494     ld1       {v5.s}[0], [x0], x2
    495     mls       v12.8h, v10.8h , v24.8h
    496     sqrshrun  v27.8b, v12.8h, #5
    497     urhadd    v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
    498     urhadd    v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation
    499 
    500     st1       {v26.s}[0], [x1], x3      // store row 6
    501     st1       {v27.s}[0], [x1], x3      // store row 7
    502 
    503 
    504 end_func:
    505     // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    506     ldp       x19, x20, [sp], #16
    507     pop_v_regs
    508     ret
    509 
    510 
    511 
    512