Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 // *******************************************************************************
     22 // * @file
     23 // *  ih264e_half_pel.s
     24 // *
     25 // * @brief
     26 // *
     27 // *
     28 // * @author
     29 // *  Ittiam
     30 // *
     31 // * @par List of Functions:
     32 // *  ih264e_sixtapfilter_horz
     33 // *  ih264e_sixtap_filter_2dvh_vert
     34 //
     35 // *
     36 // * @remarks
     37 // *  None
     38 // *
     39 // *******************************************************************************
     40 // */
     41 
     42 
     43 .text
     44 .p2align 2
     45 .include "ih264_neon_macros.s"
     46 
     47 ///*******************************************************************************
     48 //*
     49 //* @brief
     50 //*     Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16)
     51 //*
     52 //* @par Description:
     53 //*    Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
     54 //*    sec 8.4.2.2.1 titled "Luma sample interpolation process"
     55 //*
     56 //* @param[in] pu1_src
     57 //*  UWORD8 pointer to the source
     58 //*
     59 //* @param[out] pu1_dst
     60 //*  UWORD8 pointer to the destination
     61 //*
     62 //* @param[in] src_strd
     63 //*  integer source stride
     64 //*
     65 //* @param[in] dst_strd
     66 //*  integer destination stride
     67 //*
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  None
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 //void ih264e_sixtapfilter_horz(UWORD8 *pu1_src,
     77 //                                UWORD8 *pu1_dst,
     78 //                                WORD32 src_strd,
     79 //                                WORD32 dst_strd);
     80 
     81 
     82 .equ halfpel_width ,  17 + 1            //( make it even, two rows are processed at a time)
     83 
     84 
     85         .global ih264e_sixtapfilter_horz_av8
     86 ih264e_sixtapfilter_horz_av8:
     87     // STMFD sp!,{x14}
     88     push_v_regs
     89     sxtw      x2, w2
     90     sxtw      x3, w3
     91     stp       x19, x20, [sp, #-16]!
     92 
     93     movi      v0.8b, #5
     94     sub       x0, x0, #2
     95     sub       x3, x3, #16
     96     movi      v1.8b, #20
     97     mov       x14, #16
     98 
     99 filter_horz_loop:
    100 
    101 
    102     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
    103     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
    104 
    105     //// Processing row0 and row1
    106 
    107     ext       v31.8b, v2.8b , v3.8b , #5
    108     ext       v30.8b, v3.8b , v4.8b , #5
    109 
    110     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
    111     ext       v29.8b, v4.8b , v4.8b , #5
    112     uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row0)
    113     ext       v28.8b, v5.8b , v6.8b , #5
    114     uaddl     v12.8h, v29.8b, v4.8b     //// a0 + a5                             (column3,row0)
    115     ext       v27.8b, v6.8b , v7.8b , #5
    116     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
    117     ext       v26.8b, v7.8b , v7.8b , #5
    118 
    119     uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row1)
    120     ext       v31.8b, v2.8b , v3.8b , #2
    121     uaddl     v18.8h, v26.8b, v7.8b     //// a0 + a5                             (column3,row1)
    122     ext       v30.8b, v3.8b , v4.8b , #2
    123     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    124     ext       v29.8b, v4.8b , v4.8b , #2
    125     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row0)
    126     ext       v28.8b, v5.8b , v6.8b , #2
    127     umlal     v12.8h, v29.8b, v1.8b     //// a0 + a5 + 20a2                         (column3,row0)
    128     ext       v27.8b, v6.8b , v7.8b , #2
    129     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
    130     ext       v26.8b, v7.8b , v7.8b , #2
    131 
    132     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row1)
    133     ext       v31.8b, v2.8b , v3.8b , #3
    134     umlal     v18.8h, v26.8b, v1.8b     //// a0 + a5 + 20a2                         (column3,row1)
    135     ext       v30.8b, v3.8b , v4.8b , #3
    136     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    137     ext       v29.8b, v4.8b , v4.8b , #3
    138     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row0)
    139     ext       v28.8b, v5.8b , v6.8b , #3
    140     umlal     v12.8h, v29.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column3,row0)
    141     ext       v27.8b, v6.8b , v7.8b , #3
    142     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
    143     ext       v26.8b, v7.8b , v7.8b , #3
    144 
    145     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row1)
    146     ext       v31.8b, v2.8b , v3.8b , #1
    147     umlal     v18.8h, v26.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column3,row1)
    148     ext       v30.8b, v3.8b , v4.8b , #1
    149     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    150     ext       v29.8b, v4.8b , v4.8b , #1
    151     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row0)
    152     ext       v28.8b, v5.8b , v6.8b , #1
    153     umlsl     v12.8h, v29.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column3,row0)
    154     ext       v27.8b, v6.8b , v7.8b , #1
    155     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
    156     ext       v26.8b, v7.8b , v7.8b , #1
    157 
    158     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row1)
    159     ext       v31.8b, v2.8b , v3.8b , #4
    160     umlsl     v18.8h, v26.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column3,row1)
    161     ext       v30.8b, v3.8b , v4.8b , #4
    162     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    163     ext       v29.8b, v4.8b , v4.8b , #4
    164     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row0)
    165     ext       v28.8b, v5.8b , v6.8b , #4
    166     umlsl     v12.8h, v29.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column3,row0)
    167     ext       v27.8b, v6.8b , v7.8b , #4
    168     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
    169     ext       v26.8b, v7.8b , v7.8b , #4
    170 
    171     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row1)
    172     umlsl     v18.8h, v26.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column3,row1)
    173 
    174     sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    175     sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    176     sqrshrun  v22.8b, v12.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    177     sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
    178     sqrshrun  v24.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row1)
    179     sqrshrun  v25.8b, v18.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row1)
    180 
    181     st1       {v20.8b, v21.8b}, [x1], #16 ////Store dest row0
    182     st1       {v22.h}[0], [x1], x3
    183     st1       {v23.8b, v24.8b}, [x1], #16 ////Store dest row1
    184     st1       {v25.h}[0], [x1], x3
    185 
    186     subs      x14, x14, #2              //    decrement counter
    187 
    188     bne       filter_horz_loop
    189 
    190 
    191     // LDMFD sp!,{pc}
    192     ldp       x19, x20, [sp], #16
    193     pop_v_regs
    194     ret
    195 
    196 
    197 
    198 
    199 
    200 
    201 
    202 
    203 
    204 ///**
    205 //*******************************************************************************
    206 //*
    207 //* @brief
    208 //*   This function implements a two stage cascaded six tap filter. It
    209 //*    applies the six tap filter in the vertical direction on the
    210 //*    predictor values, followed by applying the same filter in the
    211 //*    horizontal direction on the output of the first stage. The six tap
    212 //*    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
    213 //*    interpolation process"
    214 //*    (Filter run for width = 17 and height =17)
    215 //* @par Description:
    216 //*    The function interpolates
    217 //*    the predictors first in the vertical direction and then in the
    218 //*    horizontal direction to output the (1/2,1/2). The output of the first
    219 //*    stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C)
    220 //*    in 16 bit precision.
    221 //*
    222 //*
    223 //* @param[in] pu1_src
    224 //*  UWORD8 pointer to the source
    225 //*
    226 //* @param[out] pu1_dst1
    227 //*  UWORD8 pointer to the destination(vertical filtered output)
    228 //*
    229 //* @param[out] pu1_dst2
    230 //*  UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output)
    231 //*
    232 //* @param[in] src_strd
    233 //*  integer source stride
    234 //*
    235 //* @param[in] dst_strd
    236 //*  integer destination stride of pu1_dst
    237 //*
    238 //* @param[in]pi16_pred1
    239 //*  Pointer to 16bit intermediate buffer(used only in c)
    240 //*
    241 //* @param[in] pi16_pred1_strd
    242 //*  integer destination stride of pi16_pred1
    243 //*
    244 //*
    245 //* @returns
    246 //*
    247 //* @remarks
    248 //*  None
    249 //*
    250 //*******************************************************************************
    251 //*/
    252 //void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src,
    253 //                                UWORD8 *pu1_dst1,
    254 //                                UWORD8 *pu1_dst2,
    255 //                                WORD32 src_strd,
    256 //                                WORD32 dst_strd,
    257 //                                WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/
    258 //                                WORD32 pi16_pred1_strd)
    259 
    260 
    261 
    262 
    263         .global ih264e_sixtap_filter_2dvh_vert_av8
    264 
    265 ih264e_sixtap_filter_2dvh_vert_av8:
    266     // STMFD sp!,{x10,x11,x12,x14}
    267     push_v_regs
    268     sxtw      x3, w3
    269     sxtw      x4, w4
    270     stp       x19, x20, [sp, #-16]!
    271 
    272 ////x0 - pu1_ref
    273 ////x3 - u4_ref_width
    274 
    275     //// Load six rows for vertical interpolation
    276     lsl       x12, x3, #1
    277     sub       x0, x0, x12
    278     sub       x0, x0, #2
    279     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3
    280     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3
    281     ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3
    282     mov       x12, #5
    283     ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3
    284     mov       x14, #20
    285     ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3
    286     mov       v0.h[0], w12
    287     mov       v0.h[1], w14
    288     ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3
    289     movi      v1.8b, #20
    290 
    291 //// x12 - u2_buff1_width
    292 //// x14 - u2_buff2_width
    293     mov       x12, x4
    294     add       x11, x1, #16
    295 
    296     mov       x14, x12
    297 
    298     mov       x10, #3 //loop counter
    299     sub       x16 , x12, #8
    300     sub       x19, x14, #16
    301 filter_2dvh_loop:
    302 
    303     //// ////////////// ROW 1 ///////////////////////
    304 
    305 //// Process first vertical interpolated row
    306 //// each column is
    307     uaddl     v20.8h, v2.8b, v17.8b     //// a0 + a5                             (column1,row0)
    308     movi      v31.8b, #5
    309     umlal     v20.8h, v8.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    310     umlal     v20.8h, v11.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    311     umlsl     v20.8h, v5.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    312     umlsl     v20.8h, v14.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    313     mov       v21.d[0], v20.d[1]
    314 
    315     uaddl     v22.8h, v3.8b, v18.8b     //// a0 + a5                                (column2,row0)
    316     umlal     v22.8h, v9.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
    317     umlal     v22.8h, v12.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    318     umlsl     v22.8h, v6.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    319     umlsl     v22.8h, v15.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    320     ext       v30.8b, v20.8b , v21.8b , #4
    321     mov       v23.d[0], v22.d[1]
    322 
    323 
    324     uaddl     v24.8h, v4.8b, v19.8b     //// a0 + a5                                (column3,row0)
    325     ext       v29.8b, v20.8b , v21.8b , #6
    326     umlal     v24.8h, v10.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
    327     umlal     v24.8h, v13.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    328     umlsl     v24.8h, v7.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    329     umlsl     v24.8h, v16.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    330     mov       v25.d[0], v24.d[1]
    331 
    332     sqrshrun  v2.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    333     ext       v31.8b, v21.8b , v22.8b , #2
    334     sqrshrun  v3.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    335     ext       v28.8b, v20.8b , v21.8b , #2
    336 
    337     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    338     ext       v31.8b, v22.8b , v23.8b , #2
    339     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    340     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    341     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    342     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    343     ext       v30.8b, v21.8b , v22.8b , #4
    344 
    345     sqrshrun  v4.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    346     ext       v29.8b, v21.8b , v22.8b , #6
    347 
    348     ext       v28.8b, v21.8b , v22.8b , #2
    349     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    350     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    351     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    352     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    353     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    354     ext       v31.8b, v23.8b , v24.8b , #2
    355     mov       v21.d[0], v20.d[1]
    356     ext       v2.8b, v2.8b , v3.8b , #2
    357     ext       v3.8b, v3.8b , v4.8b , #2
    358     ext       v4.8b, v4.8b , v4.8b , #2
    359 
    360     st1       {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    361     st1       {v4.h}[0], [x11], x12     //// store row1 - 1,1/2 grid
    362 
    363     ext       v30.8b, v22.8b , v23.8b , #4
    364     ext       v29.8b, v22.8b , v23.8b , #6
    365 
    366     saddl     v2.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
    367     ext       v28.8b, v22.8b , v23.8b , #2
    368     smlal     v2.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
    369     smlal     v2.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
    370     smlsl     v2.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    371     smlsl     v2.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    372     ext       v31.8b, v24.8b , v25.8b , #2
    373 
    374     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    375     ext       v30.8b, v23.8b , v24.8b , #4
    376     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    377     ext       v29.8b, v23.8b , v24.8b , #6
    378 
    379     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    380     ext       v28.8b, v23.8b , v24.8b , #2
    381     ext       v31.8b, v25.8b , v25.8b , #2
    382     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    383     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    384     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    385     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    386     ext       v30.8b, v24.8b , v25.8b , #4
    387 
    388     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    389     ext       v29.8b, v24.8b , v25.8b , #6
    390 
    391     ext       v31.8b, v24.8b , v25.8b , #2
    392     shrn      v28.4h, v2.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    393 
    394     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data
    395     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    396     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    397     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    398     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    399     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    400     mov       v20.d[1], v21.d[0]
    401     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    402 
    403 
    404     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    405     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    406 
    407     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    408 
    409     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    410     //// ////////////// ROW 2 ///////////////////////
    411 
    412 //// Process first vertical interpolated row
    413 //// each column is
    414     uaddl     v20.8h, v5.8b, v2.8b      //// a0 + a5                             (column1,row0)
    415     movi      v31.8b, #5
    416     umlal     v20.8h, v11.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
    417     umlal     v20.8h, v14.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    418     umlsl     v20.8h, v8.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    419     umlsl     v20.8h, v17.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    420     mov       v21.d[0], v20.d[1]
    421 
    422     mov       v28.d[1], v29.d[0]
    423     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    424 
    425     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    426 
    427     uaddl     v22.8h, v6.8b, v3.8b      //// a0 + a5                                (column2,row0)
    428     umlal     v22.8h, v12.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
    429     umlal     v22.8h, v15.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    430     umlsl     v22.8h, v9.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    431     umlsl     v22.8h, v18.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    432     mov       v23.d[0], v22.d[1]
    433 
    434     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    435     ext       v30.8b, v20.8b , v21.8b , #4
    436 
    437     uaddl     v24.8h, v7.8b, v4.8b      //// a0 + a5                                (column3,row0)
    438     ext       v29.8b, v20.8b , v21.8b , #6
    439     umlal     v24.8h, v13.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
    440     umlal     v24.8h, v16.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    441     umlsl     v24.8h, v10.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    442     umlsl     v24.8h, v19.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    443     mov       v25.d[0], v24.d[1]
    444 
    445     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    446     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    447 
    448     sqrshrun  v5.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    449     ext       v31.8b, v21.8b , v22.8b , #2
    450     sqrshrun  v6.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    451     ext       v28.8b, v20.8b , v21.8b , #2
    452 
    453     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    454     ext       v31.8b, v22.8b , v23.8b , #2
    455     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    456     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    457     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    458     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    459     ext       v30.8b, v21.8b , v22.8b , #4
    460 
    461     sqrshrun  v7.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    462     ext       v29.8b, v21.8b , v22.8b , #6
    463 
    464     ext       v28.8b, v21.8b , v22.8b , #2
    465     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    466     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    467     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    468     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    469     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    470     ext       v31.8b, v23.8b , v24.8b , #2
    471 
    472     ext       v5.8b, v5.8b , v6.8b , #2
    473     ext       v6.8b, v6.8b , v7.8b , #2
    474     ext       v7.8b, v7.8b , v7.8b , #2
    475 
    476     st1       {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    477     st1       {v7.h}[0], [x11], x12     //// store row1 - 1,1/2 grid
    478 
    479     ext       v30.8b, v22.8b , v23.8b , #4
    480     ext       v29.8b, v22.8b , v23.8b , #6
    481 
    482     saddl     v6.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
    483     ext       v28.8b, v22.8b , v23.8b , #2
    484     smlal     v6.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
    485     smlal     v6.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
    486     smlsl     v6.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    487     smlsl     v6.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    488     ext       v31.8b, v24.8b , v25.8b , #2
    489 
    490     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    491     ext       v30.8b, v23.8b , v24.8b , #4
    492     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    493     ext       v29.8b, v23.8b , v24.8b , #6
    494 
    495     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    496     ext       v28.8b, v23.8b , v24.8b , #2
    497     ext       v31.8b, v25.8b , v25.8b , #2
    498     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    499     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    500     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    501     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    502     ext       v30.8b, v24.8b , v25.8b , #4
    503 
    504     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    505     ext       v29.8b, v24.8b , v25.8b , #6
    506 
    507     ext       v31.8b, v24.8b , v25.8b , #2
    508     shrn      v28.4h, v6.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    509 
    510     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data
    511     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    512     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    513     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    514     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    515     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    516     mov       v20.d[1], v21.d[0]
    517     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    518 
    519 
    520     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    521     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    522 
    523     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    524 
    525     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    526     //// ////////////// ROW 3 ///////////////////////
    527 
    528 //// Process first vertical interpolated row
    529 //// each column is
    530     uaddl     v20.8h, v8.8b, v5.8b      //// a0 + a5                             (column1,row0)
    531     movi      v31.8b, #5
    532     umlal     v20.8h, v14.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
    533     umlal     v20.8h, v17.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    534     umlsl     v20.8h, v11.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    535     umlsl     v20.8h, v2.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    536     mov       v21.d[0], v20.d[1]
    537 
    538     mov       v28.d[1], v29.d[0]
    539     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    540     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    541 
    542     uaddl     v22.8h, v9.8b, v6.8b      //// a0 + a5                                (column2,row0)
    543     umlal     v22.8h, v15.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
    544     umlal     v22.8h, v18.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    545     umlsl     v22.8h, v12.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    546     umlsl     v22.8h, v3.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    547     mov       v23.d[0], v22.d[1]
    548 
    549     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    550     ext       v30.8b, v20.8b , v21.8b , #4
    551 
    552     uaddl     v24.8h, v10.8b, v7.8b     //// a0 + a5                                (column3,row0)
    553     ext       v29.8b, v20.8b , v21.8b , #6
    554     umlal     v24.8h, v16.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
    555     umlal     v24.8h, v19.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    556     umlsl     v24.8h, v13.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    557     umlsl     v24.8h, v4.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    558     mov       v25.d[0], v24.d[1]
    559 
    560     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    561     st1       { v28.h}[0], [x2], x19    //// store 1/2,1,2 grif values
    562 
    563     sqrshrun  v8.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    564     ext       v31.8b, v21.8b , v22.8b , #2
    565     sqrshrun  v9.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    566     ext       v28.8b, v20.8b , v21.8b , #2
    567 
    568     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    569     ext       v31.8b, v22.8b , v23.8b , #2
    570     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    571     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    572     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    573     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    574     ext       v30.8b, v21.8b , v22.8b , #4
    575 
    576     sqrshrun  v10.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    577     ext       v29.8b, v21.8b , v22.8b , #6
    578 
    579     ext       v28.8b, v21.8b , v22.8b , #2
    580     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    581     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    582     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    583     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    584     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    585     ext       v31.8b, v23.8b , v24.8b , #2
    586 
    587     ext       v8.8b, v8.8b , v9.8b , #2
    588     ext       v9.8b, v9.8b , v10.8b , #2
    589     ext       v10.8b, v10.8b , v10.8b , #2
    590 
    591     st1       {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    592     st1       {v10.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
    593 
    594     ext       v30.8b, v22.8b , v23.8b , #4
    595     ext       v29.8b, v22.8b , v23.8b , #6
    596 
    597     saddl     v8.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
    598     ext       v28.8b, v22.8b , v23.8b , #2
    599     smlal     v8.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
    600     smlal     v8.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
    601     smlsl     v8.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    602     smlsl     v8.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    603     ext       v31.8b, v24.8b , v25.8b , #2
    604 
    605     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    606     ext       v30.8b, v23.8b , v24.8b , #4
    607     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    608     ext       v29.8b, v23.8b , v24.8b , #6
    609 
    610     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    611     ext       v28.8b, v23.8b , v24.8b , #2
    612     ext       v31.8b, v25.8b , v25.8b , #2
    613     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    614     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    615     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    616     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    617     ext       v30.8b, v24.8b , v25.8b , #4
    618 
    619     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    620     ext       v29.8b, v24.8b , v25.8b , #6
    621 
    622     ext       v31.8b, v24.8b , v25.8b , #2
    623     shrn      v28.4h, v8.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    624 
    625     ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data
    626     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    627     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    628     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    629     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    630     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    631     mov       v20.d[1], v21.d[0]
    632     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    633 
    634 
    635     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    636     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    637 
    638     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    639 
    640     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    641     //// ////////////// ROW 4 ///////////////////////
    642 
    643 //// Process first vertical interpolated row
    644 //// each column is
    645     uaddl     v20.8h, v11.8b, v8.8b     //// a0 + a5                             (column1,row0)
    646     movi      v31.8b, #5
    647     umlal     v20.8h, v17.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
    648     umlal     v20.8h, v2.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    649     umlsl     v20.8h, v14.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    650     umlsl     v20.8h, v5.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    651     mov       v21.d[0], v20.d[1]
    652     mov       v28.d[1], v29.d[0]
    653     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    654     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    655 
    656     uaddl     v22.8h, v12.8b, v9.8b     //// a0 + a5                                (column2,row0)
    657     umlal     v22.8h, v18.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
    658     umlal     v22.8h, v3.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    659     umlsl     v22.8h, v15.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    660     umlsl     v22.8h, v6.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    661     mov       v23.d[0], v22.d[1]
    662 
    663     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    664     ext       v30.8b, v20.8b , v21.8b , #4
    665 
    666     uaddl     v24.8h, v13.8b, v10.8b    //// a0 + a5                                (column3,row0)
    667     ext       v29.8b, v20.8b , v21.8b , #6
    668     umlal     v24.8h, v19.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
    669     umlal     v24.8h, v4.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    670     umlsl     v24.8h, v16.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    671     umlsl     v24.8h, v7.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    672     mov       v25.d[0], v24.d[1]
    673 
    674     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    675     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    676 
    677     sqrshrun  v11.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    678     ext       v31.8b, v21.8b , v22.8b , #2
    679     sqrshrun  v12.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    680     ext       v28.8b, v20.8b , v21.8b , #2
    681 
    682     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    683     ext       v31.8b, v22.8b , v23.8b , #2
    684     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    685     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    686     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    687     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    688     ext       v30.8b, v21.8b , v22.8b , #4
    689 
    690     sqrshrun  v13.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    691     ext       v29.8b, v21.8b , v22.8b , #6
    692 
    693     ext       v28.8b, v21.8b , v22.8b , #2
    694     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    695     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    696     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    697     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    698     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    699     ext       v31.8b, v23.8b , v24.8b , #2
    700 
    701     ext       v11.8b, v11.8b , v12.8b , #2
    702     ext       v12.8b, v12.8b , v13.8b , #2
    703     ext       v13.8b, v13.8b , v13.8b , #2
    704 
    705     st1       {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    706     st1       {v13.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
    707 
    708     ext       v30.8b, v22.8b , v23.8b , #4
    709     ext       v29.8b, v22.8b , v23.8b , #6
    710 
    711     saddl     v12.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
    712     ext       v28.8b, v22.8b , v23.8b , #2
    713     smlal     v12.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
    714     smlal     v12.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
    715     smlsl     v12.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    716     smlsl     v12.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    717     ext       v31.8b, v24.8b , v25.8b , #2
    718 
    719     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    720     ext       v30.8b, v23.8b , v24.8b , #4
    721     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    722     ext       v29.8b, v23.8b , v24.8b , #6
    723 
    724     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    725     ext       v28.8b, v23.8b , v24.8b , #2
    726     ext       v31.8b, v25.8b , v25.8b , #2
    727     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    728     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    729     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    730     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    731     ext       v30.8b, v24.8b , v25.8b , #4
    732 
    733     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    734     ext       v29.8b, v24.8b , v25.8b , #6
    735 
    736     ext       v31.8b, v24.8b , v25.8b , #2
    737     shrn      v28.4h, v12.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    738 
    739     ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data
    740     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    741     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    742     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    743     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    744     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    745     mov       v20.d[1], v21.d[0]
    746     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    747 
    748 
    749     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    750     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    751 
    752     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    753 
    754     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    755     //// ////////////// ROW 5 ///////////////////////
    756 
    757 //// Process first vertical interpolated row
    758 //// each column is
    759     uaddl     v20.8h, v14.8b, v11.8b    //// a0 + a5                             (column1,row0)
    760     movi      v31.8b, #5
    761     umlal     v20.8h, v2.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    762     umlal     v20.8h, v5.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    763     umlsl     v20.8h, v17.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    764     umlsl     v20.8h, v8.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    765     mov       v21.d[0], v20.d[1]
    766     mov       v28.d[1], v29.d[0]
    767     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    768     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    769 
    770     uaddl     v22.8h, v15.8b, v12.8b    //// a0 + a5                                (column2,row0)
    771     umlal     v22.8h, v3.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
    772     umlal     v22.8h, v6.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    773     umlsl     v22.8h, v18.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    774     umlsl     v22.8h, v9.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    775     mov       v23.d[0], v22.d[1]
    776 
    777     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    778     ext       v30.8b, v20.8b , v21.8b , #4
    779 
    780     uaddl     v24.8h, v16.8b, v13.8b    //// a0 + a5                                (column3,row0)
    781     ext       v29.8b, v20.8b , v21.8b , #6
    782     umlal     v24.8h, v4.8b, v1.8b      //// a0 + a5 + 20a2                        (column3,row0)
    783     umlal     v24.8h, v7.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    784     umlsl     v24.8h, v19.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    785     umlsl     v24.8h, v10.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    786     mov       v25.d[0], v24.d[1]
    787 
    788     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    789     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    790 
    791     sqrshrun  v14.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    792     ext       v31.8b, v21.8b , v22.8b , #2
    793     sqrshrun  v15.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    794     ext       v28.8b, v20.8b , v21.8b , #2
    795 
    796     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    797     ext       v31.8b, v22.8b , v23.8b , #2
    798     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    799     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    800     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    801     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    802     ext       v30.8b, v21.8b , v22.8b , #4
    803 
    804     sqrshrun  v16.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    805     ext       v29.8b, v21.8b , v22.8b , #6
    806 
    807     ext       v28.8b, v21.8b , v22.8b , #2
    808     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    809     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    810     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    811     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    812     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    813     ext       v31.8b, v23.8b , v24.8b , #2
    814 
    815     ext       v14.8b, v14.8b , v15.8b , #2
    816     ext       v15.8b, v15.8b , v16.8b , #2
    817     ext       v16.8b, v16.8b , v16.8b , #2
    818 
    819     st1       {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    820     st1       {v16.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
    821 
    822     ext       v30.8b, v22.8b , v23.8b , #4
    823     ext       v29.8b, v22.8b , v23.8b , #6
    824 
    825     saddl     v14.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
    826     ext       v28.8b, v22.8b , v23.8b , #2
    827     smlal     v14.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
    828     smlal     v14.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
    829     smlsl     v14.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    830     smlsl     v14.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    831     ext       v31.8b, v24.8b , v25.8b , #2
    832 
    833     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    834     ext       v30.8b, v23.8b , v24.8b , #4
    835     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    836     ext       v29.8b, v23.8b , v24.8b , #6
    837 
    838     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    839     ext       v28.8b, v23.8b , v24.8b , #2
    840     ext       v31.8b, v25.8b , v25.8b , #2
    841     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    842     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    843     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    844     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    845     ext       v30.8b, v24.8b , v25.8b , #4
    846 
    847     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    848     ext       v29.8b, v24.8b , v25.8b , #6
    849 
    850     ext       v31.8b, v24.8b , v25.8b , #2
    851     shrn      v28.4h, v14.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    852 
    853     ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data
    854     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    855     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    856     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    857     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    858     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    859     mov       v20.d[1], v21.d[0]
    860     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    861 
    862 
    863     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    864     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    865 
    866     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    867 
    868     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    869     //// ////////////// ROW 6 ///////////////////////
    870 
    871 //// Process first vertical interpolated row
    872 //// each column is
    873 
    874     cmp       x10, #1                   //// if it 17 rows are complete skip
    875     beq       filter_2dvh_skip_row
    876     uaddl     v20.8h, v17.8b, v14.8b    //// a0 + a5                             (column1,row0)
    877     movi      v31.8b, #5
    878     umlal     v20.8h, v5.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    879     umlal     v20.8h, v8.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    880     umlsl     v20.8h, v2.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    881     umlsl     v20.8h, v11.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    882     mov       v21.d[0], v20.d[1]
    883     mov       v28.d[1], v29.d[0]
    884     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    885     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    886 
    887     uaddl     v22.8h, v18.8b, v15.8b    //// a0 + a5                                (column2,row0)
    888     umlal     v22.8h, v6.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
    889     umlal     v22.8h, v9.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    890     umlsl     v22.8h, v3.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    891     umlsl     v22.8h, v12.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    892     mov       v23.d[0], v22.d[1]
    893 
    894     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    895     ext       v30.8b, v20.8b , v21.8b , #4
    896 
    897     uaddl     v24.8h, v19.8b, v16.8b    //// a0 + a5                                (column3,row0)
    898     ext       v29.8b, v20.8b , v21.8b , #6
    899     umlal     v24.8h, v7.8b, v1.8b      //// a0 + a5 + 20a2                        (column3,row0)
    900     umlal     v24.8h, v10.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    901     umlsl     v24.8h, v4.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    902     umlsl     v24.8h, v13.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    903     mov       v25.d[0], v24.d[1]
    904 
    905     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    906     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    907 
    908     sqrshrun  v17.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    909     ext       v31.8b, v21.8b , v22.8b , #2
    910     sqrshrun  v18.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    911     ext       v28.8b, v20.8b , v21.8b , #2
    912 
    913     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    914     ext       v31.8b, v22.8b , v23.8b , #2
    915     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    916     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    917     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    918     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    919     ext       v30.8b, v21.8b , v22.8b , #4
    920 
    921     sqrshrun  v19.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    922     ext       v29.8b, v21.8b , v22.8b , #6
    923 
    924     ext       v28.8b, v21.8b , v22.8b , #2
    925     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    926     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    927     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    928     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    929     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    930     ext       v31.8b, v23.8b , v24.8b , #2
    931 
    932     ext       v17.8b, v17.8b , v18.8b , #2
    933     ext       v18.8b, v18.8b , v19.8b , #2
    934     ext       v19.8b, v19.8b , v19.8b , #2
    935 
    936     st1       {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    937     st1       {v19.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
    938 
    939     ext       v30.8b, v22.8b , v23.8b , #4
    940     ext       v29.8b, v22.8b , v23.8b , #6
    941 
    942     saddl     v18.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
    943     ext       v28.8b, v22.8b , v23.8b , #2
    944     smlal     v18.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
    945     smlal     v18.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
    946     smlsl     v18.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    947     smlsl     v18.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    948     ext       v31.8b, v24.8b , v25.8b , #2
    949 
    950     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    951     ext       v30.8b, v23.8b , v24.8b , #4
    952     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    953     ext       v29.8b, v23.8b , v24.8b , #6
    954 
    955     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    956     ext       v28.8b, v23.8b , v24.8b , #2
    957     ext       v31.8b, v25.8b , v25.8b , #2
    958     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    959     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    960     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    961     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    962     ext       v30.8b, v24.8b , v25.8b , #4
    963 
    964     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    965     ext       v29.8b, v24.8b , v25.8b , #6
    966 
    967     ext       v31.8b, v24.8b , v25.8b , #2
    968     shrn      v28.4h, v18.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    969 
    970     ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data
    971     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    972     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    973     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    974     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    975     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    976     mov       v20.d[1], v21.d[0]
    977     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    978 
    979     mov       v28.d[1], v29.d[0]
    980     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    981     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    982 
    983     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    984 
    985     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    986     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    987 
    988     subs      x10, x10, #1              ////decrement loop counter
    989 
    990     bne       filter_2dvh_loop
    991 
    992 
    993 //// Process first vertical interpolated row
    994 //// each column is
    995     //// ////////////// ROW 13 ///////////////////////
    996 
    997 //// Process first vertical interpolated row
    998 //// each column is
    999 
   1000     // LDMFD sp!,{x10,x11,x12,pc}
   1001     ldp       x19, x20, [sp], #16
   1002     pop_v_regs
   1003     ret
   1004 
   1005 filter_2dvh_skip_row:
   1006     mov       v28.d[1], v29.d[0]
   1007     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
   1008     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
   1009 
   1010     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
   1011 
   1012     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
   1013     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
   1014     // LDMFD sp!,{x10,x11,x12,pc}
   1015     ldp       x19, x20, [sp], #16
   1016     pop_v_regs
   1017     ret
   1018 
   1019 
   1020 ///*****************************************
   1021