Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 // *******************************************************************************
     22 // * @file
     23 // *  ih264e_half_pel.s
     24 // *
     25 // * @brief
     26 // *
     27 // *
     28 // * @author
     29 // *  Ittiam
     30 // *
     31 // * @par List of Functions:
     32 // *  ih264e_sixtapfilter_horz
     33 // *  ih264e_sixtap_filter_2dvh_vert
     34 //
     35 // *
     36 // * @remarks
     37 // *  None
     38 // *
     39 // *******************************************************************************
     40 // */
     41 
     42 
     43 .text
     44 .p2align 2
     45 .include "ih264_neon_macros.s"
     46 
     47 ///*******************************************************************************
     48 //*
     49 //* @brief
     50 //*     Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16)
     51 //*
     52 //* @par Description:
     53 //*    Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
     54 //*    sec 8.4.2.2.1 titled "Luma sample interpolation process"
     55 //*
     56 //* @param[in] pu1_src
     57 //*  UWORD8 pointer to the source
     58 //*
     59 //* @param[out] pu1_dst
     60 //*  UWORD8 pointer to the destination
     61 //*
     62 //* @param[in] src_strd
     63 //*  integer source stride
     64 //*
     65 //* @param[in] dst_strd
     66 //*  integer destination stride
     67 //*
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  None
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 //void ih264e_sixtapfilter_horz(UWORD8 *pu1_src,
     77 //                                UWORD8 *pu1_dst,
     78 //                                WORD32 src_strd,
     79 //                                WORD32 dst_strd);
     80 
     81 
     82 .equ halfpel_width ,  17 + 1            //( make it even, two rows are processed at a time)
     83 
     84 
     85         .global ih264e_sixtapfilter_horz_av8
     86 ih264e_sixtapfilter_horz_av8:
     87     // STMFD sp!,{x14}
     88     push_v_regs
     89     stp       x19, x20, [sp, #-16]!
     90 
     91     movi      v0.8b, #5
     92     sub       x0, x0, #2
     93     sub       x3, x3, #16
     94     movi      v1.8b, #20
     95     mov       x14, #16
     96 
     97 filter_horz_loop:
     98 
     99 
    100     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
    101     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
    102 
    103     //// Processing row0 and row1
    104 
    105     ext       v31.8b, v2.8b , v3.8b , #5
    106     ext       v30.8b, v3.8b , v4.8b , #5
    107 
    108     uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
    109     ext       v29.8b, v4.8b , v4.8b , #5
    110     uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row0)
    111     ext       v28.8b, v5.8b , v6.8b , #5
    112     uaddl     v12.8h, v29.8b, v4.8b     //// a0 + a5                             (column3,row0)
    113     ext       v27.8b, v6.8b , v7.8b , #5
    114     uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
    115     ext       v26.8b, v7.8b , v7.8b , #5
    116 
    117     uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row1)
    118     ext       v31.8b, v2.8b , v3.8b , #2
    119     uaddl     v18.8h, v26.8b, v7.8b     //// a0 + a5                             (column3,row1)
    120     ext       v30.8b, v3.8b , v4.8b , #2
    121     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    122     ext       v29.8b, v4.8b , v4.8b , #2
    123     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row0)
    124     ext       v28.8b, v5.8b , v6.8b , #2
    125     umlal     v12.8h, v29.8b, v1.8b     //// a0 + a5 + 20a2                         (column3,row0)
    126     ext       v27.8b, v6.8b , v7.8b , #2
    127     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
    128     ext       v26.8b, v7.8b , v7.8b , #2
    129 
    130     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row1)
    131     ext       v31.8b, v2.8b , v3.8b , #3
    132     umlal     v18.8h, v26.8b, v1.8b     //// a0 + a5 + 20a2                         (column3,row1)
    133     ext       v30.8b, v3.8b , v4.8b , #3
    134     umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    135     ext       v29.8b, v4.8b , v4.8b , #3
    136     umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row0)
    137     ext       v28.8b, v5.8b , v6.8b , #3
    138     umlal     v12.8h, v29.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column3,row0)
    139     ext       v27.8b, v6.8b , v7.8b , #3
    140     umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
    141     ext       v26.8b, v7.8b , v7.8b , #3
    142 
    143     umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row1)
    144     ext       v31.8b, v2.8b , v3.8b , #1
    145     umlal     v18.8h, v26.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column3,row1)
    146     ext       v30.8b, v3.8b , v4.8b , #1
    147     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    148     ext       v29.8b, v4.8b , v4.8b , #1
    149     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row0)
    150     ext       v28.8b, v5.8b , v6.8b , #1
    151     umlsl     v12.8h, v29.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column3,row0)
    152     ext       v27.8b, v6.8b , v7.8b , #1
    153     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
    154     ext       v26.8b, v7.8b , v7.8b , #1
    155 
    156     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row1)
    157     ext       v31.8b, v2.8b , v3.8b , #4
    158     umlsl     v18.8h, v26.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column3,row1)
    159     ext       v30.8b, v3.8b , v4.8b , #4
    160     umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    161     ext       v29.8b, v4.8b , v4.8b , #4
    162     umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row0)
    163     ext       v28.8b, v5.8b , v6.8b , #4
    164     umlsl     v12.8h, v29.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column3,row0)
    165     ext       v27.8b, v6.8b , v7.8b , #4
    166     umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
    167     ext       v26.8b, v7.8b , v7.8b , #4
    168 
    169     umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row1)
    170     umlsl     v18.8h, v26.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column3,row1)
    171 
    172     sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    173     sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    174     sqrshrun  v22.8b, v12.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    175     sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
    176     sqrshrun  v24.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row1)
    177     sqrshrun  v25.8b, v18.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row1)
    178 
    179     st1       {v20.8b, v21.8b}, [x1], #16 ////Store dest row0
    180     st1       {v22.h}[0], [x1], x3
    181     st1       {v23.8b, v24.8b}, [x1], #16 ////Store dest row1
    182     st1       {v25.h}[0], [x1], x3
    183 
    184     subs      x14, x14, #2              //    decrement counter
    185 
    186     bne       filter_horz_loop
    187 
    188 
    189     // LDMFD sp!,{pc}
    190     ldp       x19, x20, [sp], #16
    191     pop_v_regs
    192     ret
    193 
    194 
    195 
    196 
    197 
    198 
    199 
    200 
    201 
    202 ///**
    203 //*******************************************************************************
    204 //*
    205 //* @brief
    206 //*   This function implements a two stage cascaded six tap filter. It
    207 //*    applies the six tap filter in the vertical direction on the
    208 //*    predictor values, followed by applying the same filter in the
    209 //*    horizontal direction on the output of the first stage. The six tap
    210 //*    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
    211 //*    interpolation process"
    212 //*    (Filter run for width = 17 and height =17)
    213 //* @par Description:
    214 //*    The function interpolates
    215 //*    the predictors first in the vertical direction and then in the
    216 //*    horizontal direction to output the (1/2,1/2). The output of the first
    217 //*    stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C)
    218 //*    in 16 bit precision.
    219 //*
    220 //*
    221 //* @param[in] pu1_src
    222 //*  UWORD8 pointer to the source
    223 //*
    224 //* @param[out] pu1_dst1
    225 //*  UWORD8 pointer to the destination(vertical filtered output)
    226 //*
    227 //* @param[out] pu1_dst2
    228 //*  UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output)
    229 //*
    230 //* @param[in] src_strd
    231 //*  integer source stride
    232 //*
    233 //* @param[in] dst_strd
    234 //*  integer destination stride of pu1_dst
    235 //*
    236 //* @param[in]pi16_pred1
    237 //*  Pointer to 16bit intermediate buffer(used only in c)
    238 //*
    239 //* @param[in] pi16_pred1_strd
    240 //*  integer destination stride of pi16_pred1
    241 //*
    242 //*
    243 //* @returns
    244 //*
    245 //* @remarks
    246 //*  None
    247 //*
    248 //*******************************************************************************
    249 //*/
    250 //void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src,
    251 //                                UWORD8 *pu1_dst1,
    252 //                                UWORD8 *pu1_dst2,
    253 //                                WORD32 src_strd,
    254 //                                WORD32 dst_strd,
    255 //                                WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/
    256 //                                WORD32 pi16_pred1_strd)
    257 
    258 
    259 
    260 
    261         .global ih264e_sixtap_filter_2dvh_vert_av8
    262 
    263 ih264e_sixtap_filter_2dvh_vert_av8:
    264     // STMFD sp!,{x10,x11,x12,x14}
    265     push_v_regs
    266     stp       x19, x20, [sp, #-16]!
    267 
    268 ////x0 - pu1_ref
    269 ////x3 - u4_ref_width
    270 
    271     //// Load six rows for vertical interpolation
    272     lsl       x12, x3, #1
    273     sub       x0, x0, x12
    274     sub       x0, x0, #2
    275     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3
    276     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3
    277     ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3
    278     mov       x12, #5
    279     ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3
    280     mov       x14, #20
    281     ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3
    282     mov       v0.h[0], w12
    283     mov       v0.h[1], w14
    284     ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3
    285     movi      v1.8b, #20
    286 
    287 //// x12 - u2_buff1_width
    288 //// x14 - u2_buff2_width
    289     mov       x12, x4
    290     add       x11, x1, #16
    291 
    292     mov       x14, x12
    293 
    294     mov       x10, #3 //loop counter
    295     sub       x16 , x12, #8
    296     sub       x19, x14, #16
    297 filter_2dvh_loop:
    298 
    299     //// ////////////// ROW 1 ///////////////////////
    300 
    301 //// Process first vertical interpolated row
    302 //// each column is
    303     uaddl     v20.8h, v2.8b, v17.8b     //// a0 + a5                             (column1,row0)
    304     movi      v31.8b, #5
    305     umlal     v20.8h, v8.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    306     umlal     v20.8h, v11.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    307     umlsl     v20.8h, v5.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    308     umlsl     v20.8h, v14.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    309     mov       v21.d[0], v20.d[1]
    310 
    311     uaddl     v22.8h, v3.8b, v18.8b     //// a0 + a5                                (column2,row0)
    312     umlal     v22.8h, v9.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
    313     umlal     v22.8h, v12.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    314     umlsl     v22.8h, v6.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    315     umlsl     v22.8h, v15.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    316     ext       v30.8b, v20.8b , v21.8b , #4
    317     mov       v23.d[0], v22.d[1]
    318 
    319 
    320     uaddl     v24.8h, v4.8b, v19.8b     //// a0 + a5                                (column3,row0)
    321     ext       v29.8b, v20.8b , v21.8b , #6
    322     umlal     v24.8h, v10.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
    323     umlal     v24.8h, v13.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    324     umlsl     v24.8h, v7.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    325     umlsl     v24.8h, v16.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    326     mov       v25.d[0], v24.d[1]
    327 
    328     sqrshrun  v2.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    329     ext       v31.8b, v21.8b , v22.8b , #2
    330     sqrshrun  v3.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    331     ext       v28.8b, v20.8b , v21.8b , #2
    332 
    333     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    334     ext       v31.8b, v22.8b , v23.8b , #2
    335     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    336     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    337     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    338     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    339     ext       v30.8b, v21.8b , v22.8b , #4
    340 
    341     sqrshrun  v4.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    342     ext       v29.8b, v21.8b , v22.8b , #6
    343 
    344     ext       v28.8b, v21.8b , v22.8b , #2
    345     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    346     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    347     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    348     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    349     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    350     ext       v31.8b, v23.8b , v24.8b , #2
    351     mov       v21.d[0], v20.d[1]
    352     ext       v2.8b, v2.8b , v3.8b , #2
    353     ext       v3.8b, v3.8b , v4.8b , #2
    354     ext       v4.8b, v4.8b , v4.8b , #2
    355 
    356     st1       {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    357     st1       {v4.h}[0], [x11], x12     //// store row1 - 1,1/2 grid
    358 
    359     ext       v30.8b, v22.8b , v23.8b , #4
    360     ext       v29.8b, v22.8b , v23.8b , #6
    361 
    362     saddl     v2.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
    363     ext       v28.8b, v22.8b , v23.8b , #2
    364     smlal     v2.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
    365     smlal     v2.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
    366     smlsl     v2.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    367     smlsl     v2.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    368     ext       v31.8b, v24.8b , v25.8b , #2
    369 
    370     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    371     ext       v30.8b, v23.8b , v24.8b , #4
    372     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    373     ext       v29.8b, v23.8b , v24.8b , #6
    374 
    375     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    376     ext       v28.8b, v23.8b , v24.8b , #2
    377     ext       v31.8b, v25.8b , v25.8b , #2
    378     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    379     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    380     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    381     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    382     ext       v30.8b, v24.8b , v25.8b , #4
    383 
    384     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    385     ext       v29.8b, v24.8b , v25.8b , #6
    386 
    387     ext       v31.8b, v24.8b , v25.8b , #2
    388     shrn      v28.4h, v2.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    389 
    390     ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data
    391     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    392     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    393     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    394     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    395     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    396     mov       v20.d[1], v21.d[0]
    397     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    398 
    399 
    400     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    401     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    402 
    403     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    404 
    405     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    406     //// ////////////// ROW 2 ///////////////////////
    407 
    408 //// Process first vertical interpolated row
    409 //// each column is
    410     uaddl     v20.8h, v5.8b, v2.8b      //// a0 + a5                             (column1,row0)
    411     movi      v31.8b, #5
    412     umlal     v20.8h, v11.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
    413     umlal     v20.8h, v14.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    414     umlsl     v20.8h, v8.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    415     umlsl     v20.8h, v17.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    416     mov       v21.d[0], v20.d[1]
    417 
    418     mov       v28.d[1], v29.d[0]
    419     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    420 
    421     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    422 
    423     uaddl     v22.8h, v6.8b, v3.8b      //// a0 + a5                                (column2,row0)
    424     umlal     v22.8h, v12.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
    425     umlal     v22.8h, v15.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    426     umlsl     v22.8h, v9.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    427     umlsl     v22.8h, v18.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    428     mov       v23.d[0], v22.d[1]
    429 
    430     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    431     ext       v30.8b, v20.8b , v21.8b , #4
    432 
    433     uaddl     v24.8h, v7.8b, v4.8b      //// a0 + a5                                (column3,row0)
    434     ext       v29.8b, v20.8b , v21.8b , #6
    435     umlal     v24.8h, v13.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
    436     umlal     v24.8h, v16.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    437     umlsl     v24.8h, v10.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    438     umlsl     v24.8h, v19.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    439     mov       v25.d[0], v24.d[1]
    440 
    441     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    442     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    443 
    444     sqrshrun  v5.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    445     ext       v31.8b, v21.8b , v22.8b , #2
    446     sqrshrun  v6.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    447     ext       v28.8b, v20.8b , v21.8b , #2
    448 
    449     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    450     ext       v31.8b, v22.8b , v23.8b , #2
    451     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    452     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    453     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    454     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    455     ext       v30.8b, v21.8b , v22.8b , #4
    456 
    457     sqrshrun  v7.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    458     ext       v29.8b, v21.8b , v22.8b , #6
    459 
    460     ext       v28.8b, v21.8b , v22.8b , #2
    461     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    462     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    463     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    464     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    465     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    466     ext       v31.8b, v23.8b , v24.8b , #2
    467 
    468     ext       v5.8b, v5.8b , v6.8b , #2
    469     ext       v6.8b, v6.8b , v7.8b , #2
    470     ext       v7.8b, v7.8b , v7.8b , #2
    471 
    472     st1       {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    473     st1       {v7.h}[0], [x11], x12     //// store row1 - 1,1/2 grid
    474 
    475     ext       v30.8b, v22.8b , v23.8b , #4
    476     ext       v29.8b, v22.8b , v23.8b , #6
    477 
    478     saddl     v6.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
    479     ext       v28.8b, v22.8b , v23.8b , #2
    480     smlal     v6.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
    481     smlal     v6.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
    482     smlsl     v6.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    483     smlsl     v6.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    484     ext       v31.8b, v24.8b , v25.8b , #2
    485 
    486     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    487     ext       v30.8b, v23.8b , v24.8b , #4
    488     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    489     ext       v29.8b, v23.8b , v24.8b , #6
    490 
    491     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    492     ext       v28.8b, v23.8b , v24.8b , #2
    493     ext       v31.8b, v25.8b , v25.8b , #2
    494     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    495     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    496     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    497     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    498     ext       v30.8b, v24.8b , v25.8b , #4
    499 
    500     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    501     ext       v29.8b, v24.8b , v25.8b , #6
    502 
    503     ext       v31.8b, v24.8b , v25.8b , #2
    504     shrn      v28.4h, v6.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    505 
    506     ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data
    507     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    508     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    509     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    510     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    511     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    512     mov       v20.d[1], v21.d[0]
    513     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    514 
    515 
    516     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    517     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    518 
    519     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    520 
    521     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    522     //// ////////////// ROW 3 ///////////////////////
    523 
    524 //// Process first vertical interpolated row
    525 //// each column is
    526     uaddl     v20.8h, v8.8b, v5.8b      //// a0 + a5                             (column1,row0)
    527     movi      v31.8b, #5
    528     umlal     v20.8h, v14.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
    529     umlal     v20.8h, v17.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    530     umlsl     v20.8h, v11.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    531     umlsl     v20.8h, v2.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    532     mov       v21.d[0], v20.d[1]
    533 
    534     mov       v28.d[1], v29.d[0]
    535     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    536     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    537 
    538     uaddl     v22.8h, v9.8b, v6.8b      //// a0 + a5                                (column2,row0)
    539     umlal     v22.8h, v15.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
    540     umlal     v22.8h, v18.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    541     umlsl     v22.8h, v12.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    542     umlsl     v22.8h, v3.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    543     mov       v23.d[0], v22.d[1]
    544 
    545     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    546     ext       v30.8b, v20.8b , v21.8b , #4
    547 
    548     uaddl     v24.8h, v10.8b, v7.8b     //// a0 + a5                                (column3,row0)
    549     ext       v29.8b, v20.8b , v21.8b , #6
    550     umlal     v24.8h, v16.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
    551     umlal     v24.8h, v19.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    552     umlsl     v24.8h, v13.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    553     umlsl     v24.8h, v4.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    554     mov       v25.d[0], v24.d[1]
    555 
    556     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    557     st1       { v28.h}[0], [x2], x19    //// store 1/2,1,2 grif values
    558 
    559     sqrshrun  v8.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    560     ext       v31.8b, v21.8b , v22.8b , #2
    561     sqrshrun  v9.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    562     ext       v28.8b, v20.8b , v21.8b , #2
    563 
    564     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    565     ext       v31.8b, v22.8b , v23.8b , #2
    566     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    567     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    568     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    569     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    570     ext       v30.8b, v21.8b , v22.8b , #4
    571 
    572     sqrshrun  v10.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    573     ext       v29.8b, v21.8b , v22.8b , #6
    574 
    575     ext       v28.8b, v21.8b , v22.8b , #2
    576     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    577     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    578     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    579     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    580     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    581     ext       v31.8b, v23.8b , v24.8b , #2
    582 
    583     ext       v8.8b, v8.8b , v9.8b , #2
    584     ext       v9.8b, v9.8b , v10.8b , #2
    585     ext       v10.8b, v10.8b , v10.8b , #2
    586 
    587     st1       {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    588     st1       {v10.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
    589 
    590     ext       v30.8b, v22.8b , v23.8b , #4
    591     ext       v29.8b, v22.8b , v23.8b , #6
    592 
    593     saddl     v8.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
    594     ext       v28.8b, v22.8b , v23.8b , #2
    595     smlal     v8.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
    596     smlal     v8.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
    597     smlsl     v8.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    598     smlsl     v8.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    599     ext       v31.8b, v24.8b , v25.8b , #2
    600 
    601     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    602     ext       v30.8b, v23.8b , v24.8b , #4
    603     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    604     ext       v29.8b, v23.8b , v24.8b , #6
    605 
    606     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    607     ext       v28.8b, v23.8b , v24.8b , #2
    608     ext       v31.8b, v25.8b , v25.8b , #2
    609     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    610     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    611     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    612     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    613     ext       v30.8b, v24.8b , v25.8b , #4
    614 
    615     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    616     ext       v29.8b, v24.8b , v25.8b , #6
    617 
    618     ext       v31.8b, v24.8b , v25.8b , #2
    619     shrn      v28.4h, v8.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    620 
    621     ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data
    622     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    623     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    624     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    625     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    626     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    627     mov       v20.d[1], v21.d[0]
    628     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    629 
    630 
    631     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    632     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    633 
    634     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    635 
    636     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    637     //// ////////////// ROW 4 ///////////////////////
    638 
    639 //// Process first vertical interpolated row
    640 //// each column is
    641     uaddl     v20.8h, v11.8b, v8.8b     //// a0 + a5                             (column1,row0)
    642     movi      v31.8b, #5
    643     umlal     v20.8h, v17.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
    644     umlal     v20.8h, v2.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    645     umlsl     v20.8h, v14.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    646     umlsl     v20.8h, v5.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    647     mov       v21.d[0], v20.d[1]
    648     mov       v28.d[1], v29.d[0]
    649     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    650     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    651 
    652     uaddl     v22.8h, v12.8b, v9.8b     //// a0 + a5                                (column2,row0)
    653     umlal     v22.8h, v18.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
    654     umlal     v22.8h, v3.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    655     umlsl     v22.8h, v15.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    656     umlsl     v22.8h, v6.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    657     mov       v23.d[0], v22.d[1]
    658 
    659     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    660     ext       v30.8b, v20.8b , v21.8b , #4
    661 
    662     uaddl     v24.8h, v13.8b, v10.8b    //// a0 + a5                                (column3,row0)
    663     ext       v29.8b, v20.8b , v21.8b , #6
    664     umlal     v24.8h, v19.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
    665     umlal     v24.8h, v4.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    666     umlsl     v24.8h, v16.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    667     umlsl     v24.8h, v7.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    668     mov       v25.d[0], v24.d[1]
    669 
    670     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    671     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    672 
    673     sqrshrun  v11.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    674     ext       v31.8b, v21.8b , v22.8b , #2
    675     sqrshrun  v12.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    676     ext       v28.8b, v20.8b , v21.8b , #2
    677 
    678     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    679     ext       v31.8b, v22.8b , v23.8b , #2
    680     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    681     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    682     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    683     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    684     ext       v30.8b, v21.8b , v22.8b , #4
    685 
    686     sqrshrun  v13.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    687     ext       v29.8b, v21.8b , v22.8b , #6
    688 
    689     ext       v28.8b, v21.8b , v22.8b , #2
    690     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    691     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    692     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    693     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    694     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    695     ext       v31.8b, v23.8b , v24.8b , #2
    696 
    697     ext       v11.8b, v11.8b , v12.8b , #2
    698     ext       v12.8b, v12.8b , v13.8b , #2
    699     ext       v13.8b, v13.8b , v13.8b , #2
    700 
    701     st1       {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    702     st1       {v13.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
    703 
    704     ext       v30.8b, v22.8b , v23.8b , #4
    705     ext       v29.8b, v22.8b , v23.8b , #6
    706 
    707     saddl     v12.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
    708     ext       v28.8b, v22.8b , v23.8b , #2
    709     smlal     v12.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
    710     smlal     v12.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
    711     smlsl     v12.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    712     smlsl     v12.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    713     ext       v31.8b, v24.8b , v25.8b , #2
    714 
    715     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    716     ext       v30.8b, v23.8b , v24.8b , #4
    717     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    718     ext       v29.8b, v23.8b , v24.8b , #6
    719 
    720     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    721     ext       v28.8b, v23.8b , v24.8b , #2
    722     ext       v31.8b, v25.8b , v25.8b , #2
    723     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    724     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    725     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    726     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    727     ext       v30.8b, v24.8b , v25.8b , #4
    728 
    729     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    730     ext       v29.8b, v24.8b , v25.8b , #6
    731 
    732     ext       v31.8b, v24.8b , v25.8b , #2
    733     shrn      v28.4h, v12.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    734 
    735     ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data
    736     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    737     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    738     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    739     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    740     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    741     mov       v20.d[1], v21.d[0]
    742     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    743 
    744 
    745     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    746     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    747 
    748     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    749 
    750     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    751     //// ////////////// ROW 5 ///////////////////////
    752 
    753 //// Process first vertical interpolated row
    754 //// each column is
    755     uaddl     v20.8h, v14.8b, v11.8b    //// a0 + a5                             (column1,row0)
    756     movi      v31.8b, #5
    757     umlal     v20.8h, v2.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    758     umlal     v20.8h, v5.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    759     umlsl     v20.8h, v17.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    760     umlsl     v20.8h, v8.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    761     mov       v21.d[0], v20.d[1]
    762     mov       v28.d[1], v29.d[0]
    763     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    764     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    765 
    766     uaddl     v22.8h, v15.8b, v12.8b    //// a0 + a5                                (column2,row0)
    767     umlal     v22.8h, v3.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
    768     umlal     v22.8h, v6.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    769     umlsl     v22.8h, v18.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    770     umlsl     v22.8h, v9.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    771     mov       v23.d[0], v22.d[1]
    772 
    773     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    774     ext       v30.8b, v20.8b , v21.8b , #4
    775 
    776     uaddl     v24.8h, v16.8b, v13.8b    //// a0 + a5                                (column3,row0)
    777     ext       v29.8b, v20.8b , v21.8b , #6
    778     umlal     v24.8h, v4.8b, v1.8b      //// a0 + a5 + 20a2                        (column3,row0)
    779     umlal     v24.8h, v7.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    780     umlsl     v24.8h, v19.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    781     umlsl     v24.8h, v10.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    782     mov       v25.d[0], v24.d[1]
    783 
    784     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    785     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    786 
    787     sqrshrun  v14.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    788     ext       v31.8b, v21.8b , v22.8b , #2
    789     sqrshrun  v15.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    790     ext       v28.8b, v20.8b , v21.8b , #2
    791 
    792     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    793     ext       v31.8b, v22.8b , v23.8b , #2
    794     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    795     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    796     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    797     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    798     ext       v30.8b, v21.8b , v22.8b , #4
    799 
    800     sqrshrun  v16.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    801     ext       v29.8b, v21.8b , v22.8b , #6
    802 
    803     ext       v28.8b, v21.8b , v22.8b , #2
    804     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    805     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    806     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    807     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    808     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    809     ext       v31.8b, v23.8b , v24.8b , #2
    810 
    811     ext       v14.8b, v14.8b , v15.8b , #2
    812     ext       v15.8b, v15.8b , v16.8b , #2
    813     ext       v16.8b, v16.8b , v16.8b , #2
    814 
    815     st1       {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    816     st1       {v16.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
    817 
    818     ext       v30.8b, v22.8b , v23.8b , #4
    819     ext       v29.8b, v22.8b , v23.8b , #6
    820 
    821     saddl     v14.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
    822     ext       v28.8b, v22.8b , v23.8b , #2
    823     smlal     v14.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
    824     smlal     v14.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
    825     smlsl     v14.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    826     smlsl     v14.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    827     ext       v31.8b, v24.8b , v25.8b , #2
    828 
    829     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    830     ext       v30.8b, v23.8b , v24.8b , #4
    831     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    832     ext       v29.8b, v23.8b , v24.8b , #6
    833 
    834     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    835     ext       v28.8b, v23.8b , v24.8b , #2
    836     ext       v31.8b, v25.8b , v25.8b , #2
    837     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    838     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    839     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    840     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    841     ext       v30.8b, v24.8b , v25.8b , #4
    842 
    843     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    844     ext       v29.8b, v24.8b , v25.8b , #6
    845 
    846     ext       v31.8b, v24.8b , v25.8b , #2
    847     shrn      v28.4h, v14.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    848 
    849     ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data
    850     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    851     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    852     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    853     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    854     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    855     mov       v20.d[1], v21.d[0]
    856     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    857 
    858 
    859     ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
    860     ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
    861 
    862     ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
    863 
    864     ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
    865     //// ////////////// ROW 6 ///////////////////////
    866 
    867 //// Process first vertical interpolated row
    868 //// each column is
    869 
    870     cmp       x10, #1                   //// if it 17 rows are complete skip
    871     beq       filter_2dvh_skip_row
    872     uaddl     v20.8h, v17.8b, v14.8b    //// a0 + a5                             (column1,row0)
    873     movi      v31.8b, #5
    874     umlal     v20.8h, v5.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
    875     umlal     v20.8h, v8.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
    876     umlsl     v20.8h, v2.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
    877     umlsl     v20.8h, v11.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
    878     mov       v21.d[0], v20.d[1]
    879     mov       v28.d[1], v29.d[0]
    880     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    881     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    882 
    883     uaddl     v22.8h, v18.8b, v15.8b    //// a0 + a5                                (column2,row0)
    884     umlal     v22.8h, v6.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
    885     umlal     v22.8h, v9.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
    886     umlsl     v22.8h, v3.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
    887     umlsl     v22.8h, v12.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
    888     mov       v23.d[0], v22.d[1]
    889 
    890     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    891     ext       v30.8b, v20.8b , v21.8b , #4
    892 
    893     uaddl     v24.8h, v19.8b, v16.8b    //// a0 + a5                                (column3,row0)
    894     ext       v29.8b, v20.8b , v21.8b , #6
    895     umlal     v24.8h, v7.8b, v1.8b      //// a0 + a5 + 20a2                        (column3,row0)
    896     umlal     v24.8h, v10.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
    897     umlsl     v24.8h, v4.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
    898     umlsl     v24.8h, v13.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
    899     mov       v25.d[0], v24.d[1]
    900 
    901     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    902     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    903 
    904     sqrshrun  v17.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
    905     ext       v31.8b, v21.8b , v22.8b , #2
    906     sqrshrun  v18.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
    907     ext       v28.8b, v20.8b , v21.8b , #2
    908 
    909     saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
    910     ext       v31.8b, v22.8b , v23.8b , #2
    911     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
    912     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
    913     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
    914     smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
    915     ext       v30.8b, v21.8b , v22.8b , #4
    916 
    917     sqrshrun  v19.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
    918     ext       v29.8b, v21.8b , v22.8b , #6
    919 
    920     ext       v28.8b, v21.8b , v22.8b , #2
    921     saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
    922     smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
    923     smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
    924     smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
    925     smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
    926     ext       v31.8b, v23.8b , v24.8b , #2
    927 
    928     ext       v17.8b, v17.8b , v18.8b , #2
    929     ext       v18.8b, v18.8b , v19.8b , #2
    930     ext       v19.8b, v19.8b , v19.8b , #2
    931 
    932     st1       {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid
    933     st1       {v19.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
    934 
    935     ext       v30.8b, v22.8b , v23.8b , #4
    936     ext       v29.8b, v22.8b , v23.8b , #6
    937 
    938     saddl     v18.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
    939     ext       v28.8b, v22.8b , v23.8b , #2
    940     smlal     v18.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
    941     smlal     v18.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
    942     smlsl     v18.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
    943     smlsl     v18.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
    944     ext       v31.8b, v24.8b , v25.8b , #2
    945 
    946     shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
    947     ext       v30.8b, v23.8b , v24.8b , #4
    948     shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
    949     ext       v29.8b, v23.8b , v24.8b , #6
    950 
    951     saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
    952     ext       v28.8b, v23.8b , v24.8b , #2
    953     ext       v31.8b, v25.8b , v25.8b , #2
    954     smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
    955     smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
    956     smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
    957     smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
    958     ext       v30.8b, v24.8b , v25.8b , #4
    959 
    960     saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
    961     ext       v29.8b, v24.8b , v25.8b , #6
    962 
    963     ext       v31.8b, v24.8b , v25.8b , #2
    964     shrn      v28.4h, v18.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
    965 
    966     ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data
    967     smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
    968     smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
    969     smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
    970     smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
    971     shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
    972     mov       v20.d[1], v21.d[0]
    973     sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
    974 
    975     mov       v28.d[1], v29.d[0]
    976     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
    977     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
    978 
    979     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
    980 
    981     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
    982     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
    983 
    984     subs      x10, x10, #1              ////decrement loop counter
    985 
    986     bne       filter_2dvh_loop
    987 
    988 
    989 //// Process first vertical interpolated row
    990 //// each column is
    991     //// ////////////// ROW 13 ///////////////////////
    992 
    993 //// Process first vertical interpolated row
    994 //// each column is
    995 
    996     // LDMFD sp!,{x10,x11,x12,pc}
    997     ldp       x19, x20, [sp], #16
    998     pop_v_regs
    999     ret
   1000 
   1001 filter_2dvh_skip_row:
   1002     mov       v28.d[1], v29.d[0]
   1003     sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
   1004     shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
   1005 
   1006     sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
   1007 
   1008     st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
   1009     st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
   1010     // LDMFD sp!,{x10,x11,x12,pc}
   1011     ldp       x19, x20, [sp], #16
   1012     pop_v_regs
   1013     ret
   1014 
   1015 
   1016 ///*****************************************
   1017