Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_luma_horz.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  parthiban v
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*  - ihevc_inter_pred_luma_horz()
     35 //*
     36 //* //remarks
     37 //*  none
     38 //*
     39 //*******************************************************************************
     40 //*/
     41 
     42 ///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 ///* include reconstruction */
     44 //
     45 
     46 ///**
     47 //*******************************************************************************
     48 //*
     49 //* //brief
     50 //*     interprediction luma filter for vertical input
     51 //*
     52 //* //par description:
     53 //*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     54 //*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     55 //*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
     56 //*    assumptions : the function is optimized considering the fact width is
     57 //*    multiple of 4 or 8. and height as multiple of 2.
     58 //*
     59 //* //param[in] pu1_src
     60 //*  uword8 pointer to the source
     61 //*
     62 //* //param[out] pu1_dst
     63 //*  uword8 pointer to the destination
     64 //*
     65 //* //param[in] src_strd
     66 //*  integer source stride
     67 //*
     68 //* //param[in] dst_strd
     69 //*  integer destination stride
     70 //*
     71 //* //param[in] pi1_coeff
     72 //*  word8 pointer to the filter coefficients
     73 //*
     74 //* //param[in] ht
     75 //*  integer height of the array
     76 //*
     77 //* //param[in] wd
     78 //*  integer width of the array
     79 //*
     80 //* //returns
     81 //*
     82 //* //remarks
     83 //*  none
     84 //*
     85 //*******************************************************************************
     86 //*/
     87 
     88 //void ihevc_inter_pred_luma_horz (
     89 //                            uword8 *pu1_src,
     90 //                            uword8 *pu1_dst,
     91 //                            word32 src_strd,
     92 //                            word32 dst_strd,
     93 //                            word8 *pi1_coeff,
     94 //                            word32 ht,
     95 //                            word32 wd   )
     96 
     97 //**************variables vs registers*****************************************
     98 //    x0 => *pu1_src
     99 //    x1 => *pu1_dst
    100 //    x2 =>  src_strd
    101 //    x3 =>  dst_strd
    102 //    x4 => *pi1_coeff
    103 //    x5 =>  ht
    104 //    x6 =>  wd
    105 
    106 .text
    107 .align 4
    108 
    109 .include "ihevc_neon_macros.s"
    110 
    111 .globl ihevc_inter_pred_luma_horz_av8
    112 
    113 .type ihevc_inter_pred_luma_horz_av8, %function
    114 
    115 ihevc_inter_pred_luma_horz_av8:
    116 
    117     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
    118     push_v_regs
    119     stp         x19, x20,[sp,#-16]!
    120     //str        x1,[sp,#-4]
    121     // mov        x7,#8192
    122 
    123     mov         x15,x4 // pi1_coeff
    124     mov         x16,x5 // ht
    125     mov         x17,x6 // wd
    126 
    127 start_loop_count:
    128     // ldr         x1,[sp,#-4]
    129 
    130 
    131     mov         x4,x15                      //loads pi1_coeff
    132     mov         x8,x16                      //loads ht
    133     mov         x10,x17                     //loads wd
    134 
    135     ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
    136     mov         x11,#1
    137     subs        x14,x8,#0                   //checks for ht == 0
    138 
    139     abs         v2.8b, v0.8b                //vabs_s8(coeff)
    140 
    141     //ble          end_loops
    142 
    143 
    144     dup         v24.8b, v2.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
    145     sub         x12,x0,#3                   //pu1_src - 3
    146     dup         v25.8b, v2.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
    147     add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
    148     dup         v26.8b, v2.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
    149     sub         x20,x10,x2,lsl #1           //2*src_strd - wd
    150     neg         x9, x20
    151     dup         v27.8b, v2.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
    152     sub         x20,x10,x3,lsl #1           //2*dst_strd - wd
    153     neg         x8, x20
    154     dup         v28.8b, v2.8b[4]            //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
    155 
    156     dup         v29.8b, v2.8b[5]            //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
    157     // tst          x10,#7                            //checks wd for multiples
    158     dup         v30.8b, v2.8b[6]            //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
    159     dup         v31.8b, v2.8b[7]            //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
    160 
    161     mov         x7,x1
    162 
    163     cmp         x10,#4
    164     ble         outer_loop_4
    165 
    166     cmp         x10,#24
    167     mov         x20,#16
    168     csel        x10, x20, x10,eq
    169     add         x20, x8,#8
    170     csel        x8, x20, x8,eq
    171     add         x20, x9,#8
    172     csel        x9, x20, x9,eq
    173 
    174     cmp         x10,#16
    175     bge         outer_loop_16
    176 
    177     cmp         x10,#12
    178     add         x20, x8,#4
    179     csel        x8, x20, x8,eq
    180     add         x20, x9,#4
    181     csel        x9, x20, x9,eq
    182     b           outer_loop_8
    183 
    184 
    185 outer_loop8_residual:
    186     sub         x12,x0,#3                   //pu1_src - 3
    187     mov         x1,x7
    188     mov         x14,#32
    189     add         x1, x1,#16
    190     add         x12, x12,#16
    191     mov         x10,#8
    192     add         x8, x8,#8
    193     add         x9, x9,#8
    194 
    195 outer_loop_8:
    196 
    197     add         x6,x1,x3                    //pu1_dst + dst_strd
    198     add         x4,x12,x2                   //pu1_src + src_strd
    199     subs        x5,x10,#0                   //checks wd
    200 
    201     ble         end_inner_loop_8
    202 
    203 inner_loop_8:
    204     ld1         {v0.2s},[x12],x11           //vector load pu1_src
    205     ld1         {v1.2s},[x12],x11
    206     ld1         {v2.2s},[x12],x11
    207     ld1         {v3.2s},[x12],x11
    208 
    209 
    210 
    211 
    212 
    213     // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    214     // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
    215     // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    216     // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
    217     // vext.u8    d6,d0,d1,#6                        //vector extract of src [0_6]
    218     // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
    219     // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
    220     // vext.u8    d14,d12,d13,#2
    221 
    222     //vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
    223     // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
    224     // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
    225     //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
    226     //vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
    227     //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
    228     ld1         {v4.2s},[x12],x11
    229     umull       v8.8h, v1.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    230     ld1         {v5.2s},[x12],x11
    231     umlal       v8.8h, v3.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    232     ld1         {v6.2s},[x12],x11
    233     umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    234     ld1         {v7.2s},[x12],x11
    235     umlsl       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    236     ld1         {v12.2s},[x4],x11           //vector load pu1_src + src_strd
    237     umlal       v8.8h, v4.8b, v28.8b        //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    238     ld1         {v13.2s},[x4],x11
    239     umlsl       v8.8h, v5.8b, v29.8b        //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    240     ld1         {v14.2s},[x4],x11
    241     umlal       v8.8h, v6.8b, v30.8b        //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    242     ld1         {v15.2s},[x4],x11
    243     umlsl       v8.8h, v7.8b, v31.8b        //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    244     ld1         {v16.2s},[x4],x11           //vector load pu1_src + src_strd
    245 
    246     umull       v10.8h, v15.8b, v27.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    247     ld1         {v17.2s},[x4],x11
    248     umlsl       v10.8h, v14.8b, v26.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    249     ld1         {v18.2s},[x4],x11
    250     umlal       v10.8h, v16.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    251     ld1         {v19.2s},[x4],x11           //vector load pu1_src + src_strd
    252     umlsl       v10.8h, v17.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    253     sqrshrun    v20.8b, v8.8h,#6            //right shift and saturating narrow result 1
    254     umlal       v10.8h, v18.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    255     umlsl       v10.8h, v19.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    256     st1         {v20.8b},[x1],#8            //store the result pu1_dst
    257     umlsl       v10.8h, v12.8b, v24.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    258     umlal       v10.8h, v13.8b, v25.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    259 
    260 
    261 
    262     sqrshrun    v8.8b, v10.8h,#6            //right shift and saturating narrow result 2
    263     subs        x5,x5,#8                    //decrement the wd loop
    264     st1         {v8.8b},[x6],#8             //store the result pu1_dst
    265     cmp         x5,#4
    266     bgt         inner_loop_8
    267 
    268 end_inner_loop_8:
    269     subs        x14,x14,#2                  //decrement the ht loop
    270     add         x12,x12,x9                  //increment the src pointer by 2*src_strd-wd
    271     add         x1,x1,x8                    //increment the dst pointer by 2*dst_strd-wd
    272     bgt         outer_loop_8
    273 
    274 
    275 
    276 
    277 
    278     mov         x10,x17                     //loads wd
    279     cmp         x10,#12
    280 
    281     beq         outer_loop4_residual
    282 
    283 
    284 end_loops:
    285 
    286     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    287     ldp         x19, x20,[sp], #16
    288     pop_v_regs
    289     ret
    290 
    291 
    292 
    293 
    294 
    295 
    296 outer_loop_16:
    297     mov         x15, #-7
    298     stp         x0,x7, [sp, #-16]!
    299 
    300     add         x6,x1,x3                    //pu1_dst + dst_strd
    301     add         x4,x12,x2                   //pu1_src + src_strd
    302     and         x0, x12, #31
    303     sub         x5,x10,#0                   //checks wd
    304     //ble          end_loops1
    305     add         x20,x12, x2, lsl #1
    306     prfm        PLDL1KEEP,[x20]
    307     ld1         { v0.2s},[x12],#8           //vector load pu1_src
    308     ld1         { v1.2s},[x12],x15          //vector load pu1_src
    309     add         x20,x4, x2, lsl #1
    310     prfm        PLDL1KEEP,[x20]
    311     ld1         { v2.2s},[x12],#8
    312     ld1         { v3.2s},[x12],x15
    313     ld1         { v4.2s},[x12],#8
    314     ld1         { v5.2s},[x12],x15
    315     ld1         { v6.2s},[x12],#8
    316     ld1         { v7.2s},[x12],x15
    317     ld1         { v12.2s},[x12],#8
    318     ld1         { v13.2s},[x12],x15
    319     umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    320     ld1         { v14.2s},[x12],#8
    321     ld1         { v15.2s},[x12],x15
    322     umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    323     ld1         { v16.2s},[x12],#8
    324     ld1         { v17.2s},[x12],x15
    325     umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    326     ld1         { v18.2s},[x12],#8
    327     ld1         { v19.2s},[x12],x15
    328     umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    329     umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    330     umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    331     umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    332     umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    333 
    334 
    335 inner_loop_16:
    336 
    337 
    338     subs        x5,x5,#16
    339     umull       v20.8h, v3.8b, v25.8b
    340 
    341     add         x12, x12,#8
    342     umlsl       v20.8h, v1.8b, v24.8b
    343 
    344     sub         x20,x14,#2
    345     csel        x14, x20, x14,eq
    346     umlal       v20.8h, v7.8b, v27.8b
    347 
    348     ld1         { v0.2s},[x4],#8            //vector load pu1_src
    349     ld1         { v1.2s},[x4],x15           //vector load pu1_src
    350 
    351     umlsl       v20.8h, v5.8b, v26.8b
    352 
    353     ld1         { v2.2s},[x4],#8
    354     ld1         { v3.2s},[x4],x15
    355 
    356     umlal       v20.8h, v13.8b, v28.8b
    357 
    358     ld1         { v4.2s},[x4],#8
    359     ld1         { v5.2s},[x4],x15
    360     umlal       v20.8h, v17.8b, v30.8b
    361 
    362     ld1         { v6.2s},[x4],#8
    363     ld1         { v7.2s},[x4],x15
    364     umlsl       v20.8h, v15.8b, v29.8b
    365 
    366     ld1         { v12.2s},[x4],#8
    367     ld1         { v13.2s},[x4],x15
    368     umlsl       v20.8h, v19.8b, v31.8b
    369 
    370     ld1         { v14.2s},[x4],#8
    371     ld1         { v15.2s},[x4],x15
    372     sqrshrun    v8.8b, v8.8h,#6             //right shift and saturating narrow result 1
    373 
    374     ld1         { v16.2s},[x4],#8
    375     ld1         { v17.2s},[x4],x15
    376     umull       v10.8h, v2.8b, v25.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    377 
    378     ld1         { v18.2s},[x4],#8
    379     ld1         { v19.2s},[x4],x15
    380     umlal       v10.8h, v6.8b, v27.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    381 
    382     add         x4, x4,#8
    383     umlsl       v10.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    384 
    385     add         x20,x12,x9                  //increment the src pointer by 2*src_strd-wd
    386     csel        x12, x20, x12,eq
    387     umlsl       v10.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    388 
    389     add         x20,x12,x2                  //pu1_src + src_strd
    390     csel        x4, x20, x4,eq
    391     sqrshrun    v9.8b, v20.8h,#6
    392 
    393     umlal       v10.8h, v12.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    394 
    395 //    and            x7, x12, #31
    396     umlsl       v10.8h, v14.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    397 
    398     umlal       v10.8h, v16.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    399 
    400     umlsl       v10.8h, v18.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    401 
    402     umull       v22.8h, v3.8b, v25.8b
    403 
    404     umlsl       v22.8h, v1.8b, v24.8b
    405 
    406     st1         { v8.8b},[x1],#8            //store the result pu1_dst
    407     st1         { v9.8b},[x1],#8            //store the result pu1_dst
    408     umlal       v22.8h, v7.8b, v27.8b
    409 
    410     add         x20,x1,x8
    411     csel        x1, x20, x1,eq
    412     sqrshrun    v10.8b, v10.8h,#6           //right shift and saturating narrow result 2
    413 
    414 //    cmp            x7, x0
    415     umlsl       v22.8h, v5.8b, v26.8b
    416 
    417     add         x20,x12, x2, lsl #2
    418     prfm        PLDL1KEEP,[x20]
    419     umlal       v22.8h, v13.8b, v28.8b
    420 
    421     add         x20,x4, x2, lsl #2
    422     prfm        PLDL1KEEP,[x20]
    423     umlal       v22.8h, v17.8b, v30.8b
    424 
    425 //    mov            x0, x7
    426     umlsl       v22.8h, v15.8b, v29.8b
    427 
    428     cmp         x14,#0
    429     umlsl       v22.8h, v19.8b, v31.8b
    430 
    431     beq         epilog_16
    432     ld1         { v0.2s},[x12],#8           //vector load pu1_src
    433     ld1         { v1.2s},[x12],x15          //vector load pu1_src
    434     ld1         { v2.2s},[x12],#8
    435     ld1         { v3.2s},[x12],x15
    436     ld1         { v4.2s},[x12],#8
    437     ld1         { v5.2s},[x12],x15
    438     ld1         { v6.2s},[x12],#8
    439     ld1         { v7.2s},[x12],x15
    440     ld1         { v12.2s},[x12],#8
    441     ld1         { v13.2s},[x12],x15
    442     sqrshrun    v11.8b, v22.8h,#6
    443     umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
    444     ld1         { v14.2s},[x12],#8
    445     ld1         { v15.2s},[x12],x15
    446     umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
    447     ld1         { v16.2s},[x12],#8
    448     ld1         { v17.2s},[x12],x15
    449     umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
    450     ld1         { v18.2s},[x12],#8
    451     ld1         { v19.2s},[x12],x15
    452     umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
    453     umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
    454     cmp         x5,#0
    455     umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
    456     csel        x5, x10, x5,eq
    457     umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
    458     st1         { v10.8b},[x6],#8           //store the result pu1_dst
    459     st1         { v11.8b},[x6],#8           //store the result pu1_dst
    460     umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
    461     add         x20,x1,x3                   //pu1_dst + dst_strd
    462     csel        x6, x20, x6,eq
    463     b           inner_loop_16
    464 
    465 
    466 epilog_16:
    467     sqrshrun    v11.8b, v22.8h,#6
    468     st1         { v10.8b},[x6],#8           //store the result pu1_dst
    469     st1         { v11.8b},[x6],#8           //store the result pu1_dst
    470 
    471     ldp         x0,x7, [sp], #16
    472     mov         x10,x17
    473     cmp         x10,#24
    474 
    475     beq         outer_loop8_residual
    476 
    477 
    478 
    479 end_loops1:
    480 
    481     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    482     ldp         x19, x20,[sp], #16
    483     pop_v_regs
    484     ret
    485 
    486 
    487 
    488 
    489 
    490 
    491 
    492 
    493 outer_loop4_residual:
    494     sub         x12,x0,#3                   //pu1_src - 3
    495     mov         x1,x7
    496     add         x1, x1,#8
    497     mov         x10,#4
    498     add         x12, x12,#8
    499     mov         x14,#16
    500     add         x8, x8,#4
    501     add         x9, x9,#4
    502 
    503 outer_loop_4:
    504     add         x6,x1,x3                    //pu1_dst + dst_strd
    505     add         x4,x12,x2                   //pu1_src + src_strd
    506 
    507     subs        x5,x10,#0                   //checks wd
    508     ble         end_inner_loop_4
    509 
    510 inner_loop_4:
    511     ld1         {v20.2s},[x12],x11          //vector load pu1_src
    512     ld1         {v21.2s},[x12],x11
    513     ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
    514     ld1         {v23.2s},[x4],x11
    515 
    516     zip1        v0.2s, v20.2s, v22.2s
    517     zip2        v12.2s, v20.2s, v22.2s      //vector zip the i iteration and ii interation in single register
    518     zip1        v1.2s, v21.2s, v23.2s
    519     zip2        v13.2s, v21.2s, v23.2s
    520 
    521     ld1         {v20.2s},[x12],x11          //vector load pu1_src
    522     ld1         {v21.2s},[x12],x11
    523     ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
    524     ld1         {v23.2s},[x4],x11
    525 
    526     zip1        v2.2s, v20.2s, v22.2s
    527     zip2        v14.2s, v20.2s, v22.2s
    528     zip1        v3.2s, v21.2s, v23.2s
    529     zip2        v15.2s, v21.2s, v23.2s
    530 
    531     ld1         {v20.2s},[x12],x11          //vector load pu1_src
    532     ld1         {v21.2s},[x12],x11
    533     ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
    534     ld1         {v23.2s},[x4],x11
    535 
    536     zip1        v4.2s, v20.2s, v22.2s
    537     zip2        v16.2s, v20.2s, v22.2s
    538     zip1        v5.2s, v21.2s, v23.2s
    539     zip2        v17.2s, v21.2s, v23.2s
    540 
    541     ld1         {v20.2s},[x12],x11          //vector load pu1_src
    542     ld1         {v21.2s},[x12],x11
    543     ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
    544     ld1         {v23.2s},[x4],x11
    545 
    546     zip1        v6.2s, v20.2s, v22.2s
    547     zip2        v18.2s, v20.2s, v22.2s
    548     zip1        v7.2s, v21.2s, v23.2s
    549     zip2        v19.2s, v21.2s, v23.2s
    550 
    551     //add        x12,x12,#4                        //increment the input pointer
    552     sub         x12,x12,#4
    553     //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
    554     //vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
    555     //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
    556 
    557     //vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
    558     //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
    559     //vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
    560     //vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
    561 
    562     sub         x4,x4,#4
    563     // add        x4,x4,#4                        //increment the input pointer
    564     // vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
    565     // vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
    566     // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
    567     // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
    568     // vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
    569     // vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
    570     //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
    571 
    572     umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
    573     umlsl       v8.8h, v0.8b, v24.8b
    574     umlsl       v8.8h, v2.8b, v26.8b
    575     umlal       v8.8h, v3.8b, v27.8b
    576     umlal       v8.8h, v4.8b, v28.8b
    577     umlsl       v8.8h, v5.8b, v29.8b
    578     umlal       v8.8h, v6.8b, v30.8b
    579     umlsl       v8.8h, v7.8b, v31.8b
    580 
    581     sqrshrun    v8.8b, v8.8h,#6             //narrow right shift and saturating the result
    582     st1         {v8.s}[0],[x1],#4           //store the i iteration result which is in upper part of the register
    583     st1         {v8.s}[1],[x6],#4           //store the ii iteration result which is in lower part of the register
    584     subs        x5,x5,#4                    //decrement the wd by 4
    585     bgt         inner_loop_4
    586 
    587 end_inner_loop_4:
    588     subs        x14,x14,#2                  //decrement the ht by 4
    589     add         x12,x12,x9                  //increment the input pointer 2*src_strd-wd
    590     add         x1,x1,x8                    //increment the output pointer 2*dst_strd-wd
    591     bgt         outer_loop_4
    592     //subs     x7,x7,#1
    593     // bgt     start_loop_count
    594 
    595     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    596     ldp         x19, x20,[sp], #16
    597     pop_v_regs
    598     ret
    599 
    600 
    601 
    602 
    603 
    604 
    605 
    606