Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_filters_luma_vert_w16inp.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*  - ihevc_inter_pred_filters_luma_vert_w16inp()
     35 //*
     36 //* //remarks
     37 //*  none
     38 //*
     39 //*******************************************************************************
     40 //*/
     41 
     42 ///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 ///* include reconstruction */
     44 //
     45 
     46 ///**
     47 //*******************************************************************************
     48 //*
     49 //* //brief
     50 //*    luma vertical filter for 16bit input.
     51 //*
     52 //* //par description:
     53 //*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     54 //*     the elements pointed by 'pu1_src' and  writes to the location pointed by
     55 //*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
     56 //*     clipped to lie  between 0 and 255   assumptions : the function is
     57 //*     optimized considering the fact width is  multiple of 4. and height as
     58 //*     multiple of 2.
     59 //*
     60 //* //param[in] pi2_src
     61 //*  word16 pointer to the source
     62 //*
     63 //* //param[out] pu1_dst
     64 //*  uword8 pointer to the destination
     65 //*
     66 //* //param[in] src_strd
     67 //*  integer source stride
     68 //*
     69 //* //param[in] dst_strd
     70 //*  integer destination stride
     71 //*
     72 //* //param[in] pi1_coeff
     73 //*  word8 pointer to the filter coefficients
     74 //*
     75 //* //param[in] ht
     76 //*  integer height of the array
     77 //*
     78 //* //param[in] wd
     79 //*  integer width of the array
     80 //*
     81 //* //returns
     82 //*
     83 //* //remarks
     84 //*  none
     85 //*
     86 //*******************************************************************************
     87 //*/
     88 
     89 //void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
     90 //                                    uword8 *pu1_dst,
     91 //                                    word32 src_strd,
     92 //                                    word32 dst_strd,
     93 //                                    word8 *pi1_coeff,
     94 //                                    word32 ht,
     95 //                                    word32 wd   )
     96 
     97 .text
     98 .align 4
     99 
    100 .include "ihevc_neon_macros.s"
    101 
    102 .globl ihevc_inter_pred_luma_vert_w16inp_av8
    103 
    104 .type ihevc_inter_pred_luma_vert_w16inp_av8, %function
    105 
    106 ihevc_inter_pred_luma_vert_w16inp_av8:
    107 
    108     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    109 
    110     stp         x19, x20,[sp,#-16]!
    111 
    112     mov         x15,x4 // pi1_coeff
    113     mov         x16,x5 // ht
    114     mov         x17,x6 // wd
    115 
    116     mov         x12,x15                     //load pi1_coeff
    117     mov         x6,x3
    118     mov         x5,x17                      //load wd
    119     ld1         {v0.8b},[x12]               //coeff = vld1_s8(pi1_coeff)
    120     lsl         x2, x2, #1
    121     sub         x12,x2,x2,lsl #2            //src_ctrd & pi1_coeff
    122     //abs  v0.8b, v0.8b                //vabs_s8(coeff)
    123     add         x0,x0,x12                   //x0->pu1_src    x12->pi1_coeff
    124     mov         x3,x16                      //load ht
    125     subs        x7,x3,#0                    //x3->ht
    126     //ble          end_loops            //end loop jump
    127     sxtl        v0.8h, v0.8b
    128     dup         v22.4h, v0.h[0]             //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
    129     dup         v23.4h, v0.h[1]             //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
    130     dup         v24.4h, v0.h[2]             //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
    131     dup         v25.4h, v0.h[3]             //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
    132     dup         v26.4h, v0.h[4]             //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
    133     dup         v27.4h, v0.h[5]             //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
    134     dup         v28.4h, v0.h[6]             //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
    135     dup         v29.4h, v0.h[7]             //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
    136 
    137     sub         x20,x5,x6,lsl #2            //x6->dst_strd    x5    ->wd
    138     neg         x9, x20
    139     sub         x20,x5,x2,lsl #2            //x2->src_strd
    140     neg         x8, x20
    141     sub         x8,x8,x5
    142     lsr         x3, x5, #2                  //divide by 4
    143     mul         x7, x7, x3                  //multiply height by width
    144     sub         x7, x7,#4                   //subtract by one for epilog
    145     mov         x4,x5                       //x5 ->wd
    146     //lsl x2, x2, #1
    147 
    148 prolog:
    149 
    150     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
    151     ld1         {v1.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
    152     ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
    153     subs        x4,x4,#4
    154     ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
    155     smull       v19.4s, v1.4h, v23.4h       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
    156     ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
    157     smlal       v19.4s, v0.4h, v22.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
    158     ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
    159     smlal       v19.4s, v2.4h, v24.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
    160     ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
    161     smlal       v19.4s, v3.4h, v25.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
    162     ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
    163     smlal       v19.4s, v4.4h, v26.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
    164     ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
    165     smlal       v19.4s, v5.4h, v27.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
    166     smlal       v19.4s, v6.4h, v28.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
    167     smlal       v19.4s, v7.4h, v29.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
    168 
    169     ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
    170 
    171     smull       v20.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
    172     add         x20,x0,x8,lsl #0
    173     csel        x0, x20, x0,le
    174     smlal       v20.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
    175     csel        x4, x5, x4,le               //x5 ->wd
    176     smlal       v20.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
    177     ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
    178     smlal       v20.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
    179     ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
    180     smlal       v20.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
    181     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
    182     smlal       v20.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
    183     smlal       v20.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
    184     smlal       v20.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
    185     sqshrn      v19.4h, v19.4s,#6
    186 
    187     ld1         {v1.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
    188     smull       v21.4s, v3.4h, v23.4h
    189     ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
    190     smlal       v21.4s, v2.4h, v22.4h
    191     ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
    192     smlal       v21.4s, v4.4h, v24.4h
    193     smlal       v21.4s, v5.4h, v25.4h
    194     smlal       v21.4s, v6.4h, v26.4h
    195     smlal       v21.4s, v7.4h, v27.4h
    196     smlal       v21.4s, v16.4h, v28.4h
    197     smlal       v21.4s, v17.4h, v29.4h
    198     add         x14,x1,x6
    199     sqshrn      v20.4h, v20.4s,#6
    200     sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
    201 
    202     smull       v30.4s, v4.4h, v23.4h
    203     smlal       v30.4s, v3.4h, v22.4h
    204     smlal       v30.4s, v5.4h, v24.4h
    205     smlal       v30.4s, v6.4h, v25.4h
    206     ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
    207     smlal       v30.4s, v7.4h, v26.4h
    208     ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
    209     smlal       v30.4s, v16.4h, v27.4h
    210     ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
    211     smlal       v30.4s, v17.4h, v28.4h
    212     ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
    213     smlal       v30.4s, v18.4h, v29.4h
    214     ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
    215 
    216     st1         {v19.s}[0],[x1],#4          //vst1_u8(pu1_dst,sto_res)//
    217     sqshrn      v21.4h, v21.4s,#6
    218     sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
    219     add         x20,x1,x9
    220     csel        x1, x20, x1,le
    221 
    222     subs        x7,x7,#4
    223 
    224     blt         epilog_end                  //jumps to epilog_end
    225     beq         epilog                      //jumps to epilog
    226 
    227 kernel_8:
    228 
    229     smull       v19.4s, v1.4h, v23.4h       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
    230     subs        x4,x4,#4
    231     smlal       v19.4s, v0.4h, v22.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
    232     add         x20,x0,x8,lsl #0
    233     csel        x0, x20, x0,le
    234     smlal       v19.4s, v2.4h, v24.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
    235     smlal       v19.4s, v3.4h, v25.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
    236     smlal       v19.4s, v4.4h, v26.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
    237     smlal       v19.4s, v5.4h, v27.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
    238     smlal       v19.4s, v6.4h, v28.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
    239     smlal       v19.4s, v7.4h, v29.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
    240     st1         {v20.s}[0],[x14],x6         //vst1_u8(pu1_dst_tmp,sto_res)//
    241 
    242     sqshrn      v30.4h, v30.4s,#6
    243     sqrshrun    v21.8b, v21.8h,#6
    244     ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
    245 
    246     smull       v20.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
    247     smlal       v20.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
    248     smlal       v20.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
    249     smlal       v20.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
    250     smlal       v20.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
    251     smlal       v20.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
    252     st1         {v21.s}[0],[x14],x6
    253 
    254     smlal       v20.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
    255     ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
    256 
    257     smlal       v20.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
    258 
    259     sqshrn      v19.4h, v19.4s,#6
    260     sqrshrun    v30.8b, v30.8h,#6
    261 
    262     smull       v21.4s, v3.4h, v23.4h
    263     csel        x4, x5, x4,le               //x5 ->wd
    264 
    265     smlal       v21.4s, v2.4h, v22.4h
    266     ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
    267 
    268     smlal       v21.4s, v4.4h, v24.4h
    269     add         x3,x0,x2                    //pu1_src_tmp += src_strd//
    270 
    271     smlal       v21.4s, v5.4h, v25.4h
    272 
    273     smlal       v21.4s, v6.4h, v26.4h
    274     st1         {v30.s}[0],[x14],x6
    275 
    276     smlal       v21.4s, v7.4h, v27.4h
    277     ld1         {v1.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
    278 
    279     smlal       v21.4s, v16.4h, v28.4h
    280     add         x14,x1,x6
    281 
    282     smlal       v21.4s, v17.4h, v29.4h
    283     ld1         {v0.4h},[x0],#8             //src_tmp1 = vld1_u8(pu1_src_tmp)//
    284 
    285     sqshrn      v20.4h, v20.4s,#6
    286     sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
    287     ld1         {v2.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
    288 
    289     smull       v30.4s, v4.4h, v23.4h
    290     smlal       v30.4s, v3.4h, v22.4h
    291     smlal       v30.4s, v5.4h, v24.4h
    292     ld1         {v3.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
    293 
    294     smlal       v30.4s, v6.4h, v25.4h
    295     ld1         {v4.4h},[x3],x2             //src_tmp1 = vld1_u8(pu1_src_tmp)//
    296     smlal       v30.4s, v7.4h, v26.4h
    297     ld1         {v5.4h},[x3],x2             //src_tmp2 = vld1_u8(pu1_src_tmp)//
    298     smlal       v30.4s, v16.4h, v27.4h
    299     ld1         {v6.4h},[x3],x2             //src_tmp3 = vld1_u8(pu1_src_tmp)//
    300     smlal       v30.4s, v17.4h, v28.4h
    301     ld1         {v7.4h},[x3],x2             //src_tmp4 = vld1_u8(pu1_src_tmp)//
    302     smlal       v30.4s, v18.4h, v29.4h
    303     st1         {v19.s}[0],[x1],#4          //vst1_u8(pu1_dst,sto_res)//
    304 
    305     sqshrn      v21.4h, v21.4s,#6
    306     add         x20,x1,x9
    307     csel        x1, x20, x1,le
    308 
    309     sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
    310     subs        x7,x7,#4
    311 
    312     bgt         kernel_8                    //jumps to kernel_8
    313 
    314 epilog:
    315 
    316     smull       v19.4s, v1.4h, v23.4h       //mul_res1 = vmull_u8(src_tmp2, coeffabs_1)//
    317     smlal       v19.4s, v0.4h, v22.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)//
    318     smlal       v19.4s, v2.4h, v24.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)//
    319     smlal       v19.4s, v3.4h, v25.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)//
    320     smlal       v19.4s, v4.4h, v26.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)//
    321     smlal       v19.4s, v5.4h, v27.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)//
    322     smlal       v19.4s, v6.4h, v28.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)//
    323     smlal       v19.4s, v7.4h, v29.4h       //mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)//
    324     st1         {v20.s}[0],[x14],x6
    325 
    326     sqshrn      v30.4h, v30.4s,#6
    327     sqrshrun    v21.8b, v21.8h,#6
    328 
    329     ld1         {v16.4h},[x3],x2            //src_tmp1 = vld1_u8(pu1_src_tmp)//
    330     smull       v20.4s, v2.4h, v23.4h       //mul_res2 = vmull_u8(src_tmp3, coeffabs_1)//
    331     smlal       v20.4s, v1.4h, v22.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)//
    332     smlal       v20.4s, v3.4h, v24.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)//
    333     smlal       v20.4s, v4.4h, v25.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)//
    334     smlal       v20.4s, v5.4h, v26.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)//
    335     smlal       v20.4s, v6.4h, v27.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)//
    336     smlal       v20.4s, v7.4h, v28.4h       //mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)//
    337     smlal       v20.4s, v16.4h, v29.4h      //mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)//
    338     st1         {v21.s}[0],[x14],x6
    339 
    340     sqshrn      v19.4h, v19.4s,#6
    341     sqrshrun    v30.8b, v30.8h,#6
    342 
    343     ld1         {v17.4h},[x3],x2            //src_tmp2 = vld1_u8(pu1_src_tmp)//
    344     smull       v21.4s, v3.4h, v23.4h
    345     smlal       v21.4s, v2.4h, v22.4h
    346     smlal       v21.4s, v4.4h, v24.4h
    347     smlal       v21.4s, v5.4h, v25.4h
    348     smlal       v21.4s, v6.4h, v26.4h
    349     smlal       v21.4s, v7.4h, v27.4h
    350     smlal       v21.4s, v16.4h, v28.4h
    351     smlal       v21.4s, v17.4h, v29.4h
    352     st1         {v30.s}[0],[x14],x6
    353     sqshrn      v20.4h, v20.4s,#6
    354     sqrshrun    v19.8b, v19.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
    355 
    356     ld1         {v18.4h},[x3],x2            //src_tmp3 = vld1_u8(pu1_src_tmp)//
    357     smull       v30.4s, v4.4h, v23.4h
    358     smlal       v30.4s, v3.4h, v22.4h
    359     smlal       v30.4s, v5.4h, v24.4h
    360     smlal       v30.4s, v6.4h, v25.4h
    361     smlal       v30.4s, v7.4h, v26.4h
    362     smlal       v30.4s, v16.4h, v27.4h
    363     smlal       v30.4s, v17.4h, v28.4h
    364     smlal       v30.4s, v18.4h, v29.4h
    365     sqshrn      v21.4h, v21.4s,#6
    366     sqrshrun    v20.8b, v20.8h,#6           //sto_res = vqmovun_s16(sto_res_tmp)//
    367 
    368     add         x14,x1,x6
    369     st1         {v19.s}[0],[x1],#4          //vst1_u8(pu1_dst,sto_res)//
    370 
    371 epilog_end:
    372     st1         {v20.s}[0],[x14],x6         //vst1_u8(pu1_dst_tmp,sto_res)//
    373     sqrshrun    v21.8b, v21.8h,#6
    374 
    375     st1         {v21.s}[0],[x14],x6
    376     sqshrn      v30.4h, v30.4s,#6
    377     sqrshrun    v30.8b, v30.8h,#6
    378 
    379     st1         {v30.s}[0],[x14],x6
    380 
    381 
    382 end_loops:
    383 
    384     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    385     ldp         x19, x20,[sp], #16
    386 
    387     ret
    388 
    389 
    390 
    391 
    392 
    393 
    394 
    395 
    396 
    397 
    398 
    399 
    400 
    401 
    402 
    403 
    404 
    405 
    406 
    407 
    408