Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_luma_mode2_neon.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  yogeswaran rs
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] pi1_coeff
     61 //*  word8 pointer to the planar coefficients
     62 //*
     63 //* @param[in] nt
     64 //*  size of tranform block
     65 //*
     66 //* @param[in] mode
     67 //*  type of filtering
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  none
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 
     77 //void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
     78 //                                 word32 src_strd,
     79 //                                 uword8 *pu1_dst,
     80 //                                 word32 dst_strd,
     81 //                                 word32 nt,
     82 //                                 word32 mode)
     83 //
     84 //**************variables vs registers*****************************************
     85 //x0 => *pu1_ref
     86 //x1 => src_strd
     87 //x2 => *pu1_dst
     88 //x3 => dst_strd
     89 
     90 //stack contents from #40
     91 //    nt
     92 //    mode
     93 //    pi1_coeff
     94 
     95 .text
     96 .align 4
     97 .include "ihevc_neon_macros.s"
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_chroma_mode2_av8
    102 
    103 .type ihevc_intra_pred_chroma_mode2_av8, %function
    104 
    105 ihevc_intra_pred_chroma_mode2_av8:
    106 
    107     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    108     push_v_regs
    109     stp         x19, x20,[sp,#-16]!
    110 
    111     mov         x8,#-4
    112 
    113     cmp         x4,#4
    114     beq         mode2_4
    115 
    116     add         x0,x0,x4,lsl #2
    117 
    118     sub         x0,x0,#0x12                 //src[1]
    119     add         x10,x0,#-2
    120 
    121 prologue_cpy_32:
    122 
    123     ld2         {v0.8b, v1.8b},[x0],x8
    124 
    125     mov         x11,x4
    126     rev64       v16.8b,  v0.8b
    127     rev64       v17.8b,  v1.8b
    128 
    129     ld2         {v2.8b, v3.8b},[x10],x8
    130     mov         x6, x2
    131 
    132     ld2         {v4.8b, v5.8b},[x0],x8
    133     ld2         {v6.8b, v7.8b},[x10],x8
    134     lsr         x1, x4, #3
    135 
    136     ld2         {v8.8b, v9.8b},[x0],x8
    137     ld2         {v10.8b, v11.8b},[x10],x8
    138     ld2         {v12.8b, v13.8b},[x0],x8
    139     mul         x1, x4, x1
    140 
    141     ld2         {v14.8b, v15.8b},[x10],x8
    142     add         x7,x6,x3
    143 
    144     rev64       v18.8b,  v2.8b
    145     rev64       v19.8b,  v3.8b
    146     lsl         x5, x3, #2
    147 
    148     rev64       v20.8b,  v4.8b
    149     rev64       v21.8b,  v5.8b
    150     add         x9,x7,x3
    151 
    152     rev64       v22.8b,  v6.8b
    153     rev64       v23.8b,  v7.8b
    154 
    155     rev64       v24.8b,  v8.8b
    156     rev64       v25.8b,  v9.8b
    157 
    158     rev64       v26.8b,  v10.8b
    159     subs        x1,x1,#8
    160 
    161     rev64       v27.8b,  v11.8b
    162 
    163     rev64       v28.8b,  v12.8b
    164     rev64       v29.8b,  v13.8b
    165 
    166     rev64       v30.8b,  v14.8b
    167     add         x14,x9,x3
    168     rev64       v31.8b,  v15.8b
    169 
    170     beq         epilogue_mode2
    171 
    172     sub         x12,x4,#8
    173 
    174 kernel_mode2:
    175 
    176     st2         {v16.8b, v17.8b},[x6],x5
    177     st2         {v18.8b, v19.8b},[x7],x5
    178     subs        x11,x11,#8
    179     st2         {v20.8b, v21.8b},[x9],x5
    180     st2         {v22.8b, v23.8b},[x14],x5
    181     st2         {v24.8b, v25.8b},[x6],x5
    182     add         x20,x2,#16
    183     csel        x2, x20, x2,gt
    184     st2         {v26.8b, v27.8b},[x7],x5
    185     st2         {v28.8b, v29.8b},[x9],x5
    186     st2         {v30.8b, v31.8b},[x14],x5
    187 
    188     ld2         {v0.8b, v1.8b},[x0],x8
    189     csel        x11, x4, x11,le
    190 
    191     ld2         {v2.8b, v3.8b},[x10],x8
    192     ld2         {v4.8b, v5.8b},[x0],x8
    193     add         x20, x2, x3, lsl #2
    194     csel        x2, x20, x2,le
    195     ld2         {v6.8b, v7.8b},[x10],x8
    196     rev64       v16.8b,  v0.8b
    197 
    198     ld2         {v8.8b, v9.8b},[x0],x8
    199     ld2         {v10.8b, v11.8b},[x10],x8
    200     sub         x20, x6,#16
    201     csel        x2, x20, x2,le
    202     ld2         {v12.8b, v13.8b},[x0],x8
    203     rev64       v17.8b,  v1.8b
    204     ld2         {v14.8b, v15.8b},[x10],x8
    205 
    206     subs        x12,x12,#8
    207     mov         x6, x2
    208     add         x20, x0, x4,lsl #1
    209     csel        x0, x20, x0,le
    210     add         x7, x6, x3
    211 
    212     rev64       v18.8b,  v2.8b
    213     sub         x20, x0, #16
    214     csel        x0, x20, x0,le
    215     rev64       v19.8b,  v3.8b
    216 
    217     rev64       v20.8b,  v4.8b
    218     csel        x12, x4, x12,le
    219     rev64       v21.8b,  v5.8b
    220 
    221     rev64       v22.8b,  v6.8b
    222     add         x9, x7, x3
    223     rev64       v23.8b,  v7.8b
    224 
    225     rev64       v24.8b,  v8.8b
    226     add         x10,x0,#-2
    227     rev64       v25.8b,  v9.8b
    228 
    229     rev64       v26.8b,  v10.8b
    230     subs        x1, x1, #8
    231     rev64       v27.8b,  v11.8b
    232 
    233     rev64       v28.8b,  v12.8b
    234     rev64       v29.8b,  v13.8b
    235 
    236     rev64       v30.8b,  v14.8b
    237     add         x14, x9, x3
    238     rev64       v31.8b,  v15.8b
    239 
    240     bne         kernel_mode2
    241 
    242 epilogue_mode2:
    243 
    244     st2         {v16.8b, v17.8b},[x6],x5
    245     st2         {v18.8b, v19.8b},[x7],x5
    246     st2         {v20.8b, v21.8b},[x9],x5
    247     st2         {v22.8b, v23.8b},[x14],x5
    248     st2         {v24.8b, v25.8b},[x6],x5
    249     st2         {v26.8b, v27.8b},[x7],x5
    250     st2         {v28.8b, v29.8b},[x9],x5
    251     st2         {v30.8b, v31.8b},[x14],x5
    252 
    253     b           end_func
    254 
    255 mode2_4:
    256 
    257     lsl         x12,x4,#1
    258     add         x0,x0,x12
    259     sub         x0,x0,#2
    260 
    261     ld2         {v12.8b, v13.8b},[x0],x8
    262     shl         d0, d12,#32
    263     add         x10,x0,#2
    264     shl         d1, d13,#32
    265 
    266     rev64       v0.8b,  v0.8b
    267     ld2         {v14.8b, v15.8b},[x10],x8
    268     shl         d2, d14,#32
    269 
    270     rev64       v1.8b,  v1.8b
    271     shl         d3, d15,#32
    272     zip1        v0.8b, v0.8b, v1.8b
    273     zip2        v1.8b, v0.8b, v1.8b
    274     st1         {v0.8b},[x2],x3
    275 
    276     rev64       v2.8b,  v2.8b
    277     ld2         {v16.8b, v17.8b},[x0],x8
    278     shl         d4, d16,#32
    279     rev64       v3.8b,  v3.8b
    280     shl         d5, d17,#32
    281     zip1        v2.8b, v2.8b, v3.8b
    282     zip2        v3.8b, v2.8b, v3.8b
    283     rev64       v4.8b,  v4.8b
    284     rev64       v5.8b,  v5.8b
    285     st1         {v2.8b},[x2],x3
    286 
    287 
    288     ld2         {v18.8b, v19.8b},[x10],x8
    289     shl         d6, d18,#32
    290 
    291     zip1        v4.8b, v4.8b, v5.8b
    292     zip2        v5.8b, v4.8b, v5.8b
    293     shl         d7, d19,#32
    294     rev64       v6.8b,  v6.8b
    295     st1         {v4.8b},[x2],x3
    296 
    297     rev64       v7.8b,  v7.8b
    298     zip1        v6.8b, v6.8b, v7.8b
    299     zip2        v7.8b, v6.8b, v7.8b
    300     st1         {v6.8b},[x2],x3
    301 
    302 end_func:
    303     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    304     ldp         x19, x20,[sp],#16
    305     pop_v_regs
    306     ret
    307 
    308 
    309 
    310 
    311 
    312 
    313