Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_luma_mode2_neon.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  yogeswaran rs
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] pi1_coeff
     61 //*  word8 pointer to the planar coefficients
     62 //*
     63 //* @param[in] nt
     64 //*  size of tranform block
     65 //*
     66 //* @param[in] mode
     67 //*  type of filtering
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  none
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 
     77 //void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
     78 //                                 word32 src_strd,
     79 //                                 uword8 *pu1_dst,
     80 //                                 word32 dst_strd,
     81 //                                 word32 nt,
     82 //                                 word32 mode)
     83 //
     84 //**************variables vs registers*****************************************
     85 //x0 => *pu1_ref
     86 //x1 => src_strd
     87 //x2 => *pu1_dst
     88 //x3 => dst_strd
     89 
     90 //stack contents from #40
     91 //    nt
     92 //    mode
     93 //    pi1_coeff
     94 
     95 .text
     96 .align 4
     97 .include "ihevc_neon_macros.s"
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_luma_mode2_av8
    102 
    103 .type ihevc_intra_pred_luma_mode2_av8, %function
    104 
    105 ihevc_intra_pred_luma_mode2_av8:
    106 
    107     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    108 
    109     stp         x19, x20,[sp,#-16]!
    110 
    111     mov         x8,#-2
    112 
    113     cmp         x4,#4
    114     beq         mode2_4
    115 
    116     add         x0,x0,x4,lsl #1
    117 
    118     sub         x0,x0,#9                    //src[1]
    119     sub         x10,x0,#1
    120 
    121 prologue_cpy_32:
    122 
    123     ld1         {v0.8b},[x0],x8
    124     mov         x11,x4
    125 
    126     ld1         {v1.8b},[x10],x8
    127     mov         x6, x2
    128 
    129     ld1         {v2.8b},[x0],x8
    130     ld1         {v3.8b},[x10],x8
    131     lsr         x1, x4, #3
    132 
    133     ld1         {v4.8b},[x0],x8
    134     ld1         {v5.8b},[x10],x8
    135     ld1         {v6.8b},[x0],x8
    136     mul         x1, x4, x1
    137 
    138     ld1         {v7.8b},[x10],x8
    139     add         x7,x6,x3
    140 
    141     rev64       v16.8b,  v0.8b
    142     rev64       v17.8b,  v1.8b
    143     lsl         x5, x3, #2
    144 
    145     rev64       v18.8b,  v2.8b
    146     rev64       v19.8b,  v3.8b
    147     add         x9,x7,x3
    148 
    149     rev64       v20.8b,  v4.8b
    150     subs        x1,x1,#8
    151 
    152     rev64       v21.8b,  v5.8b
    153     rev64       v22.8b,  v6.8b
    154     rev64       v23.8b,  v7.8b
    155     add         x14,x9,x3
    156 
    157     beq         epilogue_mode2
    158 
    159     sub         x12,x4,#8
    160 
    161 kernel_mode2:
    162 
    163     st1         {v16.8b},[x6],x5
    164     st1         {v17.8b},[x7],x5
    165     subs        x11,x11,#8
    166 
    167     st1         {v18.8b},[x9],x5
    168     add         x20,x2,#8
    169     csel        x2, x20, x2,gt
    170 
    171     st1         {v19.8b},[x14],x5
    172     st1         {v20.8b},[x6],x5
    173     csel        x11, x4, x11,le
    174 
    175     st1         {v21.8b},[x7],x5
    176     st1         {v22.8b},[x9],x5
    177     add         x20, x2, x3, lsl #2
    178     csel        x2, x20, x2,le
    179 
    180     st1         {v23.8b},[x14],x5
    181     ld1         {v0.8b},[x0],x8
    182     sub         x14,x4,#8
    183 
    184     ld1         {v1.8b},[x10],x8
    185     ld1         {v2.8b},[x0],x8
    186     add         x20, x2, #8
    187     csel        x2, x20, x2,le
    188 
    189     ld1         {v3.8b},[x10],x8
    190     ld1         {v4.8b},[x0],x8
    191     sub         x20, x6, x14
    192     csel        x2, x20, x2,le
    193 
    194     ld1         {v5.8b},[x10],x8
    195     subs        x12,x12,#8
    196 
    197     ld1         {v6.8b},[x0],x8
    198     mov         x6, x2
    199 
    200     ld1         {v7.8b},[x10],x8
    201     add         x20, x0, x4
    202     csel        x0, x20, x0,le
    203 
    204     rev64       v16.8b,  v0.8b
    205     add         x7, x6, x3
    206 
    207     rev64       v17.8b,  v1.8b
    208     sub         x20, x0, #8
    209     csel        x0, x20, x0,le
    210 
    211     rev64       v18.8b,  v2.8b
    212     csel        x12, x4, x12,le
    213 
    214     rev64       v19.8b,  v3.8b
    215     add         x9, x7, x3
    216 
    217     rev64       v20.8b,  v4.8b
    218     sub         x10,x0,#1
    219 
    220     rev64       v21.8b,  v5.8b
    221     subs        x1, x1, #8
    222 
    223     rev64       v22.8b,  v6.8b
    224     add         x14, x9, x3
    225 
    226     rev64       v23.8b,  v7.8b
    227 
    228     bne         kernel_mode2
    229 
    230 epilogue_mode2:
    231 
    232     st1         {v16.8b},[x6],x5
    233     st1         {v17.8b},[x7],x5
    234     st1         {v18.8b},[x9],x5
    235     st1         {v19.8b},[x14],x5
    236     st1         {v20.8b},[x6],x5
    237     st1         {v21.8b},[x7],x5
    238     st1         {v22.8b},[x9],x5
    239     st1         {v23.8b},[x14],x5
    240 
    241     b           end_func
    242 
    243 mode2_4:
    244 
    245     mov         x8,#-2
    246     sub         x0,x0,#1
    247     sub         x10,x0,#1
    248 
    249     ld1         {v0.8b},[x0],x8
    250     add         x5,x2,x3
    251     ld1         {v2.8b},[x10],x8
    252     add         x6,x5,x3
    253     ld1         {v4.8b},[x0]
    254     add         x7,x6,x3
    255     ld1         {v6.8b},[x10]
    256 
    257     rev64       v1.8b,  v0.8b
    258     rev64       v3.8b,  v2.8b
    259 
    260 
    261 
    262     st1         {v1.s}[0],[x2]
    263     rev64       v5.8b,  v4.8b
    264     st1         {v3.s}[0],[x5]
    265     rev64       v7.8b,  v6.8b
    266     st1         {v5.s}[0],[x6]
    267     st1         {v7.s}[0],[x7]
    268 
    269 end_func:
    270     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    271     ldp         x19, x20,[sp],#16
    272 
    273     ret
    274 
    275 
    276 
    277 
    278 
    279 
    280 
    281