Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_luma_mode_18_34_neon.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  yogeswaran rs
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] pi1_coeff
     61 //*  word8 pointer to the planar coefficients
     62 //*
     63 //* @param[in] nt
     64 //*  size of tranform block
     65 //*
     66 //* @param[in] mode
     67 //*  type of filtering
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  none
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 
     77 //void ihevc_intra_pred_luma_mode_18_34(uword8 *pu1_ref,
     78 //                                      word32 src_strd,
     79 //                                      uword8 *pu1_dst,
     80 //                                      word32 dst_strd,
     81 //                                      word32 nt,
     82 //                                      word32 mode)
     83 //
     84 //**************variables vs registers*****************************************
     85 //x0 => *pu1_ref
     86 //x1 => src_strd
     87 //x2 => *pu1_dst
     88 //x3 => dst_strd
     89 
     90 //stack contents from #40
     91 //    nt
     92 //    mode
     93 //    pi1_coeff
     94 
     95 .text
     96 .align 4
     97 .include "ihevc_neon_macros.s"
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_luma_mode_18_34_av8
    102 
    103 .type ihevc_intra_pred_luma_mode_18_34_av8, %function
    104 
    105 ihevc_intra_pred_luma_mode_18_34_av8:
    106 
    107     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    108     push_v_regs
    109     stp         x19, x20,[sp,#-16]!
    110 
    111     cmp         x4,#4
    112     beq         mode2_4
    113 
    114     mov         x11,x4
    115     mov         x12,x4
    116     sub         x14,x4,#8
    117 
    118     add         x0,x0,x4,lsl #1
    119 
    120     cmp         x5,#0x22
    121     mov         x10,x2
    122 
    123     add         x0,x0,#2
    124     sub         x20,x0,#2
    125     csel        x0, x20, x0,ne
    126     mov         x20,#1
    127     csel        x6, x20, x6,eq
    128     mov         x20,#-1
    129     csel        x6, x20, x6,ne
    130     mov         x8,x0
    131 
    132 prologue_cpy_32:
    133 
    134     ld1         {v0.8b},[x8],x6
    135     lsr         x1, x4, #3
    136     ld1         {v1.8b},[x8],x6
    137     mul         x1, x4, x1
    138     ld1         {v2.8b},[x8],x6
    139     ld1         {v3.8b},[x8],x6
    140     subs        x1,x1,#8
    141     ld1         {v4.8b},[x8],x6
    142     ld1         {v5.8b},[x8],x6
    143     ld1         {v6.8b},[x8],x6
    144 
    145     ld1         {v7.8b},[x8],x6
    146 
    147 
    148     beq         epilogue_mode2
    149     sub         x11,x11,#8
    150 
    151     cmp         x5,#0x22
    152     add         x20,x0,#8
    153     csel        x0, x20, x0,ne
    154     csel        x8, x0, x8,ne
    155     bne         kernel_mode18
    156     //add        x8,x0,#8
    157 
    158 kernel_mode2:
    159     st1         {v0.8b},[x10],x3
    160     st1         {v1.8b},[x10],x3
    161     subs        x12,x12,#8
    162     st1         {v2.8b},[x10],x3
    163     add         x20,x2,#8
    164     csel        x2, x20, x2,ne
    165     st1         {v3.8b},[x10],x3
    166 
    167     ld1         {v0.8b},[x8],x6
    168     st1         {v4.8b},[x10],x3
    169 
    170     st1         {v5.8b},[x10],x3
    171     ld1         {v1.8b},[x8],x6
    172     st1         {v6.8b},[x10],x3
    173     ld1         {v2.8b},[x8],x6
    174     st1         {v7.8b},[x10],x3
    175 
    176     ld1         {v3.8b},[x8],x6
    177     sub         x20,x10,x14
    178     csel        x2, x20, x2,eq
    179     ld1         {v4.8b},[x8],x6
    180     mov         x10,x2
    181     ld1         {v5.8b},[x8],x6
    182     csel        x12, x4, x12,eq
    183     ld1         {v6.8b},[x8],x6
    184     subs        x11,x11,#8
    185 
    186     ld1         {v7.8b},[x8],x6
    187 
    188     add         x20,x0,#8
    189     csel        x0, x20, x0,eq
    190     csel        x11, x4, x11,eq
    191     csel        x8, x0, x8,eq
    192 
    193     subs        x1, x1, #8
    194 
    195     bne         kernel_mode2
    196 
    197     b           epilogue_mode2
    198 
    199 kernel_mode18:
    200     st1         {v0.8b},[x10],x3
    201     st1         {v1.8b},[x10],x3
    202     subs        x12,x12,#8
    203     st1         {v2.8b},[x10],x3
    204     add         x20,x2,#8
    205     csel        x2, x20, x2,ne
    206     st1         {v3.8b},[x10],x3
    207 
    208     ld1         {v0.8b},[x8],x6
    209     st1         {v4.8b},[x10],x3
    210 
    211     st1         {v5.8b},[x10],x3
    212     ld1         {v1.8b},[x8],x6
    213 
    214     st1         {v6.8b},[x10],x3
    215     ld1         {v2.8b},[x8],x6
    216     st1         {v7.8b},[x10],x3
    217 
    218     ld1         {v3.8b},[x8],x6
    219     sub         x20,x10,x14
    220     csel        x2, x20, x2,eq
    221     ld1         {v4.8b},[x8],x6
    222     mov         x10,x2
    223     ld1         {v5.8b},[x8],x6
    224     csel        x12, x4, x12,eq
    225     ld1         {v6.8b},[x8],x6
    226     subs        x11,x11,#8
    227     ld1         {v7.8b},[x8],x6
    228 
    229     add         x20,x0,#8
    230     csel        x0, x20, x0,ne
    231     csel        x11, x4, x11,eq
    232     sub         x20,x8,x14
    233     csel        x0, x20, x0,eq
    234     subs        x1, x1, #8
    235     mov         x8,x0
    236 
    237     bne         kernel_mode18
    238 
    239 
    240 epilogue_mode2:
    241 
    242     st1         {v0.8b},[x10],x3
    243     st1         {v1.8b},[x10],x3
    244     st1         {v2.8b},[x10],x3
    245     st1         {v3.8b},[x10],x3
    246     st1         {v4.8b},[x10],x3
    247     st1         {v5.8b},[x10],x3
    248     st1         {v6.8b},[x10],x3
    249     st1         {v7.8b},[x10],x3
    250 
    251     b           end_func
    252 
    253 mode2_4:
    254 
    255     add         x0,x0,#10
    256     cmp         x5,#0x22
    257     sub         x20,x0,#2
    258     csel        x0, x20, x0,ne
    259 
    260     mov         x20,#1
    261     csel        x8, x20, x8,eq
    262     mov         x20,#-1
    263     csel        x8, x20, x8,ne
    264 
    265     ld1         {v0.8b},[x0],x8
    266     st1         {v0.s}[0],[x2],x3
    267 
    268     ld1         {v0.8b},[x0],x8
    269     st1         {v0.s}[0],[x2],x3
    270 
    271     ld1         {v0.8b},[x0],x8
    272     st1         {v0.s}[0],[x2],x3
    273 
    274     ld1         {v0.8b},[x0],x8
    275     st1         {v0.s}[0],[x2],x3
    276 
    277 end_func:
    278     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    279     ldp         x19, x20,[sp],#16
    280     pop_v_regs
    281     ret
    282 
    283 
    284 
    285 
    286 
    287 
    288 
    289