Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_chroma_ver_neon.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  yogeswaran rs
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] nt
     61 //*  size of tranform block
     62 //*
     63 //* @param[in] mode
     64 //*  type of filtering
     65 //*
     66 //* @returns
     67 //*
     68 //* @remarks
     69 //*  none
     70 //*
     71 //*******************************************************************************
     72 //*/
     73 
     74 //void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref,
     75 //        word32 src_strd,
     76 //        uword8 *pu1_dst,
     77 //        word32 dst_strd,
     78 //        word32 nt,
     79 //        word32 mode)
     80 //**************variables vs registers*****************************************
     81 //x0 => *pu1_ref
     82 //x1 => src_strd
     83 //x2 => *pu1_dst
     84 //x3 => dst_strd
     85 
     86 //stack contents from #40
     87 //    nt
     88 //    mode
     89 
     90 .text
     91 .align 4
     92 .include "ihevc_neon_macros.s"
     93 
     94 
     95 .globl ihevc_intra_pred_chroma_ver_av8
     96 
     97 .type ihevc_intra_pred_chroma_ver_av8, %function
     98 
     99 ihevc_intra_pred_chroma_ver_av8:
    100 
    101     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    102     push_v_regs
    103     stp         x19, x20,[sp,#-16]!
    104 
    105     lsl         x5, x4, #2                  //4nt
    106 
    107 
    108     cmp         x4, #8
    109     beq         blk_8
    110     blt         blk_4
    111 
    112 copy_16:
    113     add         x5, x5, #2                  //2nt+2
    114     add         x6, x0, x5                  //&src[2nt+1]
    115 
    116     add         x5, x2, x3                  //pu1_dst + dst_strd
    117     ld2         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
    118     add         x8, x5, x3
    119 
    120     add         x10, x8, x3
    121     ld2         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
    122     lsl         x11, x3, #2
    123 
    124     sub         x11, x11, #16
    125 
    126 
    127     st2         {v20.8b, v21.8b}, [x2],#16
    128     st2         {v20.8b, v21.8b}, [x5],#16
    129     st2         {v20.8b, v21.8b}, [x8],#16
    130     st2         {v20.8b, v21.8b}, [x10],#16
    131 
    132     st2         {v22.8b, v23.8b}, [x2], x11
    133     st2         {v22.8b, v23.8b}, [x5], x11
    134     st2         {v22.8b, v23.8b}, [x8], x11
    135     st2         {v22.8b, v23.8b}, [x10], x11
    136 
    137     subs        x4, x4, #4
    138 
    139 kernel_copy_16:
    140     st2         {v20.8b, v21.8b}, [x2],#16
    141     st2         {v20.8b, v21.8b}, [x5],#16
    142     st2         {v20.8b, v21.8b}, [x8],#16
    143     st2         {v20.8b, v21.8b}, [x10],#16
    144 
    145     st2         {v22.8b, v23.8b}, [x2], x11
    146     st2         {v22.8b, v23.8b}, [x5], x11
    147     st2         {v22.8b, v23.8b}, [x8], x11
    148     st2         {v22.8b, v23.8b}, [x10], x11
    149 
    150     subs        x4, x4, #4
    151 
    152 
    153     st2         {v20.8b, v21.8b}, [x2],#16
    154     st2         {v20.8b, v21.8b}, [x5],#16
    155     st2         {v20.8b, v21.8b}, [x8],#16
    156     st2         {v20.8b, v21.8b}, [x10],#16
    157 
    158     st2         {v22.8b, v23.8b}, [x2], x11
    159     st2         {v22.8b, v23.8b}, [x5], x11
    160     st2         {v22.8b, v23.8b}, [x8], x11
    161     st2         {v22.8b, v23.8b}, [x10], x11
    162 
    163     subs        x4, x4, #4
    164 
    165     st2         {v20.8b, v21.8b}, [x2],#16
    166     st2         {v20.8b, v21.8b}, [x5],#16
    167     st2         {v20.8b, v21.8b}, [x8],#16
    168     st2         {v20.8b, v21.8b}, [x10],#16
    169 
    170     st2         {v22.8b, v23.8b}, [x2], x11
    171     st2         {v22.8b, v23.8b}, [x5], x11
    172     st2         {v22.8b, v23.8b}, [x8], x11
    173     st2         {v22.8b, v23.8b}, [x10], x11
    174 
    175     subs        x4, x4, #4
    176     bne         kernel_copy_16
    177 
    178     b           end_func
    179 
    180 blk_8:
    181 
    182     add         x5, x5, #2                  //2nt+2
    183     add         x6, x0, x5                  //&src[2nt+1]
    184 
    185     add         x5, x2, x3                  //pu1_dst + dst_strd
    186     ld2         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
    187     add         x8, x5, x3
    188 
    189     add         x10, x8, x3
    190     ld2         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
    191 
    192     lsl         x11,x3,#2
    193 
    194     st2         {v20.8b, v21.8b}, [x2],x11
    195     st2         {v20.8b, v21.8b}, [x5],x11
    196     st2         {v20.8b, v21.8b}, [x8],x11
    197     st2         {v20.8b, v21.8b}, [x10],x11
    198 
    199     st2         {v20.8b, v21.8b}, [x2]
    200     st2         {v20.8b, v21.8b}, [x5]
    201     st2         {v20.8b, v21.8b}, [x8]
    202     st2         {v20.8b, v21.8b}, [x10]
    203 
    204     subs        x4, x4, #8
    205     beq         end_func
    206 
    207 blk_4:
    208 
    209     //lsl        x5, x4, #2            @4nt
    210     add         x5, x5, #2                  //2nt+2
    211     add         x6, x0, x5                  //&src[2nt+1]
    212 
    213     ld1         {v0.8b},[x6]
    214     add         x5, x2, x3                  //pu1_dst + dst_strd
    215 
    216     st1         {v0.8b},[x2]
    217     add         x8, x5, x3
    218     st1         {v0.8b},[x5]
    219     add         x10, x8, x3
    220     st1         {v0.8b},[x8]
    221     st1         {v0.8b},[x10]
    222 
    223 
    224 
    225 end_func:
    226     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    227     ldp         x19, x20,[sp],#16
    228     pop_v_regs
    229     ret
    230 
    231 
    232 
    233