Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_chroma_dc_neon.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  yogeswaran rs
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] pi1_coeff
     61 //*  word8 pointer to the planar coefficients
     62 //*
     63 //* @param[in] nt
     64 //*  size of tranform block
     65 //*
     66 //* @param[in] mode
     67 //*  type of filtering
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  none
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 
     77 //void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
     78 //                                word32 src_strd,
     79 //                                uword8 *pu1_dst,
     80 //                                word32 dst_strd,
     81 //                                word32 nt,
     82 //                                word32 mode)
     83 //
     84 //**************variables vs registers*****************************************
     85 //x0 => *pu1_ref
     86 //x1 => src_strd
     87 //x2 => *pu1_dst
     88 //x3 => dst_strd
     89 
     90 //stack contents from #40
     91 //    nt
     92 //    mode
     93 //    pi1_coeff
     94 
     95 .text
     96 .align 4
     97 .include "ihevc_neon_macros.s"
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_chroma_dc_av8
    102 
    103 .type ihevc_intra_pred_chroma_dc_av8, %function
    104 
    105 ihevc_intra_pred_chroma_dc_av8:
    106 
    107     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    108     push_v_regs
    109     stp         x19, x20,[sp,#-16]!
    110 
    111     mov         x9, #0
    112     mov         v17.s[0], w9
    113     mov         v17.s[1], w9
    114 
    115     clz         w5,w4                       //counts leading zeros
    116 
    117     add         x6, x0, x4,lsl #1           //&src[2nt]
    118     mov         v18.s[0], w9
    119     mov         v18.s[1], w9
    120     sub         x20, x5, #32                //log2nt
    121     neg         x5, x20
    122     add         x7, x0, x4, lsl #2          //&src[4nt]
    123     mov         x12,x5
    124     add         x8, x7, #2                  //&src[4nt+2]
    125 
    126     cmp         x4, #4
    127     beq         dc_4                        //nt=4 loop
    128 
    129 
    130 add_loop:
    131     ld2         {v30.8b, v31.8b}, [x6], #16 //load from src[nt]
    132     lsl         x10,x4,#1                   //2nt
    133 
    134     uaddlp      v2.4h,  v30.8b
    135     subs        x10, x10,#0x10
    136 
    137     ld2         {v26.8b, v27.8b}, [x8],#16  //load from src[2nt+1]
    138 
    139     uaddlp      v3.4h,  v31.8b
    140     uaddlp      v2.2s,  v2.4h
    141     uaddlp      v3.2s,  v3.4h
    142 
    143     uadalp      v17.1d,  v2.2s
    144 
    145     uadalp      v18.1d,  v3.2s
    146 
    147     uaddlp      v2.4h,  v26.8b
    148     uaddlp      v3.4h,  v27.8b
    149 
    150     uaddlp      v2.2s,  v2.4h
    151     uaddlp      v3.2s,  v3.4h
    152 
    153     uadalp      v17.1d,  v2.2s
    154     uadalp      v18.1d,  v3.2s
    155 
    156     beq         epil_add_loop
    157 
    158 core_loop_add:
    159     ld2         {v30.8b, v31.8b}, [x6],#16  //load from src[nt]
    160     uaddlp      v28.4h,  v30.8b
    161     uaddlp      v3.4h,  v31.8b
    162 
    163     ld2         {v26.8b, v27.8b}, [x8],#16  //load from src[2nt+1]
    164 
    165     uaddlp      v3.2s,  v3.4h
    166     uaddlp      v29.2s,  v28.4h
    167 
    168     uadalp      v18.1d,  v3.2s
    169     uadalp      v17.1d,  v29.2s
    170 
    171     uaddlp      v3.4h,  v27.8b
    172     uaddlp      v28.4h,  v26.8b
    173 
    174     uaddlp      v3.2s,  v3.4h
    175     uaddlp      v29.2s,  v28.4h
    176 
    177     uadalp      v18.1d,  v3.2s
    178     uadalp      v17.1d,  v29.2s
    179 
    180 
    181 epil_add_loop:
    182 
    183     smov        x1, v18.s[0]
    184     smov        x11, v17.s[0]
    185 
    186     add         x1,x1,x4
    187     add         x11,x11,x4
    188 
    189     lsr         x1,x1,x12
    190     lsr         x11,x11,x12
    191 
    192     dup         v17.8b,w1
    193     dup         v16.8b,w11
    194 
    195 prologue_cpy_32:
    196 
    197     add         x5, x2, x3
    198     subs        x9, x4, #8
    199     lsl         x6, x3, #2
    200     csel        x11, x6, x11,eq
    201     add         x8, x5, x3
    202     add         x10, x8, x3
    203 
    204     beq         epilogue_copy
    205 
    206     st2         {v16.8b, v17.8b}, [x2],#16
    207     sub         x6, x6, #16
    208 
    209     st2         {v16.8b, v17.8b}, [x5],#16
    210     st2         {v16.8b, v17.8b}, [x8],#16
    211     mov         x20,#16
    212     csel        x11, x20, x11,ne
    213     st2         {v16.8b, v17.8b}, [x10],#16
    214 
    215 
    216     st2         {v16.8b, v17.8b}, [x2], x6
    217     st2         {v16.8b, v17.8b}, [x5], x6
    218     st2         {v16.8b, v17.8b}, [x8], x6
    219     st2         {v16.8b, v17.8b}, [x10], x6
    220 
    221 kernel_copy:
    222     st2         {v16.8b, v17.8b}, [x2],#16
    223     st2         {v16.8b, v17.8b}, [x5],#16
    224     st2         {v16.8b, v17.8b}, [x8],#16
    225     st2         {v16.8b, v17.8b}, [x10],#16
    226 
    227     st2         {v16.8b, v17.8b}, [x2], x6
    228     st2         {v16.8b, v17.8b}, [x5], x6
    229     st2         {v16.8b, v17.8b}, [x8], x6
    230     st2         {v16.8b, v17.8b}, [x10], x6
    231 
    232     st2         {v16.8b, v17.8b}, [x2],#16
    233     st2         {v16.8b, v17.8b}, [x5],#16
    234     st2         {v16.8b, v17.8b}, [x8],#16
    235     st2         {v16.8b, v17.8b}, [x10],#16
    236 
    237     st2         {v16.8b, v17.8b}, [x2], x6
    238     st2         {v16.8b, v17.8b}, [x5], x6
    239     st2         {v16.8b, v17.8b}, [x8], x6
    240     st2         {v16.8b, v17.8b}, [x10], x6
    241 
    242 epilogue_copy:
    243     st2         {v16.8b, v17.8b}, [x2],x11
    244     st2         {v16.8b, v17.8b}, [x5],x11
    245     st2         {v16.8b, v17.8b}, [x8],x11
    246     st2         {v16.8b, v17.8b}, [x10],x11
    247 
    248     st2         {v16.8b, v17.8b}, [x2]
    249     st2         {v16.8b, v17.8b}, [x5]
    250     st2         {v16.8b, v17.8b}, [x8]
    251     st2         {v16.8b, v17.8b}, [x10]
    252     b           end_func
    253 
    254 dc_4:
    255     ld2         {v30.8b, v31.8b},[x6]       //load from src[nt]
    256     shl         d3, d30,#32
    257 
    258     ld2         {v26.8b, v27.8b},[x8]       //load from src[2nt+1]
    259     shl         d2, d31,#32
    260 
    261     uaddlp      v3.4h,  v3.8b
    262     uaddlp      v2.4h,  v2.8b
    263     uaddlp      v3.2s,  v3.4h
    264     uaddlp      v2.2s,  v2.4h
    265     uadalp      v17.1d,  v3.2s
    266     uadalp      v18.1d,  v2.2s
    267 
    268     shl         d3, d26,#32
    269     shl         d2, d27,#32
    270     uaddlp      v3.4h,  v3.8b
    271     uaddlp      v2.4h,  v2.8b
    272     uaddlp      v3.2s,  v3.4h
    273     uaddlp      v2.2s,  v2.4h
    274     uadalp      v17.1d,  v3.2s
    275     uadalp      v18.1d,  v2.2s
    276 
    277     smov        x10, v17.s[0]
    278     smov        x11, v18.s[0]
    279 
    280     add         x10,x10,x4
    281     add         x11,x11,x4
    282     lsr         x10,x10,x12
    283     lsr         x11,x11,x12
    284     orr         x10,x10,x11,lsl #8
    285     dup         v0.4h,w10
    286 
    287     st1         {v0.8b},[x2],x3
    288     st1         {v0.8b},[x2],x3
    289     st1         {v0.8b},[x2],x3
    290     st1         {v0.8b},[x2]
    291 
    292 end_func:
    293     // ldmfd sp!,{x4-x12,x15}     //reload the registers from sp
    294     ldp         x19, x20,[sp],#16
    295     pop_v_regs
    296     ret
    297 
    298 
    299 
    300 
    301