Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 ///**
     20 //*******************************************************************************
     21 //*
     22 //* //brief
     23 //*     interprediction luma function for copy
     24 //*
     25 //* //par description:
     26 //*   copies the array of width 'wd' and height 'ht' from the  location pointed
     27 //*   by 'src' to the location pointed by 'dst'
     28 //*
     29 //* //param[in] pu1_src
     30 //*  uword8 pointer to the source
     31 //*
     32 //* //param[out] pu1_dst
     33 //*  uword8 pointer to the destination
     34 //*
     35 //* //param[in] src_strd
     36 //*  integer source stride
     37 //*
     38 //* //param[in] dst_strd
     39 //*  integer destination stride
     40 //*
     41 //* //param[in] pi1_coeff
     42 //*  word8 pointer to the filter coefficients
     43 //*
     44 //* //param[in] ht
     45 //*  integer height of the array
     46 //*
     47 //* //param[in] wd
     48 //*  integer width of the array
     49 //*
     50 //* //returns
     51 //*
     52 //* //remarks
     53 //*  none
     54 //*
     55 //*******************************************************************************
     56 //*/
     57 //void ihevc_inter_pred_luma_copy (
     58 //                            uword8 *pu1_src,
     59 //                            uword8 *pu1_dst,
     60 //                            word32 src_strd,
     61 //                            word32 dst_strd,
     62 //                            word8 *pi1_coeff,
     63 //                            word32 ht,
     64 //                            word32 wd   )
     65 
     66 //**************variables vs registers*****************************************
     67 //    x0 => *pu1_src
     68 //    x1 => *pu1_dst
     69 //    x2 =>  src_strd
     70 //    x3 =>  dst_strd
     71 //    x11 =>  ht
     72 //    x16 => wd
     73 
     74 .text
     75 .align 4
     76 
     77 .include "ihevc_neon_macros.s"
     78 
     79 .globl ihevc_inter_pred_luma_copy_av8
     80 
     81 .type ihevc_inter_pred_luma_copy_av8, %function
     82 
     83 ihevc_inter_pred_luma_copy_av8:
     84     // stmfd sp!, {x8-x16, lr}                //stack stores the values of the arguments
     85     stp         x19,x20,[sp, #-16]!
     86     mov         x16,x6                      //loads wd
     87     mov         x11,x5                      //loads ht
     88     cmp         x11,#0                      //checks ht == 0
     89     ble         end_loops
     90     tst         x16,#15                     //checks wd for multiples for 4 & 8
     91     beq         core_loop_wd_16
     92     tst         x16,#7                      //checks wd for multiples for 4 & 8
     93     beq         core_loop_wd_8
     94     sub         x15,x16,#4
     95 
     96 outer_loop_wd_4:
     97     subs        x8,x16,#0                   //checks wd == 0
     98     ble         end_inner_loop_wd_4
     99 
    100 inner_loop_wd_4:
    101     ld1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    102     add         x9,x0,x2                    //pu1_src_tmp += src_strd
    103     add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
    104     st1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    105     ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    106     add         x0,x0,#4                    //pu1_src += 4
    107     st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    108     ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    109     subs        x8,x8,#4                    //(wd -4)
    110     st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    111     ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    112     add         x1,x1,#4                    //pu1_dst += 4
    113     st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    114 
    115     bgt         inner_loop_wd_4
    116 
    117 end_inner_loop_wd_4:
    118     subs        x11,x11,#4                  //ht - 4
    119     sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
    120     sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
    121     bgt         outer_loop_wd_4
    122 
    123 end_loops:
    124     // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    125 //  MRS x20,PMCCFILTR_EL0
    126     sub         x0,x20,x19
    127     ldp         x19,x20,[sp],#16
    128     ret
    129 
    130 
    131 core_loop_wd_8:
    132     sub         x15,x16,#8
    133 
    134 outer_loop_wd_8:
    135     subs        x8,x16,#0                   //checks wd
    136     ble         end_inner_loop_wd_8
    137 
    138 inner_loop_wd_8:
    139     add         x9,x0,x2                    //pu1_src_tmp += src_strd
    140     ld1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    141     add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
    142     st1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
    143     ld1         {v1.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
    144     st1         {v1.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    145     subs        x8,x8,#8                    //wd - 8(loop condition)
    146     ld1         {v2.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
    147     st1         {v2.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    148     ld1         {v3.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
    149     st1         {v3.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    150     bgt         inner_loop_wd_8
    151 
    152 end_inner_loop_wd_8:
    153     subs        x11,x11,#4                  //ht -= 4
    154     sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
    155     sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
    156     bgt         outer_loop_wd_8
    157 
    158     // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    159 //  MRS x20,PMCCFILTR_EL0
    160     sub         x0,x20,x19
    161     ldp         x19,x20,[sp],#16
    162     ret
    163 
    164 core_loop_wd_16:
    165     sub         x15,x16,#16
    166 
    167 outer_loop_wd_16:
    168     subs        x8,x16,#0                   //checks wd
    169     ble         end_inner_loop_wd_16
    170 
    171 inner_loop_wd_16:
    172     add         x9,x0,x2                    //pu1_src_tmp += src_strd
    173     ld1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
    174     add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
    175     st1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
    176     ld1         {v1.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
    177     st1         {v1.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
    178     subs        x8,x8,#16                   //wd - 8(loop condition)
    179     ld1         {v2.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
    180     st1         {v2.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
    181     ld1         {v3.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
    182     st1         {v3.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
    183     bgt         inner_loop_wd_16
    184 
    185 end_inner_loop_wd_16:
    186     subs        x11,x11,#4                  //ht -= 4
    187     sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
    188     sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
    189     bgt         outer_loop_wd_16
    190 
    191     // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    192 //  MRS x20,PMCCFILTR_EL0
    193     sub         x0,x20,x19
    194     ldp         x19,x20,[sp],#16
    195     ret
    196 
    197 
    198 
    199 
    200