Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_inter_pred_chroma_copy.s
     22 //*
     23 //* @brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using ARM
     26 //* RVCT
     27 //*
     28 //* @author
     29 //*  Yogeswaran RS
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*
     34 //* @remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 ///**
     40 //*******************************************************************************
     41 //*
     42 //* @brief
     43 //*   Chroma interprediction filter for copy
     44 //*
     45 //* @par Description:
     46 //*    Copies the array of width 'wd' and height 'ht' from the  location pointed
     47 //*    by 'src' to the location pointed by 'dst'
     48 //*
     49 //* @param[in] pu1_src
     50 //*  UWORD8 pointer to the source
     51 //*
     52 //* @param[out] pu1_dst
     53 //*  UWORD8 pointer to the destination
     54 //*
     55 //* @param[in] src_strd
     56 //*  integer source stride
     57 //*
     58 //* @param[in] dst_strd
     59 //*  integer destination stride
     60 //*
     61 //* @param[in] pi1_coeff
     62 //*  WORD8 pointer to the filter coefficients
     63 //*
     64 //* @param[in] ht
     65 //*  integer height of the array
     66 //*
     67 //* @param[in] wd
     68 //*  integer width of the array
     69 //*
     70 //* @returns
     71 //*
     72 //* @remarks
     73 //*  None
     74 //*
     75 //*******************************************************************************
     76 //*/
     77 
     78 //void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src,
     79 //                                   UWORD8 *pu1_dst,
     80 //                                   WORD32 src_strd,
     81 //                                   WORD32 dst_strd,
     82 //                                   WORD8 *pi1_coeff,
     83 //                                   WORD32 ht,
     84 //                                   WORD32 wd)
     85 //**************Variables Vs Registers*****************************************
     86 //x0 => *pu1_src
     87 //x1 => *pu1_dst
     88 //x2 =>  src_strd
     89 //x3 =>  dst_strd
     90 //x4 => *pi1_coeff
     91 //x5 =>  ht
     92 //x6 =>  wd
     93 
     94 .text
     95 .align 4
     96 
     97 .globl ihevc_inter_pred_chroma_copy_av8
     98 
     99 .type ihevc_inter_pred_chroma_copy_av8, %function
    100 
    101 ihevc_inter_pred_chroma_copy_av8:
    102 
    103     LSL         x12,x6,#1                   //wd << 1
    104     CMP         x5,#0                       //checks ht == 0
    105     BLE         END_LOOPS
    106     AND         x8,x5,#3                    //check ht for mul of 2
    107     SUB         x5,x5,x8                    //check the rounded height value
    108     TST         x12,#15                     //checks wd for multiples for 16
    109     BEQ         CORE_LOOP_WD_16
    110     TST         x12,#7                      //checks wd for multiples for 4 & 8
    111     BEQ         CORE_LOOP_WD_8
    112     SUB         x11,x12,#4
    113     CMP         x5,#0
    114     BEQ         OUTER_LOOP_WD_4_HT_2
    115 
    116 OUTER_LOOP_WD_4:
    117     SUBS        x4,x12,#0                   //checks wd == 0
    118     BLE         END_INNER_LOOP_WD_4
    119 
    120 INNER_LOOP_WD_4:
    121     LD1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    122     ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    123     ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    124     ST1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    125     LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    126     ADD         x0,x0,#4                    //pu1_src += 4
    127     ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    128     LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    129     SUBS        x4,x4,#4                    //(wd -4)
    130     ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    131     LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    132     ADD         x1,x1,#4                    //pu1_dst += 4
    133     ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    134     BGT         INNER_LOOP_WD_4
    135 
    136 END_INNER_LOOP_WD_4:
    137     SUBS        x5,x5,#4                    //ht - 4
    138     SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
    139     SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
    140     BGT         OUTER_LOOP_WD_4
    141     CMP         x8,#0
    142     BGT         OUTER_LOOP_WD_4_HT_2
    143 
    144 END_LOOPS:
    145     RET
    146 
    147 OUTER_LOOP_WD_4_HT_2:
    148     SUBS        x4,x12,#0                   //checks wd == 0
    149     BLE         END_LOOPS
    150 
    151 INNER_LOOP_WD_4_HT_2:
    152     LD1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    153     ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    154     ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    155     ST1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    156     LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    157     ADD         x0,x0,#4                    //pu1_src += 4
    158     ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    159     SUBS        x4,x4,#4                    //(wd -4)
    160     ADD         x1,x1,#4                    //pu1_dst += 4
    161     BGT         INNER_LOOP_WD_4_HT_2
    162     B           END_LOOPS
    163 
    164 CORE_LOOP_WD_8:
    165     SUB         x11,x12,#8
    166     CMP         x5,#0
    167     BEQ         OUTER_LOOP_WD_8_HT_2
    168 
    169 OUTER_LOOP_WD_8:
    170     SUBS        x4,x12,#0                   //checks wd
    171     BLE         END_INNER_LOOP_WD_8
    172 
    173 
    174 INNER_LOOP_WD_8:
    175     ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    176     LD1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    177     ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    178     ST1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
    179     LD1         {v1.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
    180     ST1         {v1.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
    181     SUBS        x4,x4,#8                    //wd - 8(Loop condition)
    182     LD1         {v2.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
    183     ST1         {v2.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
    184     LD1         {v3.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
    185     ST1         {v3.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
    186     BGT         INNER_LOOP_WD_8
    187 
    188 END_INNER_LOOP_WD_8:
    189     SUBS        x5,x5,#4                    //ht -= 4
    190     SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
    191     SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
    192     BGT         OUTER_LOOP_WD_8
    193     CMP         x8,#0
    194     BGT         OUTER_LOOP_WD_8_HT_2
    195     B           END_LOOPS
    196 
    197 OUTER_LOOP_WD_8_HT_2:
    198     SUBS        x4,x12,#0                   //checks wd
    199     BLE         END_LOOPS
    200 
    201 INNER_LOOP_WD_8_HT_2:
    202     ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    203     LD1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    204     ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    205     ST1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
    206     LD1         {v1.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
    207     ST1         {v1.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
    208     B           END_LOOPS
    209 
    210 CORE_LOOP_WD_16:
    211     SUB         x11,x12,#16
    212     CMP         x5,#0
    213     BEQ         OUTER_LOOP_WD_16_HT_2
    214 
    215 OUTER_LOOP_WD_16:
    216     SUBS        x4,x12,#0                   //checks wd
    217     BLE         END_INNER_LOOP_WD_16
    218 
    219 INNER_LOOP_WD_16:
    220     ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    221     LD1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
    222     ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    223     ST1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
    224     LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    225     ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    226     SUBS        x4,x4,#16                   //wd - 16(Loop condition)
    227     LD1         {v2.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    228     ST1         {v2.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    229     LD1         {v3.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    230     ST1         {v3.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    231     BGT         INNER_LOOP_WD_16
    232 
    233 END_INNER_LOOP_WD_16:
    234     SUBS        x5,x5,#4                    //ht -= 4
    235     SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
    236     SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
    237     BGT         OUTER_LOOP_WD_16
    238     CMP         x8,#0
    239     BGT         OUTER_LOOP_WD_16_HT_2
    240     B           END_LOOPS
    241 
    242 OUTER_LOOP_WD_16_HT_2:
    243     SUBS        x4,x12,#0                   //checks wd
    244     BLE         END_LOOPS
    245 
    246 INNER_LOOP_WD_16_HT_2:
    247     ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
    248     LD1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
    249     ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
    250     ST1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
    251     LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    252     ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
    253 
    254     RET
    255 
    256 
    257