Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @/**
     20 @*******************************************************************************
     21 @*
     22 @* @brief
     23 @*     interprediction luma function for copy
     24 @*
     25 @* @par description:
     26 @*   copies the array of width 'wd' and height 'ht' from the  location pointed
     27 @*   by 'src' to the location pointed by 'dst'
     28 @*
     29 @* @param[in] pu1_src
     30 @*  uword8 pointer to the source
     31 @*
     32 @* @param[out] pu1_dst
     33 @*  uword8 pointer to the destination
     34 @*
     35 @* @param[in] src_strd
     36 @*  integer source stride
     37 @*
     38 @* @param[in] dst_strd
     39 @*  integer destination stride
     40 @*
     41 @* @param[in] pi1_coeff
     42 @*  word8 pointer to the filter coefficients
     43 @*
     44 @* @param[in] ht
     45 @*  integer height of the array
     46 @*
     47 @* @param[in] wd
     48 @*  integer width of the array
     49 @*
     50 @* @returns
     51 @*
     52 @* @remarks
     53 @*  none
     54 @*
     55 @*******************************************************************************
     56 @*/
     57 @void ihevc_inter_pred_luma_copy (
     58 @                            uword8 *pu1_src,
     59 @                            uword8 *pu1_dst,
     60 @                            word32 src_strd,
     61 @                            word32 dst_strd,
     62 @                            word8 *pi1_coeff,
     63 @                            word32 ht,
     64 @                            word32 wd   )
     65 
     66 @**************variables vs registers*****************************************
     67 @   r0 => *pu1_src
     68 @   r1 => *pu1_dst
     69 @   r2 =>  src_strd
     70 @   r3 =>  dst_strd
     71 @   r7 =>  ht
     72 @   r12 => wd
     73 
     74 .text
     75 .align 4
     76 
     77 
     78 
     79 
     80 .globl ihevc_inter_pred_luma_copy_a9q
     81 
     82 .type ihevc_inter_pred_luma_copy_a9q, %function
     83 
     84 ihevc_inter_pred_luma_copy_a9q:
     85     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
     86     ldr         r12,[sp,#48]                @loads wd
     87     ldr         r7,[sp,#44]                 @loads ht
     88     cmp         r7,#0                       @checks ht == 0
     89     ble         end_loops
     90     tst         r12,#15                     @checks wd for multiples for 4 & 8
     91     beq         core_loop_wd_16
     92     tst         r12,#7                      @checks wd for multiples for 4 & 8
     93     beq         core_loop_wd_8
     94     sub         r11,r12,#4
     95 
     96 outer_loop_wd_4:
     97     subs        r4,r12,#0                   @checks wd == 0
     98     ble         end_inner_loop_wd_4
     99 
    100 inner_loop_wd_4:
    101     vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    102     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    103     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    104     vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    105     vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    106     add         r0,r0,#4                    @pu1_src += 4
    107     vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    108     vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    109     subs        r4,r4,#4                    @(wd -4)
    110     vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    111     vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    112     add         r1,r1,#4                    @pu1_dst += 4
    113     vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    114 
    115     bgt         inner_loop_wd_4
    116 
    117 end_inner_loop_wd_4:
    118     subs        r7,r7,#4                    @ht - 4
    119     sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    120     sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    121     bgt         outer_loop_wd_4
    122 
    123 end_loops:
    124     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    125 
    126 
    127 core_loop_wd_8:
    128     sub         r11,r12,#8
    129 
    130 outer_loop_wd_8:
    131     subs        r4,r12,#0                   @checks wd
    132     ble         end_inner_loop_wd_8
    133 
    134 inner_loop_wd_8:
    135     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    136     vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
    137     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    138     vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    139     vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
    140     vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    141     subs        r4,r4,#8                    @wd - 8(loop condition)
    142     vld1.8      {d2},[r5],r2                @vld1_u8(pu1_src_tmp)
    143     vst1.8      {d2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    144     vld1.8      {d3},[r5],r2                @vld1_u8(pu1_src_tmp)
    145     vst1.8      {d3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    146     bgt         inner_loop_wd_8
    147 
    148 end_inner_loop_wd_8:
    149     subs        r7,r7,#4                    @ht -= 4
    150     sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    151     sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    152     bgt         outer_loop_wd_8
    153 
    154     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    155 
    156 core_loop_wd_16:
    157     sub         r11,r12,#16
    158 
    159 outer_loop_wd_16:
    160     subs        r4,r12,#0                   @checks wd
    161     ble         end_inner_loop_wd_16
    162 
    163 inner_loop_wd_16:
    164     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    165     vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
    166     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    167     vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    168     vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
    169     vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    170     subs        r4,r4,#16                   @wd - 8(loop condition)
    171     vld1.8      {q2},[r5],r2                @vld1_u8(pu1_src_tmp)
    172     vst1.8      {q2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    173     vld1.8      {q3},[r5],r2                @vld1_u8(pu1_src_tmp)
    174     vst1.8      {q3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    175     bgt         inner_loop_wd_16
    176 
    177 end_inner_loop_wd_16:
    178     subs        r7,r7,#4                    @ht -= 4
    179     sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    180     sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    181     bgt         outer_loop_wd_16
    182 
    183     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    184 
    185 
    186 
    187 
    188 
    189