Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_chroma_copy_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*   chroma interprediction filter for copy
     45 @*
     46 @* @par description:
     47 @*    copies the array of width 'wd' and height 'ht' from the  location pointed
     48 @*    by 'src' to the location pointed by 'dst'
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[out] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] pi1_coeff
     63 @*  word8 pointer to the filter coefficients
     64 @*
     65 @* @param[in] ht
     66 @*  integer height of the array
     67 @*
     68 @* @param[in] wd
     69 @*  integer width of the array
     70 @*
     71 @* @returns
     72 @*
     73 @* @remarks
     74 @*  none
     75 @*
     76 @*******************************************************************************
     77 @*/
     78 
     79 @void ihevc_inter_pred_chroma_copy( uword8 *pu1_src,
     80 @                                   uword8 *pu1_dst,
     81 @                                   word32 src_strd,
     82 @                                   word32 dst_strd,
     83 @                                   word8 *pi1_coeff,
     84 @                                   word32 ht,
     85 @                                   word32 wd)
     86 @**************variables vs registers*****************************************
     87 @               r0 => *pu1_src
     88 @               r1 => *pu1_dst
     89 @               r2 =>  src_strd
     90 @               r3 =>  dst_strd
     91 @               r4 => *pi1_coeff
     92 @               r5 =>  ht
     93 @               r6 =>  wd
     94 
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_inter_pred_chroma_copy_a9q
    102 
    103 .type ihevc_inter_pred_chroma_copy_a9q, %function
    104 
    105 ihevc_inter_pred_chroma_copy_a9q:
    106     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    107     ldr         r12,[sp,#48]                @loads wd
    108     lsl         r12,r12,#1
    109     ldr         r7,[sp,#44]                 @loads ht
    110     cmp         r7,#0                       @checks ht == 0
    111     ble         end_loops
    112     and         r8,r7,#3                    @check ht for mul of 2
    113     sub         r7,r7,r8                    @check the rounded height value
    114     tst         r12,#15                     @checks wd for multiples for 4 & 8
    115     beq         core_loop_wd_16
    116     tst         r12,#7                      @checks wd for multiples for 4 & 8
    117     beq         core_loop_wd_8
    118 
    119     sub         r11,r12,#4
    120     cmp         r7,#0
    121     beq         outer_loop_wd_4_ht_2
    122 
    123 outer_loop_wd_4:
    124     subs        r4,r12,#0                   @checks wd == 0
    125     ble         end_inner_loop_wd_4
    126 
    127 inner_loop_wd_4:
    128     vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    129     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    130     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    131     vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    132     vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    133     add         r0,r0,#4                    @pu1_src += 4
    134     vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    135     vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    136     subs        r4,r4,#4                    @(wd -4)
    137     vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    138     vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    139     add         r1,r1,#4                    @pu1_dst += 4
    140     vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    141     bgt         inner_loop_wd_4
    142 
    143 end_inner_loop_wd_4:
    144     subs        r7,r7,#4                    @ht - 4
    145     sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    146     sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    147     bgt         outer_loop_wd_4
    148     cmp         r8,#0
    149     bgt         outer_loop_wd_4_ht_2
    150 
    151 end_loops:
    152     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    153 
    154 
    155 outer_loop_wd_4_ht_2:
    156     subs        r4,r12,#0                   @checks wd == 0
    157     ble         end_loops
    158 
    159 inner_loop_wd_4_ht_2:
    160     vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    161     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    162     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    163     vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    164     vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
    165     add         r0,r0,#4                    @pu1_src += 4
    166     vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
    167     subs        r4,r4,#4                    @(wd -4)
    168     add         r1,r1,#4                    @pu1_dst += 4
    169     bgt         inner_loop_wd_4_ht_2
    170     b           end_loops
    171 
    172 core_loop_wd_8:
    173     sub         r11,r12,#8
    174     cmp         r7,#0
    175     beq         outer_loop_wd_8_ht_2
    176 
    177 outer_loop_wd_8:
    178     subs        r4,r12,#0                   @checks wd
    179     ble         end_inner_loop_wd_8
    180 
    181 inner_loop_wd_8:
    182     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    183     vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
    184     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    185     vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    186     vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
    187     vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    188     subs        r4,r4,#8                    @wd - 8(loop condition)
    189     vld1.8      {d2},[r5],r2                @vld1_u8(pu1_src_tmp)
    190     vst1.8      {d2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    191     vld1.8      {d3},[r5],r2                @vld1_u8(pu1_src_tmp)
    192     vst1.8      {d3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    193     bgt         inner_loop_wd_8
    194 
    195 end_inner_loop_wd_8:
    196     subs        r7,r7,#4                    @ht -= 4
    197     sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    198     sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    199     bgt         outer_loop_wd_8
    200     cmp         r8,#0
    201     bgt         outer_loop_wd_8_ht_2
    202     b           end_loops
    203 
    204 outer_loop_wd_8_ht_2:
    205     subs        r4,r12,#0                   @checks wd
    206     ble         end_loops
    207 
    208 inner_loop_wd_8_ht_2:
    209     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    210     vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
    211     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    212     vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    213     vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
    214     vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    215     @subs     r4,r4,#8                      @wd - 8(loop condition)
    216     @bgt      inner_loop_wd_8_ht_2
    217     b           end_loops
    218 
    219 core_loop_wd_16:
    220     sub         r11,r12,#16
    221     cmp         r7,#0
    222     beq         outer_loop_wd_16_ht_2
    223 
    224 outer_loop_wd_16:
    225     subs        r4,r12,#0                   @checks wd
    226     ble         end_inner_loop_wd_16
    227 
    228 inner_loop_wd_16:
    229     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    230     vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
    231     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    232     vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    233     vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
    234     vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    235     subs        r4,r4,#16                   @wd - 16(loop condition)
    236     vld1.8      {q2},[r5],r2                @vld1_u8(pu1_src_tmp)
    237     vst1.8      {q2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    238     vld1.8      {q3},[r5],r2                @vld1_u8(pu1_src_tmp)
    239     vst1.8      {q3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    240     bgt         inner_loop_wd_16
    241 
    242 end_inner_loop_wd_16:
    243     subs        r7,r7,#4                    @ht -= 4
    244     sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
    245     sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
    246     bgt         outer_loop_wd_16
    247     cmp         r8,#0
    248     bgt         outer_loop_wd_16_ht_2
    249     b           end_loops
    250 
    251 outer_loop_wd_16_ht_2:
    252     subs        r4,r12,#0                   @checks wd
    253     ble         end_loops
    254 
    255 inner_loop_wd_16_ht_2:
    256     add         r5,r0,r2                    @pu1_src_tmp += src_strd
    257     vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
    258     add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
    259     vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
    260     vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
    261     vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
    262     @subs     r4,r4,#16                     @wd - 16(loop condition)
    263     @bgt      inner_loop_wd_16_ht_2
    264 
    265     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    266 
    267 
    268 
    269 
    270 
    271