Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_ver_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] nt
     61 @*  size of tranform block
     62 @*
     63 @* @param[in] mode
     64 @*  type of filtering
     65 @*
     66 @* @returns
     67 @*
     68 @* @remarks
     69 @*  none
     70 @*
     71 @*******************************************************************************
     72 @*/
     73 
     74 @void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref,
     75 @        word32 src_strd,
     76 @        uword8 *pu1_dst,
     77 @        word32 dst_strd,
     78 @        word32 nt,
     79 @        word32 mode)
     80 @**************variables vs registers*****************************************
     81 @r0 => *pu1_ref
     82 @r1 => src_strd
     83 @r2 => *pu1_dst
     84 @r3 => dst_strd
     85 
     86 @stack contents from #40
     87 @   nt
     88 @   mode
     89 
     90 .text
     91 .align 4
     92 
     93 
     94 
     95 
     96 .globl ihevc_intra_pred_chroma_ver_a9q
     97 
     98 .type ihevc_intra_pred_chroma_ver_a9q, %function
     99 
    100 ihevc_intra_pred_chroma_ver_a9q:
    101 
    102     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    103 
    104     ldr         r4,[sp,#40]                 @loads nt
    105     lsl         r5, r4, #2                  @4nt
    106 
    107 
    108     cmp         r4, #8
    109     beq         blk_8
    110     blt         blk_4
    111 
    112 copy_16:
    113     add         r5, r5, #2                  @2nt+2
    114     add         r6, r0, r5                  @&src[2nt+1]
    115 
    116     add         r5, r2, r3                  @pu1_dst + dst_strd
    117     vld2.8      {d20,d21}, [r6]!            @16 loads (col 0:15)
    118     add         r8, r5, r3
    119 
    120     add         r10, r8, r3
    121     vld2.8      {d22,d23}, [r6]             @16 loads (col 16:31)
    122     lsl         r11, r3, #2
    123 
    124     add         r11, r11, #0xfffffff0
    125 
    126 
    127     vst2.8      {d20,d21}, [r2]!
    128     vst2.8      {d20,d21}, [r5]!
    129     vst2.8      {d20,d21}, [r8]!
    130     vst2.8      {d20,d21}, [r10]!
    131 
    132     vst2.8      {d22,d23}, [r2], r11
    133     vst2.8      {d22,d23}, [r5], r11
    134     vst2.8      {d22,d23}, [r8], r11
    135     vst2.8      {d22,d23}, [r10], r11
    136 
    137     subs        r4, r4, #4
    138 
    139 kernel_copy_16:
    140     vst2.8      {d20,d21}, [r2]!
    141     vst2.8      {d20,d21}, [r5]!
    142     vst2.8      {d20,d21}, [r8]!
    143     vst2.8      {d20,d21}, [r10]!
    144 
    145     vst2.8      {d22,d23}, [r2], r11
    146     vst2.8      {d22,d23}, [r5], r11
    147     vst2.8      {d22,d23}, [r8], r11
    148     vst2.8      {d22,d23}, [r10], r11
    149 
    150     subs        r4, r4, #4
    151 
    152 
    153     vst2.8      {d20,d21}, [r2]!
    154     vst2.8      {d20,d21}, [r5]!
    155     vst2.8      {d20,d21}, [r8]!
    156     vst2.8      {d20,d21}, [r10]!
    157 
    158     vst2.8      {d22,d23}, [r2], r11
    159     vst2.8      {d22,d23}, [r5], r11
    160     vst2.8      {d22,d23}, [r8], r11
    161     vst2.8      {d22,d23}, [r10], r11
    162 
    163     subs        r4, r4, #4
    164 
    165     vst2.8      {d20,d21}, [r2]!
    166     vst2.8      {d20,d21}, [r5]!
    167     vst2.8      {d20,d21}, [r8]!
    168     vst2.8      {d20,d21}, [r10]!
    169 
    170     vst2.8      {d22,d23}, [r2], r11
    171     vst2.8      {d22,d23}, [r5], r11
    172     vst2.8      {d22,d23}, [r8], r11
    173     vst2.8      {d22,d23}, [r10], r11
    174 
    175     subs        r4, r4, #4
    176     bne         kernel_copy_16
    177 
    178     b           end_func
    179 
    180 blk_8:
    181 
    182     add         r5, r5, #2                  @2nt+2
    183     add         r6, r0, r5                  @&src[2nt+1]
    184 
    185     add         r5, r2, r3                  @pu1_dst + dst_strd
    186     vld2.8      {d20,d21}, [r6]!            @16 loads (col 0:15)
    187     add         r8, r5, r3
    188 
    189     add         r10, r8, r3
    190     vld2.8      {d22,d23}, [r6]             @16 loads (col 16:31)
    191 
    192     lsl         r11,r3,#2
    193 
    194     vst2.8      {d20,d21}, [r2],r11
    195     vst2.8      {d20,d21}, [r5],r11
    196     vst2.8      {d20,d21}, [r8],r11
    197     vst2.8      {d20,d21}, [r10],r11
    198 
    199     vst2.8      {d20,d21}, [r2]
    200     vst2.8      {d20,d21}, [r5]
    201     vst2.8      {d20,d21}, [r8]
    202     vst2.8      {d20,d21}, [r10]
    203 
    204     subs        r4, r4, #8
    205     beq         end_func
    206 
    207 blk_4:
    208 
    209     @lsl        r5, r4, #2          @4nt
    210     add         r5, r5, #2                  @2nt+2
    211     add         r6, r0, r5                  @&src[2nt+1]
    212 
    213     vld1.8      {d0},[r6]
    214     add         r5, r2, r3                  @pu1_dst + dst_strd
    215 
    216     vst1.8      {d0},[r2]
    217     add         r8, r5, r3
    218     vst1.8      {d0},[r5]
    219     add         r10, r8, r3
    220     vst1.8      {d0},[r8]
    221     vst1.8      {d0},[r10]
    222 
    223 
    224 
    225 end_func:
    226     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    227 
    228 
    229 
    230