Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_luma_mode2_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] pi1_coeff
     61 @*  word8 pointer to the planar coefficients
     62 @*
     63 @* @param[in] nt
     64 @*  size of tranform block
     65 @*
     66 @* @param[in] mode
     67 @*  type of filtering
     68 @*
     69 @* @returns
     70 @*
     71 @* @remarks
     72 @*  none
     73 @*
     74 @*******************************************************************************
     75 @*/
     76 
     77 @void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
     78 @                                 word32 src_strd,
     79 @                                 uword8 *pu1_dst,
     80 @                                 word32 dst_strd,
     81 @                                 word32 nt,
     82 @                                 word32 mode)
     83 @
     84 @**************variables vs registers*****************************************
     85 @r0 => *pu1_ref
     86 @r1 => src_strd
     87 @r2 => *pu1_dst
     88 @r3 => dst_strd
     89 
     90 @stack contents from #40
     91 @   nt
     92 @   mode
     93 @   pi1_coeff
     94 
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_luma_mode2_a9q
    102 
    103 .type ihevc_intra_pred_luma_mode2_a9q, %function
    104 
    105 ihevc_intra_pred_luma_mode2_a9q:
    106 
    107     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    108 
    109     ldr         r4,[sp,#40]                 @loads nt
    110     mov         r8,#-2
    111 
    112     cmp         r4,#4
    113     beq         mode2_4
    114 
    115     add         r0,r0,r4,lsl #1
    116 
    117     sub         r0,r0,#9                    @src[1]
    118     add         r10,r0,#-1
    119 
    120 prologue_cpy_32:
    121 
    122     vld1.8      {d0},[r0],r8
    123     mov         r11,r4
    124 
    125     vld1.8      {d1},[r10],r8
    126     mov         r6, r2
    127 
    128     vld1.8      {d2},[r0],r8
    129     vld1.8      {d3},[r10],r8
    130     lsr         r1, r4, #3
    131 
    132     vld1.8      {d4},[r0],r8
    133     vld1.8      {d5},[r10],r8
    134     vld1.8      {d6},[r0],r8
    135     mul         r1, r4, r1
    136 
    137     vld1.8      {d7},[r10],r8
    138     add         r7,r6,r3
    139 
    140     vrev64.8    d8,d0
    141     vrev64.8    d9,d1
    142     lsl         r5, r3, #2
    143 
    144     vrev64.8    d10,d2
    145     vrev64.8    d11,d3
    146     add         r9,r7,r3
    147 
    148     vrev64.8    d12,d4
    149     subs        r1,r1,#8
    150 
    151     vrev64.8    d13,d5
    152     vrev64.8    d14,d6
    153     vrev64.8    d15,d7
    154     add         r14,r9,r3
    155 
    156     beq         epilogue_mode2
    157 
    158     sub         r12,r4,#8
    159 
    160 kernel_mode2:
    161 
    162     vst1.8      {d8},[r6],r5
    163     vst1.8      {d9},[r7],r5
    164     subs        r11,r11,#8
    165 
    166     vst1.8      {d10},[r9],r5
    167     addgt       r2,r2,#8
    168 
    169     vst1.8      {d11},[r14],r5
    170     vst1.8      {d12},[r6],r5
    171     movle       r11,r4
    172 
    173     vst1.8      {d13},[r7],r5
    174     vst1.8      {d14},[r9],r5
    175     addle       r2, r2, r3, lsl #2
    176 
    177     vst1.8      {d15},[r14],r5
    178     vld1.8      {d0},[r0],r8
    179     sub         r14,r4,#8
    180 
    181     vld1.8      {d1},[r10],r8
    182     vld1.8      {d2},[r0],r8
    183     addle       r2, r2, #8
    184 
    185     vld1.8      {d3},[r10],r8
    186     vld1.8      {d4},[r0],r8
    187     suble       r2, r6, r14
    188 
    189     vld1.8      {d5},[r10],r8
    190     subs        r12,r12,#8
    191 
    192     vld1.8      {d6},[r0],r8
    193     mov         r6, r2
    194 
    195     vld1.8      {d7},[r10],r8
    196     addle       r0, r0, r4
    197 
    198     vrev64.8    d8,d0
    199     add         r7, r6, r3
    200 
    201     vrev64.8    d9,d1
    202     suble       r0, r0, #8
    203 
    204     vrev64.8    d10,d2
    205     movle       r12,r4
    206 
    207     vrev64.8    d11,d3
    208     add         r9, r7, r3
    209 
    210     vrev64.8    d12,d4
    211     add         r10,r0,#-1
    212 
    213     vrev64.8    d13,d5
    214     subs        r1, r1, #8
    215 
    216     vrev64.8    d14,d6
    217     add         r14, r9, r3
    218 
    219     vrev64.8    d15,d7
    220 
    221     bne         kernel_mode2
    222 
    223 epilogue_mode2:
    224 
    225     vst1.8      {d8},[r6],r5
    226     vst1.8      {d9},[r7],r5
    227     vst1.8      {d10},[r9],r5
    228     vst1.8      {d11},[r14],r5
    229     vst1.8      {d12},[r6],r5
    230     vst1.8      {d13},[r7],r5
    231     vst1.8      {d14},[r9],r5
    232     vst1.8      {d15},[r14],r5
    233 
    234     b           end_func
    235 
    236 mode2_4:
    237 
    238     mov         r8,#-2
    239     sub         r0,r0,#1
    240     add         r10,r0,#-1
    241 
    242     vld1.8      {d0},[r0],r8
    243     add         r5,r2,r3
    244     vld1.8      {d2},[r10],r8
    245     add         r6,r5,r3
    246     vld1.8      {d4},[r0]
    247     add         r7,r6,r3
    248     vld1.8      {d6},[r10]
    249 
    250     vrev64.8    d1,d0
    251     vrev64.8    d3,d2
    252 
    253 
    254 
    255     vst1.32     {d1[0]},[r2]
    256     vrev64.8    d5,d4
    257     vst1.32     {d3[0]},[r5]
    258     vrev64.8    d7,d6
    259     vst1.32     {d5[0]},[r6]
    260     vst1.32     {d7[0]},[r7]
    261 
    262 end_func:
    263     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    264 
    265 
    266 
    267 
    268 
    269 
    270 
    271