Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_luma_mode2_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] pi1_coeff
     61 @*  word8 pointer to the planar coefficients
     62 @*
     63 @* @param[in] nt
     64 @*  size of tranform block
     65 @*
     66 @* @param[in] mode
     67 @*  type of filtering
     68 @*
     69 @* @returns
     70 @*
     71 @* @remarks
     72 @*  none
     73 @*
     74 @*******************************************************************************
     75 @*/
     76 
     77 @void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
     78 @                                 word32 src_strd,
     79 @                                 uword8 *pu1_dst,
     80 @                                 word32 dst_strd,
     81 @                                 word32 nt,
     82 @                                 word32 mode)
     83 @
     84 @**************variables vs registers*****************************************
     85 @r0 => *pu1_ref
     86 @r1 => src_strd
     87 @r2 => *pu1_dst
     88 @r3 => dst_strd
     89 
     90 @stack contents from #40
     91 @   nt
     92 @   mode
     93 @   pi1_coeff
     94 
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_chroma_mode2_a9q
    102 
    103 .type ihevc_intra_pred_chroma_mode2_a9q, %function
    104 
    105 ihevc_intra_pred_chroma_mode2_a9q:
    106 
    107     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    108 
    109     ldr         r4,[sp,#40]                 @loads nt
    110     mov         r8,#-4
    111 
    112     cmp         r4,#4
    113     beq         mode2_4
    114 
    115     add         r0,r0,r4,lsl #2
    116 
    117     sub         r0,r0,#0x12                 @src[1]
    118     add         r10,r0,#-2
    119 
    120 prologue_cpy_32:
    121 
    122     vld2.8      {d0,d1},[r0],r8
    123 
    124     mov         r11,r4
    125     vrev64.8    d16,d0
    126     vrev64.8    d17,d1
    127 
    128     vld2.8      {d2,d3},[r10],r8
    129     mov         r6, r2
    130 
    131     vld2.8      {d4,d5},[r0],r8
    132     vld2.8      {d6,d7},[r10],r8
    133     lsr         r1, r4, #3
    134 
    135     vld2.8      {d8,d9},[r0],r8
    136     vld2.8      {d10,d11},[r10],r8
    137     vld2.8      {d12,d13},[r0],r8
    138     mul         r1, r4, r1
    139 
    140     vld2.8      {d14,d15},[r10],r8
    141     add         r7,r6,r3
    142 
    143     vrev64.8    d18,d2
    144     vrev64.8    d19,d3
    145     lsl         r5, r3, #2
    146 
    147     vrev64.8    d20,d4
    148     vrev64.8    d21,d5
    149     add         r9,r7,r3
    150 
    151     vrev64.8    d22,d6
    152     vrev64.8    d23,d7
    153 
    154     vrev64.8    d24,d8
    155     vrev64.8    d25,d9
    156 
    157     vrev64.8    d26,d10
    158     subs        r1,r1,#8
    159 
    160     vrev64.8    d27,d11
    161 
    162     vrev64.8    d28,d12
    163     vrev64.8    d29,d13
    164 
    165     vrev64.8    d30,d14
    166     add         r14,r9,r3
    167     vrev64.8    d31,d15
    168 
    169     beq         epilogue_mode2
    170 
    171     sub         r12,r4,#8
    172 
    173 kernel_mode2:
    174 
    175     vst2.8      {d16,d17},[r6],r5
    176     vst2.8      {d18,d19},[r7],r5
    177     subs        r11,r11,#8
    178     vst2.8      {d20,d21},[r9],r5
    179     vst2.8      {d22,d23},[r14],r5
    180     vst2.8      {d24,d25},[r6],r5
    181     addgt       r2,r2,#16
    182     vst2.8      {d26,d27},[r7],r5
    183     vst2.8      {d28,d29},[r9],r5
    184     vst2.8      {d30,d31},[r14],r5
    185 
    186     vld2.8      {d0,d1},[r0],r8
    187     movle       r11,r4
    188 
    189     vld2.8      {d2,d3},[r10],r8
    190     vld2.8      {d4,d5},[r0],r8
    191     addle       r2, r2, r3, lsl #2
    192     vld2.8      {d6,d7},[r10],r8
    193     vrev64.8    d16,d0
    194 
    195     vld2.8      {d8,d9},[r0],r8
    196     vld2.8      {d10,d11},[r10],r8
    197     suble       r2, r6,#16
    198     vld2.8      {d12,d13},[r0],r8
    199     vrev64.8    d17,d1
    200     vld2.8      {d14,d15},[r10],r8
    201 
    202     subs        r12,r12,#8
    203     mov         r6, r2
    204     addle       r0, r0, r4,lsl #1
    205     add         r7, r6, r3
    206 
    207     vrev64.8    d18,d2
    208     suble       r0, r0, #16
    209     vrev64.8    d19,d3
    210 
    211     vrev64.8    d20,d4
    212     movle       r12,r4
    213     vrev64.8    d21,d5
    214 
    215     vrev64.8    d22,d6
    216     add         r9, r7, r3
    217     vrev64.8    d23,d7
    218 
    219     vrev64.8    d24,d8
    220     add         r10,r0,#-2
    221     vrev64.8    d25,d9
    222 
    223     vrev64.8    d26,d10
    224     subs        r1, r1, #8
    225     vrev64.8    d27,d11
    226 
    227     vrev64.8    d28,d12
    228     vrev64.8    d29,d13
    229 
    230     vrev64.8    d30,d14
    231     add         r14, r9, r3
    232     vrev64.8    d31,d15
    233 
    234     bne         kernel_mode2
    235 
    236 epilogue_mode2:
    237 
    238     vst2.8      {d16,d17},[r6],r5
    239     vst2.8      {d18,d19},[r7],r5
    240     vst2.8      {d20,d21},[r9],r5
    241     vst2.8      {d22,d23},[r14],r5
    242     vst2.8      {d24,d25},[r6],r5
    243     vst2.8      {d26,d27},[r7],r5
    244     vst2.8      {d28,d29},[r9],r5
    245     vst2.8      {d30,d31},[r14],r5
    246 
    247     b           end_func
    248 
    249 mode2_4:
    250 
    251     lsl         r12,r4,#1
    252     add         r0,r0,r12
    253     sub         r0,r0,#2
    254 
    255     vld2.8      {d12,d13},[r0],r8
    256     vshl.i64    d0,d12,#32
    257     add         r10,r0,#2
    258     vshl.i64    d1,d13,#32
    259 
    260     vrev64.8    d0,d0
    261     vld2.8      {d14,d15},[r10],r8
    262     vshl.i64    d2,d14,#32
    263 
    264     vrev64.8    d1,d1
    265     vshl.i64    d3,d15,#32
    266     vzip.8      d0,d1
    267     vst1.8      {d0},[r2],r3
    268 
    269     vrev64.8    d2,d2
    270     vld2.8      {d16,d17},[r0],r8
    271     vshl.i64    d4,d16,#32
    272     vrev64.8    d3,d3
    273     vshl.i64    d5,d17,#32
    274     vzip.8      d2,d3
    275     vrev64.8    d4,d4
    276     vrev64.8    d5,d5
    277     vst1.8      {d2},[r2],r3
    278 
    279 
    280     vld2.8      {d18,d19},[r10],r8
    281     vshl.i64    d6,d18,#32
    282 
    283     vzip.8      d4,d5
    284     vshl.i64    d7,d19,#32
    285     vrev64.8    d6,d6
    286     vst1.8      {d4},[r2],r3
    287 
    288     vrev64.8    d7,d7
    289     vzip.8      d6,d7
    290     vst1.8      {d6},[r2],r3
    291 
    292 end_func:
    293     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    294 
    295 
    296 
    297 
    298 
    299 
    300