Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_dc_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] pi1_coeff
     61 @*  word8 pointer to the planar coefficients
     62 @*
     63 @* @param[in] nt
     64 @*  size of tranform block
     65 @*
     66 @* @param[in] mode
     67 @*  type of filtering
     68 @*
     69 @* @returns
     70 @*
     71 @* @remarks
     72 @*  none
     73 @*
     74 @*******************************************************************************
     75 @*/
     76 
     77 @void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
     78 @                                word32 src_strd,
     79 @                                uword8 *pu1_dst,
     80 @                                word32 dst_strd,
     81 @                                word32 nt,
     82 @                                word32 mode)
     83 @
     84 @**************variables vs registers*****************************************
     85 @r0 => *pu1_ref
     86 @r1 => src_strd
     87 @r2 => *pu1_dst
     88 @r3 => dst_strd
     89 
     90 @stack contents from #40
     91 @   nt
     92 @   mode
     93 @   pi1_coeff
     94 
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_intra_pred_chroma_dc_a9q
    102 
    103 .type ihevc_intra_pred_chroma_dc_a9q, %function
    104 
    105 ihevc_intra_pred_chroma_dc_a9q:
    106 
    107     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    108 
    109     ldr         r4,[sp,#40]                 @loads nt
    110     mov         r9, #0
    111     vmov        d17, r9, r9
    112 
    113     clz         r5, r4                      @counts leading zeros
    114 
    115     add         r6, r0, r4,lsl #1           @&src[2nt]
    116     vmov        d18, r9, r9
    117     rsb         r5, r5, #32                 @log2nt
    118     add         r7, r0, r4, lsl #2          @&src[4nt]
    119     mov         r12,r5
    120     add         r8, r7, #2                  @&src[4nt+2]
    121 
    122     cmp         r4, #4
    123     beq         dc_4                        @nt=4 loop
    124 
    125 
    126 add_loop:
    127     vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
    128     lsl         r10,r4,#1                   @2nt
    129 
    130     vpaddl.u8   d2, d30
    131     subs        r10, #0x10
    132 
    133     vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
    134 
    135     vpaddl.u8   d3, d31
    136     vpaddl.u16  d2, d2
    137     vpaddl.u16  d3, d3
    138 
    139     vpadal.u32  d17, d2
    140 
    141     vpadal.u32  d18, d3
    142 
    143     vpaddl.u8   d2, d26
    144     vpaddl.u8   d3, d27
    145 
    146     vpaddl.u16  d2, d2
    147     vpaddl.u16  d3, d3
    148 
    149     vpadal.u32  d17, d2
    150     vpadal.u32  d18, d3
    151 
    152     beq         epil_add_loop
    153 
    154 core_loop_add:
    155     vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
    156     vpaddl.u8   d28, d30
    157     vpaddl.u8   d3, d31
    158 
    159     vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
    160 
    161     vpaddl.u16  d3, d3
    162     vpaddl.u16  d29, d28
    163 
    164     vpadal.u32  d18, d3
    165     vpadal.u32  d17, d29
    166 
    167     vpaddl.u8   d3, d27
    168     vpaddl.u8   d28, d26
    169 
    170     vpaddl.u16  d3, d3
    171     vpaddl.u16  d29, d28
    172 
    173     vpadal.u32  d18, d3
    174     vpadal.u32  d17, d29
    175 
    176 
    177 epil_add_loop:
    178 
    179     vmov.32     r1,d18[0]
    180     vmov.32     r11,d17[0]
    181 
    182     add         r1,r1,r4
    183     add         r11,r11,r4
    184 
    185     lsr         r1,r1,r12
    186     lsr         r11,r11,r12
    187 
    188     vdup.8      d17,r1
    189     vdup.8      d16,r11
    190 
    191 prologue_cpy_32:
    192 
    193     add         r5, r2, r3
    194     subs        r9, r4, #8
    195     lsl         r6, r3, #2
    196     moveq       r11,r6
    197     add         r8, r5, r3
    198     add         r10, r8, r3
    199 
    200     beq         epilogue_copy
    201 
    202     vst2.8      {d16,d17}, [r2]!
    203     add         r6, r6, #0xfffffff0
    204 
    205     vst2.8      {d16,d17}, [r5]!
    206     vst2.8      {d16,d17}, [r8]!
    207     movne       r11,#16
    208     vst2.8      {d16,d17}, [r10]!
    209 
    210 
    211     vst2.8      {d16,d17}, [r2], r6
    212     vst2.8      {d16,d17}, [r5], r6
    213     vst2.8      {d16,d17}, [r8], r6
    214     vst2.8      {d16,d17}, [r10], r6
    215 
    216 kernel_copy:
    217     vst2.8      {d16,d17}, [r2]!
    218     vst2.8      {d16,d17}, [r5]!
    219     vst2.8      {d16,d17}, [r8]!
    220     vst2.8      {d16,d17}, [r10]!
    221 
    222     vst2.8      {d16,d17}, [r2], r6
    223     vst2.8      {d16,d17}, [r5], r6
    224     vst2.8      {d16,d17}, [r8], r6
    225     vst2.8      {d16,d17}, [r10], r6
    226 
    227     vst2.8      {d16,d17}, [r2]!
    228     vst2.8      {d16,d17}, [r5]!
    229     vst2.8      {d16,d17}, [r8]!
    230     vst2.8      {d16,d17}, [r10]!
    231 
    232     vst2.8      {d16,d17}, [r2], r6
    233     vst2.8      {d16,d17}, [r5], r6
    234     vst2.8      {d16,d17}, [r8], r6
    235     vst2.8      {d16,d17}, [r10], r6
    236 
    237 epilogue_copy:
    238     vst2.8      {d16,d17}, [r2],r11
    239     vst2.8      {d16,d17}, [r5],r11
    240     vst2.8      {d16,d17}, [r8],r11
    241     vst2.8      {d16,d17}, [r10],r11
    242 
    243     vst2.8      {d16,d17}, [r2]
    244     vst2.8      {d16,d17}, [r5]
    245     vst2.8      {d16,d17}, [r8]
    246     vst2.8      {d16,d17}, [r10]
    247     b           end_func
    248 
    249 dc_4:
    250     vld2.s8     {d30,d31},[r6]              @load from src[nt]
    251     vshl.i64    d3,d30,#32
    252 
    253     vld2.s8     {d26,d27},[r8]              @load from src[2nt+1]
    254     vshl.i64    d2,d31,#32
    255 
    256     vpaddl.u8   d3,d3
    257     vpaddl.u8   d2,d2
    258     vpaddl.u16  d3,d3
    259     vpaddl.u16  d2,d2
    260     vpadal.u32  d17,d3
    261     vpadal.u32  d18,d2
    262 
    263     vshl.i64    d3,d26,#32
    264     vshl.i64    d2,d27,#32
    265     vpaddl.u8   d3,d3
    266     vpaddl.u8   d2,d2
    267     vpaddl.u16  d3,d3
    268     vpaddl.u16  d2,d2
    269     vpadal.u32  d17,d3
    270     vpadal.u32  d18,d2
    271 
    272     vmov.32     r10,d17[0]
    273     vmov.32     r11,d18[0]
    274 
    275     add         r10,r10,r4
    276     add         r11,r11,r4
    277     lsr         r10,r10,r12
    278     lsr         r11,r11,r12
    279     orr         r10,r10,r11,lsl #8
    280     vdup.16     d0,r10
    281 
    282     vst1.8      {d0},[r2],r3
    283     vst1.8      {d0},[r2],r3
    284     vst1.8      {d0},[r2],r3
    285     vst1.8      {d0},[r2]
    286 
    287 end_func:
    288     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    289 
    290 
    291 
    292 
    293