Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_dc_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for intra prediction dc filtering.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*    luma intraprediction filter for dc input
     45 @*
     46 @* @par description:
     47 @*
     48 @* @param[in] pu1_ref
     49 @*  uword8 pointer to the source
     50 @*
     51 @* @param[out] pu1_dst
     52 @*  uword8 pointer to the destination
     53 @*
     54 @* @param[in] src_strd
     55 @*  integer source stride
     56 @*
     57 @* @param[in] dst_strd
     58 @*  integer destination stride
     59 @*
     60 @* @param[in] pi1_coeff
     61 @*  word8 pointer to the planar coefficients
     62 @*
     63 @* @param[in] nt
     64 @*  size of tranform block
     65 @*
     66 @* @param[in] mode
     67 @*  type of filtering
     68 @*
     69 @* @returns
     70 @*
     71 @* @remarks
     72 @*  none
     73 @*
     74 @*******************************************************************************
     75 @*/
     76 
     77 @void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref,
     78 @                                word32 src_strd,
     79 @                                uword8 *pu1_dst,
     80 @                                word32 dst_strd,
     81 @                                word32 nt,
     82 @                                word32 mode)
     83 @
     84 @**************variables vs registers*****************************************
     85 @r0 => *pu1_ref
     86 @r1 => src_strd
     87 @r2 => *pu1_dst
     88 @r3 => dst_strd
     89 
     90 @stack contents from #40
     91 @   nt
     92 @   mode
     93 @   pi1_coeff
     94 
     95 .equ    nt_offset,      40
     96 
     97 .text
     98 .align 4
     99 
    100 
    101 
    102 
    103 .globl ihevc_intra_pred_chroma_dc_a9q
    104 
    105 .type ihevc_intra_pred_chroma_dc_a9q, %function
    106 
    107 ihevc_intra_pred_chroma_dc_a9q:
    108 
    109     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    110 
    111     ldr         r4,[sp,#nt_offset]          @loads nt
    112     mov         r9, #0
    113     vmov        d17, r9, r9
    114 
    115     clz         r5, r4                      @counts leading zeros
    116 
    117     add         r6, r0, r4,lsl #1           @&src[2nt]
    118     vmov        d18, r9, r9
    119     rsb         r5, r5, #32                 @log2nt
    120     add         r7, r0, r4, lsl #2          @&src[4nt]
    121     mov         r12,r5
    122     add         r8, r7, #2                  @&src[4nt+2]
    123 
    124     cmp         r4, #4
    125     beq         dc_4                        @nt=4 loop
    126 
    127 
    128 add_loop:
    129     vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
    130     lsl         r10,r4,#1                   @2nt
    131 
    132     vpaddl.u8   d2, d30
    133     subs        r10, #0x10
    134 
    135     vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
    136 
    137     vpaddl.u8   d3, d31
    138     vpaddl.u16  d2, d2
    139     vpaddl.u16  d3, d3
    140 
    141     vpadal.u32  d17, d2
    142 
    143     vpadal.u32  d18, d3
    144 
    145     vpaddl.u8   d2, d26
    146     vpaddl.u8   d3, d27
    147 
    148     vpaddl.u16  d2, d2
    149     vpaddl.u16  d3, d3
    150 
    151     vpadal.u32  d17, d2
    152     vpadal.u32  d18, d3
    153 
    154     beq         epil_add_loop
    155 
    156 core_loop_add:
    157     vld2.s8     {d30,d31}, [r6]!            @load from src[nt]
    158     vpaddl.u8   d28, d30
    159     vpaddl.u8   d3, d31
    160 
    161     vld2.s8     {d26,d27}, [r8]!            @load from src[2nt+1]
    162 
    163     vpaddl.u16  d3, d3
    164     vpaddl.u16  d29, d28
    165 
    166     vpadal.u32  d18, d3
    167     vpadal.u32  d17, d29
    168 
    169     vpaddl.u8   d3, d27
    170     vpaddl.u8   d28, d26
    171 
    172     vpaddl.u16  d3, d3
    173     vpaddl.u16  d29, d28
    174 
    175     vpadal.u32  d18, d3
    176     vpadal.u32  d17, d29
    177 
    178 
    179 epil_add_loop:
    180 
    181     vmov.32     r1,d18[0]
    182     vmov.32     r11,d17[0]
    183 
    184     add         r1,r1,r4
    185     add         r11,r11,r4
    186 
    187     lsr         r1,r1,r12
    188     lsr         r11,r11,r12
    189 
    190     vdup.8      d17,r1
    191     vdup.8      d16,r11
    192 
    193 prologue_cpy_32:
    194 
    195     add         r5, r2, r3
    196     subs        r9, r4, #8
    197     lsl         r6, r3, #2
    198     moveq       r11,r6
    199     add         r8, r5, r3
    200     add         r10, r8, r3
    201 
    202     beq         epilogue_copy
    203 
    204     vst2.8      {d16,d17}, [r2]!
    205     add         r6, r6, #0xfffffff0
    206 
    207     vst2.8      {d16,d17}, [r5]!
    208     vst2.8      {d16,d17}, [r8]!
    209     movne       r11,#16
    210     vst2.8      {d16,d17}, [r10]!
    211 
    212 
    213     vst2.8      {d16,d17}, [r2], r6
    214     vst2.8      {d16,d17}, [r5], r6
    215     vst2.8      {d16,d17}, [r8], r6
    216     vst2.8      {d16,d17}, [r10], r6
    217 
    218 kernel_copy:
    219     vst2.8      {d16,d17}, [r2]!
    220     vst2.8      {d16,d17}, [r5]!
    221     vst2.8      {d16,d17}, [r8]!
    222     vst2.8      {d16,d17}, [r10]!
    223 
    224     vst2.8      {d16,d17}, [r2], r6
    225     vst2.8      {d16,d17}, [r5], r6
    226     vst2.8      {d16,d17}, [r8], r6
    227     vst2.8      {d16,d17}, [r10], r6
    228 
    229     vst2.8      {d16,d17}, [r2]!
    230     vst2.8      {d16,d17}, [r5]!
    231     vst2.8      {d16,d17}, [r8]!
    232     vst2.8      {d16,d17}, [r10]!
    233 
    234     vst2.8      {d16,d17}, [r2], r6
    235     vst2.8      {d16,d17}, [r5], r6
    236     vst2.8      {d16,d17}, [r8], r6
    237     vst2.8      {d16,d17}, [r10], r6
    238 
    239 epilogue_copy:
    240     vst2.8      {d16,d17}, [r2],r11
    241     vst2.8      {d16,d17}, [r5],r11
    242     vst2.8      {d16,d17}, [r8],r11
    243     vst2.8      {d16,d17}, [r10],r11
    244 
    245     vst2.8      {d16,d17}, [r2]
    246     vst2.8      {d16,d17}, [r5]
    247     vst2.8      {d16,d17}, [r8]
    248     vst2.8      {d16,d17}, [r10]
    249     b           end_func
    250 
    251 dc_4:
    252     vld2.s8     {d30,d31},[r6]              @load from src[nt]
    253     vshl.i64    d3,d30,#32
    254 
    255     vld2.s8     {d26,d27},[r8]              @load from src[2nt+1]
    256     vshl.i64    d2,d31,#32
    257 
    258     vpaddl.u8   d3,d3
    259     vpaddl.u8   d2,d2
    260     vpaddl.u16  d3,d3
    261     vpaddl.u16  d2,d2
    262     vpadal.u32  d17,d3
    263     vpadal.u32  d18,d2
    264 
    265     vshl.i64    d3,d26,#32
    266     vshl.i64    d2,d27,#32
    267     vpaddl.u8   d3,d3
    268     vpaddl.u8   d2,d2
    269     vpaddl.u16  d3,d3
    270     vpaddl.u16  d2,d2
    271     vpadal.u32  d17,d3
    272     vpadal.u32  d18,d2
    273 
    274     vmov.32     r10,d17[0]
    275     vmov.32     r11,d18[0]
    276 
    277     add         r10,r10,r4
    278     add         r11,r11,r4
    279     lsr         r10,r10,r12
    280     lsr         r11,r11,r12
    281     orr         r10,r10,r11,lsl #8
    282     vdup.16     d0,r10
    283 
    284     vst1.8      {d0},[r2],r3
    285     vst1.8      {d0},[r2],r3
    286     vst1.8      {d0},[r2],r3
    287     vst1.8      {d0},[r2]
    288 
    289 end_func:
    290     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    291 
    292 
    293 
    294 
    295