Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_inter_pred_chroma_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for inter prediction  interpolation.
     27 @*
     28 @* @author
     29 @*  Ittaim
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_inter_pred_chroma_a9q()
     34 @*
     35 @* @remarks
     36 @*  None
     37 @*
     38 @*******************************************************************************
     39 @*
     40 
     41 @* All the functions here are replicated from ih264_inter_pred_filters.c
     42 @
     43 
     44 @**
     45 @**
     46 @**
     47 @
     48 @**
     49 @*******************************************************************************
     50 @*
     51 @* @brief
     52 @*    Interprediction chroma filter
     53 @*
     54 @* @par Description:
     55 @*   Applies filtering to chroma samples as mentioned in
     56 @*    sec 8.4.2.2.2 titled "chroma sample interpolation process"
     57 @*
     58 @* @param[in] pu1_src
     59 @*  UWORD8 pointer to the source containing alternate U and V samples
     60 @*
     61 @* @param[out] pu1_dst
     62 @*  UWORD8 pointer to the destination
     63 @*
     64 @* @param[in] src_strd
     65 @*  integer source stride
     66 @*
     67 @* @param[in] dst_strd
     68 @*  integer destination stride
     69 @*
     70 @* @param[in]uc_dx
     71 @*  dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
     72 @*
     73 @* @param[in] uc_dy
     74 @*  dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
     75 @*
     76 @* @param[in] ht
     77 @*  integer height of the array
     78 @*
     79 @* @param[in] wd
     80 @*  integer width of the array
     81 @*
     82 @* @returns
     83 @*
     84 @* @remarks
     85 @*  None
     86 @*
     87 @*******************************************************************************
     88 @*
     89 
     90 @void ih264_inter_pred_chroma(UWORD8 *pu1_src,
     91 @                             UWORD8 *pu1_dst,
     92 @                             WORD32 src_strd,
     93 @                             WORD32 dst_strd,
     94 @                             UWORD8 u1_dx,
     95 @                             UWORD8 u1_dy,
     96 @                             WORD32 ht,
     97 @                             WORD32 wd)
     98 @**************Variables Vs Registers*****************************************
     99 @   r0 => *pu1_src
    100 @   r1 => *pu1_dst
    101 @   r2 =>  src_strd
    102 @   r3 =>  dst_strd
    103 @   r4 =>  u1_dx
    104 @   r5 =>  u1_dy
    105 @   r6 =>  height
    106 @   r7 => width
    107 @
    108 .text
    109 .p2align 2
    110 
    111     .global ih264_inter_pred_chroma_a9q
    112 
    113 ih264_inter_pred_chroma_a9q:
    114 
    115     stmfd         sp!, {r4-r12, r14}    @store register values to stack
    116     vstmdb        sp!, {d8-d15}         @push neon registers to stack
    117     ldr           r4, [sp, #104]
    118     ldr           r5, [sp, #108]
    119     ldr           r6, [sp, #112]
    120     ldr           r7, [sp, #116]
    121 
    122     rsb           r8, r4, #8            @8-u1_dx
    123     rsb           r9, r5, #8            @8-u1_dy
    124     mul           r10, r8, r9
    125     mul           r11, r4, r9
    126 
    127     vdup.u8       d28, r10
    128     vdup.u8       d29, r11
    129 
    130     mul           r10, r8, r5
    131     mul           r11, r4, r5
    132 
    133     vdup.u8       d30, r10
    134     vdup.u8       d31, r11
    135 
    136     subs          r12, r7, #2           @if wd=4 branch to loop_4
    137     beq           loop_2
    138     subs          r12, r7, #4           @if wd=8 branch to loop_8
    139     beq           loop_4
    140 
    141 loop_8:
    142     sub           r6, #1
    143     vld1.8        {d0, d1, d2}, [r0], r2 @ Load row0
    144     vld1.8        {d5, d6, d7}, [r0], r2 @ Load row1
    145     vext.8        d3, d0, d1, #2
    146     vext.8        d8, d5, d6, #2
    147 
    148     vmull.u8      q5, d0, d28
    149     vmlal.u8      q5, d5, d30
    150     vmlal.u8      q5, d3, d29
    151     vmlal.u8      q5, d8, d31
    152     vext.8        d9, d6, d7, #2
    153     vext.8        d4, d1, d2, #2
    154 
    155 inner_loop_8:
    156     vmull.u8      q6, d6, d30
    157     vmlal.u8      q6, d1, d28
    158     vmlal.u8      q6, d9, d31
    159     vmlal.u8      q6, d4, d29
    160     vmov          d0, d5
    161     vmov          d3, d8
    162 
    163     vqrshrun.s16  d14, q5, #6
    164     vmov          d1, d6
    165     vmov          d4, d9
    166 
    167     vld1.8        {d5, d6, d7}, [r0], r2 @ Load row1
    168     vqrshrun.s16  d15, q6, #6
    169 
    170     vext.8        d8, d5, d6, #2
    171     subs          r6, #1
    172     vext.8        d9, d6, d7, #2
    173     vst1.8        {q7}, [r1], r3        @ Store dest row
    174 
    175     vmull.u8      q5, d0, d28
    176     vmlal.u8      q5, d5, d30
    177     vmlal.u8      q5, d3, d29
    178     vmlal.u8      q5, d8, d31
    179     bne           inner_loop_8
    180 
    181     vmull.u8      q6, d6, d30
    182     vmlal.u8      q6, d1, d28
    183     vmlal.u8      q6, d9, d31
    184     vmlal.u8      q6, d4, d29
    185 
    186     vqrshrun.s16  d14, q5, #6
    187     vqrshrun.s16  d15, q6, #6
    188 
    189     vst1.8        {q7}, [r1], r3        @ Store dest row
    190 
    191     b             end_func
    192 
    193 loop_4:
    194     sub           r6, #1
    195     vld1.8        {d0, d1}, [r0], r2    @ Load row0
    196     vld1.8        {d2, d3}, [r0], r2    @ Load row1
    197     vext.8        d1, d0, d1, #2
    198     vext.8        d3, d2, d3, #2
    199 
    200     vmull.u8      q2, d2, d30
    201     vmlal.u8      q2, d0, d28
    202     vmlal.u8      q2, d3, d31
    203     vmlal.u8      q2, d1, d29
    204 
    205 inner_loop_4:
    206     subs          r6, #1
    207     vmov          d0, d2
    208     vmov          d1, d3
    209 
    210     vld1.8        {d2, d3}, [r0], r2    @ Load row1
    211     vqrshrun.s16  d6, q2, #6
    212 
    213     vext.8        d3, d2, d3, #2
    214     vst1.8        {d6}, [r1], r3        @ Store dest row
    215 
    216     vmull.u8      q2, d0, d28
    217     vmlal.u8      q2, d2, d30
    218     vmlal.u8      q2, d1, d29
    219     vmlal.u8      q2, d3, d31
    220     bne           inner_loop_4
    221 
    222     vqrshrun.s16  d6, q2, #6
    223     vst1.8        {d6}, [r1], r3        @ Store dest row
    224 
    225     b             end_func
    226 
    227 loop_2:
    228     vld1.8        {d0}, [r0], r2        @ Load row0
    229     vext.8        d1, d0, d0, #2
    230     vld1.8        {d2}, [r0], r2        @ Load row1
    231     vext.8        d3, d2, d2, #2
    232     vmull.u8      q2, d0, d28
    233     vmlal.u8      q2, d1, d29
    234     vmlal.u8      q2, d2, d30
    235     vmlal.u8      q2, d3, d31
    236     vld1.8        {d6}, [r0]            @ Load row2
    237     vqrshrun.s16  d4, q2, #6
    238     vext.8        d7, d6, d6, #2
    239     vst1.32       d4[0], [r1], r3       @ Store dest row0
    240     vmull.u8      q4, d2, d28
    241     vmlal.u8      q4, d3, d29
    242     vmlal.u8      q4, d6, d30
    243     vmlal.u8      q4, d7, d31
    244     subs          r6, #2
    245     vqrshrun.s16  d8, q4, #6
    246     vst1.32       d8[0], [r1], r3       @ Store dest row1
    247     bne           loop_2                @ repeat if ht=2
    248 
    249 end_func:
    250     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    251     ldmfd         sp!, {r4-r12, pc}     @ Restoring registers from stack
    252 
    253