Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_inter_pred_luma_horz_qpel_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for inter prediction horizontal quarter pel interpolation.
     27 @*
     28 @* @author
     29 @*  Mohit
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_inter_pred_luma_horz_qpel_a9q()
     34 @*
     35 @* @remarks
     36 @*  None
     37 @*
     38 @*******************************************************************************
     39 @*
     40 
     41 @* All the functions here are replicated from ih264_inter_pred_filters.c
     42 @
     43 
     44 @**
     45 @**
     46 @*******************************************************************************
     47 @*
     48 @* @brief
     49 @*     Quarter pel interprediction luma filter for horizontal input
     50 @*
     51 @* @par Description:
     52 @* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
     53 @* sec 8.4.2.2.1 titled "Luma sample interpolation process"
     54 @*
     55 @* @param[in] pu1_src
     56 @*  UWORD8 pointer to the source
     57 @*
     58 @* @param[out] pu1_dst
     59 @*  UWORD8 pointer to the destination
     60 @*
     61 @* @param[in] src_strd
     62 @*  integer source stride
     63 @*
     64 @* @param[in] dst_strd
     65 @*  integer destination stride
     66 @*
     67 @* @param[in] ht
     68 @*  integer height of the array
     69 @*
     70 @* @param[in] wd
     71 @*  integer width of the array
     72 @*
     73 @ @param[in] pu1_tmp: temporary buffer: UNUSED in this function
     74 @*
     75 @* @param[in] dydx: x and y reference offset for qpel calculations.
     76 @* @returns
     77 @*
     78 @ @remarks
     79 @*  None
     80 @*
     81 @*******************************************************************************
     82 @*
     83 
     84 @void ih264_inter_pred_luma_horz (
     85 @                            UWORD8 *pu1_src,
     86 @                            UWORD8 *pu1_dst,
     87 @                            WORD32 src_strd,
     88 @                            WORD32 dst_strd,
     89 @                            WORD32 ht,
     90 @                            WORD32 wd,
     91 @                            UWORD8* pu1_tmp,
     92 @                            UWORD32 dydx)
     93 
     94 @**************Variables Vs Registers*****************************************
     95 @   r0 => *pu1_src
     96 @   r1 => *pu1_dst
     97 @   r2 =>  src_strd
     98 @   r3 =>  dst_strd
     99 @   r5 =>  ht
    100 @   r6 =>  wd
    101 @   r7 =>  dydx
    102 
    103 .text
    104 .p2align 2
    105 
    106 
    107     .global ih264_inter_pred_luma_horz_qpel_a9q
    108 
    109 ih264_inter_pred_luma_horz_qpel_a9q:
    110 
    111     stmfd         sp!, {r4-r12, r14}    @store register values to stack
    112     vstmdb        sp!, {d8-d15}         @push neon registers to stack
    113     ldr           r5, [sp, #104]        @Loads ht
    114     ldr           r6, [sp, #108]        @Loads wd
    115     ldr           r7, [sp, #116]        @Loads dydx
    116     and           r7, r7, #3            @Finds x-offset
    117     add           r7, r0, r7, lsr #1    @pu1_src + (x_offset>>1)
    118     sub           r0, r0, #2            @pu1_src-2
    119     vmov.i8       d0, #5                @filter coeff
    120     subs          r12, r6, #8           @if wd=8 branch to loop_8
    121     vmov.i8       d1, #20               @filter coeff
    122 
    123     beq           loop_8
    124 
    125     subs          r12, r6, #4           @if wd=4 branch to loop_4
    126     beq           loop_4
    127 
    128 loop_16:                                @when  wd=16
    129     @ Processing row0 and row1
    130     vld1.8        {d2, d3, d4}, [r0], r2 @// Load row0
    131     vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
    132     vld1.8        {d5, d6, d7}, [r0], r2 @// Load row1
    133     vext.8        d30, d3, d4, #5       @//extract a[5]                         (column2,row0)
    134     vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
    135     vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
    136     vaddl.u8      q5, d30, d3           @// a0 + a5                             (column2,row0)
    137     vext.8        d27, d6, d7, #5       @//extract a[5]                         (column2,row1)
    138     vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
    139     vext.8        d31, d2, d3, #2       @//extract a[2]                         (column1,row0)
    140     vaddl.u8      q8, d27, d6           @// a0 + a5                             (column2,row1)
    141     vext.8        d30, d3, d4, #2       @//extract a[2]                         (column2,row0)
    142     vmlal.u8      q4, d31, d1           @// a0 + a5 + 20a2                      (column1,row0)
    143     vext.8        d28, d5, d6, #2       @//extract a[2]                         (column1,row1)
    144     vmlal.u8      q5, d30, d1           @// a0 + a5 + 20a2                      (column2,row0)
    145     vext.8        d27, d6, d7, #2       @//extract a[2]                         (column2,row1)
    146     vmlal.u8      q7, d28, d1           @// a0 + a5 + 20a2                      (column1,row1)
    147     vext.8        d31, d2, d3, #3       @//extract a[3]                         (column1,row0)
    148     vmlal.u8      q8, d27, d1           @// a0 + a5 + 20a2                      (column2,row1)
    149     vext.8        d30, d3, d4, #3       @//extract a[3]                         (column2,row0)
    150     vmlal.u8      q4, d31, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
    151     vext.8        d28, d5, d6, #3       @//extract a[3]                         (column1,row1)
    152     vmlal.u8      q5, d30, d1           @// a0 + a5 + 20a2 + 20a3               (column2,row0)
    153     vext.8        d27, d6, d7, #3       @//extract a[3]                         (column2,row1)
    154     vmlal.u8      q7, d28, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
    155     vext.8        d31, d2, d3, #1       @//extract a[1]                         (column1,row0)
    156     vmlal.u8      q8, d27, d1           @// a0 + a5 + 20a2 + 20a3               (column2,row1)
    157     vext.8        d30, d3, d4, #1       @//extract a[1]                         (column2,row0)
    158     vmlsl.u8      q4, d31, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
    159     vext.8        d28, d5, d6, #1       @//extract a[1]                         (column1,row1)
    160     vmlsl.u8      q5, d30, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2,row0)
    161     vext.8        d27, d6, d7, #1       @//extract a[1]                         (column2,row1)
    162     vmlsl.u8      q7, d28, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
    163     vext.8        d31, d2, d3, #4       @//extract a[4]                         (column1,row0)
    164     vmlsl.u8      q8, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2,row1)
    165     vext.8        d30, d3, d4, #4       @//extract a[4]                         (column2,row0)
    166     vmlsl.u8      q4, d31, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
    167     vext.8        d28, d5, d6, #4       @//extract a[4]                         (column1,row1)
    168     vmlsl.u8      q5, d30, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2,row0)
    169     vext.8        d27, d6, d7, #4       @//extract a[4]                         (column2,row1)
    170     vmlsl.u8      q7, d28, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
    171     vmlsl.u8      q8, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2,row1)
    172     vld1.32       {d12, d13}, [r7], r2  @Load value for interpolation           (column1,row0)
    173     vqrshrun.s16  d20, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
    174     vqrshrun.s16  d21, q5, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column2,row0)
    175     vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row2)
    176     vrhadd.u8     q10, q6, q10          @Interpolation step for qpel calculation
    177     vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
    178     vst1.8        {d20, d21}, [r1], r3  @//Store dest row0
    179     vext.8        d30, d3, d4, #5       @//extract a[5]                         (column2,row2)
    180     vqrshrun.s16  d19, q8, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column2,row1)
    181     vld1.32       {d12, d13}, [r7], r2  @Load value for interpolation           (column1,row1)
    182     vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
    183     vst1.8        {d18, d19}, [r1], r3  @//Store dest row1
    184     subs          r5, r5, #2            @ 2 rows done, decrement by 2
    185 
    186     beq           end_func
    187     b             loop_16
    188 
    189 loop_8:
    190 @ Processing row0 and row1
    191 
    192     vld1.8        {d5, d6}, [r0], r2    @// Load row1
    193     vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
    194     vld1.8        {d2, d3}, [r0], r2    @// Load row0
    195     vext.8        d25, d5, d6, #2       @//extract a[2]                         (column1,row1)
    196     vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
    197     vext.8        d24, d5, d6, #3       @//extract a[3]                         (column1,row1)
    198     vext.8        d23, d5, d6, #1       @//extract a[1]                         (column1,row1)
    199     vext.8        d22, d5, d6, #4       @//extract a[4]                         (column1,row1)
    200     vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
    201     vext.8        d29, d2, d3, #3       @//extract a[3]                         (column1,row0)
    202     vmlal.u8      q7, d25, d1           @// a0 + a5 + 20a2                      (column1,row1)
    203     vmlal.u8      q7, d24, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
    204     vmlsl.u8      q7, d23, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
    205     vmlsl.u8      q7, d22, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
    206     vext.8        d30, d2, d3, #2       @//extract a[2]                         (column1,row0)
    207     vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
    208     vext.8        d27, d2, d3, #1       @//extract a[1]                         (column1,row0)
    209     vext.8        d26, d2, d3, #4       @//extract a[4]                         (column1,row0)
    210     vmlal.u8      q4, d29, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
    211     vmlal.u8      q4, d30, d1           @// a0 + a5 + 20a2                      (column1,row0)
    212     vmlsl.u8      q4, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
    213     vmlsl.u8      q4, d26, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
    214     vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
    215     vld1.32       d12, [r7], r2         @Load value for interpolation           (column1,row0)
    216     vld1.32       d13, [r7], r2         @Load value for interpolation           (column1,row1)
    217     vqrshrun.s16  d19, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
    218     vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
    219     vst1.8        {d18}, [r1], r3       @//Store dest row0
    220     vst1.8        {d19}, [r1], r3       @//Store dest row1
    221     subs          r5, r5, #2            @ 2 rows done, decrement by 2
    222 
    223     beq           end_func              @ Branch if height==4
    224     b             loop_8                @looping if height == 8 or 16
    225 
    226 loop_4:
    227     vld1.8        {d5, d6}, [r0], r2    @// Load row1
    228     vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
    229     vld1.8        {d2, d3}, [r0], r2    @// Load row0
    230     vext.8        d25, d5, d6, #2       @//extract a[2]                         (column1,row1)
    231     vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
    232     vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
    233     vext.8        d24, d5, d6, #3       @//extract a[3]                         (column1,row1)
    234     vext.8        d23, d5, d6, #1       @//extract a[1]                         (column1,row1)
    235     vext.8        d22, d5, d6, #4       @//extract a[4]                         (column1,row1)
    236     vext.8        d29, d2, d3, #3       @//extract a[3]                         (column1,row0)
    237     vmlal.u8      q7, d25, d1           @// a0 + a5 + 20a2                      (column1,row1)
    238     vmlal.u8      q7, d24, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
    239     vmlsl.u8      q7, d23, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
    240     vmlsl.u8      q7, d22, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
    241     vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
    242     vext.8        d30, d2, d3, #2       @//extract a[2]                         (column1,row0)
    243     vld1.32       d12, [r7], r2         @Load value for interpolation           (column1,row0)
    244     vld1.32       d13, [r7], r2         @Load value for interpolation           (column1,row1)
    245     vext.8        d27, d2, d3, #1       @//extract a[1]                         (column1,row0)
    246     vext.8        d26, d2, d3, #4       @//extract a[4]                         (column1,row0)
    247     vmlal.u8      q4, d29, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
    248     vmlal.u8      q4, d30, d1           @// a0 + a5 + 20a2                      (column1,row0)
    249     vmlsl.u8      q4, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
    250     vmlsl.u8      q4, d26, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
    251     vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
    252     vqrshrun.s16  d19, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
    253     vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
    254     vst1.32       d18[0], [r1], r3      @//Store dest row0
    255     vst1.32       d19[0], [r1], r3      @//Store dest row1
    256 
    257     subs          r5, r5, #2            @ 2 rows done, decrement by 2
    258     beq           end_func
    259 
    260     b             loop_4
    261 
    262 end_func:
    263     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    264     ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
    265 
    266 
    267