Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for inter prediction  interpolation.
     27 @*
     28 @* @author
     29 @*  Mohit
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q()
     34 @*
     35 @* @remarks
     36 @*  None
     37 @*
     38 @*******************************************************************************
     39 @*
     40 
     41 @* All the functions here are replicated from ih264_inter_pred_filters.c
     42 @
     43 
     44 @*******************************************************************************
     45 @*
     46 @* @brief
     47 @*   This function implements two six tap filters. It
     48 @*    applies the six tap filter in the horizontal direction on the
     49 @*    predictor values, then applies the same filter in the
     50 @*    vertical direction on the predictor values. It then averages these
     51 @*    two outputs to obtain quarter pel values in horizontal and vertical direction.
     52 @*    The six tap filtering operation is described in sec 8.4.2.2.1 titled
     53 @*    "Luma sample interpolation process"
     54 @*
     55 @* @par Description:
     56 @*    This function is called to obtain pixels lying at the following
     57 @*    location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4).
     58 @*    The function interpolates the predictors first in the horizontal direction
     59 @*    and then in the vertical direction, and then averages these two
     60 @*    values.
     61 @*
     62 @* @param[in] pu1_src
     63 @*  UWORD8 pointer to the source
     64 @*
     65 @* @param[out] pu1_dst
     66 @*  UWORD8 pointer to the destination
     67 @*
     68 @* @param[in] src_strd
     69 @*  integer source stride
     70 @*
     71 @* @param[in] dst_strd
     72 @*  integer destination stride
     73 @*
     74 @* @param[in] ht
     75 @*  integer height of the array
     76 @*
     77 @* @param[in] wd
     78 @*  integer width of the array
     79 @*
     80 @* @param[in] pu1_tmp: temporary buffer
     81 @*
     82 @* @param[in] dydx: x and y reference offset for qpel calculations
     83 @*
     84 @* @returns
     85 @*
     86 @* @remarks
     87 @*  None
     88 @*
     89 @*******************************************************************************
     90 @*;
     91 
     92 @void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
     93 @                                UWORD8 *pu1_dst,
     94 @                                WORD32 src_strd,,
     95 @                                WORD32 dst_strd,
     96 @                                WORD32 ht,
     97 @                                WORD32 wd,
     98 @                                UWORD8* pu1_tmp,
     99 @                                UWORD32 dydx)
    100 
    101 @**************Variables Vs Registers*****************************************
    102 @   r0 => *pu1_src
    103 @   r1 => *pu1_dst
    104 @   r2 =>  src_strd
    105 @   r3 =>  dst_strd
    106 @   r4 =>  ht
    107 @   r5 =>  wd
    108 @   r6 =>  dydx
    109 
    110 .text
    111 .p2align 2
    112 
    113     .global ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q
    114 
    115 ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q:
    116 
    117     stmfd         sp!, {r4-r12, r14}    @store register values to stack
    118     vstmdb        sp!, {d8-d15}         @push neon registers to stack
    119     ldr           r4, [sp, #104]        @ loads ht
    120     ldr           r5, [sp, #108]        @ loads wd
    121     ldr           r6, [sp, #116]        @dydx
    122     and           r7, r6, #3
    123     add           r7, r0, r7, lsr #1    @pu1_pred_vert = pu1_src + (x_offset>>1)
    124 
    125     and           r6, r6, #12           @Finds y-offset
    126     lsr           r6, r6, #3            @dydx>>3
    127     mul           r6, r2, r6
    128     add           r6, r0, r6            @pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd
    129     sub           r7, r7, r2, lsl #1    @pu1_pred_vert-2*src_strd
    130     sub           r6, r6, #2            @pu1_pred_horz-2
    131     vmov.u8       d30, #20              @ Filter coeff 20
    132     vmov.u8       d31, #5               @ Filter coeff 5
    133 
    134     subs          r12, r5, #4           @if wd=4 branch to loop_4
    135     beq           loop_4
    136     subs          r12, r5, #8           @if wd=8 branch to loop_8
    137     beq           loop_8
    138 
    139 loop_16:
    140     vld1.32       {q0}, [r7], r2        @ Vector load from src[0_0]
    141     vld1.32       {q1}, [r7], r2        @ Vector load from src[1_0]
    142     vld1.32       {q2}, [r7], r2        @ Vector load from src[2_0]
    143     vld1.32       {q3}, [r7], r2        @ Vector load from src[3_0]
    144     vld1.32       {q4}, [r7], r2        @ Vector load from src[4_0]
    145     add           r11, r6, #8
    146     vld1.32       {q5}, [r7], r2        @ Vector load from src[5_0]
    147     vld1.32       {q9}, [r6], r2        @ horz row0, col 0
    148     vaddl.u8      q12, d0, d10
    149     vmlal.u8      q12, d4, d30
    150     vmlal.u8      q12, d6, d30
    151     vmlsl.u8      q12, d2, d31
    152     vmlsl.u8      q12, d8, d31
    153     vext.8        d23, d18, d19, #5
    154     vext.8        d20, d18, d19, #2
    155     vext.8        d21, d18, d19, #3
    156     vext.8        d22, d18, d19, #4
    157     vext.8        d19, d18, d19, #1
    158     vqrshrun.s16  d26, q12, #5
    159     vaddl.u8      q14, d18, d23
    160     vmlal.u8      q14, d20, d30
    161     vmlal.u8      q14, d21, d30
    162     vmlsl.u8      q14, d19, d31
    163     vmlsl.u8      q14, d22, d31
    164     vld1.32       {q9}, [r11], r2       @ horz row 0, col 1
    165     vaddl.u8      q12, d1, d11
    166     vmlal.u8      q12, d5, d30
    167     vmlal.u8      q12, d7, d30
    168     vmlsl.u8      q12, d3, d31
    169     vmlsl.u8      q12, d9, d31
    170     vqrshrun.s16  d28, q14, #5
    171     vext.8        d23, d18, d19, #5
    172     vext.8        d20, d18, d19, #2
    173     vext.8        d21, d18, d19, #3
    174     vext.8        d22, d18, d19, #4
    175     vext.8        d19, d18, d19, #1
    176     vqrshrun.s16  d27, q12, #5
    177     vld1.32       {q6}, [r7], r2        @ src[6_0]
    178 
    179     vaddl.u8      q12, d18, d23
    180     vmlal.u8      q12, d20, d30
    181     vmlal.u8      q12, d21, d30
    182     vmlsl.u8      q12, d19, d31
    183     vmlsl.u8      q12, d22, d31
    184 
    185     vaddl.u8      q8, d2, d12
    186     vmlal.u8      q8, d6, d30
    187     vmlal.u8      q8, d8, d30
    188     vmlsl.u8      q8, d4, d31
    189     vmlsl.u8      q8, d10, d31
    190     vqrshrun.s16  d29, q12, #5
    191     vld1.32       {q9}, [r6], r2        @ horz row 1, col 0
    192 
    193     vaddl.u8      q12, d3, d13
    194     vmlal.u8      q12, d7, d30
    195     vmlal.u8      q12, d9, d30
    196     vmlsl.u8      q12, d5, d31
    197     vmlsl.u8      q12, d11, d31
    198     vrhadd.u8     q14, q14, q13
    199     vqrshrun.s16  d26, q8, #5
    200     vext.8        d23, d18, d19, #5
    201     vext.8        d20, d18, d19, #2
    202     vext.8        d21, d18, d19, #3
    203     vext.8        d22, d18, d19, #4
    204     vst1.32       {q14}, [r1], r3       @ store row 0
    205     vext.8        d19, d18, d19, #1
    206     vqrshrun.s16  d27, q12, #5
    207 
    208     vaddl.u8      q14, d18, d23
    209     vmlal.u8      q14, d20, d30
    210     vmlal.u8      q14, d21, d30
    211     vmlsl.u8      q14, d19, d31
    212     vmlsl.u8      q14, d22, d31
    213 
    214     vld1.32       {q9}, [r11], r2       @ horz row 1, col 1
    215 
    216     vext.8        d23, d18, d19, #5
    217     vext.8        d20, d18, d19, #2
    218     vext.8        d21, d18, d19, #3
    219     vext.8        d22, d18, d19, #4
    220     vext.8        d19, d18, d19, #1
    221 
    222     vqrshrun.s16  d28, q14, #5
    223     vaddl.u8      q12, d18, d23
    224     vmlal.u8      q12, d20, d30
    225     vmlal.u8      q12, d21, d30
    226     vmlsl.u8      q12, d19, d31
    227     vmlsl.u8      q12, d22, d31
    228 
    229     vqrshrun.s16  d29, q12, #5
    230     vrhadd.u8     q14, q14, q13
    231     vst1.32       {q14}, [r1], r3       @ store row 1
    232 
    233     subs          r4, r4, #2            @ 2 rows processed, decrement by 2
    234     subne         r7, r7 , r2, lsl #2
    235     subne         r7, r7, r2
    236     beq           end_func              @ Branch if height==4
    237 
    238     b             loop_16               @ looping if height = 8 or 16
    239 
    240 
    241 loop_8:
    242     vld1.32       d0, [r7], r2          @ Vector load from src[0_0]
    243     vld1.32       d1, [r7], r2          @ Vector load from src[1_0]
    244     vld1.32       d2, [r7], r2          @ Vector load from src[2_0]
    245     vld1.32       d3, [r7], r2          @ Vector load from src[3_0]
    246     vld1.32       d4, [r7], r2          @ Vector load from src[4_0]
    247     vld1.32       d5, [r7], r2          @ Vector load from src[5_0]
    248     vaddl.u8      q5, d0, d5
    249     vmlal.u8      q5, d2, d30
    250     vmlal.u8      q5, d3, d30
    251     vmlsl.u8      q5, d1, d31
    252     vmlsl.u8      q5, d4, d31
    253     vld1.32       {q6}, [r6], r2        @horz row 0
    254     vext.8        d17, d12, d13, #5
    255     vext.8        d14, d12, d13, #2
    256     vext.8        d15, d12, d13, #3
    257     vext.8        d16, d12, d13, #4
    258     vext.8        d13, d12, d13, #1
    259     vqrshrun.s16  d26, q5, #5
    260     vld1.32       d6, [r7], r2          @ src[6_0]
    261     vaddl.u8      q5, d12, d17
    262     vmlal.u8      q5, d14, d30
    263     vmlal.u8      q5, d15, d30
    264     vmlsl.u8      q5, d13, d31
    265     vmlsl.u8      q5, d16, d31
    266     vld1.32       {q6}, [r6], r2        @ horz row 1
    267     vaddl.u8      q9, d1, d6
    268     vmlal.u8      q9, d3, d30
    269     vmlal.u8      q9, d4, d30
    270     vmlsl.u8      q9, d2, d31
    271     vmlsl.u8      q9, d5, d31
    272     vqrshrun.s16  d28, q5, #5
    273     vext.8        d17, d12, d13, #5
    274     vext.8        d14, d12, d13, #2
    275     vext.8        d15, d12, d13, #3
    276     vext.8        d16, d12, d13, #4
    277     vext.8        d13, d12, d13, #1
    278     vqrshrun.s16  d27, q9, #5
    279     vaddl.u8      q5, d12, d17
    280     vmlal.u8      q5, d14, d30
    281     vmlal.u8      q5, d15, d30
    282     vmlsl.u8      q5, d13, d31
    283     vmlsl.u8      q5, d16, d31
    284     vqrshrun.s16  d29, q5, #5
    285     vrhadd.u8     q13, q13, q14
    286     vst1.32       d26, [r1], r3
    287     vst1.32       d27, [r1], r3
    288 
    289     subs          r4, r4, #2            @ 2 rows processed, decrement by 2
    290     subne         r7, r7 , r2, lsl #2
    291     subne         r7, r7, r2
    292     beq           end_func              @ Branch if height==4
    293     b             loop_8                @looping if height == 8 or 16
    294 
    295 loop_4:
    296     vld1.32       d0[0], [r7], r2       @ Vector load from src[0_0]
    297     vld1.32       d1[0], [r7], r2       @ Vector load from src[1_0]
    298     vld1.32       d2[0], [r7], r2       @ Vector load from src[2_0]
    299     vld1.32       d3[0], [r7], r2       @ Vector load from src[3_0]
    300     vld1.32       d4[0], [r7], r2       @ Vector load from src[4_0]
    301     vld1.32       d5[0], [r7], r2       @ Vector load from src[5_0]
    302     vaddl.u8      q5, d0, d5
    303     vmlal.u8      q5, d2, d30
    304     vmlal.u8      q5, d3, d30
    305     vmlsl.u8      q5, d1, d31
    306     vmlsl.u8      q5, d4, d31
    307     vld1.32       {q6}, [r6], r2        @load for horz filter row 0
    308     vext.8        d17, d12, d13, #5
    309     vext.8        d14, d12, d13, #2
    310     vext.8        d15, d12, d13, #3
    311     vext.8        d16, d12, d13, #4
    312     vext.8        d13, d12, d13, #1
    313     vqrshrun.s16  d26, q5, #5
    314     vld1.32       d6[0], [r7], r2       @ Vector load from src[6_0]
    315     vaddl.u8      q5, d12, d17
    316     vmlal.u8      q5, d14, d30
    317     vmlal.u8      q5, d15, d30
    318     vmlsl.u8      q5, d13, d31
    319     vmlsl.u8      q5, d16, d31
    320     vld1.32       {q6}, [r6], r2        @horz row 1
    321     vaddl.u8      q9, d1, d6
    322     vmlal.u8      q9, d3, d30
    323     vmlal.u8      q9, d4, d30
    324     vmlsl.u8      q9, d2, d31
    325     vmlsl.u8      q9, d5, d31
    326     vqrshrun.s16  d28, q5, #5
    327     vext.8        d17, d12, d13, #5
    328     vext.8        d14, d12, d13, #2
    329     vext.8        d15, d12, d13, #3
    330     vext.8        d16, d12, d13, #4
    331     vext.8        d13, d12, d13, #1
    332     vqrshrun.s16  d27, q9, #5
    333     vaddl.u8      q5, d12, d17
    334     vmlal.u8      q5, d14, d30
    335     vmlal.u8      q5, d15, d30
    336     vmlsl.u8      q5, d13, d31
    337     vmlsl.u8      q5, d16, d31
    338     vqrshrun.s16  d29, q5, #5
    339     vrhadd.u8     q13, q13, q14
    340     vst1.32       d26[0], [r1], r3
    341     vst1.32       d27[0], [r1], r3
    342 
    343     subs          r4, r4, #2            @ 2 rows processed, decrement by 2
    344     subne         r7, r7 , r2, lsl #2
    345     subne         r7, r7, r2
    346     beq           end_func              @ Branch if height==4
    347     b             loop_4                @ Loop if height==8
    348 end_func:
    349     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    350     ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
    351 
    352 
    353