Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_inter_pred_luma_vert_qpel_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for inter prediction vertical quarter pel interpolation.
     27 @*
     28 @* @author
     29 @*  Mohit
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_inter_pred_luma_vert_qpel_a9q()
     34 @*
     35 @* @remarks
     36 @*  None
     37 @*
     38 @*******************************************************************************
     39 @*
     40 
     41 @* All the functions here are replicated from ih264_inter_pred_filters.c
     42 @
     43 
     44 @*******************************************************************************
     45 @*
     46 @* @brief
     47 @*     Quarter pel interprediction luma filter for vertical input
     48 @*
     49 @* @par Description:
     50 @* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
     51 @* sec 8.4.2.2.1 titled "Luma sample interpolation process"
     52 @*
     53 @* @param[in] pu1_src
     54 @*  UWORD8 pointer to the source
     55 @*
     56 @* @param[out] pu1_dst
     57 @*  UWORD8 pointer to the destination
     58 @*
     59 @* @param[in] src_strd
     60 @*  integer source stride
     61 @*
     62 @* @param[in] dst_strd
     63 @*  integer destination stride
     64 @*
     65 @* @param[in] ht
     66 @*  integer height of the array
     67 @*
     68 @* @param[in] wd
     69 @*  integer width of the array
     70 @*
     71 @* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
     72 @*
     73 @* @param[in] dydx: x and y reference offset for qpel calculations.
     74 @* @returns
     75 @*
     76 @ @remarks
     77 @*  None
     78 @*
     79 @*******************************************************************************
     80 @*
     81 
     82 @void ih264_inter_pred_luma_vert (
     83 @                            UWORD8 *pu1_src,
     84 @                            UWORD8 *pu1_dst,
     85 @                            WORD32 src_strd,
     86 @                            WORD32 dst_strd,
     87 @                            WORD32 ht,
     88 @                            WORD32 wd,
     89 @                            UWORD8* pu1_tmp,
     90 @                            UWORD32 dydx)
     91 
     92 @**************Variables Vs Registers*****************************************
     93 @   r0 => *pu1_src
     94 @   r1 => *pu1_dst
     95 @   r2 =>  src_strd
     96 @   r3 =>  dst_strd
     97 @   r5 =>  ht
     98 @   r6 =>  wd
     99 @   r7 =>  dydx
    100 
    101 .text
    102 .p2align 2
    103 
    104     .global ih264_inter_pred_luma_vert_qpel_a9q
    105 
    106 ih264_inter_pred_luma_vert_qpel_a9q:
    107 
    108     stmfd         sp!, {r4-r12, r14}    @store register values to stack
    109     vstmdb        sp!, {d8-d15}         @push neon registers to stack
    110     ldr           r5, [sp, #104]        @Loads ht
    111 
    112     ldr           r6, [sp, #108]        @Loads wd
    113     ldr           r7, [sp, #116]        @Loads dydx
    114     and           r7, r7, #12           @Finds y-offset
    115     lsr           r7, r7, #3            @dydx>>3
    116     mul           r7, r2, r7
    117     add           r7, r0, r7            @pu1_src + (y_offset>>1)*src_strd
    118     vmov.u16      q11, #20              @ Filter coeff 0x14 into Q11
    119     sub           r0, r0, r2, lsl #1    @pu1_src-2*src_strd
    120     subs          r12, r6, #8           @if wd=8 branch to loop_8
    121     vmov.u16      q12, #5               @ Filter coeff 0x5  into Q12
    122     beq           loop_8
    123 
    124     subs          r12, r6, #4           @if wd=4 branch to loop_4
    125     beq           loop_4
    126 
    127 loop_16:                                @when  wd=16
    128 
    129     vld1.u32      {q0}, [r0], r2        @ Vector load from src[0_0]
    130     vld1.u32      {q1}, [r0], r2        @ Vector load from src[1_0]
    131     vld1.u32      {q2}, [r0], r2        @ Vector load from src[2_0]
    132     vld1.u32      {q3}, [r0], r2        @ Vector load from src[3_0]
    133     vld1.u32      {q4}, [r0], r2        @ Vector load from src[4_0]
    134     vaddl.u8      q6, d4, d6            @ temp1 = src[2_0] + src[3_0]
    135     vld1.u32      {q5}, [r0], r2        @ Vector load from src[5_0]
    136     vaddl.u8      q7, d0, d10           @ temp = src[0_0] + src[5_0]
    137     vaddl.u8      q8, d2, d8            @ temp2 = src[1_0] + src[4_0]
    138     vmla.u16      q7, q6, q11           @ temp += temp1 * 20
    139     vaddl.u8      q10, d1, d11          @ temp4 = src[0_8] + src[5_8]
    140     vaddl.u8      q9, d5, d7            @ temp3 = src[2_8] + src[3_8]
    141     vmla.u16      q10, q9, q11          @ temp4 += temp3 * 20
    142     vld1.u32      {q0}, [r0], r2
    143     vaddl.u8      q13, d3, d9           @ temp5 = src[1_8] + src[4_8]
    144     vaddl.u8      q6, d6, d8
    145     vmls.u16      q7, q8, q12           @ temp -= temp2 * 5
    146     vaddl.u8      q8, d2, d0
    147     vaddl.u8      q9, d4, d10
    148     vmla.u16      q8, q6, q11
    149     vmls.u16      q10, q13, q12         @ temp4 -= temp5 * 5
    150     vaddl.u8      q13, d5, d11
    151     vaddl.u8      q6, d7, d9
    152     vqrshrun.s16  d30, q7, #5           @ dst[0_0] = CLIP_U8((temp +16) >> 5)
    153     vaddl.u8      q7, d3, d1
    154     vld1.u32      {q1}, [r0], r2
    155     vmla.u16      q7, q6, q11
    156     vmls.u16      q8, q9, q12
    157     vqrshrun.s16  d31, q10, #5          @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
    158     vld1.u32      {q10}, [r7], r2       @ Load for interpolation row 0
    159     vrhadd.u8     q15, q10, q15         @ Interpolation to obtain qpel value
    160     vaddl.u8      q9, d4, d2
    161     vaddl.u8      q6, d8, d10
    162 
    163     vst1.u32      {q15}, [r1], r3       @ Vector store to dst[0_0]
    164     vmla.u16      q9, q6, q11
    165     vaddl.u8      q10, d6, d0
    166     vmls.u16      q7, q13, q12
    167     vqrshrun.s16  d30, q8, #5
    168     vaddl.u8      q6, d9, d11
    169     vaddl.u8      q8, d5, d3
    170     vaddl.u8      q13, d7, d1
    171     vmla.u16      q8, q6, q11
    172     vmls.u16      q9, q10, q12
    173     vld1.u32      {q2}, [r0], r2
    174 
    175     vqrshrun.s16  d31, q7, #5
    176     vld1.u32      {q7}, [r7], r2        @ Load for interpolation row 1
    177     vaddl.u8      q6, d10, d0
    178     vrhadd.u8     q15, q7, q15          @ Interpolation to obtain qpel value
    179     vaddl.u8      q7, d6, d4
    180     vaddl.u8      q10, d8, d2
    181     vmla.u16      q7, q6, q11
    182     vmls.u16      q8, q13, q12
    183     vst1.u32      {q15}, [r1], r3       @store row 1
    184     vqrshrun.s16  d30, q9, #5
    185     vaddl.u8      q9, d7, d5
    186     vaddl.u8      q6, d11, d1
    187     vmla.u16      q9, q6, q11
    188     vaddl.u8      q13, d9, d3
    189     vmls.u16      q7, q10, q12
    190     vqrshrun.s16  d31, q8, #5
    191     vld1.u32      {q8}, [r7], r2        @ Load for interpolation row 2
    192     vmls.u16      q9, q13, q12
    193     vrhadd.u8     q15, q8, q15          @ Interpolation to obtain qpel value
    194     vaddl.u8      q6, d0, d2            @ temp1 = src[2_0] + src[3_0]
    195     vst1.u32      {q15}, [r1], r3       @store row 2
    196     vaddl.u8      q8, d10, d4           @ temp2 = src[1_0] + src[4_0]
    197     vaddl.u8      q10, d9, d7           @ temp4 = src[0_8] + src[5_8]
    198     vqrshrun.s16  d30, q7, #5
    199     vaddl.u8      q13, d5, d11          @ temp5 = src[1_8] + src[4_8]
    200     vaddl.u8      q7, d8, d6            @ temp = src[0_0] + src[5_0]
    201     vqrshrun.s16  d31, q9, #5
    202     vld1.u32      {q9}, [r7], r2        @ Load for interpolation row 3
    203     vmla.u16      q7, q6, q11           @ temp += temp1 * 20
    204     vrhadd.u8     q15, q9, q15          @ Interpolation to obtain qpel value
    205     vaddl.u8      q9, d1, d3            @ temp3 = src[2_8] + src[3_8]
    206     vst1.u32      {q15}, [r1], r3       @store row 3
    207     subs          r5, r5, #4            @ 4 rows processed, decrement by 4
    208     subne         r0, r0 , r2, lsl #2
    209     subne         r0, r0, r2
    210     beq           end_func              @ Branch if height==4
    211 
    212     b             loop_16               @ looping if height = 8 or 16
    213 
    214 
    215 loop_8:
    216 
    217     @ Processing row0 and row1
    218     vld1.u32      d0, [r0], r2          @ Vector load from src[0_0]
    219     vld1.u32      d1, [r0], r2          @ Vector load from src[1_0]
    220     vld1.u32      d2, [r0], r2          @ Vector load from src[2_0]
    221     vld1.u32      d3, [r0], r2          @ Vector load from src[3_0]
    222     vld1.u32      d4, [r0], r2          @ Vector load from src[4_0]
    223     vld1.u32      d5, [r0], r2          @ Vector load from src[5_0]
    224 
    225     vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
    226     vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
    227     vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
    228     vmla.u16      q4, q3, q11           @ temp += temp1 * 20
    229     vld1.u32      d6, [r0], r2
    230     vaddl.u8      q7, d3, d4
    231     vaddl.u8      q8, d1, d6
    232     vaddl.u8      q9, d2, d5
    233     vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
    234     vmla.u16      q8, q7, q11
    235     vld1.u32      d7, [r0], r2
    236     vaddl.u8      q10, d4, d5
    237     vaddl.u8      q6, d2, d7
    238     vaddl.u8      q5, d3, d6
    239     vmls.u16      q8, q9, q12
    240     vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
    241     vmla.u16      q6, q10, q11
    242     vld1.32       d8, [r7], r2          @Load value for interpolation           (row0)
    243     vld1.32       d9, [r7], r2          @Load value for interpolation           (row1)
    244     vld1.u32      d0, [r0], r2
    245     vaddl.u8      q7, d5, d6
    246     vqrshrun.s16  d27, q8, #5
    247     vrhadd.u8     q13, q4, q13          @ Interpolation step for qpel calculation
    248     vaddl.u8      q10, d3, d0
    249     vmls.u16      q6, q5, q12
    250     vst1.u32      d26, [r1], r3         @ Vector store to dst[0_0]
    251     vaddl.u8      q9, d4, d7
    252     vmla.u16      q10, q7, q11
    253     vst1.u32      d27, [r1], r3         @ Vector store to dst[1_0]
    254     vqrshrun.s16  d28, q6, #5
    255     vmls.u16      q10, q9, q12
    256     vld1.32       d12, [r7], r2         @Load value for interpolation           (row2)
    257     vld1.32       d13, [r7], r2         @Load value for interpolation           (row3)
    258     vqrshrun.s16  d29, q10, #5
    259     subs          r9, r5, #4
    260     vrhadd.u8     q14, q6, q14
    261     vst1.u32      d28, [r1], r3         @store row 2
    262     vst1.u32      d29, [r1], r3         @store row 3
    263 
    264     subs          r5, r5, #4            @ 4 rows processed, decrement by 4
    265     subne         r0, r0 , r2, lsl #2
    266     subne         r0, r0, r2
    267     beq           end_func              @ Branch if height==4
    268     b             loop_8                @looping if height == 8 or 16
    269 
    270 loop_4:
    271 @ Processing row0 and row1
    272 
    273     vld1.u32      d0[0], [r0], r2       @ Vector load from src[0_0]
    274     vld1.u32      d1[0], [r0], r2       @ Vector load from src[1_0]
    275     vld1.u32      d2[0], [r0], r2       @ Vector load from src[2_0]
    276     vld1.u32      d3[0], [r0], r2       @ Vector load from src[3_0]
    277     vld1.u32      d4[0], [r0], r2       @ Vector load from src[4_0]
    278     vld1.u32      d5[0], [r0], r2       @ Vector load from src[5_0]
    279 
    280     vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
    281     vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
    282     vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
    283     vmla.u16      q4, q3, q11           @ temp += temp1 * 20
    284     vld1.u32      d6, [r0], r2
    285     vaddl.u8      q7, d3, d4
    286     vaddl.u8      q8, d1, d6
    287     vaddl.u8      q9, d2, d5
    288     vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
    289     vld1.u32      d7[0], [r0], r2
    290     vmla.u16      q8, q7, q11
    291     vaddl.u8      q10, d4, d5
    292     vaddl.u8      q6, d2, d7
    293     vaddl.u8      q5, d3, d6
    294     vmls.u16      q8, q9, q12
    295     vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
    296     vld1.u32      d8[0], [r7], r2       @Load value for interpolation - row 0
    297     vld1.u32      d9[0], [r7], r2       @Load value for interpolation - row 1
    298     vmla.u16      q6, q10, q11
    299     vld1.u32      d0[0], [r0], r2
    300     vaddl.u8      q7, d5, d6
    301     vqrshrun.s16  d27, q8, #5
    302     vaddl.u8      q10, d3, d0
    303     vrhadd.u8     q13, q13, q4          @Interpolation step for qpel calculation
    304     vmls.u16      q6, q5, q12
    305     vst1.u32      d26[0], [r1], r3      @ Vector store to dst[0_0]
    306     vaddl.u8      q9, d4, d7
    307     vmla.u16      q10, q7, q11
    308     vst1.u32      d27[0], [r1], r3      @ store row 1
    309     vqrshrun.s16  d28, q6, #5
    310     vld1.u32      d12[0], [r7], r2      @Load value for interpolation - row 2
    311     vld1.u32      d13[0], [r7], r2      @Load value for interpolation - row 3
    312 
    313     vmls.u16      q10, q9, q12
    314     vqrshrun.s16  d29, q10, #5
    315     vrhadd.u8     q14, q6, q14          @Interpolation step for qpel calculation
    316     vst1.u32      d28[0], [r1], r3      @store row 2
    317     vst1.u32      d29[0], [r1], r3      @store row 3
    318 
    319     subs          r5, r5, #8
    320     subeq         r0, r0, r2, lsl #2
    321     subeq         r0, r0, r2
    322     beq           loop_4                @ Loop if height==8
    323 
    324 end_func:
    325     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    326     ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
    327 
    328 
    329