Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_inter_pred_luma_vert_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for inter prediction  interpolation.
     27 @*
     28 @* @author
     29 @*  Ittiam
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_inter_pred_luma_vert_a9q()
     34 @*
     35 @* @remarks
     36 @*  None
     37 @*
     38 @*******************************************************************************
     39 @*
     40 
     41 @* All the functions here are replicated from ih264_inter_pred_filters.c
     42 @
     43 
     44 @**
     45 @**
     46 @**
     47 @ *******************************************************************************
     48 @ *
     49 @ * @brief
     50 @ *    Interprediction luma filter for vertical input
     51 @ *
     52 @ * @par Description:
     53 @ *   Applies a 6 tap vertcal filter.The output is  clipped to 8 bits
     54 @ *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
     55 @ *
     56 @ * @param[in] pu1_src
     57 @ *  UWORD8 pointer to the source
     58 @ *
     59 @ * @param[out] pu1_dst
     60 @ *  UWORD8 pointer to the destination
     61 @ *
     62 @ * @param[in] src_strd
     63 @ *  integer source stride
     64 @ *
     65 @ * @param[in] dst_strd
     66 @ *  integer destination stride
     67 @ *
     68 @ * @param[in] ht
     69 @ *  integer height of the array
     70 @ *
     71 @ * @param[in] wd
     72 @ *  integer width of the array
     73 @ *
     74 @ * @returns
     75 @ *
     76 @ * @remarks
     77 @ *  None
     78 @ *
     79 @ *******************************************************************************
     80 
     81 @void ih264_inter_pred_luma_vert (
     82 @                            UWORD8 *pu1_src,
     83 @                            UWORD8 *pu1_dst,
     84 @                            WORD32 src_strd,
     85 @                            WORD32 dst_strd,
     86 @                            WORD32 ht,
     87 @                            WORD32 wd   )
     88 
     89 @**************Variables Vs Registers*****************************************
     90 @   r0 => *pu1_src
     91 @   r1 => *pu1_dst
     92 @   r2 =>  src_strd
     93 @   r3 =>  dst_strd
     94 @   r5 =>  ht
     95 @   r6 =>  wd
     96 
     97 .text
     98 .p2align 2
     99 
    100 
    101     .global ih264_inter_pred_luma_vert_a9q
    102 
    103 ih264_inter_pred_luma_vert_a9q:
    104 
    105     stmfd         sp!, {r4-r12, r14}    @store register values to stack
    106     vstmdb        sp!, {d8-d15}         @push neon registers to stack
    107     ldr           r5, [sp, #104]        @Loads ht
    108     sub           r0, r0, r2, lsl #1    @pu1_src-2*src_strd
    109     ldr           r6, [sp, #108]        @Loads wd
    110     vmov.u16      q11, #20              @ Filter coeff 0x14 into Q11
    111 
    112     subs          r12, r6, #8           @if wd=8 branch to loop_8
    113     vmov.u16      q12, #5               @ Filter coeff 0x5  into Q12
    114     beq           loop_8
    115 
    116     subs          r12, r6, #4           @if wd=4 branch to loop_4
    117     beq           loop_4
    118 
    119 loop_16:                                @when  wd=16
    120 
    121     vld1.u32      {q0}, [r0], r2        @ Vector load from src[0_0]
    122     vld1.u32      {q1}, [r0], r2        @ Vector load from src[1_0]
    123     vld1.u32      {q2}, [r0], r2        @ Vector load from src[2_0]
    124     vld1.u32      {q3}, [r0], r2        @ Vector load from src[3_0]
    125     vld1.u32      {q4}, [r0], r2        @ Vector load from src[4_0]
    126     vaddl.u8      q6, d4, d6            @ temp1 = src[2_0] + src[3_0]
    127     vld1.u32      {q5}, [r0], r2        @ Vector load from src[5_0]
    128 
    129     vaddl.u8      q7, d0, d10           @ temp = src[0_0] + src[5_0]
    130     vaddl.u8      q8, d2, d8            @ temp2 = src[1_0] + src[4_0]
    131     vmla.u16      q7, q6, q11           @ temp += temp1 * 20
    132     vaddl.u8      q10, d1, d11          @ temp4 = src[0_8] + src[5_8]
    133     vaddl.u8      q9, d5, d7            @ temp3 = src[2_8] + src[3_8]
    134     vmla.u16      q10, q9, q11          @ temp4 += temp3 * 20
    135     vld1.u32      {q0}, [r0], r2
    136     vaddl.u8      q13, d3, d9           @ temp5 = src[1_8] + src[4_8]
    137     vaddl.u8      q6, d6, d8
    138     vmls.u16      q7, q8, q12           @ temp -= temp2 * 5
    139     vaddl.u8      q8, d2, d0
    140     vaddl.u8      q9, d4, d10
    141     vmla.u16      q8, q6, q11
    142     vmls.u16      q10, q13, q12         @ temp4 -= temp5 * 5
    143     vaddl.u8      q13, d5, d11
    144     vaddl.u8      q6, d7, d9
    145     vqrshrun.s16  d30, q7, #5           @ dst[0_0] = CLIP_U8((temp +16) >> 5)
    146     vaddl.u8      q7, d3, d1
    147     vld1.u32      {q1}, [r0], r2
    148     vmla.u16      q7, q6, q11
    149     vmls.u16      q8, q9, q12
    150     vqrshrun.s16  d31, q10, #5          @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
    151     vaddl.u8      q9, d4, d2
    152     vaddl.u8      q6, d8, d10
    153 
    154     vst1.u32      {q15}, [r1], r3       @ Vector store to dst[0_0]
    155     vmla.u16      q9, q6, q11
    156     vaddl.u8      q10, d6, d0
    157     vmls.u16      q7, q13, q12
    158     vqrshrun.s16  d30, q8, #5
    159     vaddl.u8      q6, d9, d11
    160     vaddl.u8      q8, d5, d3
    161     vaddl.u8      q13, d7, d1
    162     vmla.u16      q8, q6, q11
    163     vmls.u16      q9, q10, q12
    164     vld1.u32      {q2}, [r0], r2
    165 
    166     vqrshrun.s16  d31, q7, #5
    167     vaddl.u8      q6, d10, d0
    168     vaddl.u8      q7, d6, d4
    169     vaddl.u8      q10, d8, d2
    170     vmla.u16      q7, q6, q11
    171     vmls.u16      q8, q13, q12
    172     vst1.u32      {q15}, [r1], r3       @store row 1
    173     vqrshrun.s16  d30, q9, #5
    174     vaddl.u8      q9, d7, d5
    175     vaddl.u8      q6, d11, d1
    176     vmla.u16      q9, q6, q11
    177     vaddl.u8      q13, d9, d3
    178     vmls.u16      q7, q10, q12
    179 
    180     vqrshrun.s16  d31, q8, #5
    181     vmls.u16      q9, q13, q12
    182     vaddl.u8      q6, d0, d2            @ temp1 = src[2_0] + src[3_0]
    183     vst1.u32      {q15}, [r1], r3       @store row 2
    184     vaddl.u8      q8, d10, d4           @ temp2 = src[1_0] + src[4_0]
    185     vaddl.u8      q10, d9, d7           @ temp4 = src[0_8] + src[5_8]
    186     vqrshrun.s16  d30, q7, #5
    187     vaddl.u8      q13, d5, d11          @ temp5 = src[1_8] + src[4_8]
    188     vaddl.u8      q7, d8, d6            @ temp = src[0_0] + src[5_0]
    189     vqrshrun.s16  d31, q9, #5
    190     vmla.u16      q7, q6, q11           @ temp += temp1 * 20
    191     vaddl.u8      q9, d1, d3            @ temp3 = src[2_8] + src[3_8]
    192     vst1.u32      {q15}, [r1], r3       @store row 3
    193     subs          r5, r5, #4            @ 4 rows processed, decrement by 4
    194     subne         r0, r0 , r2, lsl #2
    195     subne         r0, r0, r2
    196     beq           end_func              @ Branch if height==4
    197 
    198     b             loop_16               @ looping if height = 8 or 16
    199 
    200 loop_8:
    201 @ Processing row0 and row1
    202 
    203     vld1.u32      d0, [r0], r2          @ Vector load from src[0_0]
    204     vld1.u32      d1, [r0], r2          @ Vector load from src[1_0]
    205     vld1.u32      d2, [r0], r2          @ Vector load from src[2_0]
    206     vld1.u32      d3, [r0], r2          @ Vector load from src[3_0]
    207     vld1.u32      d4, [r0], r2          @ Vector load from src[4_0]
    208     vld1.u32      d5, [r0], r2          @ Vector load from src[5_0]
    209 
    210     vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
    211     vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
    212     vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
    213     vmla.u16      q4, q3, q11           @ temp += temp1 * 20
    214     vld1.u32      d6, [r0], r2
    215     vaddl.u8      q7, d3, d4
    216     vaddl.u8      q8, d1, d6
    217     vaddl.u8      q9, d2, d5
    218     vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
    219     vmla.u16      q8, q7, q11
    220     vld1.u32      d7, [r0], r2
    221     vaddl.u8      q10, d4, d5
    222     vaddl.u8      q6, d2, d7
    223     vaddl.u8      q5, d3, d6
    224     vmls.u16      q8, q9, q12
    225     vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
    226     vmla.u16      q6, q10, q11
    227     vld1.u32      d0, [r0], r2
    228     vaddl.u8      q7, d5, d6
    229     vqrshrun.s16  d27, q8, #5
    230     vaddl.u8      q10, d3, d0
    231     vmls.u16      q6, q5, q12
    232     vst1.u32      d26, [r1], r3         @ Vector store to dst[0_0]
    233     vaddl.u8      q9, d4, d7
    234     vmla.u16      q10, q7, q11
    235     vst1.u32      d27, [r1], r3
    236     vqrshrun.s16  d28, q6, #5
    237     vst1.u32      d28, [r1], r3
    238     vmls.u16      q10, q9, q12
    239     vqrshrun.s16  d29, q10, #5
    240     vst1.u32      d29, [r1], r3         @store row 3
    241 
    242     subs          r5, r5, #4            @ 4 rows processed, decrement by 4
    243     subne         r0, r0 , r2, lsl #2
    244     subne         r0, r0, r2
    245     beq           end_func              @ Branch if height==4
    246 
    247     b             loop_8                @looping if height == 8 or 16
    248 
    249 
    250 loop_4:
    251 @ Processing row0 and row1
    252 
    253     vld1.u32      d0[0], [r0], r2       @ Vector load from src[0_0]
    254     vld1.u32      d1[0], [r0], r2       @ Vector load from src[1_0]
    255     vld1.u32      d2[0], [r0], r2       @ Vector load from src[2_0]
    256     vld1.u32      d3[0], [r0], r2       @ Vector load from src[3_0]
    257     vld1.u32      d4[0], [r0], r2       @ Vector load from src[4_0]
    258     vld1.u32      d5[0], [r0], r2       @ Vector load from src[5_0]
    259 
    260     vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
    261     vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
    262     vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
    263     vmla.u16      q4, q3, q11           @ temp += temp1 * 20
    264     vld1.u32      d6[0], [r0], r2
    265     vaddl.u8      q7, d3, d4
    266     vaddl.u8      q8, d1, d6
    267     vaddl.u8      q9, d2, d5
    268     vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
    269     vld1.u32      d7[0], [r0], r2
    270     vmla.u16      q8, q7, q11
    271     vaddl.u8      q10, d4, d5
    272     vaddl.u8      q6, d2, d7
    273     vaddl.u8      q5, d3, d6
    274     vmls.u16      q8, q9, q12
    275     vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
    276     vmla.u16      q6, q10, q11
    277     vld1.u32      d0[0], [r0], r2
    278     vaddl.u8      q7, d5, d6
    279     vqrshrun.s16  d27, q8, #5
    280     vaddl.u8      q10, d3, d0
    281     vmls.u16      q6, q5, q12
    282     vst1.u32      d26[0], [r1], r3      @ Vector store to dst[0_0]
    283     vaddl.u8      q9, d4, d7
    284     vmla.u16      q10, q7, q11
    285     vst1.u32      d27[0], [r1], r3
    286     vqrshrun.s16  d28, q6, #5
    287     vst1.u32      d28[0], [r1], r3
    288     vmls.u16      q10, q9, q12
    289     vqrshrun.s16  d29, q10, #5
    290     vst1.u32      d29[0], [r1], r3      @store row 3
    291 
    292     subs          r5, r5, #8
    293     subeq         r0, r0, r2, lsl #2
    294     subeq         r0, r0, r2
    295     beq           loop_4                @ Loop if height==8
    296 
    297 end_func:
    298     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    299     ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
    300 
    301 
    302