Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_inter_pred_luma_bilinear_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for inter prediction  interpolation.
     27 @*
     28 @* @author
     29 @* Ittiam
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_inter_pred_luma_bilinear_a9q()
     34 @*
     35 @* @remarks
     36 @*  None
     37 @*
     38 @*******************************************************************************
     39 @*
     40 
     41 @* All the functions here are replicated from ih264_inter_pred_filters.c
     42 @
     43 
     44 @**
     45 @**
     46 @**
     47 @ *******************************************************************************
     48 @ *  function:ih264_inter_pred_luma_bilinear
     49 @ *
     50 @* @brief
     51 @*    This routine applies the bilinear filter to the predictors .
     52 @*    The  filtering operation is described in
     53 @*    sec 8.4.2.2.1 titled "Luma sample interpolation process"
     54 @*
     55 @* @par Description:
     56 @\note
     57 @*     This function is called to obtain pixels lying at the following
     58 @*    locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
     59 @*    The function averages the two adjacent values from the two input arrays in horizontal direction.
     60 @*
     61 @*
     62 @* @param[in] pu1_src1:
     63 @*  UWORD8 Pointer to the buffer containing the first input array.
     64 @*
     65 @* @param[in] pu1_src2:
     66 @*  UWORD8 Pointer to the buffer containing the second input array.
     67 @*
     68 @* @param[out] pu1_dst
     69 @*  UWORD8 pointer to the destination where the output of bilinear filter is stored.
     70 @*
     71 @* @param[in] src_strd1
     72 @*  Stride of the first input buffer
     73 @*
     74 @* @param[in] src_strd2
     75 @*  Stride of the second input buffer
     76 @*
     77 @* @param[in] dst_strd
     78 @*  integer destination stride of pu1_dst
     79 @*
     80 @* @param[in] ht
     81 @*  integer height of the array
     82 @*
     83 @* @param[in] wd
     84 @*  integer width of the array
     85 @*
     86 @* @returns
     87 @*
     88 @* @remarks
     89 @*  None
     90 @*
     91 @*******************************************************************************
     92 @*
     93 
     94 @void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
     95 @                                   UWORD8 *pu1_src2,
     96 @                                   UWORD8 *pu1_dst,
     97 @                                   WORD32 src_strd1,
     98 @                                   WORD32 src_strd2,
     99 @                                   WORD32 dst_strd,
    100 @                                   WORD32 height,
    101 @                                   WORD32 width)
    102 @
    103 @**************Variables Vs Registers*****************************************
    104 @   r0 => *pu1_src1
    105 @   r1 => *pu1_src2
    106 @   r2 => *pu1_dst
    107 @   r3 =>  src_strd1
    108 @   r4 =>  src_strd2
    109 @   r5 =>  dst_strd
    110 @   r6 =>  height
    111 @   r7 => width
    112 @
    113 .text
    114 .p2align 2
    115 
    116     .global ih264_inter_pred_luma_bilinear_a9q
    117 
    118 ih264_inter_pred_luma_bilinear_a9q:
    119 
    120 
    121 
    122     stmfd         sp!, {r4-r12, r14}    @store register values to stack
    123     vstmdb        sp!, {d8-d15}         @push neon registers to stack
    124     ldr           r4, [sp, #104]
    125     ldr           r5, [sp, #108]        @
    126     ldr           r6, [sp, #112]
    127     ldr           r7, [sp, #116]
    128 
    129     subs          r12, r7, #4           @if wd=4 branch to loop_4
    130     beq           loop_4
    131     subs          r12, r7, #8           @if wd=8 branch to loop_8
    132     beq           loop_8
    133 
    134 loop_16:                                @when  wd=16
    135 
    136     vld1.8        {q0}, [r0], r3        @// Load row0 ;src1
    137     vld1.8        {q2}, [r1], r4        @// Load row0  ;src2
    138     vld1.8        {q1}, [r0], r3        @// Load row1 ;src1
    139     vaddl.u8      q10, d0, d4
    140     vld1.8        {q3}, [r1], r4        @// Load row1  ;src2
    141     vaddl.u8      q11, d1, d5
    142     vld1.8        {q4}, [r0], r3        @// Load row2 ;src1
    143     vaddl.u8      q12, d2, d6
    144     vld1.8        {q5}, [r0], r3        @// Load row3 ;src1
    145     vaddl.u8      q13, d3, d7
    146     vld1.8        {q6}, [r1], r4        @// Load row2  ;src2
    147     vaddl.u8      q8, d8, d12
    148     vld1.8        {q7}, [r1], r4        @// Load row3  ;src2
    149     vaddl.u8      q9, d9, d13
    150     vqrshrun.s16  d28, q10, #1
    151     vqrshrun.s16  d29, q11, #1
    152     vaddl.u8      q10, d10, d14
    153     vqrshrun.s16  d30, q12, #1
    154     vqrshrun.s16  d31, q13, #1
    155     vst1.8        {q14}, [r2], r5       @//Store dest row0
    156     vaddl.u8      q11, d11, d15
    157     vst1.8        {q15}, [r2], r5       @//Store dest row1
    158     vqrshrun.s16  d28, q8, #1
    159     vld1.8        {q0}, [r0], r3        @// Load row4 ;src1
    160     vqrshrun.s16  d29, q9, #1
    161     vld1.8        {q1}, [r0], r3        @// Load row5 ;src1
    162     vqrshrun.s16  d30, q10, #1
    163     vld1.8        {q2}, [r1], r4        @// Load row4  ;src2
    164     vqrshrun.s16  d31, q11, #1
    165     vld1.8        {q3}, [r1], r4        @// Load row5  ;src2
    166     vaddl.u8      q10, d0, d4
    167     vst1.8        {q14}, [r2], r5       @//Store dest row2
    168     vaddl.u8      q13, d3, d7
    169     vst1.8        {q15}, [r2], r5       @//Store dest row3
    170     vaddl.u8      q11, d1, d5
    171     vld1.8        {q4}, [r0], r3        @// Load row6 ;src1
    172     vaddl.u8      q12, d2, d6
    173     vld1.8        {q5}, [r0], r3        @// Load row7 ;src1
    174     vqrshrun.s16  d28, q10, #1
    175     vld1.8        {q6}, [r1], r4        @// Load row6  ;src2
    176     vqrshrun.s16  d29, q11, #1
    177     vld1.8        {q7}, [r1], r4        @// Load row7  ;src2
    178     vaddl.u8      q8, d8, d12
    179     vaddl.u8      q9, d9, d13
    180     vaddl.u8      q10, d10, d14
    181     vqrshrun.s16  d30, q12, #1
    182     vqrshrun.s16  d31, q13, #1
    183     vst1.8        {q14}, [r2], r5       @//Store dest row4
    184     vaddl.u8      q11, d11, d15
    185     vst1.8        {q15}, [r2], r5       @//Store dest row5
    186     vqrshrun.s16  d28, q8, #1
    187     vqrshrun.s16  d30, q10, #1
    188     vqrshrun.s16  d29, q9, #1
    189     vld1.8        {q2}, [r1], r4        @// Load row8  ;src2
    190     vqrshrun.s16  d31, q11, #1
    191     vst1.8        {q14}, [r2], r5       @//Store dest row6
    192     subs          r12, r6, #8
    193     vst1.8        {q15}, [r2], r5       @//Store dest row7
    194 
    195     beq           end_func              @ end function if ht=8
    196 
    197     vld1.8        {q0}, [r0], r3        @// Load row8 ;src1
    198     vaddl.u8      q10, d0, d4
    199     vld1.8        {q1}, [r0], r3        @// Load row9 ;src1
    200     vaddl.u8      q11, d1, d5
    201     vld1.8        {q3}, [r1], r4        @// Load row9  ;src2
    202     vqrshrun.s16  d28, q10, #1
    203     vld1.8        {q4}, [r0], r3        @// Load row10 ;src1
    204     vqrshrun.s16  d29, q11, #1
    205     vld1.8        {q5}, [r0], r3        @// Load row11 ;src1
    206     vaddl.u8      q12, d2, d6
    207     vld1.8        {q6}, [r1], r4        @// Load row10  ;src2
    208     vaddl.u8      q13, d3, d7
    209     vld1.8        {q7}, [r1], r4        @// Load row11 ;src2
    210     vaddl.u8      q8, d8, d12
    211     vaddl.u8      q9, d9, d13
    212     vaddl.u8      q10, d10, d14
    213     vqrshrun.s16  d30, q12, #1
    214     vst1.8        {q14}, [r2], r5       @//Store dest row8
    215     vqrshrun.s16  d31, q13, #1
    216     vst1.8        {q15}, [r2], r5       @//Store dest row9
    217     vqrshrun.s16  d28, q8, #1
    218     vld1.8        {q0}, [r0], r3        @// Load row12 ;src1
    219     vaddl.u8      q11, d11, d15
    220     vld1.8        {q1}, [r0], r3        @// Load row13 ;src1
    221     vqrshrun.s16  d29, q9, #1
    222     vld1.8        {q2}, [r1], r4        @// Load row12  ;src2
    223     vqrshrun.s16  d30, q10, #1
    224     vld1.8        {q3}, [r1], r4        @// Load row13  ;src2
    225     vqrshrun.s16  d31, q11, #1
    226     vst1.8        {q14}, [r2], r5       @//Store dest row10
    227     vaddl.u8      q10, d0, d4
    228     vst1.8        {q15}, [r2], r5       @//Store dest row11
    229     vaddl.u8      q11, d1, d5
    230     vld1.8        {q4}, [r0], r3        @// Load row14 ;src1
    231     vaddl.u8      q13, d3, d7
    232     vld1.8        {q5}, [r0], r3        @// Load row15 ;src1
    233     vaddl.u8      q12, d2, d6
    234     vld1.8        {q6}, [r1], r4        @// Load row14  ;src2
    235     vaddl.u8      q8, d8, d12
    236     vld1.8        {q7}, [r1], r4        @// Load row15  ;src2
    237     vaddl.u8      q9, d9, d13
    238     vqrshrun.s16  d28, q10, #1
    239     vqrshrun.s16  d29, q11, #1
    240     vaddl.u8      q10, d10, d14
    241     vst1.8        {q14}, [r2], r5       @//Store dest row12
    242     vqrshrun.s16  d30, q12, #1
    243     vqrshrun.s16  d31, q13, #1
    244     vaddl.u8      q11, d11, d15
    245     vst1.8        {q15}, [r2], r5       @//Store dest row13
    246     vqrshrun.s16  d28, q8, #1
    247     vqrshrun.s16  d29, q9, #1
    248     vqrshrun.s16  d30, q10, #1
    249     vst1.8        {q14}, [r2], r5       @//Store dest row14
    250     vqrshrun.s16  d31, q11, #1
    251     vst1.8        {q15}, [r2], r5       @//Store dest row15
    252     b             end_func
    253 
    254 
    255 
    256 loop_8: @wd=8;
    257     vld1.8        {d0}, [r0], r3        @// Load row0 ;src1
    258     vld1.8        {d4}, [r1], r4        @// Load row0  ;src2
    259     vld1.8        {d1}, [r0], r3        @// Load row1 ;src1
    260     vaddl.u8      q10, d0, d4
    261     vld1.8        {d5}, [r1], r4        @// Load row1  ;src2
    262     vld1.8        {d2}, [r0], r3        @// Load row2 ;src1
    263     vqrshrun.s16  d28, q10, #1
    264     vld1.8        {d6}, [r1], r4        @// Load row2  ;src2
    265     vaddl.u8      q11, d1, d5
    266     vld1.8        {d3}, [r0], r3        @// Load row3 ;src1
    267     vaddl.u8      q12, d2, d6
    268     vst1.8        {d28}, [r2], r5       @//Store dest row0
    269     vqrshrun.s16  d29, q11, #1
    270     vld1.8        {d7}, [r1], r4        @// Load row3  ;src2
    271     vqrshrun.s16  d30, q12, #1
    272     vst1.8        {d29}, [r2], r5       @//Store dest row1
    273     vaddl.u8      q13, d3, d7
    274     vst1.8        {d30}, [r2], r5       @//Store dest row2
    275     vqrshrun.s16  d31, q13, #1
    276     subs          r12, r6, #4
    277     vst1.8        {d31}, [r2], r5       @//Store dest row3
    278     beq           end_func              @ end function if ht=4
    279 
    280     vld1.8        {d12}, [r1], r4       @// Load row4 ;src2
    281     vld1.8        {d8}, [r0], r3        @// Load row4 ;src1
    282     vld1.8        {d9}, [r0], r3        @// Load row5 ;src1
    283     vaddl.u8      q8, d8, d12
    284     vld1.8        {d13}, [r1], r4       @// Load row5  ;src2
    285     vld1.8        {d10}, [r0], r3       @// Load row6;src1
    286     vaddl.u8      q9, d9, d13
    287     vld1.8        {d14}, [r1], r4       @// Load row6  ;src2
    288     vqrshrun.s16  d28, q8, #1
    289     vld1.8        {d11}, [r0], r3       @// Load row7 ;src1
    290     vqrshrun.s16  d29, q9, #1
    291     vst1.8        {d28}, [r2], r5       @//Store dest row4
    292     vaddl.u8      q10, d10, d14
    293     vst1.8        {d29}, [r2], r5       @//Store dest row5
    294     vqrshrun.s16  d30, q10, #1
    295     vld1.8        {d15}, [r1], r4       @// Load row7 ;src2
    296     vaddl.u8      q11, d11, d15
    297     vst1.8        {d30}, [r2], r5       @//Store dest row6
    298     vqrshrun.s16  d31, q11, #1
    299     subs          r12, r6, #8
    300     vst1.8        {d31}, [r2], r5       @//Store dest row7
    301     beq           end_func              @ end function if ht=8
    302 
    303     vld1.8        {d0}, [r0], r3        @// Load row8 ;src1
    304     vld1.8        {d4}, [r1], r4        @// Load row8  ;src2
    305     vld1.8        {d1}, [r0], r3        @// Load row9 ;src1
    306     vaddl.u8      q10, d0, d4
    307     vld1.8        {d5}, [r1], r4        @// Load row9  ;src2
    308     vld1.8        {d2}, [r0], r3        @// Load row10 ;src1
    309     vaddl.u8      q11, d1, d5
    310     vld1.8        {d6}, [r1], r4        @// Load row10  ;src2
    311     vqrshrun.s16  d28, q10, #1
    312     vld1.8        {d3}, [r0], r3        @// Load row11 ;src1
    313     vaddl.u8      q12, d2, d6
    314     vld1.8        {d7}, [r1], r4        @// Load row11  ;src2
    315     vqrshrun.s16  d29, q11, #1
    316     vld1.8        {d8}, [r0], r3        @// Load row12 ;src1
    317     vaddl.u8      q13, d3, d7
    318     vst1.8        {d28}, [r2], r5       @//Store dest row8
    319     vqrshrun.s16  d30, q12, #1
    320     vld1.8        {d12}, [r1], r4       @// Load row12  ;src2
    321     vqrshrun.s16  d31, q13, #1
    322     vst1.8        {d29}, [r2], r5       @//Store dest row9
    323     vaddl.u8      q8, d8, d12
    324     vld1.8        {d9}, [r0], r3        @// Load row13 ;src1
    325     vqrshrun.s16  d28, q8, #1
    326     vld1.8        {d13}, [r1], r4       @// Load row13  ;src2
    327     vld1.8        {d10}, [r0], r3       @// Load row14;src1
    328     vaddl.u8      q9, d9, d13
    329     vld1.8        {d11}, [r0], r3       @// Load row15 ;src1
    330     vld1.8        {d14}, [r1], r4       @// Load row14  ;src2
    331     vqrshrun.s16  d29, q9, #1
    332     vld1.8        {d15}, [r1], r4       @// Load roW15 ;src2
    333     vaddl.u8      q10, d10, d14
    334     vst1.8        {d30}, [r2], r5       @//Store dest row10
    335     vaddl.u8      q11, d11, d15
    336     vst1.8        {d31}, [r2], r5       @//Store dest row11
    337     vqrshrun.s16  d30, q10, #1
    338     vst1.8        {d28}, [r2], r5       @//Store dest row12
    339     vqrshrun.s16  d31, q11, #1
    340     vst1.8        {d29}, [r2], r5       @//Store dest row13
    341     vst1.8        {d30}, [r2], r5       @//Store dest row14
    342     vst1.8        {d31}, [r2], r5       @//Store dest row15
    343 
    344     b             end_func
    345 
    346 
    347 
    348 loop_4:
    349     vld1.32       d0[0], [r0], r3       @// Load row0 ;src1
    350     vld1.32       d4[0], [r1], r4       @// Load row0  ;src2
    351     vld1.32       d1[0], [r0], r3       @// Load row1 ;src1
    352     vaddl.u8      q10, d0, d4
    353     vld1.32       d5[0], [r1], r4       @// Load row1  ;src2
    354     vld1.32       d2[0], [r0], r3       @// Load row2 ;src1
    355     vqrshrun.s16  d28, q10, #1
    356     vld1.32       d6[0], [r1], r4       @// Load row2  ;src2
    357     vaddl.u8      q11, d1, d5
    358     vld1.32       d3[0], [r0], r3       @// Load row3 ;src1
    359     vaddl.u8      q12, d2, d6
    360     vst1.32       d28[0], [r2], r5      @//Store dest row0
    361     vqrshrun.s16  d29, q11, #1
    362     vld1.32       d7[0], [r1], r4       @// Load row3  ;src2
    363     vqrshrun.s16  d30, q12, #1
    364     vst1.32       d29[0], [r2], r5      @//Store dest row1
    365     vaddl.u8      q13, d3, d7
    366     vst1.32       d30[0], [r2], r5      @//Store dest row2
    367     vqrshrun.s16  d31, q13, #1
    368     subs          r12, r6, #4
    369     vst1.32       d31[0], [r2], r5      @//Store dest row3
    370     beq           end_func              @ end function if ht=4
    371 
    372     vld1.32       d12[0], [r1], r4      @// Load row4 ;src2
    373     vld1.32       d8[0], [r0], r3       @// Load row4 ;src1
    374     vld1.32       d9[0], [r0], r3       @// Load row5 ;src1
    375     vaddl.u8      q8, d8, d12
    376     vld1.32       d13[0], [r1], r4      @// Load row5  ;src2
    377     vld1.32       d10[0], [r0], r3      @// Load row6;src1
    378     vaddl.u8      q9, d9, d13
    379     vld1.32       d14[0], [r1], r4      @// Load row6  ;src2
    380     vqrshrun.s16  d28, q8, #1
    381     vld1.32       d11[0], [r0], r3      @// Load row7 ;src1
    382     vqrshrun.s16  d29, q9, #1
    383     vst1.32       d28[0], [r2], r5      @//Store dest row4
    384     vaddl.u8      q10, d10, d14
    385     vst1.32       d29[0], [r2], r5      @//Store dest row5
    386     vqrshrun.s16  d30, q10, #1
    387     vld1.32       d15[0], [r1], r4      @// Load row7 ;src2
    388     vaddl.u8      q11, d11, d15
    389     vst1.32       d30[0], [r2], r5      @//Store dest row6
    390     vqrshrun.s16  d31, q11, #1
    391     vst1.32       d31[0], [r2], r5      @//Store dest row7
    392 
    393 end_func:
    394 
    395     vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
    396     ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
    397 
    398 
    399