Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_default_weighted_pred_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for default weighted prediction.
     27 @*
     28 @* @author
     29 @*  Kaushik Senthoor R
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_default_weighted_pred_luma_a9q()
     34 @*  - ih264_default_weighted_pred_chroma_a9q()
     35 @*
     36 @* @remarks
     37 @*  None
     38 @*
     39 @*******************************************************************************
     40 @*
     41 @*******************************************************************************
     42 @* @function
     43 @*  ih264_default_weighted_pred_luma_a9q()
     44 @*
     45 @* @brief
     46 @*  This routine performs the default weighted prediction as described in sec
     47 @* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma.
     48 @*
     49 @* @par Description:
     50 @*  This function gets two ht x wd blocks, calculates their rounded-average and
     51 @* stores it in the destination block.
     52 @*
     53 @* @param[in] pu1_src1:
     54 @*  UWORD8 Pointer to the buffer containing the first input block.
     55 @*
     56 @* @param[in] pu1_src2:
     57 @*  UWORD8 Pointer to the buffer containing the second input block.
     58 @*
     59 @* @param[out] pu1_dst
     60 @*  UWORD8 pointer to the destination where the output block is stored.
     61 @*
     62 @* @param[in] src_strd1
     63 @*  Stride of the first input buffer
     64 @*
     65 @* @param[in] src_strd2
     66 @*  Stride of the second input buffer
     67 @*
     68 @* @param[in] dst_strd
     69 @*  Stride of the destination buffer
     70 @*
     71 @* @param[in] ht
     72 @*  integer height of the array
     73 @*
     74 @* @param[in] wd
     75 @*  integer width of the array
     76 @*
     77 @* @returns
     78 @*  None
     79 @*
     80 @* @remarks
     81 @*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
     82 @*
     83 @*******************************************************************************
     84 @*
     85 @void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1,
     86 @                                          UWORD8 *pu1_src2,
     87 @                                          UWORD8 *pu1_dst,
     88 @                                          WORD32 src_strd1,
     89 @                                          WORD32 src_strd2,
     90 @                                          WORD32 dst_strd,
     91 @                                          WORD32 ht,
     92 @                                          WORD32 wd)
     93 @
     94 @**************Variables Vs Registers*****************************************
     95 @   r0      => pu1_src1
     96 @   r1      => pu1_src2
     97 @   r2      => pu1_dst
     98 @   r3      => src_strd1
     99 @   [sp]    => src_strd2 (r4)
    100 @   [sp+4]  => dst_strd  (r5)
    101 @   [sp+8]  => ht        (r6)
    102 @   [sp+12] => wd        (r7)
    103 @
    104 .text
    105 .p2align 2
    106 
    107     .global ih264_default_weighted_pred_luma_a9q
    108 
    109 ih264_default_weighted_pred_luma_a9q:
    110 
    111     stmfd         sp!, {r4-r7, r14}     @stack stores the values of the arguments
    112     ldr           r7, [sp, #32]         @Load wd
    113     ldr           r4, [sp, #20]         @Load src_strd2
    114     ldr           r5, [sp, #24]         @Load dst_strd
    115     cmp           r7, #16
    116     ldr           r6, [sp, #28]         @Load ht
    117     vpush         {d8-d15}
    118     beq           loop_16               @branch if wd is 16
    119     cmp           r7, #8
    120     beq           loop_8                @branch if wd is 8
    121 
    122 loop_4:                                 @each iteration processes four rows
    123 
    124     vld1.32       d0[0], [r0], r3       @load row 1 in source 1
    125     vld1.32       d0[1], [r0], r3       @load row 2 in source 1
    126     vld1.32       d2[0], [r1], r4       @load row 1 in source 2
    127     vld1.32       d2[1], [r1], r4       @load row 2 in source 2
    128 
    129     vld1.32       d1[0], [r0], r3       @load row 3 in source 1
    130     vld1.32       d1[1], [r0], r3       @load row 4 in source 1
    131     vrhadd.u8     d0, d0, d2
    132     vld1.32       d3[0], [r1], r4       @load row 3 in source 2
    133     vld1.32       d3[1], [r1], r4       @load row 4 in source 2
    134 
    135     subs          r6, r6, #4            @decrement ht by 4
    136     vst1.32       d0[0], [r2], r5       @load row 1 in destination
    137     vst1.32       d0[1], [r2], r5       @load row 2 in destination
    138     vrhadd.u8     d1, d1, d3
    139     vst1.32       d1[0], [r2], r5       @load row 3 in destination
    140     vst1.32       d1[1], [r2], r5       @load row 4 in destination
    141 
    142     bgt           loop_4                @if greater than 0 repeat the loop again
    143 
    144     b             end_loops
    145 
    146 loop_8:                                 @each iteration processes four rows
    147 
    148     vld1.8        d0, [r0], r3          @load row 1 in source 1
    149     vld1.8        d4, [r1], r4          @load row 1 in source 2
    150     vld1.8        d1, [r0], r3          @load row 2 in source 1
    151     vld1.8        d5, [r1], r4          @load row 2 in source 2
    152     vld1.8        d2, [r0], r3          @load row 3 in source 1
    153     vrhadd.u8     q0, q0, q2
    154     vld1.8        d6, [r1], r4          @load row 3 in source 2
    155     vld1.8        d3, [r0], r3          @load row 4 in source 1
    156     vrhadd.u8     d2, d2, d6
    157     vld1.8        d7, [r1], r4          @load row 4 in source 2
    158 
    159     subs          r6, r6, #4            @decrement ht by 4
    160     vst1.8        d0, [r2], r5          @load row 1 in destination
    161     vrhadd.u8     d3, d3, d7
    162     vst1.8        d1, [r2], r5          @load row 2 in destination
    163     vst1.8        d2, [r2], r5          @load row 3 in destination
    164     vst1.8        d3, [r2], r5          @load row 4 in destination
    165 
    166     bgt           loop_8                @if greater than 0 repeat the loop again
    167 
    168     b             end_loops
    169 
    170 loop_16:                                @each iteration processes eight rows
    171 
    172     vld1.8        {q0}, [r0], r3        @load row 1 in source 1
    173     vld1.8        {q8}, [r1], r4        @load row 1 in source 2
    174     vld1.8        {q1}, [r0], r3        @load row 2 in source 1
    175     vld1.8        {q9}, [r1], r4        @load row 2 in source 2
    176     vrhadd.u8     q0, q0, q8
    177     vld1.8        {q2}, [r0], r3        @load row 3 in source 1
    178     vld1.8        {q10}, [r1], r4       @load row 3 in source 2
    179     vrhadd.u8     q1, q1, q9
    180     vld1.8        {q3}, [r0], r3        @load row 4 in source 1
    181     vld1.8        {q11}, [r1], r4       @load row 4 in source 2
    182     vrhadd.u8     q2, q2, q10
    183     vld1.8        {q4}, [r0], r3        @load row 5 in source 1
    184     vld1.8        {q12}, [r1], r4       @load row 5 in source 2
    185     vrhadd.u8     q3, q3, q11
    186     vld1.8        {q5}, [r0], r3        @load row 6 in source 1
    187     vld1.8        {q13}, [r1], r4       @load row 6 in source 2
    188     vrhadd.u8     q4, q4, q12
    189     vld1.8        {q6}, [r0], r3        @load row 7 in source 1
    190     vld1.8        {q14}, [r1], r4       @load row 7 in source 2
    191     vrhadd.u8     q5, q5, q13
    192     vld1.8        {q7}, [r0], r3        @load row 8 in source 1
    193     vld1.8        {q15}, [r1], r4       @load row 8 in source 2
    194 
    195     vrhadd.u8     q6, q6, q14
    196     vst1.8        {q0}, [r2], r5        @load row 1 in destination
    197     vst1.8        {q1}, [r2], r5        @load row 2 in destination
    198     vrhadd.u8     q7, q7, q15
    199     vst1.8        {q2}, [r2], r5        @load row 3 in destination
    200     vst1.8        {q3}, [r2], r5        @load row 4 in destination
    201     subs          r6, r6, #8            @decrement ht by 8
    202     vst1.8        {q4}, [r2], r5        @load row 5 in destination
    203     vst1.8        {q5}, [r2], r5        @load row 6 in destination
    204     vst1.8        {q6}, [r2], r5        @load row 7 in destination
    205     vst1.8        {q7}, [r2], r5        @load row 8 in destination
    206 
    207     bgt           loop_16               @if greater than 0 repeat the loop again
    208 
    209 end_loops:
    210 
    211     vpop          {d8-d15}
    212     ldmfd         sp!, {r4-r7, r15}     @Reload the registers from sp
    213 
    214 
    215 @*******************************************************************************
    216 @* @function
    217 @*  ih264_default_weighted_pred_chroma_a9q()
    218 @*
    219 @* @brief
    220 @*  This routine performs the default weighted prediction as described in sec
    221 @* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma.
    222 @*
    223 @* @par Description:
    224 @*  This function gets two ht x wd blocks, calculates their rounded-average and
    225 @* stores it in the destination block for U and V.
    226 @*
    227 @* @param[in] pu1_src1:
    228 @*  UWORD8 Pointer to the buffer containing the first input block.
    229 @*
    230 @* @param[in] pu1_src2:
    231 @*  UWORD8 Pointer to the buffer containing the second input block.
    232 @*
    233 @* @param[out] pu1_dst
    234 @*  UWORD8 pointer to the destination where the output block is stored.
    235 @*
    236 @* @param[in] src_strd1
    237 @*  Stride of the first input buffer
    238 @*
    239 @* @param[in] src_strd2
    240 @*  Stride of the second input buffer
    241 @*
    242 @* @param[in] dst_strd
    243 @*  Stride of the destination buffer
    244 @*
    245 @* @param[in] ht
    246 @*  integer height of the array
    247 @*
    248 @* @param[in] wd
    249 @*  integer width of the array
    250 @*
    251 @* @returns
    252 @*  None
    253 @*
    254 @* @remarks
    255 @*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
    256 @*
    257 @*******************************************************************************
    258 @*
    259 @void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1,
    260 @                                            UWORD8 *pu1_src2,
    261 @                                            UWORD8 *pu1_dst,
    262 @                                            WORD32 src_strd1,
    263 @                                            WORD32 src_strd2,
    264 @                                            WORD32 dst_strd,
    265 @                                            WORD32 ht,
    266 @                                            WORD32 wd)
    267 @
    268 @**************Variables Vs Registers*****************************************
    269 @   r0      => pu1_src1
    270 @   r1      => pu1_src2
    271 @   r2      => pu1_dst
    272 @   r3      => src_strd1
    273 @   [sp]    => src_strd2 (r4)
    274 @   [sp+4]  => dst_strd  (r5)
    275 @   [sp+8]  => ht        (r6)
    276 @   [sp+12] => wd        (r7)
    277 @
    278 
    279 
    280     .global ih264_default_weighted_pred_chroma_a9q
    281 
    282 ih264_default_weighted_pred_chroma_a9q:
    283 
    284     stmfd         sp!, {r4-r7, r14}     @stack stores the values of the arguments
    285     ldr           r7, [sp, #32]         @Load wd
    286     ldr           r4, [sp, #20]         @Load src_strd2
    287     ldr           r5, [sp, #24]         @Load dst_strd
    288     cmp           r7, #8
    289     ldr           r6, [sp, #28]         @Load ht
    290     vpush         {d8-d15}
    291     beq           loop_8_uv             @branch if wd is 8
    292     cmp           r7, #4
    293     beq           loop_4_uv             @branch if wd is 4
    294 
    295 loop_2_uv:                              @each iteration processes two rows
    296 
    297     vld1.32       d0[0], [r0], r3       @load row 1 in source 1
    298     vld1.32       d0[1], [r0], r3       @load row 2 in source 1
    299 
    300     vld1.32       d1[0], [r1], r4       @load row 1 in source 2
    301     vld1.32       d1[1], [r1], r4       @load row 2 in source 2
    302 
    303     vrhadd.u8     d0, d0, d1
    304 
    305     subs          r6, r6, #2            @decrement ht by 2
    306     vst1.32       d0[0], [r2], r5       @load row 1 in destination
    307     vst1.32       d0[1], [r2], r5       @load row 2 in destination
    308 
    309     bgt           loop_2_uv             @if greater than 0 repeat the loop again
    310 
    311     b             end_loops_uv
    312 
    313 loop_4_uv:                              @each iteration processes two rows
    314 
    315     vld1.8        d0, [r0], r3          @load row 1 in source 1
    316     vld1.8        d2, [r1], r4          @load row 1 in source 2
    317     vld1.8        d1, [r0], r3          @load row 2 in source 1
    318     vrhadd.u8     d0, d0, d2
    319     vld1.8        d3, [r1], r4          @load row 2 in source 2
    320 
    321     vrhadd.u8     d1, d1, d3
    322     vst1.8        d0, [r2], r5          @load row 1 in destination
    323     subs          r6, r6, #2            @decrement ht by 2
    324     vst1.8        d1, [r2], r5          @load row 2 in destination
    325 
    326     bgt           loop_4_uv             @if greater than 0 repeat the loop again
    327 
    328     b             end_loops_uv
    329 
    330 loop_8_uv:                              @each iteration processes four rows
    331 
    332     vld1.8        {q0}, [r0], r3        @load row 1 in source 1
    333     vld1.8        {q4}, [r1], r4        @load row 1 in source 2
    334     vld1.8        {q1}, [r0], r3        @load row 2 in source 1
    335     vrhadd.u8     q0, q0, q4
    336     vld1.8        {q5}, [r1], r4        @load row 2 in source 2
    337     vld1.8        {q2}, [r0], r3        @load row 3 in source 1
    338     vrhadd.u8     q1, q1, q5
    339     vld1.8        {q6}, [r1], r4        @load row 3 in source 2
    340     vld1.8        {q3}, [r0], r3        @load row 4 in source 1
    341     vrhadd.u8     q2, q2, q6
    342     vld1.8        {q7}, [r1], r4        @load row 4 in source 2
    343 
    344     vst1.8        {q0}, [r2], r5        @load row 1 in destination
    345     vrhadd.u8     q3, q3, q7
    346     vst1.8        {q1}, [r2], r5        @load row 2 in destination
    347     subs          r6, r6, #4            @decrement ht by 4
    348     vst1.8        {q2}, [r2], r5        @load row 3 in destination
    349     vst1.8        {q3}, [r2], r5        @load row 4 in destination
    350 
    351     bgt           loop_8_uv             @if greater than 0 repeat the loop again
    352 
    353 end_loops_uv:
    354 
    355     vpop          {d8-d15}
    356     ldmfd         sp!, {r4-r7, r15}     @Reload the registers from sp
    357 
    358 
    359