Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_weighted_pred_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for weighted prediction.
     27 @*
     28 @* @author
     29 @*  Kaushik Senthoor R
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_weighted_pred_luma_a9q()
     34 @*  - ih264_weighted_pred_chroma_a9q()
     35 @*
     36 @* @remarks
     37 @*  None
     38 @*
     39 @*******************************************************************************
     40 @*
     41 @*******************************************************************************
     42 @* @function
     43 @*  ih264_weighted_pred_luma_a9q()
     44 @*
     45 @* @brief
     46 @*  This routine performs the default weighted prediction as described in sec
     47 @* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
     48 @*
     49 @* @par Description:
     50 @*  This function gets a ht x wd block, calculates the weighted sample, rounds
     51 @* off, adds offset and stores it in the destination block.
     52 @*
     53 @* @param[in] pu1_src:
     54 @*  UWORD8 Pointer to the buffer containing the input block.
     55 @*
     56 @* @param[out] pu1_dst
     57 @*  UWORD8 pointer to the destination where the output block is stored.
     58 @*
     59 @* @param[in] src_strd
     60 @*  Stride of the input buffer
     61 @*
     62 @* @param[in] dst_strd
     63 @*  Stride of the destination buffer
     64 @*
     65 @* @param[in] log_wd
     66 @*  number of bits to be rounded off
     67 @*
     68 @* @param[in] wt
     69 @*  weight for the weighted prediction
     70 @*
     71 @* @param[in] ofst
     72 @*  offset used after rounding off
     73 @*
     74 @* @param[in] ht
     75 @*  integer height of the array
     76 @*
     77 @* @param[in] wd
     78 @*  integer width of the array
     79 @*
     80 @* @returns
     81 @*  None
     82 @*
     83 @* @remarks
     84 @*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
     85 @*
     86 @*******************************************************************************
     87 @*
     88 @void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src,
     89 @                                  UWORD8 *pu1_dst,
     90 @                                  WORD32 src_strd,
     91 @                                  WORD32 dst_strd,
     92 @                                  WORD32 log_wd,
     93 @                                  WORD32 wt,
     94 @                                  WORD32 ofst,
     95 @                                  WORD32 ht,
     96 @                                  WORD32 wd)
     97 @
     98 @**************Variables Vs Registers*****************************************
     99 @   r0      => pu1_src
    100 @   r1      => pu1_dst
    101 @   r2      => src_strd
    102 @   r3      => dst_strd
    103 @   [sp]    => log_wd (r4)
    104 @   [sp+4]  => wt     (r5)
    105 @   [sp+8]  => ofst   (r6)
    106 @   [sp+12] => ht     (r7)
    107 @   [sp+16] => wd     (r8)
    108 @
    109 .text
    110 .p2align 2
    111 
    112     .global ih264_weighted_pred_luma_a9q
    113 
    114 ih264_weighted_pred_luma_a9q:
    115 
    116     stmfd         sp!, {r4-r9, r14}     @stack stores the values of the arguments
    117     ldr           r5, [sp, #32]         @Load wt
    118     ldr           r4, [sp, #28]         @Load log_wd in r4
    119     ldr           r6, [sp, #36]         @Load ofst
    120     ldr           r7, [sp, #40]         @Load ht
    121     ldr           r8, [sp, #44]         @Load wd
    122     vpush         {d8-d15}
    123 
    124     vdup.16       d2, r5                @D2 = wt (16-bit)
    125     neg           r9, r4                @r9 = -log_wd
    126     vdup.8        d3, r6                @D3 = ofst (8-bit)
    127     cmp           r8, #16               @check if wd is 16
    128     vdup.16       q0, r9                @Q0 = -log_wd (16-bit)
    129     beq           loop_16               @branch if wd is 16
    130 
    131     cmp           r8, #8                @check if wd is 8
    132     beq           loop_8                @branch if wd is 8
    133 
    134 loop_4:                                 @each iteration processes four rows
    135 
    136     vld1.32       d4[0], [r0], r2       @load row 1 in source
    137     vld1.32       d4[1], [r0], r2       @load row 2 in source
    138     vld1.32       d6[0], [r0], r2       @load row 3 in source
    139     vld1.32       d6[1], [r0], r2       @load row 4 in source
    140 
    141     vmovl.u8      q2, d4                @converting rows 1,2 to 16-bit
    142     vmovl.u8      q3, d6                @converting rows 3,4 to 16-bit
    143 
    144     vmul.s16      q2, q2, d2[0]         @weight mult. for rows 1,2
    145     vmul.s16      q3, q3, d2[0]         @weight mult. for rows 3,4
    146 
    147     subs          r7, r7, #4            @decrement ht by 4
    148     vrshl.s16     q2, q2, q0            @rounds off the weighted samples from rows 1,2
    149     vrshl.s16     q3, q3, q0            @rounds off the weighted samples from rows 3,4
    150 
    151     vaddw.s8      q2, q2, d3            @adding offset for rows 1,2
    152     vaddw.s8      q3, q3, d3            @adding offset for rows 3,4
    153 
    154     vqmovun.s16   d4, q2                @saturating rows 1,2 to unsigned 8-bit
    155     vqmovun.s16   d6, q3                @saturating rows 3,4 to unsigned 8-bit
    156 
    157     vst1.32       d4[0], [r1], r3       @store row 1 in destination
    158     vst1.32       d4[1], [r1], r3       @store row 2 in destination
    159     vst1.32       d6[0], [r1], r3       @store row 3 in destination
    160     vst1.32       d6[1], [r1], r3       @store row 4 in destination
    161 
    162     bgt           loop_4                @if greater than 0 repeat the loop again
    163 
    164     b             end_loops
    165 
    166 loop_8:                                 @each iteration processes four rows
    167 
    168     vld1.8        d4, [r0], r2          @load row 1 in source
    169     vld1.8        d6, [r0], r2          @load row 2 in source
    170     vld1.8        d8, [r0], r2          @load row 3 in source
    171     vmovl.u8      q2, d4                @converting row 1 to 16-bit
    172     vld1.8        d10, [r0], r2         @load row 4 in source
    173     vmovl.u8      q3, d6                @converting row 2 to 16-bit
    174 
    175     vmovl.u8      q4, d8                @converting row 3 to 16-bit
    176     vmul.s16      q2, q2, d2[0]         @weight mult. for row 1
    177     vmovl.u8      q5, d10               @converting row 4 to 16-bit
    178     vmul.s16      q3, q3, d2[0]         @weight mult. for row 2
    179     vmul.s16      q4, q4, d2[0]         @weight mult. for row 3
    180     vmul.s16      q5, q5, d2[0]         @weight mult. for row 4
    181 
    182     vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1
    183     vrshl.s16     q3, q3, q0            @rounds off the weighted samples from row 2
    184     vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 3
    185     vaddw.s8      q2, q2, d3            @adding offset for row 1
    186     vrshl.s16     q5, q5, q0            @rounds off the weighted samples from row 4
    187     vaddw.s8      q3, q3, d3            @adding offset for row 2
    188 
    189     vaddw.s8      q4, q4, d3            @adding offset for row 3
    190     vqmovun.s16   d4, q2                @saturating row 1 to unsigned 8-bit
    191     vaddw.s8      q5, q5, d3            @adding offset for row 4
    192     vqmovun.s16   d6, q3                @saturating row 2 to unsigned 8-bit
    193     vqmovun.s16   d8, q4                @saturating row 3 to unsigned 8-bit
    194     vqmovun.s16   d10, q5               @saturating row 4 to unsigned 8-bit
    195 
    196     vst1.8        d4, [r1], r3          @store row 1 in destination
    197     vst1.8        d6, [r1], r3          @store row 2 in destination
    198     subs          r7, r7, #4            @decrement ht by 4
    199     vst1.8        d8, [r1], r3          @store row 3 in destination
    200     vst1.8        d10, [r1], r3         @store row 4 in destination
    201 
    202     bgt           loop_8                @if greater than 0 repeat the loop again
    203 
    204     b             end_loops
    205 
    206 loop_16:                                @each iteration processes two rows
    207 
    208     vld1.8        {q2}, [r0], r2        @load row 1 in source
    209     vld1.8        {q3}, [r0], r2        @load row 2 in source
    210     vmovl.u8      q6, d4                @converting row 1L to 16-bit
    211     vld1.8        {q4}, [r0], r2        @load row 3 in source
    212     vmovl.u8      q7, d5                @converting row 1H to 16-bit
    213     vld1.8        {q5}, [r0], r2        @load row 4 in source
    214 
    215     vmovl.u8      q8, d6                @converting row 2L to 16-bit
    216     vmul.s16      q6, q6, d2[0]         @weight mult. for row 1L
    217     vmovl.u8      q9, d7                @converting row 2H to 16-bit
    218     vmul.s16      q7, q7, d2[0]         @weight mult. for row 1H
    219     vmovl.u8      q10, d8               @converting row 3L to 16-bit
    220     vmul.s16      q8, q8, d2[0]         @weight mult. for row 2L
    221     vmovl.u8      q11, d9               @converting row 3H to 16-bit
    222     vmul.s16      q9, q9, d2[0]         @weight mult. for row 2H
    223     vmovl.u8      q12, d10              @converting row 4L to 16-bit
    224     vmul.s16      q10, q10, d2[0]       @weight mult. for row 3L
    225     vmovl.u8      q13, d11              @converting row 4H to 16-bit
    226     vmul.s16      q11, q11, d2[0]       @weight mult. for row 3H
    227 
    228     vmul.s16      q12, q12, d2[0]       @weight mult. for row 4L
    229     vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 1L
    230     vmul.s16      q13, q13, d2[0]       @weight mult. for row 4H
    231 
    232     vrshl.s16     q7, q7, q0            @rounds off the weighted samples from row 1H
    233     vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 2L
    234     vaddw.s8      q6, q6, d3            @adding offset for row 1L
    235     vrshl.s16     q9, q9, q0            @rounds off the weighted samples from row 2H
    236     vaddw.s8      q7, q7, d3            @adding offset for row 1H
    237     vqmovun.s16   d4, q6                @saturating row 1L to unsigned 8-bit
    238     vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 3L
    239     vaddw.s8      q8, q8, d3            @adding offset for row 2L
    240     vqmovun.s16   d5, q7                @saturating row 1H to unsigned 8-bit
    241     vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 3H
    242     vaddw.s8      q9, q9, d3            @adding offset for row 2H
    243     vqmovun.s16   d6, q8                @saturating row 2L to unsigned 8-bit
    244     vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 4L
    245     vaddw.s8      q10, q10, d3          @adding offset for row 3L
    246     vqmovun.s16   d7, q9                @saturating row 2H to unsigned 8-bit
    247     vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 4H
    248     vaddw.s8      q11, q11, d3          @adding offset for row 3H
    249 
    250     vqmovun.s16   d8, q10               @saturating row 3L to unsigned 8-bit
    251     vaddw.s8      q12, q12, d3          @adding offset for row 4L
    252     vqmovun.s16   d9, q11               @saturating row 3H to unsigned 8-bit
    253     vaddw.s8      q13, q13, d3          @adding offset for row 4H
    254 
    255     vqmovun.s16   d10, q12              @saturating row 4L to unsigned 8-bit
    256     vst1.8        {q2}, [r1], r3        @store row 1 in destination
    257     vqmovun.s16   d11, q13              @saturating row 4H to unsigned 8-bit
    258     vst1.8        {q3}, [r1], r3        @store row 2 in destination
    259     subs          r7, r7, #4            @decrement ht by 4
    260     vst1.8        {q4}, [r1], r3        @store row 3 in destination
    261     vst1.8        {q5}, [r1], r3        @store row 4 in destination
    262 
    263     bgt           loop_16               @if greater than 0 repeat the loop again
    264 
    265 end_loops:
    266 
    267     vpop          {d8-d15}
    268     ldmfd         sp!, {r4-r9, r15}     @Reload the registers from sp
    269 
    270 
    271 @*******************************************************************************
    272 @* @function
    273 @*  ih264_weighted_pred_chroma_a9q()
    274 @*
    275 @* @brief
    276 @*  This routine performs the default weighted prediction as described in sec
    277 @* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
    278 @*
    279 @* @par Description:
    280 @*  This function gets a ht x wd block, calculates the weighted sample, rounds
    281 @* off, adds offset and stores it in the destination block for U and V.
    282 @*
    283 @* @param[in] pu1_src:
    284 @*  UWORD8 Pointer to the buffer containing the input block.
    285 @*
    286 @* @param[out] pu1_dst
    287 @*  UWORD8 pointer to the destination where the output block is stored.
    288 @*
    289 @* @param[in] src_strd
    290 @*  Stride of the input buffer
    291 @*
    292 @* @param[in] dst_strd
    293 @*  Stride of the destination buffer
    294 @*
    295 @* @param[in] log_wd
    296 @*  number of bits to be rounded off
    297 @*
    298 @* @param[in] wt
    299 @*  weights for the weighted prediction for U and V
    300 @*
    301 @* @param[in] ofst
    302 @*  offsets used after rounding off for U and V
    303 @*
    304 @* @param[in] ht
    305 @*  integer height of the array
    306 @*
    307 @* @param[in] wd
    308 @*  integer width of the array
    309 @*
    310 @* @returns
    311 @*  None
    312 @*
    313 @* @remarks
    314 @*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
    315 @*
    316 @*******************************************************************************
    317 @*
    318 @void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src,
    319 @                                    UWORD8 *pu1_dst,
    320 @                                    WORD32 src_strd,
    321 @                                    WORD32 dst_strd,
    322 @                                    WORD32 log_wd,
    323 @                                    WORD32 wt,
    324 @                                    WORD32 ofst,
    325 @                                    WORD32 ht,
    326 @                                    WORD32 wd)
    327 @
    328 @**************Variables Vs Registers*****************************************
    329 @   r0      => pu1_src
    330 @   r1      => pu1_dst
    331 @   r2      => src_strd
    332 @   r3      => dst_strd
    333 @   [sp]    => log_wd (r4)
    334 @   [sp+4]  => wt     (r5)
    335 @   [sp+8]  => ofst   (r6)
    336 @   [sp+12] => ht     (r7)
    337 @   [sp+16] => wd     (r8)
    338 @
    339 
    340 
    341     .global ih264_weighted_pred_chroma_a9q
    342 
    343 ih264_weighted_pred_chroma_a9q:
    344 
    345     stmfd         sp!, {r4-r9, r14}     @stack stores the values of the arguments
    346 
    347     ldr           r4, [sp, #28]         @Load log_wd in r4
    348     ldr           r5, [sp, #32]         @Load wt = {wt_u (16-bit), wt_v (16-bit)}
    349     ldr           r6, [sp, #36]         @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)}
    350     ldr           r8, [sp, #44]         @Load wd
    351 
    352     neg           r9, r4                @r9 = -log_wd
    353     vdup.32       q1, r5                @Q1 = {wt_u (16-bit), wt_v (16-bit)}
    354     ldr           r7, [sp, #40]         @Load ht
    355     vpush         {d8-d15}
    356     vdup.16       d4, r6                @D4 = {ofst_u (8-bit), ofst_v (8-bit)}
    357     cmp           r8, #8                @check if wd is 8
    358     vdup.16       q0, r9                @Q0 = -log_wd (16-bit)
    359     beq           loop_8_uv             @branch if wd is 8
    360 
    361     cmp           r8, #4                @check if ws is 4
    362     beq           loop_4_uv             @branch if wd is 4
    363 
    364 loop_2_uv:                              @each iteration processes two rows
    365 
    366     vld1.32       d6[0], [r0], r2       @load row 1 in source
    367     vld1.32       d6[1], [r0], r2       @load row 2 in source
    368 
    369     vmovl.u8      q3, d6                @converting rows 1,2 to 16-bit
    370 
    371     vmul.s16      q3, q3, q1            @weight mult. for rows 1,2
    372 
    373     vrshl.s16     q3, q3, q0            @rounds off the weighted samples from rows 1,2
    374 
    375     vaddw.s8      q3, q3, d4            @adding offset for rows 1,2
    376 
    377     vqmovun.s16   d6, q3                @saturating rows 1,2 to unsigned 8-bit
    378 
    379     subs          r7, r7, #2            @decrement ht by 2
    380     vst1.32       d6[0], [r1], r3       @store row 1 in destination
    381     vst1.32       d6[1], [r1], r3       @store row 2 in destination
    382 
    383     bgt           loop_2_uv             @if greater than 0 repeat the loop again
    384 
    385     b             end_loops_uv
    386 
    387 loop_4_uv:                              @each iteration processes two rows
    388 
    389     vld1.8        d6, [r0], r2          @load row 1 in source
    390     vld1.8        d8, [r0], r2          @load row 2 in source
    391 
    392     vmovl.u8      q3, d6                @converting row 1 to 16-bit
    393     vmovl.u8      q4, d8                @converting row 2 to 16-bit
    394 
    395     vmul.s16      q3, q3, q1            @weight mult. for row 1
    396     vmul.s16      q4, q4, q1            @weight mult. for row 2
    397 
    398     subs          r7, r7, #2            @decrement ht by 2
    399     vrshl.s16     q3, q3, q0            @rounds off the weighted samples from row 1
    400     vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2
    401 
    402     vaddw.s8      q3, q3, d4            @adding offset for row 1
    403     vaddw.s8      q4, q4, d4            @adding offset for row 2
    404 
    405     vqmovun.s16   d6, q3                @saturating row 1 to unsigned 8-bit
    406     vqmovun.s16   d8, q4                @saturating row 2 to unsigned 8-bit
    407 
    408     vst1.8        d6, [r1], r3          @store row 1 in destination
    409     vst1.8        d8, [r1], r3          @store row 2 in destination
    410 
    411     bgt           loop_4_uv             @if greater than 0 repeat the loop again
    412 
    413     b             end_loops_uv
    414 
    415 loop_8_uv:                              @each iteration processes two rows
    416 
    417     vld1.8        {q3}, [r0], r2        @load row 1 in source
    418     vld1.8        {q4}, [r0], r2        @load row 2 in source
    419     vmovl.u8      q7, d6                @converting row 1L to 16-bit
    420     vld1.8        {q5}, [r0], r2        @load row 3 in source
    421     vmovl.u8      q8, d7                @converting row 1H to 16-bit
    422     vld1.8        {q6}, [r0], r2        @load row 4 in source
    423 
    424     vmul.s16      q7, q7, q1            @weight mult. for row 1L
    425     vmovl.u8      q9, d8                @converting row 2L to 16-bit
    426     vmul.s16      q8, q8, q1            @weight mult. for row 1H
    427     vmovl.u8      q10, d9               @converting row 2H to 16-bit
    428     vmul.s16      q9, q9, q1            @weight mult. for row 2L
    429     vmovl.u8      q11, d10              @converting row 3L to 16-bit
    430     vmul.s16      q10, q10, q1          @weight mult. for row 2H
    431     vmovl.u8      q12, d11              @converting row 3H to 16-bit
    432     vmul.s16      q11, q11, q1          @weight mult. for row 3L
    433     vmovl.u8      q13, d12              @converting row 4L to 16-bit
    434     vmul.s16      q12, q12, q1          @weight mult. for row 3H
    435     vmovl.u8      q14, d13              @converting row 4H to 16-bit
    436 
    437     vmul.s16      q13, q13, q1          @weight mult. for row 4L
    438     vrshl.s16     q7, q7, q0            @rounds off the weighted samples from row 1L
    439     vmul.s16      q14, q14, q1          @weight mult. for row 4H
    440 
    441     vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 1H
    442     vrshl.s16     q9, q9, q0            @rounds off the weighted samples from row 2L
    443     vaddw.s8      q7, q7, d4            @adding offset for row 1L
    444     vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 2H
    445     vaddw.s8      q8, q8, d4            @adding offset for row 1H
    446     vqmovun.s16   d6, q7                @saturating row 1L to unsigned 8-bit
    447     vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 3L
    448     vaddw.s8      q9, q9, d4            @adding offset for row 2L
    449     vqmovun.s16   d7, q8                @saturating row 1H to unsigned 8-bit
    450     vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 3H
    451     vaddw.s8      q10, q10, d4          @adding offset for row 2H
    452     vqmovun.s16   d8, q9                @saturating row 2L to unsigned 8-bit
    453     vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 4L
    454     vaddw.s8      q11, q11, d4          @adding offset for row 3L
    455     vqmovun.s16   d9, q10               @saturating row 2H to unsigned 8-bit
    456     vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 4H
    457     vaddw.s8      q12, q12, d4          @adding offset for row 3H
    458 
    459     vqmovun.s16   d10, q11              @saturating row 3L to unsigned 8-bit
    460     vaddw.s8      q13, q13, d4          @adding offset for row 4L
    461     vqmovun.s16   d11, q12              @saturating row 3H to unsigned 8-bit
    462     vaddw.s8      q14, q14, d4          @adding offset for row 4H
    463 
    464     vqmovun.s16   d12, q13              @saturating row 4L to unsigned 8-bit
    465     vst1.8        {q3}, [r1], r3        @store row 1 in destination
    466     vqmovun.s16   d13, q14              @saturating row 4H to unsigned 8-bit
    467     vst1.8        {q4}, [r1], r3        @store row 2 in destination
    468     subs          r7, r7, #4            @decrement ht by 4
    469     vst1.8        {q5}, [r1], r3        @store row 3 in destination
    470     vst1.8        {q6}, [r1], r3        @store row 4 in destination
    471 
    472     bgt           loop_8_uv             @if greater than 0 repeat the loop again
    473 
    474 end_loops_uv:
    475 
    476     vpop          {d8-d15}
    477     ldmfd         sp!, {r4-r9, r15}     @Reload the registers from sp
    478 
    479 
    480