Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @******************************************************************************
     22 @* @file
     23 @*  ih264_weighted_bi_pred_a9q.s
     24 @*
     25 @* @brief
     26 @*  Contains function definitions for weighted biprediction.
     27 @*
     28 @* @author
     29 @*  Kaushik Senthoor R
     30 @*
     31 @* @par List of Functions:
     32 @*
     33 @*  - ih264_weighted_bi_pred_luma_a9q()
     34 @*  - ih264_weighted_bi_pred_chroma_a9q()
     35 @*
     36 @* @remarks
     37 @*  None
     38 @*
     39 @*******************************************************************************
     40 @*
     41 @*******************************************************************************
     42 @* @function
     43 @*  ih264_weighted_bi_pred_luma_a9q()
     44 @*
     45 @* @brief
     46 @*  This routine performs the weighted biprediction as described in sec
     47 @* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
     48 @*
     49 @* @par Description:
     50 @*  This function gets two ht x wd blocks, calculates the weighted samples,
     51 @* rounds off, adds offset and stores it in the destination block.
     52 @*
     53 @* @param[in] pu1_src1
     54 @*  UWORD8 Pointer to the buffer containing the input block 1.
     55 @*
     56 @* @param[in] pu1_src2
     57 @*  UWORD8 Pointer to the buffer containing the input block 2.
     58 @*
     59 @* @param[out] pu1_dst
     60 @*  UWORD8 pointer to the destination where the output block is stored.
     61 @*
     62 @* @param[in] src_strd1
     63 @*  Stride of the input buffer 1
     64 @*
     65 @* @param[in] src_strd2
     66 @*  Stride of the input buffer 2
     67 @*
     68 @* @param[in] dst_strd
     69 @*  Stride of the destination buffer
     70 @*
     71 @* @param[in] log_wd
     72 @*  number of bits to be rounded off
     73 @*
     74 @* @param[in] wt1
     75 @*  weight for the weighted prediction
     76 @*
     77 @* @param[in] wt2
     78 @*  weight for the weighted prediction
     79 @*
     80 @* @param[in] ofst1
     81 @*  offset 1 used after rounding off
     82 @*
     83 @* @param[in] ofst2
     84 @*  offset 2 used after rounding off
     85 @*
     86 @* @param[in] ht
     87 @*  integer height of the array
     88 @*
     89 @* @param[in] wd
     90 @*  integer width of the array
     91 @*
     92 @* @returns
     93 @*  None
     94 @*
     95 @* @remarks
     96 @*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
     97 @*
     98 @*******************************************************************************
     99 @*
    100 @void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1,
    101 @                                     UWORD8 *pu1_src2,
    102 @                                     UWORD8 *pu1_dst,
    103 @                                     WORD32 src_strd1,
    104 @                                     WORD32 src_strd2,
    105 @                                     WORD32 dst_strd,
    106 @                                     WORD32 log_wd,
    107 @                                     WORD32 wt1,
    108 @                                     WORD32 wt2,
    109 @                                     WORD32 ofst1,
    110 @                                     WORD32 ofst2,
    111 @                                     WORD32 ht,
    112 @                                     WORD32 wd)
    113 @
    114 @**************Variables Vs Registers*****************************************
    115 @   r0      => pu1_src1
    116 @   r1      => pu1_src2
    117 @   r2      => pu1_dst
    118 @   r3      => src_strd1
    119 @   [sp]    => src_strd2 (r4)
    120 @   [sp+4]  => dst_strd  (r5)
    121 @   [sp+8]  => log_wd    (r6)
    122 @   [sp+12] => wt1       (r7)
    123 @   [sp+16] => wt2       (r8)
    124 @   [sp+20] => ofst1     (r9)
    125 @   [sp+24] => ofst2     (r10)
    126 @   [sp+28] => ht        (r11)
    127 @   [sp+32] => wd        (r12)
    128 @
    129 .text
    130 .p2align 2
    131 
    132     .global ih264_weighted_bi_pred_luma_a9q
    133 
    134 ih264_weighted_bi_pred_luma_a9q:
    135 
    136     stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
    137     ldr           r6, [sp, #48]         @Load log_wd in r6
    138     ldr           r7, [sp, #52]         @Load wt1 in r7
    139     ldr           r8, [sp, #56]         @Load wt2 in r8
    140     ldr           r9, [sp, #60]         @Load ofst1 in r9
    141 
    142     add           r6, r6, #1            @r6  = log_wd + 1
    143     sxtb          r7, r7                @sign-extend 16-bit wt1 to 32-bit
    144     ldr           r4, [sp, #40]         @Load src_strd2 in r4
    145     ldr           r5, [sp, #44]         @Load dst_strd in r5
    146     sxtb          r9, r9                @sign-extend 8-bit ofst1 to 32-bit
    147     neg           r10, r6               @r10 = -(log_wd + 1)
    148     ldr           r11, [sp, #68]        @Load ht in r11
    149     ldr           r12, [sp, #72]        @Load wd in r12
    150     vdup.16       q0, r10               @Q0  = -(log_wd + 1) (32-bit)
    151     add           r9, r9, #1            @r9 = ofst1 + 1
    152 
    153     ldr           r10, [sp, #64]        @Load ofst2 in r10
    154     sxtb          r8, r8                @sign-extend 16-bit wt2 to 32-bit
    155     cmp           r12, #16              @check if wd is 16
    156     vpush         {d8-d15}
    157     sxtb          r10, r10              @sign-extend 8-bit ofst2 to 32-bit
    158     add           r9, r9, r10           @r9 = ofst1 + ofst2 + 1
    159     vmov          d2, r7, r8            @D2 = {wt1(32-bit), wt2(32-bit)}
    160     asr           r9, r9, #1            @r9 = ofst = (ofst1 + ofst2 + 1) >> 1
    161     vdup.8        d3, r9                @D3 = ofst (8-bit)
    162     beq           loop_16               @branch if wd is 16
    163 
    164     cmp           r12, #8               @check if wd is 8
    165     beq           loop_8                @branch if wd is 8
    166 
    167 loop_4:                                 @each iteration processes four rows
    168 
    169     vld1.32       d4[0], [r0], r3       @load row 1 in source 1
    170     vld1.32       d4[1], [r0], r3       @load row 2 in source 1
    171     vld1.32       d6[0], [r1], r4       @load row 1 in source 2
    172     vld1.32       d6[1], [r1], r4       @load row 2 in source 2
    173 
    174     vmovl.u8      q2, d4                @converting rows 1,2 in source 1 to 16-bit
    175     vld1.32       d8[0], [r0], r3       @load row 3 in source 1
    176     vld1.32       d8[1], [r0], r3       @load row 4 in source 1
    177     vmovl.u8      q3, d6                @converting rows 1,2 in source 2 to 16-bit
    178     vld1.32       d10[0], [r1], r4      @load row 3 in source 2
    179     vld1.32       d10[1], [r1], r4      @load row 4 in source 2
    180 
    181     vmovl.u8      q4, d8                @converting rows 3,4 in source 1 to 16-bit
    182     vmovl.u8      q5, d10               @converting rows 3,4 in source 2 to 16-bit
    183 
    184     vmul.s16      q2, q2, d2[0]         @weight 1 mult. for rows 1,2
    185     vmla.s16      q2, q3, d2[2]         @weight 2 mult. for rows 1,2
    186     vmul.s16      q4, q4, d2[0]         @weight 1 mult. for rows 3,4
    187     vmla.s16      q4, q5, d2[2]         @weight 2 mult. for rows 3,4
    188 
    189     subs          r11, r11, #4          @decrement ht by 4
    190     vrshl.s16     q2, q2, q0            @rounds off the weighted samples from rows 1,2
    191     vrshl.s16     q4, q4, q0            @rounds off the weighted samples from rows 3,4
    192 
    193     vaddw.s8      q2, q2, d3            @adding offset for rows 1,2
    194     vaddw.s8      q4, q4, d3            @adding offset for rows 3,4
    195 
    196     vqmovun.s16   d4, q2                @saturating rows 1,2 to unsigned 8-bit
    197     vqmovun.s16   d8, q4                @saturating rows 3,4 to unsigned 8-bit
    198 
    199     vst1.32       d4[0], [r2], r5       @store row 1 in destination
    200     vst1.32       d4[1], [r2], r5       @store row 2 in destination
    201     vst1.32       d8[0], [r2], r5       @store row 3 in destination
    202     vst1.32       d8[1], [r2], r5       @store row 4 in destination
    203 
    204     bgt           loop_4                @if greater than 0 repeat the loop again
    205 
    206     b             end_loops
    207 
    208 loop_8:                                 @each iteration processes four rows
    209 
    210     vld1.8        d4, [r0], r3          @load row 1 in source 1
    211     vld1.8        d6, [r1], r4          @load row 1 in source 2
    212     vld1.8        d8, [r0], r3          @load row 2 in source 1
    213     vld1.8        d10, [r1], r4         @load row 2 in source 2
    214     vmovl.u8      q2, d4                @converting row 1 in source 1 to 16-bit
    215     vld1.8        d12, [r0], r3         @load row 3 in source 1
    216     vld1.8        d14, [r1], r4         @load row 3 in source 2
    217     vmovl.u8      q3, d6                @converting row 1 in source 2 to 16-bit
    218     vld1.8        d16, [r0], r3         @load row 4 in source 1
    219     vld1.8        d18, [r1], r4         @load row 4 in source 2
    220 
    221     vmovl.u8      q4, d8                @converting row 2 in source 1 to 16-bit
    222     vmovl.u8      q5, d10               @converting row 2 in source 2 to 16-bit
    223 
    224     vmul.s16      q2, q2, d2[0]         @weight 1 mult. for row 1
    225     vmla.s16      q2, q3, d2[2]         @weight 2 mult. for row 1
    226     vmovl.u8      q6, d12               @converting row 3 in source 1 to 16-bit
    227     vmovl.u8      q7, d14               @converting row 3 in source 2 to 16-bit
    228     vmul.s16      q4, q4, d2[0]         @weight 1 mult. for row 2
    229     vmla.s16      q4, q5, d2[2]         @weight 2 mult. for row 2
    230     vmovl.u8      q8, d16               @converting row 4 in source 1 to 16-bit
    231     vmovl.u8      q9, d18               @converting row 4 in source 2 to 16-bit
    232 
    233     vmul.s16      q6, q6, d2[0]         @weight 1 mult. for row 3
    234     vmla.s16      q6, q7, d2[2]         @weight 2 mult. for row 3
    235     vmul.s16      q8, q8, d2[0]         @weight 1 mult. for row 4
    236     vmla.s16      q8, q9, d2[2]         @weight 2 mult. for row 4
    237 
    238     vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1
    239     vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2
    240     vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 3
    241     vaddw.s8      q2, q2, d3            @adding offset for row 1
    242     vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 4
    243     vaddw.s8      q4, q4, d3            @adding offset for row 2
    244 
    245     vaddw.s8      q6, q6, d3            @adding offset for row 3
    246     vqmovun.s16   d4, q2                @saturating row 1 to unsigned 8-bit
    247     vaddw.s8      q8, q8, d3            @adding offset for row 4
    248     vqmovun.s16   d8, q4                @saturating row 2 to unsigned 8-bit
    249 
    250     vqmovun.s16   d12, q6               @saturating row 3 to unsigned 8-bit
    251     vqmovun.s16   d16, q8               @saturating row 4 to unsigned 8-bit
    252 
    253     vst1.8        d4, [r2], r5          @store row 1 in destination
    254     vst1.8        d8, [r2], r5          @store row 2 in destination
    255     subs          r11, r11, #4          @decrement ht by 4
    256     vst1.8        d12, [r2], r5         @store row 3 in destination
    257     vst1.8        d16, [r2], r5         @store row 4 in destination
    258 
    259     bgt           loop_8                @if greater than 0 repeat the loop again
    260 
    261     b             end_loops
    262 
    263 loop_16:                                @each iteration processes two rows
    264 
    265     vld1.8        {q2}, [r0], r3        @load row 1 in source 1
    266     vld1.8        {q3}, [r1], r4        @load row 1 in source 2
    267     vld1.8        {q4}, [r0], r3        @load row 2 in source 1
    268     vld1.8        {q5}, [r1], r4        @load row 2 in source 2
    269     vmovl.u8      q10, d4               @converting row 1L in source 1 to 16-bit
    270     vld1.8        {q6}, [r0], r3        @load row 3 in source 1
    271     vld1.8        {q7}, [r1], r4        @load row 3 in source 2
    272     vmovl.u8      q11, d6               @converting row 1L in source 2 to 16-bit
    273     vld1.8        {q8}, [r0], r3        @load row 4 in source 1
    274     vld1.8        {q9}, [r1], r4        @load row 4 in source 2
    275 
    276     vmovl.u8      q2, d5                @converting row 1H in source 1 to 16-bit
    277     vmovl.u8      q3, d7                @converting row 1H in source 2 to 16-bit
    278 
    279     vmul.s16      q10, q10, d2[0]       @weight 1 mult. for row 1L
    280     vmla.s16      q10, q11, d2[2]       @weight 2 mult. for row 1L
    281     vmovl.u8      q12, d8               @converting row 2L in source 1 to 16-bit
    282     vmovl.u8      q13, d10              @converting row 2L in source 2 to 16-bit
    283 
    284     vmul.s16      q2, q2, d2[0]         @weight 1 mult. for row 1H
    285     vmla.s16      q2, q3, d2[2]         @weight 2 mult. for row 1H
    286     vmovl.u8      q4, d9                @converting row 2H in source 1 to 16-bit
    287     vmovl.u8      q5, d11               @converting row 2H in source 2 to 16-bit
    288 
    289     vmul.s16      q12, q12, d2[0]       @weight 1 mult. for row 2L
    290     vmla.s16      q12, q13, d2[2]       @weight 2 mult. for row 2L
    291     vmovl.u8      q14, d12              @converting row 3L in source 1 to 16-bit
    292     vmovl.u8      q15, d14              @converting row 3L in source 2 to 16-bit
    293 
    294     vmul.s16      q4, q4, d2[0]         @weight 1 mult. for row 2H
    295     vmla.s16      q4, q5, d2[2]         @weight 2 mult. for row 2H
    296     vmovl.u8      q6, d13               @converting row 3H in source 1 to 16-bit
    297     vmovl.u8      q7, d15               @converting row 3H in source 2 to 16-bit
    298 
    299     vmul.s16      q14, q14, d2[0]       @weight 1 mult. for row 3L
    300     vmla.s16      q14, q15, d2[2]       @weight 2 mult. for row 3L
    301     vmovl.u8      q11, d16              @converting row 4L in source 1 to 16-bit
    302     vmovl.u8      q3, d18               @converting row 4L in source 2 to 16-bit
    303 
    304     vmul.s16      q6, q6, d2[0]         @weight 1 mult. for row 3H
    305     vmla.s16      q6, q7, d2[2]         @weight 2 mult. for row 3H
    306     vmovl.u8      q8, d17               @converting row 4H in source 1 to 16-bit
    307     vmovl.u8      q9, d19               @converting row 4H in source 2 to 16-bit
    308 
    309     vmul.s16      q11, q11, d2[0]       @weight 1 mult. for row 4L
    310     vmla.s16      q11, q3, d2[2]        @weight 2 mult. for row 4L
    311     vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 1L
    312 
    313     vmul.s16      q8, q8, d2[0]         @weight 1 mult. for row 4H
    314     vmla.s16      q8, q9, d2[2]         @weight 2 mult. for row 4H
    315     vrshl.s16     q2, q2, q0            @rounds off the weighted samples from row 1H
    316 
    317     vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 2L
    318     vaddw.s8      q10, q10, d3          @adding offset for row 1L
    319     vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 2H
    320     vaddw.s8      q2, q2, d3            @adding offset for row 1H
    321     vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 3L
    322     vaddw.s8      q12, q12, d3          @adding offset for row 2L
    323     vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 3H
    324     vaddw.s8      q4, q4, d3            @adding offset for row 2H
    325     vrshl.s16     q11, q11, q0          @rounds off the weighted samples from row 4L
    326     vaddw.s8      q14, q14, d3          @adding offset for row 3L
    327     vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 4H
    328     vaddw.s8      q6, q6, d3            @adding offset for row 3H
    329 
    330     vqmovun.s16   d26, q10              @saturating row 1L to unsigned 8-bit
    331     vaddw.s8      q11, q11, d3          @adding offset for row 4L
    332     vqmovun.s16   d27, q2               @saturating row 1H to unsigned 8-bit
    333     vaddw.s8      q8, q8, d3            @adding offset for row 4H
    334 
    335     vqmovun.s16   d10, q12              @saturating row 2L to unsigned 8-bit
    336     vqmovun.s16   d11, q4               @saturating row 2H to unsigned 8-bit
    337     vqmovun.s16   d30, q14              @saturating row 3L to unsigned 8-bit
    338     vqmovun.s16   d31, q6               @saturating row 3H to unsigned 8-bit
    339     vst1.8        {q13}, [r2], r5       @store row 1 in destination
    340     vqmovun.s16   d14, q11              @saturating row 4L to unsigned 8-bit
    341     vqmovun.s16   d15, q8               @saturating row 4H to unsigned 8-bit
    342 
    343     vst1.8        {q5}, [r2], r5        @store row 2 in destination
    344     subs          r11, r11, #4          @decrement ht by 4
    345     vst1.8        {q15}, [r2], r5       @store row 3 in destination
    346     vst1.8        {q7}, [r2], r5        @store row 4 in destination
    347 
    348     bgt           loop_16               @if greater than 0 repeat the loop again
    349 
    350 end_loops:
    351 
    352     vpop          {d8-d15}
    353     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from sp
    354 
    355 
    356 @*******************************************************************************
    357 @* @function
    358 @*  ih264_weighted_bi_pred_chroma_a9q()
    359 @*
    360 @* @brief
    361 @*  This routine performs the default weighted prediction as described in sec
    362 @* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
    363 @*
    364 @* @par Description:
    365 @*  This function gets two ht x wd blocks, calculates the weighted samples,
    366 @* rounds off, adds offset and stores it in the destination block for U and V.
    367 @*
    368 @* @param[in] pu1_src1
    369 @*  UWORD8 Pointer to the buffer containing the input block 1.
    370 @*
    371 @* @param[in] pu1_src2
    372 @*  UWORD8 Pointer to the buffer containing the input block 2.
    373 @*
    374 @* @param[out] pu1_dst
    375 @*  UWORD8 pointer to the destination where the output block is stored.
    376 @*
    377 @* @param[in] src_strd1
    378 @*  Stride of the input buffer 1
    379 @*
    380 @* @param[in] src_strd2
    381 @*  Stride of the input buffer 2
    382 @*
    383 @* @param[in] dst_strd
    384 @*  Stride of the destination buffer
    385 @*
    386 @* @param[in] log_wd
    387 @*  number of bits to be rounded off
    388 @*
    389 @* @param[in] wt1
    390 @*  weights for the weighted prediction in U and V
    391 @*
    392 @* @param[in] wt2
    393 @*  weights for the weighted prediction in U and V
    394 @*
    395 @* @param[in] ofst1
    396 @*  offset 1 used after rounding off for U an dV
    397 @*
    398 @* @param[in] ofst2
    399 @*  offset 2 used after rounding off for U and V
    400 @*
    401 @* @param[in] ht
    402 @*  integer height of the array
    403 @*
    404 @* @param[in] wd
    405 @*  integer width of the array
    406 @*
    407 @* @returns
    408 @*  None
    409 @*
    410 @* @remarks
    411 @*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
    412 @*
    413 @*******************************************************************************
    414 @*
    415 @void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1,
    416 @                                       UWORD8 *pu1_src2,
    417 @                                       UWORD8 *pu1_dst,
    418 @                                       WORD32 src_strd1,
    419 @                                       WORD32 src_strd2,
    420 @                                       WORD32 dst_strd,
    421 @                                       WORD32 log_wd,
    422 @                                       WORD32 wt1,
    423 @                                       WORD32 wt2,
    424 @                                       WORD32 ofst1,
    425 @                                       WORD32 ofst2,
    426 @                                       WORD32 ht,
    427 @                                       WORD32 wd)
    428 @
    429 @**************Variables Vs Registers*****************************************
    430 @   r0      => pu1_src1
    431 @   r1      => pu1_src2
    432 @   r2      => pu1_dst
    433 @   r3      => src_strd1
    434 @   [sp]    => src_strd2 (r4)
    435 @   [sp+4]  => dst_strd  (r5)
    436 @   [sp+8]  => log_wd    (r6)
    437 @   [sp+12] => wt1       (r7)
    438 @   [sp+16] => wt2       (r8)
    439 @   [sp+20] => ofst1     (r9)
    440 @   [sp+24] => ofst2     (r10)
    441 @   [sp+28] => ht        (r11)
    442 @   [sp+32] => wd        (r12)
    443 @
    444 
    445 
    446     .global ih264_weighted_bi_pred_chroma_a9q
    447 
    448 ih264_weighted_bi_pred_chroma_a9q:
    449 
    450     stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
    451 
    452     ldr           r6, [sp, #48]         @Load log_wd in r6
    453     ldr           r7, [sp, #52]         @Load wt1 in r7
    454     ldr           r8, [sp, #56]         @Load wt2 in r8
    455     add           r6, r6, #1            @r6  = log_wd + 1
    456     ldr           r9, [sp, #60]         @Load ofst1 in r9
    457     ldr           r10, [sp, #64]        @Load ofst2 in r10
    458 
    459     neg           r12, r6               @r12 = -(log_wd + 1)
    460     ldr           r4, [sp, #40]         @Load src_strd2 in r4
    461     ldr           r5, [sp, #44]         @Load dst_strd in r5
    462     vdup.16       q0, r12               @Q0  = -(log_wd + 1) (16-bit)
    463 
    464     ldr           r11, [sp, #68]        @Load ht in r11
    465     vdup.32       q1, r7                @Q1 = (wt1_u, wt1_v) (32-bit)
    466     ldr           r12, [sp, #72]        @Load wd in r12
    467     vdup.32       q2, r8                @Q2 = (wt2_u, wt2_v) (32-bit)
    468     asr           r7, r9, #8            @r7 = ofst1_v
    469     asr           r8, r10, #8           @r8 = ofst2_v
    470     vpush         {d8-d15}
    471     sxtb          r9, r9                @sign-extend 8-bit ofst1_u to 32-bit
    472     sxtb          r10, r10              @sign-extend 8-bit ofst2_u to 32-bit
    473     sxtb          r7, r7                @sign-extend 8-bit ofst1_v to 32-bit
    474     sxtb          r8, r8                @sign-extend 8-bit ofst2_v to 32-bit
    475 
    476     add           r9, r9, #1            @r9 = ofst1_u + 1
    477     add           r7, r7, #1            @r7 = ofst1_v + 1
    478     add           r9, r9, r10           @r9 = ofst1_u + ofst2_u + 1
    479     add           r7, r7, r8            @r7 = ofst1_v + ofst2_v + 1
    480     asr           r9, r9, #1            @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1
    481     asr           r7, r7, #1            @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1
    482     cmp           r12, #8               @check if wd is 8
    483     pkhbt         r9, r9, r7, lsl #16   @r9 = {ofst_u(16-bit), ofst_v(16-bit)}
    484     vdup.32       q3, r9                @Q3 = {ofst_u(16-bit), ofst_v(16-bit)}
    485     beq           loop_8_uv             @branch if wd is 8
    486 
    487     cmp           r12, #4               @check if wd is 4
    488     beq           loop_4_uv             @branch if wd is 4
    489 
    490 loop_2_uv:                              @each iteration processes two rows
    491 
    492     vld1.32       d8[0], [r0], r3       @load row 1 in source 1
    493     vld1.32       d8[1], [r0], r3       @load row 2 in source 1
    494     vld1.32       d10[0], [r1], r4      @load row 1 in source 2
    495     vld1.32       d10[1], [r1], r4      @load row 2 in source 2
    496 
    497     vmovl.u8      q4, d8                @converting rows 1,2 in source 1 to 16-bit
    498     vmovl.u8      q5, d10               @converting rows 1,2 in source 2 to 16-bit
    499 
    500     vmul.s16      q4, q4, q1            @weight 1 mult. for rows 1,2
    501     vmla.s16      q4, q5, q2            @weight 2 mult. for rows 1,2
    502 
    503     vrshl.s16     q4, q4, q0            @rounds off the weighted samples from rows 1,2
    504 
    505     vadd.s16      q4, q4, q3            @adding offset for rows 1,2
    506 
    507     vqmovun.s16   d8, q4                @saturating rows 1,2 to unsigned 8-bit
    508 
    509     vst1.32       d8[0], [r2], r5       @store row 1 in destination
    510     vst1.32       d8[1], [r2], r5       @store row 2 in destination
    511 
    512     subs          r11, r11, #2          @decrement ht by 2
    513     bgt           loop_2_uv             @if greater than 0 repeat the loop again
    514 
    515     b             end_loops_uv
    516 
    517 loop_4_uv:                              @each iteration processes two rows
    518 
    519     vld1.8        d8, [r0], r3          @load row 1 in source 1
    520     vld1.8        d10, [r1], r4         @load row 1 in source 2
    521     vmovl.u8      q4, d8                @converting row 1 in source 1 to 16-bit
    522     vld1.8        d12, [r0], r3         @load row 2 in source 1
    523     vmovl.u8      q5, d10               @converting row 1 in source 2 to 16-bit
    524     vld1.8        d14, [r1], r4         @load row 2 in source 2
    525 
    526     vmovl.u8      q6, d12               @converting row 2 in source 1 to 16-bit
    527     vmul.s16      q4, q4, q1            @weight 1 mult. for row 1
    528     vmla.s16      q4, q5, q2            @weight 2 mult. for row 1
    529     vmovl.u8      q7, d14               @converting row 2 in source 2 to 16-bit
    530 
    531     vmul.s16      q6, q6, q1            @weight 1 mult. for row 2
    532     vmla.s16      q6, q7, q2            @weight 2 mult. for row 2
    533 
    534     subs          r11, r11, #2          @decrement ht by 2
    535     vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 1
    536     vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 2
    537     vadd.s16      q4, q4, q3            @adding offset for row 1
    538     vadd.s16      q6, q6, q3            @adding offset for row 2
    539 
    540     vqmovun.s16   d8, q4                @saturating row 1 to unsigned 8-bit
    541     vqmovun.s16   d12, q6               @saturating row 2 to unsigned 8-bit
    542 
    543     vst1.8        d8, [r2], r5          @store row 1 in destination
    544     vst1.8        d12, [r2], r5         @store row 2 in destination
    545 
    546     bgt           loop_4_uv             @if greater than 0 repeat the loop again
    547 
    548     b             end_loops_uv
    549 
    550 loop_8_uv:                              @each iteration processes two rows
    551 
    552     vld1.8        {q4}, [r0], r3        @load row 1 in source 1
    553     vld1.8        {q5}, [r1], r4        @load row 1 in source 2
    554     vld1.8        {q6}, [r0], r3        @load row 2 in source 1
    555     vld1.8        {q7}, [r1], r4        @load row 2 in source 2
    556     vmovl.u8      q12, d8               @converting row 1L in source 1 to 16-bit
    557     vld1.8        {q8}, [r0], r3        @load row 3 in source 1
    558     vld1.8        {q9}, [r1], r4        @load row 3 in source 2
    559     vmovl.u8      q13, d10              @converting row 1L in source 2 to 16-bit
    560     vld1.8        {q10}, [r0], r3       @load row 4 in source 1
    561     vld1.8        {q11}, [r1], r4       @load row 4 in source 2
    562 
    563     vmovl.u8      q4, d9                @converting row 1H in source 1 to 16-bit
    564     vmovl.u8      q5, d11               @converting row 1H in source 2 to 16-bit
    565 
    566     vmul.s16      q12, q12, q1          @weight 1 mult. for row 1L
    567     vmla.s16      q12, q13, q2          @weight 2 mult. for row 1L
    568     vmovl.u8      q14, d12              @converting row 2L in source 1 to 16-bit
    569     vmovl.u8      q15, d14              @converting row 2L in source 2 to 16-bit
    570 
    571     vmul.s16      q4, q4, q1            @weight 1 mult. for row 1H
    572     vmla.s16      q4, q5, q2            @weight 2 mult. for row 1H
    573     vmovl.u8      q6, d13               @converting row 2H in source 1 to 16-bit
    574     vmovl.u8      q7, d15               @converting row 2H in source 2 to 16-bit
    575 
    576     vmul.s16      q14, q14, q1          @weight 1 mult. for row 2L
    577     vmla.s16      q14, q15, q2          @weight 2 mult. for row 2L
    578     vmovl.u8      q13, d16              @converting row 3L in source 1 to 16-bit
    579     vmovl.u8      q5, d18               @converting row 3L in source 2 to 16-bit
    580 
    581     vmul.s16      q6, q6, q1            @weight 1 mult. for row 2H
    582     vmla.s16      q6, q7, q2            @weight 2 mult. for row 2H
    583     vmovl.u8      q8, d17               @converting row 3H in source 1 to 16-bit
    584     vmovl.u8      q9, d19               @converting row 3H in source 2 to 16-bit
    585 
    586     vmul.s16      q13, q13, q1          @weight 1 mult. for row 3L
    587     vmla.s16      q13, q5, q2           @weight 2 mult. for row 3L
    588     vmovl.u8      q15, d20              @converting row 4L in source 1 to 16-bit
    589     vmovl.u8      q7, d22               @converting row 4L in source 2 to 16-bit
    590 
    591     vmul.s16      q8, q8, q1            @weight 1 mult. for row 3H
    592     vmla.s16      q8, q9, q2            @weight 2 mult. for row 3H
    593     vmovl.u8      q10, d21              @converting row 4H in source 1 to 16-bit
    594     vmovl.u8      q11, d23              @converting row 4H in source 2 to 16-bit
    595 
    596     vmul.s16      q15, q15, q1          @weight 1 mult. for row 4L
    597     vmla.s16      q15, q7, q2           @weight 2 mult. for row 4L
    598     vrshl.s16     q12, q12, q0          @rounds off the weighted samples from row 1L
    599 
    600     vmul.s16      q10, q10, q1          @weight 1 mult. for row 4H
    601     vmla.s16      q10, q11, q2          @weight 2 mult. for row 4H
    602     vrshl.s16     q4, q4, q0            @rounds off the weighted samples from row 1H
    603 
    604     vrshl.s16     q14, q14, q0          @rounds off the weighted samples from row 2L
    605     vadd.s16      q12, q12, q3          @adding offset for row 1L
    606     vrshl.s16     q6, q6, q0            @rounds off the weighted samples from row 2H
    607     vadd.s16      q4, q4, q3            @adding offset for row 1H
    608     vrshl.s16     q13, q13, q0          @rounds off the weighted samples from row 3L
    609     vadd.s16      q14, q14, q3          @adding offset for row 2L
    610     vrshl.s16     q8, q8, q0            @rounds off the weighted samples from row 3H
    611     vadd.s16      q6, q6, q3            @adding offset for row 2H
    612     vrshl.s16     q15, q15, q0          @rounds off the weighted samples from row 4L
    613     vadd.s16      q13, q13, q3          @adding offset for row 3L
    614     vrshl.s16     q10, q10, q0          @rounds off the weighted samples from row 4H
    615     vadd.s16      q8, q8, q3            @adding offset for row 3H
    616 
    617     vqmovun.s16   d10, q12              @saturating row 1L to unsigned 8-bit
    618     vadd.s16      q15, q15, q3          @adding offset for row 4L
    619     vqmovun.s16   d11, q4               @saturating row 1H to unsigned 8-bit
    620     vadd.s16      q10, q10, q3          @adding offset for row 4H
    621 
    622     vqmovun.s16   d18, q14              @saturating row 2L to unsigned 8-bit
    623     vqmovun.s16   d19, q6               @saturating row 2H to unsigned 8-bit
    624     vqmovun.s16   d14, q13              @saturating row 3L to unsigned 8-bit
    625     vqmovun.s16   d15, q8               @saturating row 3H to unsigned 8-bit
    626     vst1.8        {q5}, [r2], r5        @store row 1 in destination
    627     vqmovun.s16   d22, q15              @saturating row 4L to unsigned 8-bit
    628     vqmovun.s16   d23, q10              @saturating row 4H to unsigned 8-bit
    629 
    630     vst1.8        {q9}, [r2], r5        @store row 2 in destination
    631     subs          r11, r11, #4          @decrement ht by 4
    632     vst1.8        {q7}, [r2], r5        @store row 3 in destination
    633     vst1.8        {q11}, [r2], r5       @store row 4 in destination
    634 
    635     bgt           loop_8_uv             @if greater than 0 repeat the loop again
    636 
    637 end_loops_uv:
    638 
    639     vpop          {d8-d15}
    640     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from sp
    641 
    642 
    643