Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_default_weighted_pred_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for default weighted prediction.
     27 //*
     28 //* @author
     29 //*  Kaushik Senthoor R
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_default_weighted_pred_luma_av8()
     34 //*  - ih264_default_weighted_pred_chroma_av8()
     35 //*
     36 //* @remarks
     37 //*  None
     38 //*
     39 //*******************************************************************************
     40 //*/
     41 //*******************************************************************************
     42 //* @function
     43 //*  ih264_default_weighted_pred_luma_av8()
     44 //*
     45 //* @brief
     46 //*  This routine performs the default weighted prediction as described in sec
     47 //* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma.
     48 //*
     49 //* @par Description:
     50 //*  This function gets two ht x wd blocks, calculates their rounded-average and
     51 //* stores it in the destination block.
     52 //*
     53 //* @param[in] puc_src1:
     54 //*  UWORD8 Pointer to the buffer containing the first input block.
     55 //*
     56 //* @param[in] puc_src2:
     57 //*  UWORD8 Pointer to the buffer containing the second input block.
     58 //*
     59 //* @param[out] puc_dst
     60 //*  UWORD8 pointer to the destination where the output block is stored.
     61 //*
     62 //* @param[in] src_strd1
     63 //*  Stride of the first input buffer
     64 //*
     65 //* @param[in] src_strd2
     66 //*  Stride of the second input buffer
     67 //*
     68 //* @param[in] dst_strd
     69 //*  Stride of the destination buffer
     70 //*
     71 //* @param[in] ht
     72 //*  integer height of the array
     73 //*
     74 //* @param[in] wd
     75 //*  integer width of the array
     76 //*
     77 //* @returns
     78 //*  None
     79 //*
     80 //* @remarks
     81 //*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
     82 //*
     83 //*******************************************************************************
     84 //*/
     85 //void ih264_default_weighted_pred_luma_av8(UWORD8 *puc_src1,
     86 //                                          UWORD8 *puc_src2,
     87 //                                          UWORD8 *puc_dst,
     88 //                                          WORD32 src_strd1,
     89 //                                          WORD32 src_strd2,
     90 //                                          WORD32 dst_strd,
     91 //                                          WORD32 ht,
     92 //                                          WORD32 wd)
     93 //
     94 //**************Variables Vs Registers*****************************************
     95 //    x0      => puc_src1
     96 //    x1      => puc_src2
     97 //    x2      => puc_dst
     98 //    w3      => src_strd1
     99 //    w4      => src_strd2
    100 //    w5      => dst_strd
    101 //    w6      => ht
    102 //    w7      => wd
    103 //
    104 .text
    105 .p2align 2
    106 .include "ih264_neon_macros.s"
    107 
    108 
    109 
    110     .global ih264_default_weighted_pred_luma_av8
    111 
    112 ih264_default_weighted_pred_luma_av8:
    113 
    114     push_v_regs
    115     stp       x19, x20, [sp, #-16]!
    116     sxtw      x3, w3
    117     sxtw      x4, w4
    118     sxtw      x5, w5
    119     cmp       w7, #16
    120     beq       loop_16                   //branch if wd is 16
    121     cmp       w7, #8
    122     beq       loop_8                    //branch if wd is 8
    123 
    124 loop_4:                                 //each iteration processes four rows
    125 
    126     ld1       {v0.s}[0], [x0], x3       //load row 1 in source 1
    127     ld1       {v0.s}[1], [x0], x3       //load row 2 in source 1
    128     ld1       {v2.s}[0], [x1], x4       //load row 1 in source 2
    129     ld1       {v2.s}[1], [x1], x4       //load row 2 in source 2
    130     ld1       {v1.s}[0], [x0], x3       //load row 3 in source 1
    131     ld1       {v1.s}[1], [x0], x3       //load row 4 in source 1
    132     urhadd    v0.8b, v0.8b , v2.8b
    133     ld1       {v3.s}[0], [x1], x4       //load row 3 in source 2
    134     ld1       {v3.s}[1], [x1], x4       //load row 4 in source 2
    135     subs      w6, w6, #4                //decrement ht by 4
    136     st1       {v0.s}[0], [x2], x5       //load row 1 in destination
    137     st1       {v0.s}[1], [x2], x5       //load row 2 in destination
    138     urhadd    v1.8b, v1.8b , v3.8b
    139     st1       {v1.s}[0], [x2], x5       //load row 3 in destination
    140     st1       {v1.s}[1], [x2], x5       //load row 4 in destination
    141     bgt       loop_4                    //if greater than 0 repeat the loop again
    142     b         end_loops
    143 
    144 loop_8:                                 //each iteration processes four rows
    145 
    146     ld1       {v0.8b}, [x0], x3         //load row 1 in source 1
    147     ld1       {v4.8b}, [x1], x4         //load row 1 in source 2
    148     ld1       {v1.8b}, [x0], x3         //load row 2 in source 1
    149     ld1       {v5.8b}, [x1], x4         //load row 2 in source 2
    150     ld1       {v2.8b}, [x0], x3         //load row 3 in source 1
    151     urhadd    v0.16b, v0.16b , v4.16b
    152     urhadd    v1.16b, v1.16b , v5.16b
    153     ld1       {v6.8b}, [x1], x4         //load row 3 in source 2
    154     ld1       {v3.8b}, [x0], x3         //load row 4 in source 1
    155     urhadd    v2.8b, v2.8b , v6.8b
    156     ld1       {v7.8b}, [x1], x4         //load row 4 in source 2
    157     subs      w6, w6, #4                //decrement ht by 4
    158     st1       {v0.8b}, [x2], x5         //load row 1 in destination
    159     urhadd    v3.8b, v3.8b , v7.8b
    160     st1       {v1.8b}, [x2], x5         //load row 2 in destination
    161     st1       {v2.8b}, [x2], x5         //load row 3 in destination
    162     st1       {v3.8b}, [x2], x5         //load row 4 in destination
    163     bgt       loop_8                    //if greater than 0 repeat the loop again
    164     b         end_loops
    165 
    166 loop_16:                                //each iteration processes eight rows
    167 
    168     ld1       {v0.8b, v1.8b}, [x0], x3  //load row 1 in source 1
    169     ld1       {v16.8b, v17.8b}, [x1], x4 //load row 1 in source 2
    170     ld1       {v2.8b, v3.8b}, [x0], x3  //load row 2 in source 1
    171     ld1       {v18.8b, v19.8b}, [x1], x4 //load row 2 in source 2
    172     urhadd    v0.16b, v0.16b , v16.16b
    173     urhadd    v1.16b, v1.16b , v17.16b
    174     ld1       {v4.8b, v5.8b}, [x0], x3  //load row 3 in source 1
    175     ld1       {v20.8b, v21.8b}, [x1], x4 //load row 3 in source 2
    176     urhadd    v2.16b, v2.16b , v18.16b
    177     urhadd    v3.16b, v3.16b , v19.16b
    178     ld1       {v6.8b, v7.8b}, [x0], x3  //load row 4 in source 1
    179     ld1       {v22.8b, v23.8b}, [x1], x4 //load row 4 in source 2
    180     urhadd    v4.16b, v4.16b , v20.16b
    181     urhadd    v5.16b, v5.16b , v21.16b
    182     ld1       {v8.8b, v9.8b}, [x0], x3  //load row 5 in source 1
    183     ld1       {v24.8b, v25.8b}, [x1], x4 //load row 5 in source 2
    184     urhadd    v6.16b, v6.16b , v22.16b
    185     urhadd    v7.16b, v7.16b , v23.16b
    186     ld1       {v10.8b, v11.8b}, [x0], x3 //load row 6 in source 1
    187     ld1       {v26.8b, v27.8b}, [x1], x4 //load row 6 in source 2
    188     urhadd    v8.16b, v8.16b , v24.16b
    189     urhadd    v9.16b, v9.16b , v25.16b
    190     ld1       {v12.8b, v13.8b}, [x0], x3 //load row 7 in source 1
    191     ld1       {v28.8b, v29.8b}, [x1], x4 //load row 7 in source 2
    192     urhadd    v10.16b, v10.16b , v26.16b
    193     urhadd    v11.16b, v11.16b , v27.16b
    194     ld1       {v14.8b, v15.8b}, [x0], x3 //load row 8 in source 1
    195     ld1       {v30.8b, v31.8b}, [x1], x4 //load row 8 in source 2
    196     urhadd    v12.16b, v12.16b , v28.16b
    197     urhadd    v13.16b, v13.16b , v29.16b
    198     st1       {v0.8b, v1.8b}, [x2], x5  //load row 1 in destination
    199     st1       {v2.8b, v3.8b}, [x2], x5  //load row 2 in destination
    200     urhadd    v14.16b, v14.16b , v30.16b
    201     urhadd    v15.16b, v15.16b , v31.16b
    202     st1       {v4.8b, v5.8b}, [x2], x5  //load row 3 in destination
    203     st1       {v6.8b, v7.8b}, [x2], x5  //load row 4 in destination
    204     subs      w6, w6, #8                //decrement ht by 8
    205     st1       {v8.8b, v9.8b}, [x2], x5  //load row 5 in destination
    206     st1       {v10.8b, v11.8b}, [x2], x5 //load row 6 in destination
    207     st1       {v12.8b, v13.8b}, [x2], x5 //load row 7 in destination
    208     st1       {v14.8b, v15.8b}, [x2], x5 //load row 8 in destination
    209     bgt       loop_16                   //if greater than 0 repeat the loop again
    210 
    211 end_loops:
    212 
    213     // LDMFD sp!,{x4-x7,x15}                      //Reload the registers from sp
    214     ldp       x19, x20, [sp], #16
    215     pop_v_regs
    216     ret
    217 
    218 
    219 //*******************************************************************************
    220 //* @function
    221 //*  ih264_default_weighted_pred_chroma_av8()
    222 //*
    223 //* @brief
    224 //*  This routine performs the default weighted prediction as described in sec
    225 //* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma.
    226 //*
    227 //* @par Description:
    228 //*  This function gets two ht x wd blocks, calculates their rounded-average and
    229 //* stores it in the destination block for U and V.
    230 //*
    231 //* @param[in] puc_src1:
    232 //*  UWORD8 Pointer to the buffer containing the first input block.
    233 //*
    234 //* @param[in] puc_src2:
    235 //*  UWORD8 Pointer to the buffer containing the second input block.
    236 //*
    237 //* @param[out] puc_dst
    238 //*  UWORD8 pointer to the destination where the output block is stored.
    239 //*
    240 //* @param[in] src_strd1
    241 //*  Stride of the first input buffer
    242 //*
    243 //* @param[in] src_strd2
    244 //*  Stride of the second input buffer
    245 //*
    246 //* @param[in] dst_strd
    247 //*  Stride of the destination buffer
    248 //*
    249 //* @param[in] ht
    250 //*  integer height of the array
    251 //*
    252 //* @param[in] wd
    253 //*  integer width of the array
    254 //*
    255 //* @returns
    256 //*  None
    257 //*
    258 //* @remarks
    259 //*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
    260 //*
    261 //*******************************************************************************
    262 //*/
    263 //void ih264_default_weighted_pred_chroma_av8(UWORD8 *puc_src1,
    264 //                                            UWORD8 *puc_src2,
    265 //                                            UWORD8 *puc_dst,
    266 //                                            WORD32 src_strd1,
    267 //                                            WORD32 src_strd2,
    268 //                                            WORD32 dst_strd,
    269 //                                            WORD32 ht,
    270 //                                            WORD32 wd)
    271 //
    272 //**************Variables Vs Registers*****************************************
    273 //    x0      => puc_src1
    274 //    x1      => puc_src2
    275 //    x2      => puc_dst
    276 //    w3      => src_strd1
    277 //    w4      => src_strd2
    278 //    w5      => dst_strd
    279 //    w6      => ht
    280 //    w7      => wd
    281 //
    282 
    283 
    284 
    285 
    286     .global ih264_default_weighted_pred_chroma_av8
    287 
    288 ih264_default_weighted_pred_chroma_av8:
    289 
    290     push_v_regs
    291     stp       x19, x20, [sp, #-16]!
    292     sxtw      x3, w3
    293     sxtw      x4, w4
    294     sxtw      x5, w5
    295     cmp       w7, #8
    296     beq       loop_8_uv                 //branch if wd is 8
    297     cmp       w7, #4
    298     beq       loop_4_uv                 //branch if wd is 4
    299 
    300 loop_2_uv:                              //each iteration processes two rows
    301 
    302     ld1       {v0.s}[0], [x0], x3       //load row 1 in source 1
    303     ld1       {v0.s}[1], [x0], x3       //load row 2 in source 1
    304     ld1       {v1.s}[0], [x1], x4       //load row 1 in source 2
    305     ld1       {v1.s}[1], [x1], x4       //load row 2 in source 2
    306     urhadd    v0.8b, v0.8b , v1.8b
    307     subs      w6, w6, #2                //decrement ht by 2
    308     st1       {v0.s}[0], [x2], x5       //load row 1 in destination
    309     st1       {v0.s}[1], [x2], x5       //load row 2 in destination
    310     bgt       loop_2_uv                 //if greater than 0 repeat the loop again
    311     b         end_loops_uv
    312 
    313 loop_4_uv:                              //each iteration processes two rows
    314 
    315     ld1       {v0.8b}, [x0], x3         //load row 1 in source 1
    316     ld1       {v2.8b}, [x1], x4         //load row 1 in source 2
    317     ld1       {v1.8b}, [x0], x3         //load row 2 in source 1
    318     urhadd    v0.8b, v0.8b , v2.8b
    319     ld1       {v3.8b}, [x1], x4         //load row 2 in source 2
    320     urhadd    v1.8b, v1.8b , v3.8b
    321     st1       {v0.8b}, [x2], x5         //load row 1 in destination
    322     subs      w6, w6, #2                //decrement ht by 2
    323     st1       {v1.8b}, [x2], x5         //load row 2 in destination
    324     bgt       loop_4_uv                 //if greater than 0 repeat the loop again
    325     b         end_loops_uv
    326 
    327 loop_8_uv:                              //each iteration processes four rows
    328 
    329     ld1       {v0.8b, v1.8b}, [x0], x3  //load row 1 in source 1
    330     ld1       {v8.8b, v9.8b}, [x1], x4  //load row 1 in source 2
    331     ld1       {v2.8b, v3.8b}, [x0], x3  //load row 2 in source 1
    332     urhadd    v0.16b, v0.16b , v8.16b
    333     urhadd    v1.16b, v1.16b , v9.16b
    334     ld1       {v10.8b, v11.8b}, [x1], x4 //load row 2 in source 2
    335     ld1       {v4.8b, v5.8b}, [x0], x3  //load row 3 in source 1
    336     urhadd    v2.16b, v2.16b , v10.16b
    337     urhadd    v3.16b, v3.16b , v11.16b
    338     ld1       {v12.8b, v13.8b}, [x1], x4 //load row 3 in source 2
    339     ld1       {v6.8b, v7.8b}, [x0], x3  //load row 4 in source 1
    340     urhadd    v4.16b, v4.16b , v12.16b
    341     urhadd    v5.16b, v5.16b , v13.16b
    342     ld1       {v14.8b, v15.8b}, [x1], x4 //load row 4 in source 2
    343     st1       {v0.8b, v1.8b}, [x2], x5  //load row 1 in destination
    344     urhadd    v6.16b, v6.16b , v14.16b
    345     urhadd    v7.16b, v7.16b , v15.16b
    346     st1       {v2.8b, v3.8b}, [x2], x5  //load row 2 in destination
    347     subs      w6, w6, #4                //decrement ht by 4
    348     st1       {v4.8b, v5.8b}, [x2], x5  //load row 3 in destination
    349     st1       {v6.8b, v7.8b}, [x2], x5  //load row 4 in destination
    350     bgt       loop_8_uv                 //if greater than 0 repeat the loop again
    351 
    352 end_loops_uv:
    353     ldp       x19, x20, [sp], #16
    354     pop_v_regs
    355     ret
    356 
    357 
    358 
    359