Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_intra_pred_chroma.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for intra chroma prediction .
     27 //*
     28 //* @author
     29 //*  Ittiam
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_intra_pred_luma_chroma_mode_vert_av8()
     34 //*  - ih264_intra_pred_luma_chroma_mode_horz_av8()
     35 //*  - ih264_intra_pred_luma_chroma_mode_dc_av8()
     36 //*  - ih264_intra_pred_luma_chroma_mode_plane_av8()
     37 //*
     38 //* @remarks
     39 //*  None
     40 //*
     41 //*******************************************************************************
     42 //*/
     43 
     44 ///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
     45 //
     46 
     47 ///**
     48 ///**
     49 ///**
     50 //
     51 
     52 
     53 .text
     54 .p2align 2
     55 .include "ih264_neon_macros.s"
     56 
     57 .extern ih264_gai1_intrapred_chroma_plane_coeffs1
     58 .extern ih264_gai1_intrapred_chroma_plane_coeffs2
     59 
     60 
     61 
     62 ///**
     63 //*******************************************************************************
     64 //*
     65 //*ih264_intra_pred_chroma_8x8_mode_dc
     66 //*
     67 //* @brief
     68 //*     Perform Intra prediction for  chroma_8x8 mode:DC
     69 //*
     70 //* @par Description:
     71 //*    Perform Intra prediction for  chroma_8x8 mode:DC ,described in sec 8.3.4.1
     72 //*
     73 //* @param[in] pu1_src
     74 //*  UWORD8 pointer to the source containing alternate U and V samples
     75 //*
     76 //* @param[out] pu1_dst
     77 //*  UWORD8 pointer to the destination with alternate U and V samples
     78 //*
     79 //* @param[in] src_strd
     80 //*  integer source stride
     81 //*
     82 //* @param[in] dst_strd
     83 //*  integer destination stride
     84 //*
     85 //** @param[in] ui_neighboravailability
     86 //*  availability of neighbouring pixels
     87 //*
     88 //* @returns
     89 //*
     90 //* @remarks
     91 //*  None
     92 //*
     93 //*******************************************************************************/
     94 //void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
     95 //                                        UWORD8 *pu1_dst,
     96 //                                        WORD32 src_strd,
     97 //                                        WORD32 dst_strd,
     98 //                                        WORD32 ui_neighboravailability)
     99 
    100 //**************Variables Vs Registers*****************************************
    101 //    x0 => *pu1_src
    102 //    x1 => *pu1_dst
    103 //    x2 =>  src_strd
    104 //    x3 =>  dst_strd
    105 //   x4 =>  ui_neighboravailability
    106 
    107 
    108 
    109     .global ih264_intra_pred_chroma_8x8_mode_dc_av8
    110 
    111 ih264_intra_pred_chroma_8x8_mode_dc_av8:
    112 
    113 
    114     push_v_regs
    115     stp       x19, x20, [sp, #-16]!
    116 
    117     mov       x19, #5
    118     ands      x6, x4, x19
    119     beq       none_available
    120     cmp       x6, #1
    121     beq       left_only_available
    122     cmp       x6, #4
    123     beq       top_only_available
    124 
    125 all_available:
    126     ld1       {v0.8b, v1.8b}, [x0]
    127     add       x6, x0, #18
    128     ld1       {v2.8b, v3.8b}, [x6]
    129     uxtl      v0.8h, v0.8b
    130     uxtl      v1.8h, v1.8b
    131     addp      v0.4s, v0.4s , v0.4s
    132     addp      v1.4s, v1.4s , v1.4s
    133     addp      v0.4s, v0.4s , v0.4s
    134     addp      v1.4s, v1.4s , v1.4s
    135     uxtl      v2.8h, v2.8b
    136     uxtl      v3.8h, v3.8b
    137     addp      v2.4s, v2.4s , v2.4s
    138     addp      v3.4s, v3.4s , v3.4s
    139     addp      v2.4s, v2.4s , v2.4s
    140     addp      v3.4s, v3.4s , v3.4s
    141     rshrn     v5.8b, v0.8h, #2
    142     dup       v21.8h, v5.h[0]
    143     rshrn     v6.8b, v3.8h, #2
    144     dup       v20.8h, v6.h[0]
    145     add       v1.8h, v1.8h, v2.8h
    146     rshrn     v1.8b, v1.8h, #3
    147     dup       v23.8h, v1.h[0]
    148     mov       v20.d[0], v23.d[0]
    149     add       v0.8h, v0.8h, v3.8h
    150     rshrn     v0.8b, v0.8h, #3
    151     dup       v23.8h, v0.h[0]
    152     mov       v21.d[1], v23.d[0]
    153     b         store
    154 left_only_available:
    155     ld1       {v0.8b, v1.8b}, [x0]
    156     uxtl      v0.8h, v0.8b
    157     uxtl      v1.8h, v1.8b
    158     addp      v0.4s, v0.4s , v0.4s
    159     addp      v1.4s, v1.4s , v1.4s
    160     addp      v0.4s, v0.4s , v0.4s
    161     addp      v1.4s, v1.4s , v1.4s
    162     rshrn     v0.8b, v0.8h, #2
    163     rshrn     v1.8b, v1.8h, #2
    164     dup       v20.8h , v1.h[0]
    165     dup       v21.8h, v0.h[0]
    166     b         store
    167 
    168 top_only_available:
    169     add       x6, x0, #18
    170     ld1       {v0.8b, v1.8b}, [x6]
    171     uxtl      v0.8h, v0.8b
    172     uxtl      v1.8h, v1.8b
    173     addp      v0.4s, v0.4s , v0.4s
    174     addp      v1.4s, v1.4s , v1.4s
    175     addp      v0.4s, v0.4s , v0.4s
    176     addp      v1.4s, v1.4s , v1.4s
    177     rshrn     v0.8b, v0.8h, #2
    178     rshrn     v1.8b, v1.8h, #2
    179     dup       v20.8h , v0.h[0]
    180     dup       v21.8h, v1.h[0]
    181     mov       v20.d[1], v21.d[1]
    182     mov       v21.d[0], v20.d[0]
    183     b         store
    184 none_available:
    185     mov       w15, #128
    186     dup       v20.16b, w15
    187     dup       v21.16b, w15
    188 
    189 
    190 store:
    191 
    192     st1       { v20.16b}, [x1], x3
    193     st1       { v20.16b}, [x1], x3
    194     st1       { v20.16b}, [x1], x3
    195     st1       { v20.16b}, [x1], x3
    196     st1       { v21.16b}, [x1], x3
    197     st1       { v21.16b}, [x1], x3
    198     st1       { v21.16b}, [x1], x3
    199     st1       { v21.16b}, [x1], x3
    200 end_func:
    201 
    202     ldp       x19, x20, [sp], #16
    203     pop_v_regs
    204     ret
    205 
    206 
    207 
    208 
    209 
    210 ///******************************************************************************
    211 
    212 
    213 ///**
    214 //*******************************************************************************
    215 //*
    216 //*ih264_intra_pred_chroma_8x8_mode_horz
    217 //*
    218 //* @brief
    219 //*  Perform Intra prediction for  chroma_8x8 mode:Horizontal
    220 //*
    221 //* @par Description:
    222 //*   Perform Intra prediction for  chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
    223 //*
    224 //* @param[in] pu1_src
    225 //* UWORD8 pointer to the source containing alternate U and V samples
    226 //*
    227 //* @param[out] pu1_dst
    228 //*  UWORD8 pointer to the destination with alternate U and V samples
    229 //*
    230 //* @param[in] src_strd
    231 //*  integer source stride
    232 //*
    233 //* @param[in] dst_strd
    234 //*  integer destination stride
    235 //*
    236 //* @param[in] ui_neighboravailability
    237 //* availability of neighbouring pixels(Not used in this function)
    238 //*
    239 //* @returns
    240 //*
    241 //* @remarks
    242 //*  None
    243 //*
    244 //*******************************************************************************
    245 //*/
    246 //void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
    247 //                                         UWORD8 *pu1_dst,
    248 //                                         WORD32 src_strd,
    249 //                                         WORD32 dst_strd,
    250 //                                         WORD32 ui_neighboravailability)
    251 //**************Variables Vs Registers*****************************************
    252 //    x0 => *pu1_src
    253 //    x1 => *pu1_dst
    254 //    x2 =>  src_strd
    255 //    x3 =>  dst_strd
    256 //   x4 =>  ui_neighboravailability
    257 
    258 
    259     .global ih264_intra_pred_chroma_8x8_mode_horz_av8
    260 
    261 ih264_intra_pred_chroma_8x8_mode_horz_av8:
    262 
    263 
    264 
    265     push_v_regs
    266     ld1       {v0.8h}, [x0]
    267 
    268     dup       v10.8h, v0.h[7]
    269     dup       v11.8h, v0.h[6]
    270     dup       v12.8h, v0.h[5]
    271     dup       v13.8h, v0.h[4]
    272     st1       {v10.8h}, [x1], x3
    273     dup       v14.8h, v0.h[3]
    274     st1       {v11.8h}, [x1], x3
    275     dup       v15.8h, v0.h[2]
    276     st1       {v12.8h}, [x1], x3
    277     dup       v16.8h, v0.h[1]
    278     st1       {v13.8h}, [x1], x3
    279     dup       v17.8h, v0.h[0]
    280     st1       {v14.8h}, [x1], x3
    281     st1       {v15.8h}, [x1], x3
    282     st1       {v16.8h}, [x1], x3
    283     st1       {v17.8h}, [x1], x3
    284 
    285 
    286     pop_v_regs
    287     ret
    288 
    289 
    290 
    291 
    292 
    293 
    294 ///**
    295 //*******************************************************************************
    296 //*
    297 //*ih264_intra_pred_chroma_8x8_mode_vert
    298 //*
    299 //* @brief
    300 //*   Perform Intra prediction for  chroma_8x8 mode:vertical
    301 //*
    302 //* @par Description:
    303 //*Perform Intra prediction for  chroma_8x8 mode:vertical ,described in sec 8.3.4.3
    304 //*
    305 //* @param[in] pu1_src
    306 //* UWORD8 pointer to the source containing alternate U and V samples
    307 //*
    308 //* @param[out] pu1_dst
    309 //*   UWORD8 pointer to the destination with alternate U and V samples
    310 //*
    311 //* @param[in] src_strd
    312 //*  integer source stride
    313 //*
    314 //* @param[in] dst_strd
    315 //*  integer destination stride
    316 //*
    317 //* @param[in] ui_neighboravailability
    318 //* availability of neighbouring pixels(Not used in this function)
    319 //*
    320 //* @returns
    321 //*
    322 //* @remarks
    323 //*  None
    324 //*
    325 //*******************************************************************************
    326 //void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
    327 //                                        UWORD8 *pu1_dst,
    328 //                                        WORD32 src_strd,
    329 //                                        WORD32 dst_strd,
    330 //                                        WORD32 ui_neighboravailability)
    331 
    332 //**************Variables Vs Registers*****************************************
    333 //    x0 => *pu1_src
    334 //    x1 => *pu1_dst
    335 //    x2 =>  src_strd
    336 //    x3 =>  dst_strd
    337 //   x4 =>  ui_neighboravailability
    338 
    339 
    340     .global ih264_intra_pred_chroma_8x8_mode_vert_av8
    341 
    342 ih264_intra_pred_chroma_8x8_mode_vert_av8:
    343 
    344     push_v_regs
    345 
    346     add       x0, x0, #18
    347     ld1       {v0.8b, v1.8b}, [x0]
    348 
    349     st1       {v0.8b, v1.8b}, [x1], x3
    350     st1       {v0.8b, v1.8b}, [x1], x3
    351     st1       {v0.8b, v1.8b}, [x1], x3
    352     st1       {v0.8b, v1.8b}, [x1], x3
    353     st1       {v0.8b, v1.8b}, [x1], x3
    354     st1       {v0.8b, v1.8b}, [x1], x3
    355     st1       {v0.8b, v1.8b}, [x1], x3
    356     st1       {v0.8b, v1.8b}, [x1], x3
    357 
    358     pop_v_regs
    359     ret
    360 
    361 
    362 
    363 
    364 ///******************************************************************************
    365 
    366 
    367 ///**
    368 //*******************************************************************************
    369 //*
    370 //*ih264_intra_pred_chroma_8x8_mode_plane
    371 //*
    372 //* @brief
    373 //*   Perform Intra prediction for  chroma_8x8 mode:PLANE
    374 //*
    375 //* @par Description:
    376 //*  Perform Intra prediction for  chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
    377 //*
    378 //* @param[in] pu1_src
    379 //*  UWORD8 pointer to the source containing alternate U and V samples
    380 //*
    381 //* @param[out] pu1_dst
    382 //*  UWORD8 pointer to the destination with alternate U and V samples
    383 //*
    384 //* @param[in] src_strd
    385 //*  integer source stride
    386 //*
    387 //* @param[in] dst_strd
    388 //*  integer destination stride
    389 //*
    390 //* @param[in] ui_neighboravailability
    391 //*  availability of neighbouring pixels
    392 //*
    393 //* @returns
    394 //*
    395 //* @remarks
    396 //*  None
    397 //*
    398 //*******************************************************************************/
    399 //void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
    400 //                                        UWORD8 *pu1_dst,
    401 //                                        WORD32 src_strd,
    402 //                                        WORD32 dst_strd,
    403 //                                        WORD32 ui_neighboravailability)
    404 
    405 //**************Variables Vs Registers*****************************************
    406 //    x0 => *pu1_src
    407 //    x1 => *pu1_dst
    408 //    x2 =>  src_strd
    409 //    x3 =>  dst_strd
    410 //   x4 =>  ui_neighboravailability
    411 
    412     .global ih264_intra_pred_chroma_8x8_mode_plane_av8
    413 ih264_intra_pred_chroma_8x8_mode_plane_av8:
    414 
    415     push_v_regs
    416     stp       x19, x20, [sp, #-16]!
    417 
    418     ld1       {v0.2s}, [x0]
    419     add       x10, x0, #10
    420     ld1       {v1.2s}, [x10]
    421     add       x10, x10, #6
    422     rev64     v5.4h, v0.4h
    423     ld1       {v2.2s}, [x10], #8
    424     add       x10, x10, #2
    425     rev64     v7.4h, v2.4h
    426     ld1       {v3.2s}, [x10]
    427     sub       x5, x3, #8
    428     adrp      x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1
    429     ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1]
    430     usubl     v10.8h, v5.8b, v1.8b
    431     ld1       {v8.8b, v9.8b}, [x12]     // Load multiplication factors 1 to 8 into D3
    432     mov       v8.d[1], v9.d[0]
    433     usubl     v12.8h, v3.8b, v7.8b
    434     mul       v14.8h, v10.8h , v8.8h
    435     mul       v16.8h, v12.8h , v8.8h
    436     uzp1      v15.8h, v14.8h, v16.8h
    437     uzp2      v16.8h, v14.8h, v16.8h
    438     mov       v14.16b, v15.16b
    439     mov       v15.d[0], v14.d[1]
    440     mov       v17.d[0], v16.d[1]
    441     addp      v14.4h, v14.4h, v14.4h
    442     addp      v15.4h, v15.4h, v15.4h
    443     addp      v16.4h, v16.4h, v16.4h
    444     addp      v17.4h, v17.4h, v17.4h
    445     addp      v14.4h, v14.4h, v14.4h
    446     addp      v15.4h, v15.4h, v15.4h
    447     addp      v16.4h, v16.4h, v16.4h
    448     addp      v17.4h, v17.4h, v17.4h
    449     mov       x6, #34
    450     dup       v18.8h, w6
    451     smull     v22.4s, v14.4h, v18.4h
    452     smull     v24.4s, v15.4h, v18.4h
    453     smull     v26.4s, v16.4h, v18.4h
    454     smull     v28.4s, v17.4h, v18.4h
    455     rshrn     v10.4h, v22.4s, #6
    456     rshrn     v12.4h, v24.4s, #6
    457     rshrn     v13.4h, v26.4s, #6
    458     rshrn     v14.4h, v28.4s, #6
    459     ldrb      w6, [x0], #1
    460     sxtw      x6, w6
    461     add       x10, x0, #31
    462     ldrb      w8, [x0], #1
    463     sxtw      x8, w8
    464     ldrb      w7, [x10], #1
    465     sxtw      x7, w7
    466     ldrb      w9, [x10], #1
    467     sxtw      x9, w9
    468     add       x6, x6, x7
    469     add       x8, x8, x9
    470     lsl       x6, x6, #4
    471     lsl       x8, x8, #4
    472     dup       v0.8h, w6
    473     dup       v2.8h, w8
    474     dup       v4.8h, v12.h[0]
    475     dup       v6.8h, v10.h[0]
    476     dup       v24.8h, v14.h[0]
    477     dup       v26.8h, v13.h[0]
    478     zip1      v5.8h, v4.8h, v24.8h
    479     zip2      v24.8h, v4.8h, v24.8h
    480     mov       v4.16b, v5.16b
    481     zip1      v7.8h, v6.8h, v26.8h
    482     zip2      v26.8h, v6.8h, v26.8h
    483     mov       v6.16b, v7.16b
    484     zip1      v1.8h, v0.8h, v2.8h
    485     zip2      v2.8h, v0.8h, v2.8h
    486     mov       v0.16b, v1.16b
    487 
    488     adrp      x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2
    489     ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2]
    490 
    491     ld1       {v8.2s, v9.2s}, [x12]
    492     mov       v8.d[1], v9.d[0]
    493     mov       v10.16b, v8.16b
    494     mov       v22.16b, v8.16b
    495     zip1      v9.8h, v8.8h, v10.8h
    496     zip2      v10.8h, v8.8h, v10.8h
    497     mov       v8.16b, v9.16b
    498     mul       v12.8h, v4.8h , v8.8h
    499     mul       v16.8h, v4.8h , v10.8h
    500     add       v12.8h, v0.8h , v12.8h
    501     add       v16.8h, v0.8h , v16.8h
    502     dup       v20.8h, v22.h[0]
    503     mul       v4.8h, v6.8h , v20.8h
    504     dup       v30.8h, v22.h[1]
    505     mul       v18.8h, v6.8h , v20.8h
    506     mul       v14.8h, v6.8h , v30.8h
    507     mul       v8.8h, v6.8h , v30.8h
    508     add       v24.8h, v12.8h , v4.8h
    509     add       v0.8h, v16.8h , v18.8h
    510     add       v2.8h, v12.8h , v14.8h
    511     sqrshrun  v28.8b, v24.8h, #5
    512     add       v26.8h, v16.8h , v8.8h
    513     sqrshrun  v29.8b, v0.8h, #5
    514     dup       v20.8h, v22.h[2]
    515     st1       {v28.8b, v29.8b}, [x1], x3
    516     sqrshrun  v28.8b, v2.8h, #5
    517     sqrshrun  v29.8b, v26.8h, #5
    518     mul       v4.8h, v6.8h , v20.8h
    519     mul       v18.8h, v6.8h , v20.8h
    520     st1       {v28.8b, v29.8b}, [x1], x3
    521     add       v24.8h, v12.8h , v4.8h
    522     add       v0.8h, v16.8h , v18.8h
    523     dup       v30.8h, v22.h[3]
    524     sqrshrun  v28.8b, v24.8h, #5
    525     sqrshrun  v29.8b, v0.8h, #5
    526     mul       v14.8h, v6.8h , v30.8h
    527     mul       v8.8h, v6.8h , v30.8h
    528     st1       {v28.8b, v29.8b}, [x1], x3
    529     add       v2.8h, v12.8h , v14.8h
    530     add       v26.8h, v16.8h , v8.8h
    531     dup       v20.8h, v22.h[4]
    532     sqrshrun  v28.8b, v2.8h, #5
    533     sqrshrun  v29.8b, v26.8h, #5
    534     mul       v4.8h, v6.8h , v20.8h
    535     mul       v18.8h, v6.8h , v20.8h
    536     st1       {v28.8b, v29.8b}, [x1], x3
    537     add       v24.8h, v12.8h , v4.8h
    538     add       v0.8h, v16.8h , v18.8h
    539     dup       v30.8h, v22.h[5]
    540     sqrshrun  v28.8b, v24.8h, #5
    541     sqrshrun  v29.8b, v0.8h, #5
    542     mul       v14.8h, v6.8h , v30.8h
    543     mul       v8.8h, v6.8h , v30.8h
    544     st1       {v28.8b, v29.8b}, [x1], x3
    545     add       v2.8h, v12.8h , v14.8h
    546     add       v26.8h, v16.8h , v8.8h
    547     dup       v20.8h, v22.h[6]
    548     sqrshrun  v28.8b, v2.8h, #5
    549     sqrshrun  v29.8b, v26.8h, #5
    550     mul       v4.8h, v6.8h , v20.8h
    551     mul       v18.8h, v6.8h , v20.8h
    552     st1       {v28.8b, v29.8b}, [x1], x3
    553     add       v24.8h, v12.8h , v4.8h
    554     add       v0.8h, v16.8h , v18.8h
    555     dup       v30.8h, v22.h[7]
    556     sqrshrun  v28.8b, v24.8h, #5
    557     sqrshrun  v29.8b, v0.8h, #5
    558     mul       v14.8h, v6.8h , v30.8h
    559     mul       v8.8h, v6.8h , v30.8h
    560     st1       {v28.8b, v29.8b}, [x1], x3
    561     add       v2.8h, v12.8h , v14.8h
    562     add       v26.8h, v16.8h , v8.8h
    563     sqrshrun  v28.8b, v2.8h, #5
    564     sqrshrun  v29.8b, v26.8h, #5
    565     st1       {v28.8b, v29.8b}, [x1], x3
    566 
    567 end_func_plane:
    568 
    569     ldp       x19, x20, [sp], #16
    570     pop_v_regs
    571     ret
    572 
    573 
    574 
    575