Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_intra_pred_chroma.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for intra chroma prediction .
     27 //*
     28 //* @author
     29 //*  Ittiam
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_intra_pred_luma_chroma_mode_vert_av8()
     34 //*  - ih264_intra_pred_luma_chroma_mode_horz_av8()
     35 //*  - ih264_intra_pred_luma_chroma_mode_dc_av8()
     36 //*  - ih264_intra_pred_luma_chroma_mode_plane_av8()
     37 //*
     38 //* @remarks
     39 //*  None
     40 //*
     41 //*******************************************************************************
     42 //*/
     43 
     44 ///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
     45 //
     46 
     47 ///**
     48 ///**
     49 ///**
     50 //
     51 
     52 
     53 .text
     54 .p2align 2
     55 .include "ih264_neon_macros.s"
     56 
     57 .extern ih264_gai1_intrapred_chroma_plane_coeffs1
     58 .extern ih264_gai1_intrapred_chroma_plane_coeffs2
     59 
     60 
     61 
     62 ///**
     63 //*******************************************************************************
     64 //*
     65 //*ih264_intra_pred_chroma_8x8_mode_dc
     66 //*
     67 //* @brief
     68 //*     Perform Intra prediction for  chroma_8x8 mode:DC
     69 //*
     70 //* @par Description:
     71 //*    Perform Intra prediction for  chroma_8x8 mode:DC ,described in sec 8.3.4.1
     72 //*
     73 //* @param[in] pu1_src
     74 //*  UWORD8 pointer to the source containing alternate U and V samples
     75 //*
     76 //* @param[out] pu1_dst
     77 //*  UWORD8 pointer to the destination with alternate U and V samples
     78 //*
     79 //* @param[in] src_strd
     80 //*  integer source stride
     81 //*
     82 //* @param[in] dst_strd
     83 //*  integer destination stride
     84 //*
     85 //** @param[in] ui_neighboravailability
     86 //*  availability of neighbouring pixels
     87 //*
     88 //* @returns
     89 //*
     90 //* @remarks
     91 //*  None
     92 //*
     93 //*******************************************************************************/
     94 //void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
     95 //                                        UWORD8 *pu1_dst,
     96 //                                        WORD32 src_strd,
     97 //                                        WORD32 dst_strd,
     98 //                                        WORD32 ui_neighboravailability)
     99 
    100 //**************Variables Vs Registers*****************************************
    101 //    x0 => *pu1_src
    102 //    x1 => *pu1_dst
    103 //    w2 =>  src_strd
    104 //    w3 =>  dst_strd
    105 //    w4 =>  ui_neighboravailability
    106 
    107 
    108 
    109     .global ih264_intra_pred_chroma_8x8_mode_dc_av8
    110 
    111 ih264_intra_pred_chroma_8x8_mode_dc_av8:
    112 
    113 
    114     push_v_regs
    115     stp       x19, x20, [sp, #-16]!
    116     sxtw      x3, w3
    117 
    118     mov       w19, #5
    119     ands      w6, w4, w19
    120     beq       none_available
    121     cmp       w6, #1
    122     beq       left_only_available
    123     cmp       w6, #4
    124     beq       top_only_available
    125 
    126 all_available:
    127     ld1       {v0.8b, v1.8b}, [x0]
    128     add       x6, x0, #18
    129     ld1       {v2.8b, v3.8b}, [x6]
    130     uxtl      v0.8h, v0.8b
    131     uxtl      v1.8h, v1.8b
    132     addp      v0.4s, v0.4s , v0.4s
    133     addp      v1.4s, v1.4s , v1.4s
    134     addp      v0.4s, v0.4s , v0.4s
    135     addp      v1.4s, v1.4s , v1.4s
    136     uxtl      v2.8h, v2.8b
    137     uxtl      v3.8h, v3.8b
    138     addp      v2.4s, v2.4s , v2.4s
    139     addp      v3.4s, v3.4s , v3.4s
    140     addp      v2.4s, v2.4s , v2.4s
    141     addp      v3.4s, v3.4s , v3.4s
    142     rshrn     v5.8b, v0.8h, #2
    143     dup       v21.8h, v5.h[0]
    144     rshrn     v6.8b, v3.8h, #2
    145     dup       v20.8h, v6.h[0]
    146     add       v1.8h, v1.8h, v2.8h
    147     rshrn     v1.8b, v1.8h, #3
    148     dup       v23.8h, v1.h[0]
    149     mov       v20.d[0], v23.d[0]
    150     add       v0.8h, v0.8h, v3.8h
    151     rshrn     v0.8b, v0.8h, #3
    152     dup       v23.8h, v0.h[0]
    153     mov       v21.d[1], v23.d[0]
    154     b         store
    155 left_only_available:
    156     ld1       {v0.8b, v1.8b}, [x0]
    157     uxtl      v0.8h, v0.8b
    158     uxtl      v1.8h, v1.8b
    159     addp      v0.4s, v0.4s , v0.4s
    160     addp      v1.4s, v1.4s , v1.4s
    161     addp      v0.4s, v0.4s , v0.4s
    162     addp      v1.4s, v1.4s , v1.4s
    163     rshrn     v0.8b, v0.8h, #2
    164     rshrn     v1.8b, v1.8h, #2
    165     dup       v20.8h , v1.h[0]
    166     dup       v21.8h, v0.h[0]
    167     b         store
    168 
    169 top_only_available:
    170     add       x6, x0, #18
    171     ld1       {v0.8b, v1.8b}, [x6]
    172     uxtl      v0.8h, v0.8b
    173     uxtl      v1.8h, v1.8b
    174     addp      v0.4s, v0.4s , v0.4s
    175     addp      v1.4s, v1.4s , v1.4s
    176     addp      v0.4s, v0.4s , v0.4s
    177     addp      v1.4s, v1.4s , v1.4s
    178     rshrn     v0.8b, v0.8h, #2
    179     rshrn     v1.8b, v1.8h, #2
    180     dup       v20.8h , v0.h[0]
    181     dup       v21.8h, v1.h[0]
    182     mov       v20.d[1], v21.d[1]
    183     mov       v21.d[0], v20.d[0]
    184     b         store
    185 none_available:
    186     mov       w15, #128
    187     dup       v20.16b, w15
    188     dup       v21.16b, w15
    189 
    190 
    191 store:
    192 
    193     st1       { v20.16b}, [x1], x3
    194     st1       { v20.16b}, [x1], x3
    195     st1       { v20.16b}, [x1], x3
    196     st1       { v20.16b}, [x1], x3
    197     st1       { v21.16b}, [x1], x3
    198     st1       { v21.16b}, [x1], x3
    199     st1       { v21.16b}, [x1], x3
    200     st1       { v21.16b}, [x1], x3
    201 end_func:
    202 
    203     ldp       x19, x20, [sp], #16
    204     pop_v_regs
    205     ret
    206 
    207 
    208 
    209 
    210 
    211 ///******************************************************************************
    212 
    213 
    214 ///**
    215 //*******************************************************************************
    216 //*
    217 //*ih264_intra_pred_chroma_8x8_mode_horz
    218 //*
    219 //* @brief
    220 //*  Perform Intra prediction for  chroma_8x8 mode:Horizontal
    221 //*
    222 //* @par Description:
    223 //*   Perform Intra prediction for  chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
    224 //*
    225 //* @param[in] pu1_src
    226 //* UWORD8 pointer to the source containing alternate U and V samples
    227 //*
    228 //* @param[out] pu1_dst
    229 //*  UWORD8 pointer to the destination with alternate U and V samples
    230 //*
    231 //* @param[in] src_strd
    232 //*  integer source stride
    233 //*
    234 //* @param[in] dst_strd
    235 //*  integer destination stride
    236 //*
    237 //* @param[in] ui_neighboravailability
    238 //* availability of neighbouring pixels(Not used in this function)
    239 //*
    240 //* @returns
    241 //*
    242 //* @remarks
    243 //*  None
    244 //*
    245 //*******************************************************************************
    246 //*/
    247 //void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
    248 //                                         UWORD8 *pu1_dst,
    249 //                                         WORD32 src_strd,
    250 //                                         WORD32 dst_strd,
    251 //                                         WORD32 ui_neighboravailability)
    252 //**************Variables Vs Registers*****************************************
    253 //    x0 => *pu1_src
    254 //    x1 => *pu1_dst
    255 //    w2 =>  src_strd
    256 //    w3 =>  dst_strd
    257 //    w4 =>  ui_neighboravailability
    258 
    259 
    260     .global ih264_intra_pred_chroma_8x8_mode_horz_av8
    261 
    262 ih264_intra_pred_chroma_8x8_mode_horz_av8:
    263 
    264 
    265 
    266     push_v_regs
    267     sxtw      x3, w3
    268     ld1       {v0.8h}, [x0]
    269 
    270     dup       v10.8h, v0.h[7]
    271     dup       v11.8h, v0.h[6]
    272     dup       v12.8h, v0.h[5]
    273     dup       v13.8h, v0.h[4]
    274     st1       {v10.8h}, [x1], x3
    275     dup       v14.8h, v0.h[3]
    276     st1       {v11.8h}, [x1], x3
    277     dup       v15.8h, v0.h[2]
    278     st1       {v12.8h}, [x1], x3
    279     dup       v16.8h, v0.h[1]
    280     st1       {v13.8h}, [x1], x3
    281     dup       v17.8h, v0.h[0]
    282     st1       {v14.8h}, [x1], x3
    283     st1       {v15.8h}, [x1], x3
    284     st1       {v16.8h}, [x1], x3
    285     st1       {v17.8h}, [x1], x3
    286 
    287 
    288     pop_v_regs
    289     ret
    290 
    291 
    292 
    293 
    294 
    295 
    296 ///**
    297 //*******************************************************************************
    298 //*
    299 //*ih264_intra_pred_chroma_8x8_mode_vert
    300 //*
    301 //* @brief
    302 //*   Perform Intra prediction for  chroma_8x8 mode:vertical
    303 //*
    304 //* @par Description:
    305 //*Perform Intra prediction for  chroma_8x8 mode:vertical ,described in sec 8.3.4.3
    306 //*
    307 //* @param[in] pu1_src
    308 //* UWORD8 pointer to the source containing alternate U and V samples
    309 //*
    310 //* @param[out] pu1_dst
    311 //*   UWORD8 pointer to the destination with alternate U and V samples
    312 //*
    313 //* @param[in] src_strd
    314 //*  integer source stride
    315 //*
    316 //* @param[in] dst_strd
    317 //*  integer destination stride
    318 //*
    319 //* @param[in] ui_neighboravailability
    320 //* availability of neighbouring pixels(Not used in this function)
    321 //*
    322 //* @returns
    323 //*
    324 //* @remarks
    325 //*  None
    326 //*
    327 //*******************************************************************************
    328 //void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
    329 //                                        UWORD8 *pu1_dst,
    330 //                                        WORD32 src_strd,
    331 //                                        WORD32 dst_strd,
    332 //                                        WORD32 ui_neighboravailability)
    333 
    334 //**************Variables Vs Registers*****************************************
    335 //    x0 => *pu1_src
    336 //    x1 => *pu1_dst
    337 //    w2 =>  src_strd
    338 //    w3 =>  dst_strd
    339 //    w4 =>  ui_neighboravailability
    340 
    341 
    342     .global ih264_intra_pred_chroma_8x8_mode_vert_av8
    343 
    344 ih264_intra_pred_chroma_8x8_mode_vert_av8:
    345 
    346     push_v_regs
    347     sxtw      x3, w3
    348 
    349     add       x0, x0, #18
    350     ld1       {v0.8b, v1.8b}, [x0]
    351 
    352     st1       {v0.8b, v1.8b}, [x1], x3
    353     st1       {v0.8b, v1.8b}, [x1], x3
    354     st1       {v0.8b, v1.8b}, [x1], x3
    355     st1       {v0.8b, v1.8b}, [x1], x3
    356     st1       {v0.8b, v1.8b}, [x1], x3
    357     st1       {v0.8b, v1.8b}, [x1], x3
    358     st1       {v0.8b, v1.8b}, [x1], x3
    359     st1       {v0.8b, v1.8b}, [x1], x3
    360 
    361     pop_v_regs
    362     ret
    363 
    364 
    365 
    366 
    367 ///******************************************************************************
    368 
    369 
    370 ///**
    371 //*******************************************************************************
    372 //*
    373 //*ih264_intra_pred_chroma_8x8_mode_plane
    374 //*
    375 //* @brief
    376 //*   Perform Intra prediction for  chroma_8x8 mode:PLANE
    377 //*
    378 //* @par Description:
    379 //*  Perform Intra prediction for  chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
    380 //*
    381 //* @param[in] pu1_src
    382 //*  UWORD8 pointer to the source containing alternate U and V samples
    383 //*
    384 //* @param[out] pu1_dst
    385 //*  UWORD8 pointer to the destination with alternate U and V samples
    386 //*
    387 //* @param[in] src_strd
    388 //*  integer source stride
    389 //*
    390 //* @param[in] dst_strd
    391 //*  integer destination stride
    392 //*
    393 //* @param[in] ui_neighboravailability
    394 //*  availability of neighbouring pixels
    395 //*
    396 //* @returns
    397 //*
    398 //* @remarks
    399 //*  None
    400 //*
    401 //*******************************************************************************/
    402 //void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
    403 //                                        UWORD8 *pu1_dst,
    404 //                                        WORD32 src_strd,
    405 //                                        WORD32 dst_strd,
    406 //                                        WORD32 ui_neighboravailability)
    407 
    408 //**************Variables Vs Registers*****************************************
    409 //    x0 => *pu1_src
    410 //    x1 => *pu1_dst
    411 //    w2 =>  src_strd
    412 //    w3 =>  dst_strd
    413 //    w4 =>  ui_neighboravailability
    414 
    415     .global ih264_intra_pred_chroma_8x8_mode_plane_av8
    416 ih264_intra_pred_chroma_8x8_mode_plane_av8:
    417 
    418     push_v_regs
    419     stp       x19, x20, [sp, #-16]!
    420     sxtw      x3, w3
    421 
    422     ld1       {v0.2s}, [x0]
    423     add       x10, x0, #10
    424     ld1       {v1.2s}, [x10]
    425     add       x10, x10, #6
    426     rev64     v5.4h, v0.4h
    427     ld1       {v2.2s}, [x10], #8
    428     add       x10, x10, #2
    429     rev64     v7.4h, v2.4h
    430     ld1       {v3.2s}, [x10]
    431     sub       x5, x3, #8
    432     adrp      x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1
    433     ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1]
    434     usubl     v10.8h, v5.8b, v1.8b
    435     ld1       {v8.8b, v9.8b}, [x12]     // Load multiplication factors 1 to 8 into D3
    436     mov       v8.d[1], v9.d[0]
    437     usubl     v12.8h, v3.8b, v7.8b
    438     mul       v14.8h, v10.8h , v8.8h
    439     mul       v16.8h, v12.8h , v8.8h
    440     uzp1      v15.8h, v14.8h, v16.8h
    441     uzp2      v16.8h, v14.8h, v16.8h
    442     mov       v14.16b, v15.16b
    443     mov       v15.d[0], v14.d[1]
    444     mov       v17.d[0], v16.d[1]
    445     addp      v14.4h, v14.4h, v14.4h
    446     addp      v15.4h, v15.4h, v15.4h
    447     addp      v16.4h, v16.4h, v16.4h
    448     addp      v17.4h, v17.4h, v17.4h
    449     addp      v14.4h, v14.4h, v14.4h
    450     addp      v15.4h, v15.4h, v15.4h
    451     addp      v16.4h, v16.4h, v16.4h
    452     addp      v17.4h, v17.4h, v17.4h
    453     mov       x6, #34
    454     dup       v18.8h, w6
    455     smull     v22.4s, v14.4h, v18.4h
    456     smull     v24.4s, v15.4h, v18.4h
    457     smull     v26.4s, v16.4h, v18.4h
    458     smull     v28.4s, v17.4h, v18.4h
    459     rshrn     v10.4h, v22.4s, #6
    460     rshrn     v12.4h, v24.4s, #6
    461     rshrn     v13.4h, v26.4s, #6
    462     rshrn     v14.4h, v28.4s, #6
    463     ldrb      w6, [x0], #1
    464     add       x10, x0, #31
    465     ldrb      w8, [x0], #1
    466     ldrb      w7, [x10], #1
    467     ldrb      w9, [x10], #1
    468     add       w6, w6, w7
    469     add       w8, w8, w9
    470     lsl       w6, w6, #4
    471     lsl       w8, w8, #4
    472     dup       v0.8h, w6
    473     dup       v2.8h, w8
    474     dup       v4.8h, v12.h[0]
    475     dup       v6.8h, v10.h[0]
    476     dup       v24.8h, v14.h[0]
    477     dup       v26.8h, v13.h[0]
    478     zip1      v5.8h, v4.8h, v24.8h
    479     zip2      v24.8h, v4.8h, v24.8h
    480     mov       v4.16b, v5.16b
    481     zip1      v7.8h, v6.8h, v26.8h
    482     zip2      v26.8h, v6.8h, v26.8h
    483     mov       v6.16b, v7.16b
    484     zip1      v1.8h, v0.8h, v2.8h
    485     zip2      v2.8h, v0.8h, v2.8h
    486     mov       v0.16b, v1.16b
    487 
    488     adrp      x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2
    489     ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2]
    490 
    491     ld1       {v8.2s, v9.2s}, [x12]
    492     mov       v8.d[1], v9.d[0]
    493     mov       v10.16b, v8.16b
    494     mov       v22.16b, v8.16b
    495     zip1      v9.8h, v8.8h, v10.8h
    496     zip2      v10.8h, v8.8h, v10.8h
    497     mov       v8.16b, v9.16b
    498     mul       v12.8h, v4.8h , v8.8h
    499     mul       v16.8h, v4.8h , v10.8h
    500     add       v12.8h, v0.8h , v12.8h
    501     add       v16.8h, v0.8h , v16.8h
    502     dup       v20.8h, v22.h[0]
    503     mul       v4.8h, v6.8h , v20.8h
    504     dup       v30.8h, v22.h[1]
    505     mul       v18.8h, v6.8h , v20.8h
    506     mul       v14.8h, v6.8h , v30.8h
    507     mul       v8.8h, v6.8h , v30.8h
    508     add       v24.8h, v12.8h , v4.8h
    509     add       v0.8h, v16.8h , v18.8h
    510     add       v2.8h, v12.8h , v14.8h
    511     sqrshrun  v28.8b, v24.8h, #5
    512     add       v26.8h, v16.8h , v8.8h
    513     sqrshrun  v29.8b, v0.8h, #5
    514     dup       v20.8h, v22.h[2]
    515     st1       {v28.8b, v29.8b}, [x1], x3
    516     sqrshrun  v28.8b, v2.8h, #5
    517     sqrshrun  v29.8b, v26.8h, #5
    518     mul       v4.8h, v6.8h , v20.8h
    519     mul       v18.8h, v6.8h , v20.8h
    520     st1       {v28.8b, v29.8b}, [x1], x3
    521     add       v24.8h, v12.8h , v4.8h
    522     add       v0.8h, v16.8h , v18.8h
    523     dup       v30.8h, v22.h[3]
    524     sqrshrun  v28.8b, v24.8h, #5
    525     sqrshrun  v29.8b, v0.8h, #5
    526     mul       v14.8h, v6.8h , v30.8h
    527     mul       v8.8h, v6.8h , v30.8h
    528     st1       {v28.8b, v29.8b}, [x1], x3
    529     add       v2.8h, v12.8h , v14.8h
    530     add       v26.8h, v16.8h , v8.8h
    531     dup       v20.8h, v22.h[4]
    532     sqrshrun  v28.8b, v2.8h, #5
    533     sqrshrun  v29.8b, v26.8h, #5
    534     mul       v4.8h, v6.8h , v20.8h
    535     mul       v18.8h, v6.8h , v20.8h
    536     st1       {v28.8b, v29.8b}, [x1], x3
    537     add       v24.8h, v12.8h , v4.8h
    538     add       v0.8h, v16.8h , v18.8h
    539     dup       v30.8h, v22.h[5]
    540     sqrshrun  v28.8b, v24.8h, #5
    541     sqrshrun  v29.8b, v0.8h, #5
    542     mul       v14.8h, v6.8h , v30.8h
    543     mul       v8.8h, v6.8h , v30.8h
    544     st1       {v28.8b, v29.8b}, [x1], x3
    545     add       v2.8h, v12.8h , v14.8h
    546     add       v26.8h, v16.8h , v8.8h
    547     dup       v20.8h, v22.h[6]
    548     sqrshrun  v28.8b, v2.8h, #5
    549     sqrshrun  v29.8b, v26.8h, #5
    550     mul       v4.8h, v6.8h , v20.8h
    551     mul       v18.8h, v6.8h , v20.8h
    552     st1       {v28.8b, v29.8b}, [x1], x3
    553     add       v24.8h, v12.8h , v4.8h
    554     add       v0.8h, v16.8h , v18.8h
    555     dup       v30.8h, v22.h[7]
    556     sqrshrun  v28.8b, v24.8h, #5
    557     sqrshrun  v29.8b, v0.8h, #5
    558     mul       v14.8h, v6.8h , v30.8h
    559     mul       v8.8h, v6.8h , v30.8h
    560     st1       {v28.8b, v29.8b}, [x1], x3
    561     add       v2.8h, v12.8h , v14.8h
    562     add       v26.8h, v16.8h , v8.8h
    563     sqrshrun  v28.8b, v2.8h, #5
    564     sqrshrun  v29.8b, v26.8h, #5
    565     st1       {v28.8b, v29.8b}, [x1], x3
    566 
    567 end_func_plane:
    568 
    569     ldp       x19, x20, [sp], #16
    570     pop_v_regs
    571     ret
    572 
    573 
    574 
    575