Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 //******************************************************************************
     22 //* @file
     23 //*  ih264_intra_pred_luma_16x16_av8.s
     24 //*
     25 //* @brief
     26 //*  Contains function definitions for intra 16x16 Luma prediction .
     27 //*
     28 //* @author
     29 //*  Ittiam
     30 //*
     31 //* @par List of Functions:
     32 //*
     33 //*  - ih264_intra_pred_luma_16x16_mode_vert_av8()
     34 //*  - ih264_intra_pred_luma_16x16_mode_horz_av8()
     35 //*  - ih264_intra_pred_luma_16x16_mode_dc_av8()
     36 //*  - ih264_intra_pred_luma_16x16_mode_plane_av8()
     37 //*
     38 //* @remarks
     39 //*  None
     40 //*
     41 //*******************************************************************************
     42 //*/
     43 
     44 ///* All the functions here are replicated from ih264_intra_pred_filters.c
     45 //
     46 
     47 ///**
     48 ///**
     49 ///**
     50 //
     51 
     52 
     53 .text
     54 .p2align 2
     55 .include "ih264_neon_macros.s"
     56 .extern ih264_gai1_intrapred_luma_plane_coeffs
     57 
     58 
     59 
     60 ///**
     61 //*******************************************************************************
     62 //*
     63 //*ih264_intra_pred_luma_16x16_mode_vert
     64 //*
     65 //* @brief
     66 //*   Perform Intra prediction for  luma_16x16 mode:vertical
     67 //*
     68 //* @par Description:
     69 //* Perform Intra prediction for  luma_16x16 mode:Vertical ,described in sec 8.3.3.1
     70 //*
     71 //* @param[in] pu1_src
     72 //*  UWORD8 pointer to the source
     73 //*
     74 //* @param[out] pu1_dst
     75 //*  UWORD8 pointer to the destination
     76 //*
     77 //* @param[in] src_strd
     78 //*  integer source stride
     79 //*
     80 //* @param[in] dst_strd
     81 //*  integer destination stride
     82 //*
     83 //* @param[in] ui_neighboravailability
     84 //* availability of neighbouring pixels(Not used in this function)
     85 //*
     86 //* @returns
     87 //*
     88 //* @remarks
     89 //*  None
     90 //*
     91 //*******************************************************************************
     92 //void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
     93 //                                        UWORD8 *pu1_dst,
     94 //                                        WORD32 src_strd,
     95 //                                        WORD32 dst_strd,
     96 //                                        WORD32 ui_neighboravailability)
     97 
     98 //**************Variables Vs Registers*****************************************
     99 //    x0 => *pu1_src
    100 //    x1 => *pu1_dst
    101 //    x2 =>  src_strd
    102 //    x3 =>  dst_strd
    103 //   x4 =>  ui_neighboravailability
    104 
    105 
    106     .global ih264_intra_pred_luma_16x16_mode_vert_av8
    107 
    108 ih264_intra_pred_luma_16x16_mode_vert_av8:
    109 
    110     push_v_regs
    111 
    112 
    113     add       x0, x0, #17
    114     ld1       {v0.8b, v1.8b}, [x0]
    115 
    116     st1       {v0.8b, v1.8b}, [x1], x3
    117     st1       {v0.8b, v1.8b}, [x1], x3
    118     st1       {v0.8b, v1.8b}, [x1], x3
    119     st1       {v0.8b, v1.8b}, [x1], x3
    120     st1       {v0.8b, v1.8b}, [x1], x3
    121     st1       {v0.8b, v1.8b}, [x1], x3
    122     st1       {v0.8b, v1.8b}, [x1], x3
    123     st1       {v0.8b, v1.8b}, [x1], x3
    124     st1       {v0.8b, v1.8b}, [x1], x3
    125     st1       {v0.8b, v1.8b}, [x1], x3
    126     st1       {v0.8b, v1.8b}, [x1], x3
    127     st1       {v0.8b, v1.8b}, [x1], x3
    128     st1       {v0.8b, v1.8b}, [x1], x3
    129     st1       {v0.8b, v1.8b}, [x1], x3
    130     st1       {v0.8b, v1.8b}, [x1], x3
    131     st1       {v0.8b, v1.8b}, [x1], x3
    132 
    133     pop_v_regs
    134     ret
    135 
    136 
    137 
    138 
    139 
    140 ///******************************************************************************
    141 
    142 
    143 ///**
    144 //*******************************************************************************
    145 //*
    146 //*ih264_intra_pred_luma_16x16_mode_horz
    147 //*
    148 //* @brief
    149 //*  Perform Intra prediction for  luma_16x16 mode:horizontal
    150 //*
    151 //* @par Description:
    152 //*  Perform Intra prediction for  luma_16x16 mode:horizontal ,described in sec 8.3.3.2
    153 //*
    154 //* @param[in] pu1_src
    155 //*  UWORD8 pointer to the source
    156 //*
    157 //* @param[out] pu1_dst
    158 //*  UWORD8 pointer to the destination
    159 //*
    160 //* @param[in] src_strd
    161 //*  integer source stride
    162 //*
    163 //* @param[in] dst_strd
    164 //*  integer destination stride
    165 //*
    166 //* @param[in] ui_neighboravailability
    167 //* availability of neighbouring pixels(Not used in this function)
    168 //*
    169 //* @returns
    170 //*
    171 //* @remarks
    172 //*  None
    173 //*
    174 //*******************************************************************************
    175 //*/
    176 //void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
    177 //                                         UWORD8 *pu1_dst,
    178 //                                         WORD32 src_strd,
    179 //                                         WORD32 dst_strd,
    180 //                                         WORD32 ui_neighboravailability)
    181 //**************Variables Vs Registers*****************************************
    182 //    x0 => *pu1_src
    183 //    x1 => *pu1_dst
    184 //    x2 =>  src_strd
    185 //    x3 =>  dst_strd
    186 //   x4 =>  ui_neighboravailability
    187 
    188     .global ih264_intra_pred_luma_16x16_mode_horz_av8
    189 
    190 ih264_intra_pred_luma_16x16_mode_horz_av8:
    191 
    192 
    193 
    194     push_v_regs
    195 
    196     ld1       {v0.16b}, [x0]
    197 
    198 
    199 
    200     dup       v10.16b, v0.b[15]
    201     dup       v11.16b, v0.b[14]
    202     dup       v12.16b, v0.b[13]
    203     dup       v13.16b, v0.b[12]
    204     st1       {v10.16b}, [x1], x3
    205     dup       v14.16b, v0.b[11]
    206     st1       {v11.16b}, [x1], x3
    207     dup       v15.16b, v0.b[10]
    208     st1       {v12.16b}, [x1], x3
    209     dup       v16.16b, v0.b[9]
    210     st1       {v13.16b}, [x1], x3
    211     dup       v17.16b, v0.b[8]
    212     st1       {v14.16b}, [x1], x3
    213     dup       v18.16b, v0.b[7]
    214     st1       {v15.16b}, [x1], x3
    215     dup       v19.16b, v0.b[6]
    216     st1       {v16.16b}, [x1], x3
    217     dup       v20.16b, v0.b[5]
    218     st1       {v17.16b}, [x1], x3
    219     dup       v21.16b, v0.b[4]
    220     st1       {v18.16b}, [x1], x3
    221     dup       v22.16b, v0.b[3]
    222     st1       {v19.16b}, [x1], x3
    223     dup       v23.16b, v0.b[2]
    224     st1       {v20.16b}, [x1], x3
    225     dup       v24.16b, v0.b[1]
    226     st1       {v21.16b}, [x1], x3
    227     dup       v25.16b, v0.b[0]
    228     st1       {v22.16b}, [x1], x3
    229     st1       {v23.16b}, [x1], x3
    230     st1       {v24.16b}, [x1], x3
    231     st1       {v25.16b}, [x1], x3
    232 
    233     pop_v_regs
    234     ret
    235 
    236 
    237 
    238 
    239 
    240 
    241 
    242 ///******************************************************************************
    243 
    244 
    245 ///**
    246 //*******************************************************************************
    247 //*
    248 //*ih264_intra_pred_luma_16x16_mode_dc
    249 //*
    250 //* @brief
    251 //*  Perform Intra prediction for  luma_16x16 mode:DC
    252 //*
    253 //* @par Description:
    254 //*  Perform Intra prediction for  luma_16x16 mode:DC ,described in sec 8.3.3.3
    255 //*
    256 //* @param[in] pu1_src
    257 //*  UWORD8 pointer to the source
    258 //*
    259 //* @param[out] pu1_dst
    260 //*  UWORD8 pointer to the destination
    261 //*
    262 //* @param[in] src_strd
    263 //*  integer source stride
    264 //*
    265 //* @param[in] dst_strd
    266 //*  integer destination stride
    267 //*
    268 //* @param[in] ui_neighboravailability
    269 //*  availability of neighbouring pixels
    270 //*
    271 //* @returns
    272 //*
    273 //* @remarks
    274 //*  None
    275 //*
    276 //*******************************************************************************/
    277 //void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
    278 //                                       UWORD8 *pu1_dst,
    279 //                                       WORD32 src_strd,
    280 //                                       WORD32 dst_strd,
    281 //                                       WORD32 ui_neighboravailability)
    282 
    283 //**************Variables Vs Registers*****************************************
    284 //    x0 => *pu1_src
    285 //    x1 => *pu1_dst
    286 //    x2 =>  src_strd
    287 //    x3 =>  dst_strd
    288 //   x4 =>  ui_neighboravailability
    289 
    290     .global ih264_intra_pred_luma_16x16_mode_dc_av8
    291 
    292 ih264_intra_pred_luma_16x16_mode_dc_av8:
    293 
    294 
    295 
    296     push_v_regs
    297     stp       x19, x20, [sp, #-16]!
    298 
    299     sub       v0.16b, v0.16b, v0.16b
    300     sub       v1.16b, v1.16b, v1.16b
    301     mov       w10, #0
    302     mov       w11 , #3
    303     ands      x6, x4, #0x01
    304     beq       top_available             //LEFT NOT AVAILABLE
    305     ld1       {v0.16b}, [x0]
    306     add       w10, w10, #8
    307     add       w11, w11, #1
    308 top_available:
    309     ands      x6, x4, #0x04
    310     beq       none_available
    311     add       x6, x0, #17
    312     ld1       {v1.16b}, [x6]
    313     add       w10, w10, #8
    314     add       w11, w11, #1
    315     b         summation
    316 none_available:
    317     cmp       x4, #0
    318     bne       summation
    319     mov       w15, #128
    320     dup       v20.16b, w15
    321     b         store
    322 summation:
    323     uaddl     v2.8h, v0.8b, v1.8b
    324     uaddl2    v3.8h, v0.16b, v1.16b
    325     dup       v10.8h, w10
    326     neg       w11, w11
    327     dup       v20.8h, w11
    328     add       v0.8h, v2.8h, v3.8h
    329     mov       v1.d[0], v0.d[1]
    330     add       v0.4h, v0.4h, v1.4h
    331     addp      v0.4h, v0.4h , v0.4h
    332     addp      v0.4h, v0.4h , v0.4h
    333     add       v0.4h, v0.4h, v10.4h
    334     uqshl     v0.8h, v0.8h, v20.8h
    335     sqxtun    v0.8b, v0.8h
    336     dup       v20.16b, v0.b[0]
    337 
    338 store:
    339 
    340     st1       { v20.16b}, [x1], x3
    341     st1       { v20.16b}, [x1], x3
    342     st1       { v20.16b}, [x1], x3
    343     st1       { v20.16b}, [x1], x3
    344     st1       { v20.16b}, [x1], x3
    345     st1       { v20.16b}, [x1], x3
    346     st1       { v20.16b}, [x1], x3
    347     st1       { v20.16b}, [x1], x3
    348     st1       { v20.16b}, [x1], x3
    349     st1       { v20.16b}, [x1], x3
    350     st1       { v20.16b}, [x1], x3
    351     st1       { v20.16b}, [x1], x3
    352     st1       { v20.16b}, [x1], x3
    353     st1       { v20.16b}, [x1], x3
    354     st1       { v20.16b}, [x1], x3
    355     st1       { v20.16b}, [x1], x3
    356 
    357 
    358 
    359 end_func:
    360 
    361     ldp       x19, x20, [sp], #16
    362     pop_v_regs
    363     ret
    364 
    365 
    366 
    367 
    368 
    369 ///******************************************************************************
    370 
    371 
    372 ///**
    373 //*******************************************************************************
    374 //*
    375 //*ih264_intra_pred_luma_16x16_mode_plane
    376 //*
    377 //* @brief
    378 //*  Perform Intra prediction for  luma_16x16 mode:PLANE
    379 //*
    380 //* @par Description:
    381 //*  Perform Intra prediction for  luma_16x16 mode:PLANE ,described in sec 8.3.3.4
    382 //*
    383 //* @param[in] pu1_src
    384 //*  UWORD8 pointer to the source
    385 //*
    386 //* @param[out] pu1_dst
    387 //*  UWORD8 pointer to the destination
    388 //*
    389 //* @param[in] src_strd
    390 //*  integer source stride
    391 //*
    392 //* @param[in] dst_strd
    393 //*  integer destination stride
    394 //*
    395 //* @param[in] ui_neighboravailability
    396 //*  availability of neighbouring pixels
    397 //*
    398 //* @returns
    399 //*
    400 //* @remarks
    401 //*  None
    402 //*
    403 //*******************************************************************************/
    404 //void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
    405 //                                        UWORD8 *pu1_dst,
    406 //                                        WORD32 src_strd,
    407 //                                        WORD32 dst_strd,
    408 //                                        WORD32 ui_neighboravailability)
    409 
    410 //**************Variables Vs Registers*****************************************
    411 //    x0 => *pu1_src
    412 //    x1 => *pu1_dst
    413 //    x2 =>  src_strd
    414 //    x3 =>  dst_strd
    415 //   x4 =>  ui_neighboravailability
    416 
    417     .global ih264_intra_pred_luma_16x16_mode_plane_av8
    418 ih264_intra_pred_luma_16x16_mode_plane_av8:
    419 
    420     push_v_regs
    421     stp       x19, x20, [sp, #-16]!
    422     mov       x2, x1
    423     add       x1, x0, #17
    424     add       x0, x0, #15
    425     mov       x8, #9
    426     sub       x1, x1, #1
    427     mov       x10, x1                   //top_left
    428     mov       x4, #-1
    429     ld1       {v2.2s}, [x1], x8
    430 
    431     adrp      x7, :got:ih264_gai1_intrapred_luma_plane_coeffs
    432     ldr       x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs]
    433 
    434     ld1       {v0.2s}, [x1]
    435     rev64     v2.8b, v2.8b
    436     ld1       {v6.2s, v7.2s}, [x7]
    437     usubl     v0.8h, v0.8b, v2.8b
    438     uxtl      v16.8h, v6.8b
    439     mul       v0.8h, v0.8h , v16.8h
    440     uxtl      v18.8h, v7.8b
    441     add       x7, x0, x4, lsl #3
    442     sub       x0, x7, x4, lsl #1
    443     sub       x20, x4, #0x0
    444     neg       x14, x20
    445     addp      v0.8h, v0.8h, v1.8h
    446     ldrb      w8, [x7], #-1
    447     sxtw      x8, w8
    448     ldrb      w9, [x0], #1
    449     sxtw      x9, w9
    450     saddlp    v0.2s, v0.4h
    451     sub       x12, x8, x9
    452     ldrb      w8, [x7], #-1
    453     sxtw      x8, w8
    454     saddlp    v0.1d, v0.2s
    455     ldrb      w9, [x0], #1
    456     sxtw      x9, w9
    457     sub       x8, x8, x9
    458     shl       v2.2s, v0.2s, #2
    459     add       x12, x12, x8, lsl #1
    460     add       v0.2s, v0.2s , v2.2s
    461     ldrb      w8, [x7], #-1
    462     sxtw      x8, w8
    463     ldrb      w9, [x0], #1
    464     sxtw      x9, w9
    465     srshr     v0.2s, v0.2s, #6          // i_b = D0[0]
    466     sub       x8, x8, x9
    467     ldrb      w5, [x7], #-1
    468     sxtw      x5, w5
    469     add       x8, x8, x8, lsl #1
    470     dup       v4.8h, v0.h[0]
    471     add       x12, x12, x8
    472     ldrb      w9, [x0], #1
    473     sxtw      x9, w9
    474     mul       v0.8h, v4.8h , v16.8h
    475     sub       x5, x5, x9
    476     mul       v2.8h, v4.8h , v18.8h
    477     add       x12, x12, x5, lsl #2
    478     ldrb      w8, [x7], #-1
    479     sxtw      x8, w8
    480     ldrb      w9, [x0], #1
    481     sxtw      x9, w9
    482     sub       x8, x8, x9
    483     ldrb      w5, [x7], #-1
    484     sxtw      x5, w5
    485     add       x8, x8, x8, lsl #2
    486     ldrb      w6, [x0], #1
    487     sxtw      x6, w6
    488     add       x12, x12, x8
    489     ldrb      w8, [x7], #-1
    490     sxtw      x8, w8
    491     ldrb      w9, [x0], #1
    492     sxtw      x9, w9
    493     sub       x5, x5, x6
    494     sub       x8, x8, x9
    495     add       x5, x5, x5, lsl #1
    496     sub       x20, x8, x8, lsl #3
    497     neg       x8, x20
    498     add       x12, x12, x5, lsl #1
    499     ldrb      w5, [x7], #-1
    500     sxtw      x5, w5
    501     ldrb      w6, [x10]                 //top_left
    502     sxtw      x6, w6
    503     add       x12, x12, x8
    504     sub       x9, x5, x6
    505     ldrb      w6, [x1, #7]
    506     sxtw      x6, w6
    507     add       x12, x12, x9, lsl #3      // i_c = x12
    508     add       x8, x5, x6
    509     add       x12, x12, x12, lsl #2
    510     lsl       x8, x8, #4                // i_a = x8
    511     add       x12, x12, #0x20
    512     lsr       x12, x12, #6
    513     shl       v28.8h, v4.8h, #3
    514     dup       v6.8h, w12
    515     dup       v30.8h, w8
    516     shl       v26.8h, v6.8h, #3
    517     sub       v30.8h, v30.8h , v28.8h
    518     sub       v30.8h, v30.8h , v26.8h
    519     add       v28.8h, v30.8h , v6.8h
    520     add       v26.8h, v28.8h , v0.8h
    521     add       v28.8h, v28.8h , v2.8h
    522     sqrshrun  v20.8b, v26.8h, #5
    523     sqrshrun  v21.8b, v28.8h, #5
    524     add       v26.8h, v26.8h , v6.8h
    525     add       v28.8h, v28.8h , v6.8h
    526     sqrshrun  v22.8b, v26.8h, #5
    527     st1       {v20.2s, v21.2s}, [x2], x3
    528     sqrshrun  v23.8b, v28.8h, #5
    529     add       v26.8h, v26.8h , v6.8h
    530     add       v28.8h, v28.8h , v6.8h
    531     sqrshrun  v20.8b, v26.8h, #5
    532     st1       {v22.2s, v23.2s}, [x2], x3
    533     sqrshrun  v21.8b, v28.8h, #5
    534     add       v26.8h, v26.8h , v6.8h
    535     add       v28.8h, v28.8h , v6.8h
    536     sqrshrun  v22.8b, v26.8h, #5
    537     st1       {v20.2s, v21.2s}, [x2], x3
    538     sqrshrun  v23.8b, v28.8h, #5
    539     add       v26.8h, v26.8h , v6.8h
    540     add       v28.8h, v28.8h , v6.8h
    541     sqrshrun  v20.8b, v26.8h, #5
    542     st1       {v22.2s, v23.2s}, [x2], x3
    543     sqrshrun  v21.8b, v28.8h, #5
    544     add       v26.8h, v26.8h , v6.8h
    545     add       v28.8h, v28.8h , v6.8h
    546     sqrshrun  v22.8b, v26.8h, #5
    547     st1       {v20.2s, v21.2s}, [x2], x3
    548     sqrshrun  v23.8b, v28.8h, #5
    549     add       v26.8h, v26.8h , v6.8h
    550     add       v28.8h, v28.8h , v6.8h
    551     sqrshrun  v20.8b, v26.8h, #5
    552     st1       {v22.2s, v23.2s}, [x2], x3
    553     sqrshrun  v21.8b, v28.8h, #5
    554     add       v26.8h, v26.8h , v6.8h
    555     add       v28.8h, v28.8h , v6.8h
    556     sqrshrun  v22.8b, v26.8h, #5
    557     st1       {v20.2s, v21.2s}, [x2], x3
    558     sqrshrun  v23.8b, v28.8h, #5
    559     add       v26.8h, v26.8h , v6.8h
    560     add       v28.8h, v28.8h , v6.8h
    561     sqrshrun  v20.8b, v26.8h, #5
    562     st1       {v22.2s, v23.2s}, [x2], x3
    563     sqrshrun  v21.8b, v28.8h, #5
    564     add       v26.8h, v26.8h , v6.8h
    565     add       v28.8h, v28.8h , v6.8h
    566     sqrshrun  v22.8b, v26.8h, #5
    567     st1       {v20.2s, v21.2s}, [x2], x3
    568     sqrshrun  v23.8b, v28.8h, #5
    569     add       v26.8h, v26.8h , v6.8h
    570     add       v28.8h, v28.8h , v6.8h
    571     sqrshrun  v20.8b, v26.8h, #5
    572     st1       {v22.2s, v23.2s}, [x2], x3
    573     sqrshrun  v21.8b, v28.8h, #5
    574     add       v26.8h, v26.8h , v6.8h
    575     add       v28.8h, v28.8h , v6.8h
    576     sqrshrun  v22.8b, v26.8h, #5
    577     st1       {v20.2s, v21.2s}, [x2], x3
    578     sqrshrun  v23.8b, v28.8h, #5
    579     add       v26.8h, v26.8h , v6.8h
    580     add       v28.8h, v28.8h , v6.8h
    581     sqrshrun  v20.8b, v26.8h, #5
    582     st1       {v22.2s, v23.2s}, [x2], x3
    583     sqrshrun  v21.8b, v28.8h, #5
    584     add       v26.8h, v26.8h , v6.8h
    585     add       v28.8h, v28.8h , v6.8h
    586     sqrshrun  v22.8b, v26.8h, #5
    587     st1       {v20.2s, v21.2s}, [x2], x3
    588     sqrshrun  v23.8b, v28.8h, #5
    589     add       v26.8h, v26.8h , v6.8h
    590     add       v28.8h, v28.8h , v6.8h
    591     sqrshrun  v20.8b, v26.8h, #5
    592     st1       {v22.2s, v23.2s}, [x2], x3
    593     sqrshrun  v21.8b, v28.8h, #5
    594     add       v26.8h, v26.8h , v6.8h
    595     add       v28.8h, v28.8h , v6.8h
    596     sqrshrun  v22.8b, v26.8h, #5
    597     st1       {v20.2s, v21.2s}, [x2], x3
    598     sqrshrun  v23.8b, v28.8h, #5
    599     st1       {v22.2s, v23.2s}, [x2], x3
    600 
    601 end_func_plane:
    602 
    603     ldp       x19, x20, [sp], #16
    604     pop_v_regs
    605     ret
    606 
    607