Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 ///**
     22 //******************************************************************************
     23 //*
     24 //* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC )
     25 //*                and do the prediction.
     26 //*
     27 //* @par Description
     28 //*   This function evaluates  first three 16x16 modes and compute corresponding sad
     29 //*   and return the buffer predicted with best mode.
     30 //*
     31 //* @param[in] pu1_src
     32 //*  UWORD8 pointer to the source
     33 //*
     34 //** @param[in] pu1_ngbr_pels_i16
     35 //*  UWORD8 pointer to neighbouring pels
     36 //*
     37 //* @param[out] pu1_dst
     38 //*  UWORD8 pointer to the destination
     39 //*
     40 //* @param[in] src_strd
     41 //*  integer source stride
     42 //*
     43 //* @param[in] dst_strd
     44 //*  integer destination stride
     45 //*
     46 //* @param[in] u4_n_avblty
     47 //* availability of neighbouring pixels
     48 //*
     49 //* @param[in] u4_intra_mode
     50 //* Pointer to the variable in which best mode is returned
     51 //*
     52 //* @param[in] pu4_sadmin
     53 //* Pointer to the variable in which minimum sad is returned
     54 //*
     55 //* @param[in] u4_valid_intra_modes
     56 //* Says what all modes are valid
     57 //*
     58 //*
     59 //* @return      none
     60 //*
     61 //******************************************************************************
     62 //*/
     63 //
     64 //void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
     65 //                                      UWORD8 *pu1_ngbr_pels_i16,
     66 //                                      UWORD8 *pu1_dst,
     67 //                                      UWORD32 src_strd,
     68 //                                      UWORD32 dst_strd,
     69 //                                      WORD32 u4_n_avblty,
     70 //                                      UWORD32 *u4_intra_mode,
     71 //                                      WORD32 *pu4_sadmin,
     72 //                                       UWORD32 u4_valid_intra_modes)
     73 //
     74 .text
     75 .p2align 2
     76 .include "ih264_neon_macros.s"
     77 
     78 .global ih264e_evaluate_intra16x16_modes_av8
     79 
     80 ih264e_evaluate_intra16x16_modes_av8:
     81 
     82 //x0 = pu1_src,
     83 //x1 = pu1_ngbr_pels_i16,
     84 //x2 = pu1_dst,
     85 //x3 = src_strd,
     86 //x4 = dst_strd,
     87 //x5 = u4_n_avblty,
     88 //x6 = u4_intra_mode,
     89 //x7 = pu4_sadmin
     90 
     91 
     92 
     93     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     94     push_v_regs
     95     stp       x19, x20, [sp, #-16]!
     96 
     97     ldr       x16, [sp, #80]
     98     mov       x17, x4
     99     mov       x14, x6
    100     mov       x15, x7
    101 
    102 
    103     sub       v0.16b, v0.16b, v0.16b
    104     sub       v1.16b, v1.16b, v1.16b
    105     mov       w10, #0
    106     mov       w11 , #3
    107 
    108     ands      x6, x5, #0x01
    109     beq       top_available             //LEFT NOT AVAILABLE
    110     ld1       {v0.16b}, [x1]
    111     add       w10, w10, #8
    112     add       w11, w11, #1
    113 top_available:
    114     ands      x6, x5, #0x04
    115     beq       none_available
    116     add       x6, x1, #17
    117     ld1       {v1.16b}, [x6]
    118     add       w10, w10, #8
    119     add       w11, w11, #1
    120     b         summation
    121 none_available:
    122     cmp       x5, #0
    123     bne       summation
    124     mov       w6, #128
    125     dup       v30.16b, w6
    126     dup       v31.16b, w6
    127     b         sad_comp
    128 summation:
    129     uaddl     v2.8h, v0.8b, v1.8b
    130     uaddl2    v3.8h, v0.16b, v1.16b
    131     dup       v10.8h, w10
    132     neg       w11, w11
    133     dup       v20.8h, w11
    134     add       v0.8h, v2.8h, v3.8h
    135     mov       v1.d[0], v0.d[1]
    136     add       v0.4h, v0.4h, v1.4h
    137     addp      v0.4h, v0.4h , v0.4h
    138     addp      v0.4h, v0.4h , v0.4h
    139     add       v0.4h, v0.4h, v10.4h
    140     uqshl     v0.8h, v0.8h, v20.8h
    141     sqxtun    v0.8b, v0.8h
    142 
    143     dup       v30.16b, v0.b[0]
    144     dup       v31.16b, v0.b[0]
    145 
    146 
    147 sad_comp:
    148     ld1       { v0.2s, v1.2s }, [x0], x3 // source x0w 0
    149 
    150     ld1       { v2.2s, v3.2s}, [x0], x3 //row 1
    151 
    152     ld1       { v4.2s, v5.2s}, [x0], x3 //row 2
    153 
    154     ld1       { v6.2s, v7.2s}, [x0], x3 //row 3
    155 
    156     //---------------------
    157 
    158     //values for vertical prediction
    159     add       x6, x1, #17
    160     ld1       {v10.8b}, [x6], #8
    161     ld1       {v11.8b}, [x6], #8
    162     ld1       {v9.16b}, [x1]
    163 
    164 
    165 
    166     dup       v20.8b, v9.b[15]          ///HORIZONTAL VALUE ROW=0//
    167     dup       v21.8b, v9.b[15]          ///HORIZONTAL VALUE ROW=0//
    168 
    169 
    170 ///* computing SADs for all three modes*/
    171     ///vertical row 0@
    172     uabdl     v16.8h, v0.8b, v10.8b
    173     uabdl     v18.8h, v1.8b, v11.8b
    174 
    175     ///HORZ row 0@
    176     uabdl     v26.8h, v0.8b, v20.8b
    177     uabdl     v28.8h, v1.8b, v21.8b
    178 
    179     ///dc row 0@
    180     uabdl     v22.8h, v0.8b, v30.8b
    181     uabdl     v24.8h, v1.8b, v31.8b
    182 
    183 
    184 
    185 
    186 
    187     dup       v20.8b, v9.b[14]          ///HORIZONTAL VALUE ROW=1//
    188     dup       v21.8b, v9.b[14]
    189 
    190 
    191     ///vertical row 1@
    192     uabal     v16.8h, v2.8b, v10.8b
    193     uabal     v18.8h, v3.8b, v11.8b
    194 
    195     ld1       { v0.2s, v1.2s }, [x0], x3 //row 4
    196     ///HORZ row 1@
    197     uabal     v26.8h, v2.8b, v20.8b
    198     uabal     v28.8h, v3.8b, v21.8b
    199 
    200     ///dc row 1@
    201     uabal     v22.8h, v2.8b, v30.8b
    202     uabal     v24.8h, v3.8b, v31.8b
    203 
    204     dup       v20.8b, v9.b[13]          ///HORIZONTAL VALUE ROW=2//
    205     dup       v21.8b, v9.b[13]
    206 
    207     ///vertical row 2@
    208     uabal     v16.8h, v4.8b, v10.8b
    209     uabal     v18.8h, v5.8b, v11.8b
    210 
    211     ld1       { v2.2s, v3.2s}, [x0], x3 //row 5
    212     ///HORZ row 2@
    213     uabal     v26.8h, v4.8b, v20.8b
    214     uabal     v28.8h, v5.8b, v21.8b
    215 
    216     ///dc row 2@
    217     uabal     v22.8h, v4.8b, v30.8b
    218     uabal     v24.8h, v5.8b, v31.8b
    219 
    220     dup       v20.8b, v9.b[12]          ///HORIZONTAL VALUE ROW=3//
    221     dup       v21.8b, v9.b[12]
    222 
    223     ///vertical row 3@
    224     uabal     v16.8h, v6.8b, v10.8b
    225     uabal     v18.8h, v7.8b, v11.8b
    226 
    227     ld1       { v4.2s, v5.2s}, [x0], x3 //row 6
    228     ///HORZ row 3@
    229     uabal     v26.8h, v6.8b, v20.8b
    230     uabal     v28.8h, v7.8b, v21.8b
    231 
    232     ///dc row 3@
    233     uabal     v22.8h, v6.8b, v30.8b
    234     uabal     v24.8h, v7.8b, v31.8b
    235 //----------------------------------------------------------------------------------------------
    236 
    237     dup       v20.8b, v9.b[11]          ///HORIZONTAL VALUE ROW=0//
    238     dup       v21.8b, v9.b[11]
    239 
    240     ///vertical row 0@
    241     uabal     v16.8h, v0.8b, v10.8b
    242     uabal     v18.8h, v1.8b, v11.8b
    243 
    244     ld1       {  v6.2s, v7.2s}, [x0], x3 //row 7
    245     ///HORZ row 0@
    246     uabal     v26.8h, v0.8b, v20.8b
    247     uabal     v28.8h, v1.8b, v21.8b
    248 
    249     ///dc row 0@
    250     uabal     v22.8h, v0.8b, v30.8b
    251     uabal     v24.8h, v1.8b, v31.8b
    252 
    253     dup       v20.8b, v9.b[10]          ///HORIZONTAL VALUE ROW=1//
    254     dup       v21.8b, v9.b[10]
    255 
    256     ///vertical row 1@
    257     uabal     v16.8h, v2.8b, v10.8b
    258     uabal     v18.8h, v3.8b, v11.8b
    259 
    260     ld1       { v0.2s, v1.2s }, [x0], x3 //row 8
    261     ///HORZ row 1@
    262     uabal     v26.8h, v2.8b, v20.8b
    263     uabal     v28.8h, v3.8b, v21.8b
    264 
    265     ///dc row 1@
    266     uabal     v22.8h, v2.8b, v30.8b
    267     uabal     v24.8h, v3.8b, v31.8b
    268 
    269     dup       v20.8b, v9.b[9]           ///HORIZONTAL VALUE ROW=2//
    270     dup       v21.8b, v9.b[9]
    271 
    272     ///vertical row 2@
    273     uabal     v16.8h, v4.8b, v10.8b
    274     uabal     v18.8h, v5.8b, v11.8b
    275 
    276     ld1       { v2.2s, v3.2s}, [x0], x3 //row 9
    277 
    278     ///HORZ row 2@
    279     uabal     v26.8h, v4.8b, v20.8b
    280     uabal     v28.8h, v5.8b, v21.8b
    281 
    282     ///dc row 2@
    283     uabal     v22.8h, v4.8b, v30.8b
    284     uabal     v24.8h, v5.8b, v31.8b
    285 
    286     dup       v20.8b, v9.b[8]           ///HORIZONTAL VALUE ROW=3//
    287     dup       v21.8b, v9.b[8]
    288 
    289     ///vertical row 3@
    290     uabal     v16.8h, v6.8b, v10.8b
    291     uabal     v18.8h, v7.8b, v11.8b
    292 
    293     ld1       { v4.2s, v5.2s}, [x0], x3 //row 10
    294 
    295     ///HORZ row 3@
    296     uabal     v26.8h, v6.8b, v20.8b
    297     uabal     v28.8h, v7.8b, v21.8b
    298 
    299     ///dc row 3@
    300     uabal     v22.8h, v6.8b, v30.8b
    301     uabal     v24.8h, v7.8b, v31.8b
    302 
    303 
    304 //-------------------------------------------
    305 
    306     dup       v20.8b, v9.b[7]           ///HORIZONTAL VALUE ROW=0//
    307     dup       v21.8b, v9.b[7]
    308 
    309     ///vertical row 0@
    310     uabal     v16.8h, v0.8b, v10.8b
    311     uabal     v18.8h, v1.8b, v11.8b
    312 
    313     ld1       {  v6.2s, v7.2s}, [x0], x3 //row11
    314 
    315     ///HORZ row 0@
    316     uabal     v26.8h, v0.8b, v20.8b
    317     uabal     v28.8h, v1.8b, v21.8b
    318 
    319     ///dc row 0@
    320     uabal     v22.8h, v0.8b, v30.8b
    321     uabal     v24.8h, v1.8b, v31.8b
    322 
    323     dup       v20.8b, v9.b[6]           ///HORIZONTAL VALUE ROW=1//
    324     dup       v21.8b, v9.b[6]
    325 
    326     ///vertical row 1@
    327     uabal     v16.8h, v2.8b, v10.8b
    328     uabal     v18.8h, v3.8b, v11.8b
    329 
    330     ld1       { v0.2s, v1.2s }, [x0], x3 //row12
    331 
    332     ///HORZ row 1@
    333     uabal     v26.8h, v2.8b, v20.8b
    334     uabal     v28.8h, v3.8b, v21.8b
    335 
    336     ///dc row 1@
    337     uabal     v22.8h, v2.8b, v30.8b
    338     uabal     v24.8h, v3.8b, v31.8b
    339 
    340     dup       v20.8b, v9.b[5]           ///HORIZONTAL VALUE ROW=2//
    341     dup       v21.8b, v9.b[5]
    342 
    343     ///vertical row 2@
    344     uabal     v16.8h, v4.8b, v10.8b
    345     uabal     v18.8h, v5.8b, v11.8b
    346 
    347     ld1       { v2.2s, v3.2s}, [x0], x3 //row13
    348 
    349     ///HORZ row 2@
    350     uabal     v26.8h, v4.8b, v20.8b
    351     uabal     v28.8h, v5.8b, v21.8b
    352 
    353     ///dc row 2@
    354     uabal     v22.8h, v4.8b, v30.8b
    355     uabal     v24.8h, v5.8b, v31.8b
    356 
    357     dup       v20.8b, v9.b[4]           ///HORIZONTAL VALUE ROW=3//
    358     dup       v21.8b, v9.b[4]
    359 
    360     ///vertical row 3@
    361     uabal     v16.8h, v6.8b, v10.8b
    362     uabal     v18.8h, v7.8b, v11.8b
    363 
    364     ld1       { v4.2s, v5.2s}, [x0], x3 //row14
    365 
    366     ///HORZ row 3@
    367     uabal     v26.8h, v6.8b, v20.8b
    368     uabal     v28.8h, v7.8b, v21.8b
    369 
    370     ///dc row 3@
    371     uabal     v22.8h, v6.8b, v30.8b
    372     uabal     v24.8h, v7.8b, v31.8b
    373     //-----------------------------------------------------------------
    374 
    375     dup       v20.8b, v9.b[3]           ///HORIZONTAL VALUE ROW=0//
    376     dup       v21.8b, v9.b[3]
    377 
    378     ///vertical row 0@
    379     uabal     v16.8h, v0.8b, v10.8b
    380     uabal     v18.8h, v1.8b, v11.8b
    381 
    382     ld1       {  v6.2s, v7.2s}, [x0], x3 //row15
    383 
    384     ///HORZ row 0@
    385     uabal     v26.8h, v0.8b, v20.8b
    386     uabal     v28.8h, v1.8b, v21.8b
    387 
    388     ///dc row 0@
    389     uabal     v22.8h, v0.8b, v30.8b
    390     uabal     v24.8h, v1.8b, v31.8b
    391 
    392     dup       v20.8b, v9.b[2]           ///HORIZONTAL VALUE ROW=1//
    393     dup       v21.8b, v9.b[2]
    394 
    395     ///vertical row 1@
    396     uabal     v16.8h, v2.8b, v10.8b
    397     uabal     v18.8h, v3.8b, v11.8b
    398 
    399     ///HORZ row 1@
    400     uabal     v26.8h, v2.8b, v20.8b
    401     uabal     v28.8h, v3.8b, v21.8b
    402 
    403     ///dc row 1@
    404     uabal     v22.8h, v2.8b, v30.8b
    405     uabal     v24.8h, v3.8b, v31.8b
    406 
    407     dup       v20.8b, v9.b[1]           ///HORIZONTAL VALUE ROW=2//
    408     dup       v21.8b, v9.b[1]
    409 
    410     ///vertical row 2@
    411     uabal     v16.8h, v4.8b, v10.8b
    412     uabal     v18.8h, v5.8b, v11.8b
    413 
    414     ///HORZ row 2@
    415     uabal     v26.8h, v4.8b, v20.8b
    416     uabal     v28.8h, v5.8b, v21.8b
    417 
    418     ///dc row 2@
    419     uabal     v22.8h, v4.8b, v30.8b
    420     uabal     v24.8h, v5.8b, v31.8b
    421 
    422     dup       v20.8b, v9.b[0]           ///HORIZONTAL VALUE ROW=3//
    423     dup       v21.8b, v9.b[0]
    424 
    425     ///vertical row 3@
    426     uabal     v16.8h, v6.8b, v10.8b
    427     uabal     v18.8h, v7.8b, v11.8b
    428 
    429     ///HORZ row 3@
    430     uabal     v26.8h, v6.8b, v20.8b
    431     uabal     v28.8h, v7.8b, v21.8b
    432 
    433     ///dc row 3@
    434     uabal     v22.8h, v6.8b, v30.8b
    435     uabal     v24.8h, v7.8b, v31.8b
    436     //------------------------------------------------------------------------------
    437 
    438 
    439     //vert sum
    440 
    441     add       v16.8h, v16.8h , v18.8h
    442     mov       v18.d[0], v16.d[1]
    443     add       v16.4h, v16.4h , v18.4h
    444     uaddlp    v16.2s, v16.4h
    445     addp      v16.2s, v16.2s, v16.2s
    446     smov      x8, v16.s[0]              //dc
    447 
    448 
    449     //horz sum
    450 
    451     add       v26.8h, v26.8h , v28.8h
    452     mov       v28.d[0], v26.d[1]
    453     add       v26.4h, v26.4h , v28.4h
    454     uaddlp    v26.2s, v26.4h
    455     addp      v26.2s, v26.2s, v26.2s
    456     smov      x9, v26.s[0]
    457 
    458     //dc sum
    459 
    460     add       v24.8h, v22.8h , v24.8h   ///DC
    461     mov       v25.d[0], v24.d[1]
    462     add       v24.4h, v24.4h , v25.4h   ///DC
    463     uaddlp    v24.2s, v24.4h            ///DC
    464     addp      v24.2s, v24.2s, v24.2s    ///DC
    465     smov      x10, v24.s[0]             //dc
    466 
    467 
    468     //-----------------------
    469     mov       x11, #1
    470     lsl       x11, x11, #30
    471 
    472     mov       x0, x16
    473     //--------------------------------------------
    474     ands      x7, x0, #01               // vert mode valid????????????
    475     csel      x8, x11, x8, eq
    476 
    477 
    478     ands      x6, x0, #02               // horz mode valid????????????
    479     csel      x9, x11, x9, eq
    480 
    481     ands      x6, x0, #04               // dc mode valid????????????
    482     csel      x10, x11, x10, eq
    483 
    484 
    485 
    486 
    487 //--------------------------------
    488 
    489     mov       x4, x17
    490     mov       x7, x15
    491     mov       x6, x14
    492 
    493     //---------------------------
    494 
    495     //--------------------------
    496 
    497     cmp       x8, x9
    498     bgt       not_vert
    499     cmp       x8, x10
    500     bgt       do_dc
    501 
    502     ///----------------------
    503     //DO VERTICAL PREDICTION
    504     str       w8 , [x7]                 //MIN SAD
    505     mov       w8, #0
    506     str       w8 , [x6]                 // MODE
    507     add       x6, x1, #17
    508     ld1       {v30.16b}, [x6]
    509     b         do_dc_vert
    510     //-----------------------------
    511 not_vert: cmp x9, x10
    512     bgt       do_dc
    513 
    514     ///----------------------
    515     //DO HORIZONTAL
    516     str       w9 , [x7]                 //MIN SAD
    517     mov       w9, #1
    518     str       w9 , [x6]                 // MODE
    519 
    520     ld1       {v0.16b}, [x1]
    521     dup       v10.16b, v0.b[15]
    522     dup       v11.16b, v0.b[14]
    523     dup       v12.16b, v0.b[13]
    524     dup       v13.16b, v0.b[12]
    525     st1       {v10.16b}, [x2], x4
    526     dup       v14.16b, v0.b[11]
    527     st1       {v11.16b}, [x2], x4
    528     dup       v15.16b, v0.b[10]
    529     st1       {v12.16b}, [x2], x4
    530     dup       v16.16b, v0.b[9]
    531     st1       {v13.16b}, [x2], x4
    532     dup       v17.16b, v0.b[8]
    533     st1       {v14.16b}, [x2], x4
    534     dup       v18.16b, v0.b[7]
    535     st1       {v15.16b}, [x2], x4
    536     dup       v19.16b, v0.b[6]
    537     st1       {v16.16b}, [x2], x4
    538     dup       v20.16b, v0.b[5]
    539     st1       {v17.16b}, [x2], x4
    540     dup       v21.16b, v0.b[4]
    541     st1       {v18.16b}, [x2], x4
    542     dup       v22.16b, v0.b[3]
    543     st1       {v19.16b}, [x2], x4
    544     dup       v23.16b, v0.b[2]
    545     st1       {v20.16b}, [x2], x4
    546     dup       v24.16b, v0.b[1]
    547     st1       {v21.16b}, [x2], x4
    548     dup       v25.16b, v0.b[0]
    549     st1       {v22.16b}, [x2], x4
    550     st1       {v23.16b}, [x2], x4
    551     st1       {v24.16b}, [x2], x4
    552     st1       {v25.16b}, [x2], x4
    553 
    554 
    555 
    556     b         end_func
    557 
    558 
    559     ///-----------------------------
    560 
    561 do_dc: ///---------------------------------
    562     //DO DC
    563     str       w10 , [x7]                //MIN SAD
    564     mov       w10, #2
    565     str       w10 , [x6]                // MODE
    566 do_dc_vert:
    567     st1       {v30.4s}, [x2], x4        //0
    568     st1       {v30.4s}, [x2], x4        //1
    569     st1       {v30.4s}, [x2], x4        //2
    570     st1       {v30.4s}, [x2], x4        //3
    571     st1       {v30.4s}, [x2], x4        //4
    572     st1       {v30.4s}, [x2], x4        //5
    573     st1       {v30.4s}, [x2], x4        //6
    574     st1       {v30.4s}, [x2], x4        //7
    575     st1       {v30.4s}, [x2], x4        //8
    576     st1       {v30.4s}, [x2], x4        //9
    577     st1       {v30.4s}, [x2], x4        //10
    578     st1       {v30.4s}, [x2], x4        //11
    579     st1       {v30.4s}, [x2], x4        //12
    580     st1       {v30.4s}, [x2], x4        //13
    581     st1       {v30.4s}, [x2], x4        //14
    582     st1       {v30.4s}, [x2], x4        //15
    583     ///------------------
    584 end_func:
    585     // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    586     ldp       x19, x20, [sp], #16
    587     pop_v_regs
    588     ret
    589 
    590 
    591