Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 ///**
     22 //******************************************************************************
     23 //*
     24 //* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC )
     25 //*                and do the prediction.
     26 //*
     27 //* @par Description
     28 //*   This function evaluates  first three intra chroma modes and compute corresponding sad
     29 //*   and return the buffer predicted with best mode.
     30 //*
     31 //* @param[in] pu1_src
     32 //*  UWORD8 pointer to the source
     33 //*
     34 //** @param[in] pu1_ngbr_pels
     35 //*  UWORD8 pointer to neighbouring pels
     36 //*
     37 //* @param[out] pu1_dst
     38 //*  UWORD8 pointer to the destination
     39 //*
     40 //* @param[in] src_strd
     41 //*  integer source stride
     42 //*
     43 //* @param[in] dst_strd
     44 //*  integer destination stride
     45 //*
     46 //* @param[in] u4_n_avblty
     47 //* availability of neighbouring pixels
     48 //*
     49 //* @param[in] u4_intra_mode
     50 //* Pointer to the variable in which best mode is returned
     51 //*
     52 //* @param[in] pu4_sadmin
     53 //* Pointer to the variable in which minimum sad is returned
     54 //*
     55 //* @param[in] u4_valid_intra_modes
     56 //* Says what all modes are valid
     57 //*
     58 //*
     59 //* @return      none
     60 //*
     61 //******************************************************************************
     62 //*/
     63 //
     64 //void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
     65 //                                      UWORD8 *pu1_ngbr_pels_i16,
     66 //                                      UWORD8 *pu1_dst,
     67 //                                      UWORD32 src_strd,
     68 //                                      UWORD32 dst_strd,
     69 //                                      WORD32 u4_n_avblty,
     70 //                                      UWORD32 *u4_intra_mode,
     71 //                                      WORD32 *pu4_sadmin,
     72 //                                       UWORD32 u4_valid_intra_modes)
     73 //
     74 .text
     75 .p2align 2
     76 .include "ih264_neon_macros.s"
     77 
     78 .global ih264e_evaluate_intra_chroma_modes_av8
     79 
     80 ih264e_evaluate_intra_chroma_modes_av8:
     81 
     82 //x0 = pu1_src,
     83 //x1 = pu1_ngbr_pels_i16,
     84 //x2 = pu1_dst,
     85 //x3 = src_strd,
     86 //x4 = dst_strd,
     87 //x5 = u4_n_avblty,
     88 //x6 = u4_intra_mode,
     89 //x7 = pu4_sadmin
     90 
     91 
     92 
     93     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     94     push_v_regs
     95     stp       x19, x20, [sp, #-16]!
     96     //-----------------------
     97     ldr       x16, [sp, #80]
     98     mov       x17, x4
     99     mov       x18, x5
    100     mov       x14, x6
    101     mov       x15, x7
    102 
    103     mov       x19, #5
    104     ands      x6, x5, x19
    105     beq       none_available
    106     cmp       x6, #1
    107     beq       left_only_available
    108     cmp       x6, #4
    109     beq       top_only_available
    110 
    111 all_available:
    112     ld1       {v0.8b, v1.8b}, [x1]
    113     add       x6, x1, #18
    114     ld1       {v2.8b, v3.8b}, [x6]
    115     uxtl      v0.8h, v0.8b
    116     uxtl      v1.8h, v1.8b
    117     addp      v0.4s, v0.4s , v0.4s
    118     addp      v1.4s, v1.4s , v1.4s
    119     addp      v0.4s, v0.4s , v0.4s
    120     addp      v1.4s, v1.4s , v1.4s
    121     uxtl      v2.8h, v2.8b
    122     uxtl      v3.8h, v3.8b
    123     addp      v2.4s, v2.4s , v2.4s
    124     addp      v3.4s, v3.4s , v3.4s
    125     addp      v2.4s, v2.4s , v2.4s
    126     addp      v3.4s, v3.4s , v3.4s
    127     rshrn     v5.8b, v0.8h, #2
    128     dup       v21.8h, v5.h[0]
    129     rshrn     v6.8b, v3.8h, #2
    130     dup       v20.8h, v6.h[0]
    131     add       v1.8h, v1.8h, v2.8h
    132     rshrn     v1.8b, v1.8h, #3
    133     dup       v23.8h, v1.h[0]
    134     mov       v20.d[0], v23.d[0]
    135     add       v0.8h, v0.8h, v3.8h
    136     rshrn     v0.8b, v0.8h, #3
    137     dup       v23.8h, v0.h[0]
    138     mov       v31.d[0], v23.d[0]
    139     mov       v28.d[0], v20.d[0]
    140     mov       v29.d[0], v20.d[1]
    141     mov       v30.d[0], v21.d[0]
    142     b         sad_comp
    143 
    144 left_only_available:
    145     ld1       {v0.8b, v1.8b}, [x1]
    146     uxtl      v0.8h, v0.8b
    147     uxtl      v1.8h, v1.8b
    148     addp      v0.4s, v0.4s , v0.4s
    149     addp      v1.4s, v1.4s , v1.4s
    150     addp      v0.4s, v0.4s , v0.4s
    151     addp      v1.4s, v1.4s , v1.4s
    152     rshrn     v0.8b, v0.8h, #2
    153     rshrn     v1.8b, v1.8h, #2
    154 
    155     dup       v28.8h , v1.h[0]
    156     dup       v29.8h , v1.h[0]
    157     dup       v30.8h, v0.h[0]
    158     dup       v31.8h, v0.h[0]
    159     b         sad_comp
    160 
    161 top_only_available:
    162     add       x6, x1, #18
    163     ld1       {v0.8b, v1.8b}, [x6]
    164     uxtl      v0.8h, v0.8b
    165     uxtl      v1.8h, v1.8b
    166     addp      v0.4s, v0.4s , v0.4s
    167     addp      v1.4s, v1.4s , v1.4s
    168     addp      v0.4s, v0.4s , v0.4s
    169     addp      v1.4s, v1.4s , v1.4s
    170     rshrn     v0.8b, v0.8h, #2
    171     rshrn     v1.8b, v1.8h, #2
    172     dup       v28.8h , v0.h[0]
    173     dup       v30.8h, v1.h[0]
    174     mov       v29.d[0], v30.d[1]
    175     mov       v30.d[0], v28.d[0]
    176     mov       v31.d[0], v30.d[1]
    177     b         sad_comp
    178 none_available:
    179     mov       w20, #128
    180     dup       v28.16b, w20
    181     dup       v29.16b, w20
    182     dup       v30.16b, w20
    183     dup       v31.16b, w20
    184 
    185 
    186 
    187 sad_comp:
    188     add       x6, x1, #18
    189     ld1       {v10.8b, v11.8b}, [x6]    // vertical values
    190 
    191     ld1       {v27.8h}, [x1]
    192 
    193     dup       v20.8h, v27.h[7]          ///HORIZONTAL VALUE ROW=0//
    194     dup       v21.8h, v27.h[7]
    195 
    196     ld1       { v0.8b, v1.8b}, [x0], x3
    197 
    198 
    199     ///vertical row 0@
    200     uabdl     v16.8h, v0.8b, v10.8b
    201     uabdl     v18.8h, v1.8b, v11.8b
    202 
    203     ///HORZ row 0@
    204     uabdl     v26.8h, v0.8b, v20.8b
    205     uabdl     v14.8h, v1.8b, v21.8b
    206 
    207     ld1       {v2.8b, v3.8b}, [x0], x3
    208 
    209 
    210 
    211     ///dc row 0@
    212     uabdl     v22.8h, v0.8b, v28.8b
    213     uabdl     v24.8h, v1.8b, v29.8b
    214 
    215 
    216     dup       v20.8h, v27.h[6]
    217     dup       v21.8h, v27.h[6]          ///HORIZONTAL VALUE ROW=1//
    218 
    219     ///vertical row 1@
    220     uabal     v16.8h, v2.8b, v10.8b
    221     uabal     v18.8h, v3.8b, v11.8b
    222 
    223     ld1       { v4.8b, v5.8b}, [x0], x3
    224 
    225     ///HORZ row 1@
    226     uabal     v26.8h, v2.8b, v20.8b
    227     uabal     v14.8h, v3.8b, v21.8b
    228 
    229     ///dc row 1@
    230     uabal     v22.8h, v2.8b, v28.8b
    231     uabal     v24.8h, v3.8b, v29.8b
    232 
    233     dup       v20.8h, v27.h[5]
    234     dup       v21.8h, v27.h[5]          ///HORIZONTAL VALUE ROW=2//
    235 
    236     ///vertical row 2@
    237     uabal     v16.8h, v4.8b, v10.8b
    238     uabal     v18.8h, v5.8b, v11.8b
    239 
    240     ld1       { v6.8b, v7.8b}, [x0], x3
    241     ///HORZ row 2@
    242     uabal     v26.8h, v4.8b, v20.8b
    243     uabal     v14.8h, v5.8b, v21.8b
    244 
    245     ///dc row 2@
    246     uabal     v22.8h, v4.8b, v28.8b
    247     uabal     v24.8h, v5.8b, v29.8b
    248 
    249     dup       v20.8h, v27.h[4]
    250     dup       v21.8h, v27.h[4]          ///HORIZONTAL VALUE ROW=3//
    251 
    252     ///vertical row 3@
    253     uabal     v16.8h, v6.8b, v10.8b
    254     uabal     v18.8h, v7.8b, v11.8b
    255 
    256     ///HORZ row 3@
    257     uabal     v26.8h, v6.8b, v20.8b
    258     uabal     v14.8h, v7.8b, v21.8b
    259 
    260     ///dc row 3@
    261     uabal     v22.8h, v6.8b, v28.8b
    262     uabal     v24.8h, v7.8b, v29.8b
    263 
    264     //----------------------------------------------------------------------------------------------
    265     ld1       { v0.8b, v1.8b}, [x0], x3
    266 
    267 
    268     dup       v20.8h, v27.h[3]
    269     dup       v21.8h, v27.h[3]          ///HORIZONTAL VALUE ROW=0//
    270 
    271     ///vertical row 0@
    272     uabal     v16.8h, v0.8b, v10.8b
    273     uabal     v18.8h, v1.8b, v11.8b
    274 
    275     ///HORZ row 0@
    276     uabal     v26.8h, v0.8b, v20.8b
    277     uabal     v14.8h, v1.8b, v21.8b
    278 
    279     ld1       { v2.8b, v3.8b}, [x0], x3
    280 
    281     ///dc row 0@
    282     uabal     v22.8h, v0.8b, v30.8b
    283     uabal     v24.8h, v1.8b, v31.8b
    284 
    285     dup       v20.8h, v27.h[2]
    286     dup       v21.8h, v27.h[2]          ///HORIZONTAL VALUE ROW=1//
    287 
    288     ///vertical row 1@
    289     uabal     v16.8h, v2.8b, v10.8b
    290     uabal     v18.8h, v3.8b, v11.8b
    291 
    292     ///HORZ row 1@
    293     uabal     v26.8h, v2.8b, v20.8b
    294     uabal     v14.8h, v3.8b, v21.8b
    295 
    296     ld1       { v4.8b, v5.8b}, [x0], x3
    297 
    298     ///dc row 1@
    299     uabal     v22.8h, v2.8b, v30.8b
    300     uabal     v24.8h, v3.8b, v31.8b
    301 
    302     dup       v20.8h, v27.h[1]
    303     dup       v21.8h, v27.h[1]          ///HORIZONTAL VALUE ROW=2//
    304 
    305     ///vertical row 2@
    306     uabal     v16.8h, v4.8b, v10.8b
    307     uabal     v18.8h, v5.8b, v11.8b
    308 
    309     ///HORZ row 2@
    310     uabal     v26.8h, v4.8b, v20.8b
    311     uabal     v14.8h, v5.8b, v21.8b
    312 
    313     ld1       {v6.8b, v7.8b}, [x0], x3
    314 
    315     ///dc row 2@
    316     uabal     v22.8h, v4.8b, v30.8b
    317     uabal     v24.8h, v5.8b, v31.8b
    318 
    319     dup       v20.8h, v27.h[0]
    320     dup       v21.8h, v27.h[0]          ///HORIZONTAL VALUE ROW=3//
    321 
    322     ///vertical row 3@
    323     uabal     v16.8h, v6.8b, v10.8b
    324     uabal     v18.8h, v7.8b, v11.8b
    325 
    326     ///HORZ row 3@
    327     uabal     v26.8h, v6.8b, v20.8b
    328     uabal     v14.8h, v7.8b, v21.8b
    329 
    330     ///dc row 3@
    331     uabal     v22.8h, v6.8b, v30.8b
    332     uabal     v24.8h, v7.8b, v31.8b
    333 
    334 
    335 //-------------------------------------------
    336 
    337 
    338 //vert sum
    339 
    340     add       v16.8h, v16.8h , v18.8h
    341     mov       v18.d[0], v16.d[1]
    342     add       v16.4h, v16.4h , v18.4h
    343     uaddlp    v16.2s, v16.4h
    344     addp      v16.2s, v16.2s, v16.2s
    345     smov      x8, v16.s[0]
    346 
    347 
    348     //horz sum
    349 
    350     add       v26.8h, v26.8h , v14.8h
    351     mov       v14.d[0], v26.d[1]
    352     add       v26.4h, v26.4h , v14.4h
    353     uaddlp    v26.2s, v26.4h
    354     addp      v26.2s, v26.2s, v26.2s
    355     smov      x9, v26.s[0]
    356 
    357     //dc sum
    358 
    359     add       v24.8h, v22.8h , v24.8h   ///DC
    360     mov       v25.d[0], v24.d[1]
    361     add       v24.4h, v24.4h , v25.4h   ///DC
    362     uaddlp    v24.2s, v24.4h            ///DC
    363     addp      v24.2s, v24.2s, v24.2s    ///DC
    364     smov      x10, v24.s[0]             //dc
    365 
    366 
    367 
    368 
    369     mov       x11, #1
    370 //-----------------------
    371     mov       x0, x16 // u4_valid_intra_modes
    372 
    373 //--------------------------------------------
    374 
    375 
    376     lsl       x11, x11, #30
    377 
    378     ands      x7, x0, #04               // vert mode valid????????????
    379     csel      x8, x11, x8, eq
    380 
    381     ands      x6, x0, #02               // horz mode valid????????????
    382     csel      x9, x11, x9, eq
    383 
    384     ands      x6, x0, #01               // dc mode valid????????????
    385     csel      x10, x11, x10, eq
    386 
    387 
    388     //---------------------------
    389 
    390     mov       x4, x17
    391     mov       x6, x14
    392     mov       x7, x15
    393 
    394     //--------------------------
    395 
    396     cmp       x10, x9
    397     bgt       not_dc
    398     cmp       x10, x8
    399     bgt       do_vert
    400 
    401     ///----------------------
    402     //DO DC PREDICTION
    403     str       w10 , [x7]                //MIN SAD
    404 
    405     mov       w10, #0
    406     str       w10 , [x6]                // MODE
    407 
    408     b         do_dc_vert
    409     //-----------------------------
    410 
    411 not_dc:
    412     cmp       x9, x8
    413     bgt       do_vert
    414     ///----------------------
    415     //DO HORIZONTAL
    416     str       w9 , [x7]                 //MIN SAD
    417 
    418     mov       w10, #1
    419     str       w10 , [x6]                // MODE
    420     ld1       {v0.8h}, [x1]
    421 
    422     dup       v10.8h, v0.h[7]
    423     dup       v11.8h, v0.h[6]
    424     dup       v12.8h, v0.h[5]
    425     dup       v13.8h, v0.h[4]
    426     st1       {v10.8h}, [x2], x4
    427     dup       v14.8h, v0.h[3]
    428     st1       {v11.8h}, [x2], x4
    429     dup       v15.8h, v0.h[2]
    430     st1       {v12.8h}, [x2], x4
    431     dup       v16.8h, v0.h[1]
    432     st1       {v13.8h}, [x2], x4
    433     dup       v17.8h, v0.h[0]
    434     st1       {v14.8h}, [x2], x4
    435     st1       {v15.8h}, [x2], x4
    436     st1       {v16.8h}, [x2], x4
    437     st1       {v17.8h}, [x2], x4
    438 
    439     b         end_func
    440 
    441 do_vert:
    442     //DO VERTICAL PREDICTION
    443     str       w8 , [x7]                 //MIN SAD
    444     mov       w8, #2
    445     str       w8 , [x6]                 // MODE
    446     add       x6, x1, #18
    447     ld1       {v28.8b, v29.8b}, [x6]    // vertical values
    448     ld1       {v30.8b, v31.8b}, [x6]    // vertical values
    449 
    450 do_dc_vert:
    451     st1       {v28.2s, v29.2s} , [x2], x4 //0
    452     st1       {v28.2s, v29.2s} , [x2], x4 //1
    453     st1       {v28.2s, v29.2s} , [x2], x4 //2
    454     st1       {v28.2s, v29.2s} , [x2], x4 //3
    455     st1       {v30.2s, v31.2s} , [x2], x4 //4
    456     st1       {v30.2s, v31.2s} , [x2], x4 //5
    457     st1       {v30.2s, v31.2s} , [x2], x4 //6
    458     st1       {v30.2s, v31.2s} , [x2], x4 //7
    459 
    460 end_func:
    461     // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    462     ldp       x19, x20, [sp], #16
    463     pop_v_regs
    464     ret
    465 
    466 
    467