Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 ///**
     22 //******************************************************************************
     23 //*
     24 //* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC )
     25 //*                and do the prediction.
     26 //*
     27 //* @par Description
     28 //*   This function evaluates  first three intra chroma modes and compute corresponding sad
     29 //*   and return the buffer predicted with best mode.
     30 //*
     31 //* @param[in] pu1_src
     32 //*  UWORD8 pointer to the source
     33 //*
     34 //** @param[in] pu1_ngbr_pels
     35 //*  UWORD8 pointer to neighbouring pels
     36 //*
     37 //* @param[out] pu1_dst
     38 //*  UWORD8 pointer to the destination
     39 //*
     40 //* @param[in] src_strd
     41 //*  integer source stride
     42 //*
     43 //* @param[in] dst_strd
     44 //*  integer destination stride
     45 //*
     46 //* @param[in] u4_n_avblty
     47 //* availability of neighbouring pixels
     48 //*
     49 //* @param[in] u4_intra_mode
     50 //* Pointer to the variable in which best mode is returned
     51 //*
     52 //* @param[in] pu4_sadmin
     53 //* Pointer to the variable in which minimum sad is returned
     54 //*
     55 //* @param[in] u4_valid_intra_modes
     56 //* Says what all modes are valid
     57 //*
     58 //*
     59 //* @return      none
     60 //*
     61 //******************************************************************************
     62 //*/
     63 //
     64 //void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
     65 //                                      UWORD8 *pu1_ngbr_pels_i16,
     66 //                                      UWORD8 *pu1_dst,
     67 //                                      UWORD32 src_strd,
     68 //                                      UWORD32 dst_strd,
     69 //                                      WORD32 u4_n_avblty,
     70 //                                      UWORD32 *u4_intra_mode,
     71 //                                      WORD32 *pu4_sadmin,
     72 //                                       UWORD32 u4_valid_intra_modes)
     73 //
     74 .text
     75 .p2align 2
     76 .include "ih264_neon_macros.s"
     77 
     78 .global ih264e_evaluate_intra_chroma_modes_av8
     79 
     80 ih264e_evaluate_intra_chroma_modes_av8:
     81 
     82 //x0 = pu1_src,
     83 //x1 = pu1_ngbr_pels_i16,
     84 //x2 = pu1_dst,
     85 //w3 = src_strd,
     86 //w4 = dst_strd,
     87 //w5 = u4_n_avblty,
     88 //x6 = u4_intra_mode,
     89 //x7 = pu4_sadmin
     90 
     91 
     92 
     93     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     94     push_v_regs
     95     sxtw      x3, w3
     96     sxtw      x4, w4
     97     stp       x19, x20, [sp, #-16]!
     98     //-----------------------
     99     ldr       w16, [sp, #80]
    100     mov       x17, x4
    101     mov       w18, w5
    102     mov       x14, x6
    103     mov       x15, x7
    104 
    105     mov       w19, #5
    106     ands      w6, w5, w19
    107     beq       none_available
    108     cmp       w6, #1
    109     beq       left_only_available
    110     cmp       w6, #4
    111     beq       top_only_available
    112 
    113 all_available:
    114     ld1       {v0.8b, v1.8b}, [x1]
    115     add       x6, x1, #18
    116     ld1       {v2.8b, v3.8b}, [x6]
    117     uxtl      v0.8h, v0.8b
    118     uxtl      v1.8h, v1.8b
    119     addp      v0.4s, v0.4s , v0.4s
    120     addp      v1.4s, v1.4s , v1.4s
    121     addp      v0.4s, v0.4s , v0.4s
    122     addp      v1.4s, v1.4s , v1.4s
    123     uxtl      v2.8h, v2.8b
    124     uxtl      v3.8h, v3.8b
    125     addp      v2.4s, v2.4s , v2.4s
    126     addp      v3.4s, v3.4s , v3.4s
    127     addp      v2.4s, v2.4s , v2.4s
    128     addp      v3.4s, v3.4s , v3.4s
    129     rshrn     v5.8b, v0.8h, #2
    130     dup       v21.8h, v5.h[0]
    131     rshrn     v6.8b, v3.8h, #2
    132     dup       v20.8h, v6.h[0]
    133     add       v1.8h, v1.8h, v2.8h
    134     rshrn     v1.8b, v1.8h, #3
    135     dup       v23.8h, v1.h[0]
    136     mov       v20.d[0], v23.d[0]
    137     add       v0.8h, v0.8h, v3.8h
    138     rshrn     v0.8b, v0.8h, #3
    139     dup       v23.8h, v0.h[0]
    140     mov       v31.d[0], v23.d[0]
    141     mov       v28.d[0], v20.d[0]
    142     mov       v29.d[0], v20.d[1]
    143     mov       v30.d[0], v21.d[0]
    144     b         sad_comp
    145 
    146 left_only_available:
    147     ld1       {v0.8b, v1.8b}, [x1]
    148     uxtl      v0.8h, v0.8b
    149     uxtl      v1.8h, v1.8b
    150     addp      v0.4s, v0.4s , v0.4s
    151     addp      v1.4s, v1.4s , v1.4s
    152     addp      v0.4s, v0.4s , v0.4s
    153     addp      v1.4s, v1.4s , v1.4s
    154     rshrn     v0.8b, v0.8h, #2
    155     rshrn     v1.8b, v1.8h, #2
    156 
    157     dup       v28.8h , v1.h[0]
    158     dup       v29.8h , v1.h[0]
    159     dup       v30.8h, v0.h[0]
    160     dup       v31.8h, v0.h[0]
    161     b         sad_comp
    162 
    163 top_only_available:
    164     add       x6, x1, #18
    165     ld1       {v0.8b, v1.8b}, [x6]
    166     uxtl      v0.8h, v0.8b
    167     uxtl      v1.8h, v1.8b
    168     addp      v0.4s, v0.4s , v0.4s
    169     addp      v1.4s, v1.4s , v1.4s
    170     addp      v0.4s, v0.4s , v0.4s
    171     addp      v1.4s, v1.4s , v1.4s
    172     rshrn     v0.8b, v0.8h, #2
    173     rshrn     v1.8b, v1.8h, #2
    174     dup       v28.8h , v0.h[0]
    175     dup       v30.8h, v1.h[0]
    176     mov       v29.d[0], v30.d[1]
    177     mov       v30.d[0], v28.d[0]
    178     mov       v31.d[0], v30.d[1]
    179     b         sad_comp
    180 none_available:
    181     mov       w20, #128
    182     dup       v28.16b, w20
    183     dup       v29.16b, w20
    184     dup       v30.16b, w20
    185     dup       v31.16b, w20
    186 
    187 
    188 
    189 sad_comp:
    190     add       x6, x1, #18
    191     ld1       {v10.8b, v11.8b}, [x6]    // vertical values
    192 
    193     ld1       {v27.8h}, [x1]
    194 
    195     dup       v20.8h, v27.h[7]          ///HORIZONTAL VALUE ROW=0//
    196     dup       v21.8h, v27.h[7]
    197 
    198     ld1       { v0.8b, v1.8b}, [x0], x3
    199 
    200 
    201     ///vertical row 0@
    202     uabdl     v16.8h, v0.8b, v10.8b
    203     uabdl     v18.8h, v1.8b, v11.8b
    204 
    205     ///HORZ row 0@
    206     uabdl     v26.8h, v0.8b, v20.8b
    207     uabdl     v14.8h, v1.8b, v21.8b
    208 
    209     ld1       {v2.8b, v3.8b}, [x0], x3
    210 
    211 
    212 
    213     ///dc row 0@
    214     uabdl     v22.8h, v0.8b, v28.8b
    215     uabdl     v24.8h, v1.8b, v29.8b
    216 
    217 
    218     dup       v20.8h, v27.h[6]
    219     dup       v21.8h, v27.h[6]          ///HORIZONTAL VALUE ROW=1//
    220 
    221     ///vertical row 1@
    222     uabal     v16.8h, v2.8b, v10.8b
    223     uabal     v18.8h, v3.8b, v11.8b
    224 
    225     ld1       { v4.8b, v5.8b}, [x0], x3
    226 
    227     ///HORZ row 1@
    228     uabal     v26.8h, v2.8b, v20.8b
    229     uabal     v14.8h, v3.8b, v21.8b
    230 
    231     ///dc row 1@
    232     uabal     v22.8h, v2.8b, v28.8b
    233     uabal     v24.8h, v3.8b, v29.8b
    234 
    235     dup       v20.8h, v27.h[5]
    236     dup       v21.8h, v27.h[5]          ///HORIZONTAL VALUE ROW=2//
    237 
    238     ///vertical row 2@
    239     uabal     v16.8h, v4.8b, v10.8b
    240     uabal     v18.8h, v5.8b, v11.8b
    241 
    242     ld1       { v6.8b, v7.8b}, [x0], x3
    243     ///HORZ row 2@
    244     uabal     v26.8h, v4.8b, v20.8b
    245     uabal     v14.8h, v5.8b, v21.8b
    246 
    247     ///dc row 2@
    248     uabal     v22.8h, v4.8b, v28.8b
    249     uabal     v24.8h, v5.8b, v29.8b
    250 
    251     dup       v20.8h, v27.h[4]
    252     dup       v21.8h, v27.h[4]          ///HORIZONTAL VALUE ROW=3//
    253 
    254     ///vertical row 3@
    255     uabal     v16.8h, v6.8b, v10.8b
    256     uabal     v18.8h, v7.8b, v11.8b
    257 
    258     ///HORZ row 3@
    259     uabal     v26.8h, v6.8b, v20.8b
    260     uabal     v14.8h, v7.8b, v21.8b
    261 
    262     ///dc row 3@
    263     uabal     v22.8h, v6.8b, v28.8b
    264     uabal     v24.8h, v7.8b, v29.8b
    265 
    266     //----------------------------------------------------------------------------------------------
    267     ld1       { v0.8b, v1.8b}, [x0], x3
    268 
    269 
    270     dup       v20.8h, v27.h[3]
    271     dup       v21.8h, v27.h[3]          ///HORIZONTAL VALUE ROW=0//
    272 
    273     ///vertical row 0@
    274     uabal     v16.8h, v0.8b, v10.8b
    275     uabal     v18.8h, v1.8b, v11.8b
    276 
    277     ///HORZ row 0@
    278     uabal     v26.8h, v0.8b, v20.8b
    279     uabal     v14.8h, v1.8b, v21.8b
    280 
    281     ld1       { v2.8b, v3.8b}, [x0], x3
    282 
    283     ///dc row 0@
    284     uabal     v22.8h, v0.8b, v30.8b
    285     uabal     v24.8h, v1.8b, v31.8b
    286 
    287     dup       v20.8h, v27.h[2]
    288     dup       v21.8h, v27.h[2]          ///HORIZONTAL VALUE ROW=1//
    289 
    290     ///vertical row 1@
    291     uabal     v16.8h, v2.8b, v10.8b
    292     uabal     v18.8h, v3.8b, v11.8b
    293 
    294     ///HORZ row 1@
    295     uabal     v26.8h, v2.8b, v20.8b
    296     uabal     v14.8h, v3.8b, v21.8b
    297 
    298     ld1       { v4.8b, v5.8b}, [x0], x3
    299 
    300     ///dc row 1@
    301     uabal     v22.8h, v2.8b, v30.8b
    302     uabal     v24.8h, v3.8b, v31.8b
    303 
    304     dup       v20.8h, v27.h[1]
    305     dup       v21.8h, v27.h[1]          ///HORIZONTAL VALUE ROW=2//
    306 
    307     ///vertical row 2@
    308     uabal     v16.8h, v4.8b, v10.8b
    309     uabal     v18.8h, v5.8b, v11.8b
    310 
    311     ///HORZ row 2@
    312     uabal     v26.8h, v4.8b, v20.8b
    313     uabal     v14.8h, v5.8b, v21.8b
    314 
    315     ld1       {v6.8b, v7.8b}, [x0], x3
    316 
    317     ///dc row 2@
    318     uabal     v22.8h, v4.8b, v30.8b
    319     uabal     v24.8h, v5.8b, v31.8b
    320 
    321     dup       v20.8h, v27.h[0]
    322     dup       v21.8h, v27.h[0]          ///HORIZONTAL VALUE ROW=3//
    323 
    324     ///vertical row 3@
    325     uabal     v16.8h, v6.8b, v10.8b
    326     uabal     v18.8h, v7.8b, v11.8b
    327 
    328     ///HORZ row 3@
    329     uabal     v26.8h, v6.8b, v20.8b
    330     uabal     v14.8h, v7.8b, v21.8b
    331 
    332     ///dc row 3@
    333     uabal     v22.8h, v6.8b, v30.8b
    334     uabal     v24.8h, v7.8b, v31.8b
    335 
    336 
    337 //-------------------------------------------
    338 
    339 
    340 //vert sum
    341 
    342     add       v16.8h, v16.8h , v18.8h
    343     mov       v18.d[0], v16.d[1]
    344     add       v16.4h, v16.4h , v18.4h
    345     uaddlp    v16.2s, v16.4h
    346     addp      v16.2s, v16.2s, v16.2s
    347     smov      x8, v16.s[0]
    348 
    349 
    350     //horz sum
    351 
    352     add       v26.8h, v26.8h , v14.8h
    353     mov       v14.d[0], v26.d[1]
    354     add       v26.4h, v26.4h , v14.4h
    355     uaddlp    v26.2s, v26.4h
    356     addp      v26.2s, v26.2s, v26.2s
    357     smov      x9, v26.s[0]
    358 
    359     //dc sum
    360 
    361     add       v24.8h, v22.8h , v24.8h   ///DC
    362     mov       v25.d[0], v24.d[1]
    363     add       v24.4h, v24.4h , v25.4h   ///DC
    364     uaddlp    v24.2s, v24.4h            ///DC
    365     addp      v24.2s, v24.2s, v24.2s    ///DC
    366     smov      x10, v24.s[0]             //dc
    367 
    368 
    369 
    370 
    371     mov       x11, #1
    372 //-----------------------
    373     mov       w0, w16 // u4_valid_intra_modes
    374 
    375 //--------------------------------------------
    376 
    377 
    378     lsl       x11, x11, #30
    379 
    380     ands      w7, w0, #04               // vert mode valid????????????
    381     csel      x8, x11, x8, eq
    382 
    383     ands      w6, w0, #02               // horz mode valid????????????
    384     csel      x9, x11, x9, eq
    385 
    386     ands      w6, w0, #01               // dc mode valid????????????
    387     csel      x10, x11, x10, eq
    388 
    389 
    390     //---------------------------
    391 
    392     mov       x4, x17
    393     mov       x6, x14
    394     mov       x7, x15
    395 
    396     //--------------------------
    397 
    398     cmp       x10, x9
    399     bgt       not_dc
    400     cmp       x10, x8
    401     bgt       do_vert
    402 
    403     ///----------------------
    404     //DO DC PREDICTION
    405     str       w10 , [x7]                //MIN SAD
    406 
    407     mov       w10, #0
    408     str       w10 , [x6]                // MODE
    409 
    410     b         do_dc_vert
    411     //-----------------------------
    412 
    413 not_dc:
    414     cmp       x9, x8
    415     bgt       do_vert
    416     ///----------------------
    417     //DO HORIZONTAL
    418     str       w9 , [x7]                 //MIN SAD
    419 
    420     mov       w10, #1
    421     str       w10 , [x6]                // MODE
    422     ld1       {v0.8h}, [x1]
    423 
    424     dup       v10.8h, v0.h[7]
    425     dup       v11.8h, v0.h[6]
    426     dup       v12.8h, v0.h[5]
    427     dup       v13.8h, v0.h[4]
    428     st1       {v10.8h}, [x2], x4
    429     dup       v14.8h, v0.h[3]
    430     st1       {v11.8h}, [x2], x4
    431     dup       v15.8h, v0.h[2]
    432     st1       {v12.8h}, [x2], x4
    433     dup       v16.8h, v0.h[1]
    434     st1       {v13.8h}, [x2], x4
    435     dup       v17.8h, v0.h[0]
    436     st1       {v14.8h}, [x2], x4
    437     st1       {v15.8h}, [x2], x4
    438     st1       {v16.8h}, [x2], x4
    439     st1       {v17.8h}, [x2], x4
    440 
    441     b         end_func
    442 
    443 do_vert:
    444     //DO VERTICAL PREDICTION
    445     str       w8 , [x7]                 //MIN SAD
    446     mov       w8, #2
    447     str       w8 , [x6]                 // MODE
    448     add       x6, x1, #18
    449     ld1       {v28.8b, v29.8b}, [x6]    // vertical values
    450     ld1       {v30.8b, v31.8b}, [x6]    // vertical values
    451 
    452 do_dc_vert:
    453     st1       {v28.2s, v29.2s} , [x2], x4 //0
    454     st1       {v28.2s, v29.2s} , [x2], x4 //1
    455     st1       {v28.2s, v29.2s} , [x2], x4 //2
    456     st1       {v28.2s, v29.2s} , [x2], x4 //3
    457     st1       {v30.2s, v31.2s} , [x2], x4 //4
    458     st1       {v30.2s, v31.2s} , [x2], x4 //5
    459     st1       {v30.2s, v31.2s} , [x2], x4 //6
    460     st1       {v30.2s, v31.2s} , [x2], x4 //7
    461 
    462 end_func:
    463     // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
    464     ldp       x19, x20, [sp], #16
    465     pop_v_regs
    466     ret
    467 
    468 
    469