Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_filters_vert.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  akshaya mukund
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] nt
     61 //*  size of tranform block
     62 //*
     63 //* @param[in] mode
     64 //*  type of filtering
     65 //*
     66 //* @returns
     67 //*
     68 //* @remarks
     69 //*  none
     70 //*
     71 //*******************************************************************************
     72 //*/
     73 
     74 //void ihevc_intra_pred_luma_ver(uword8* pu1_ref,
     75 //                               word32 src_strd,
     76 //                               uword8* pu1_dst,
     77 //                               word32 dst_strd,
     78 //                               word32 nt,
     79 //                               word32 mode)
     80 //
     81 //**************variables vs registers*****************************************
     82 //x0 => *pu1_ref
     83 //x1 => src_strd
     84 //x2 => *pu1_dst
     85 //x3 => dst_strd
     86 
     87 //stack contents from #40
     88 //    nt
     89 //    mode
     90 
     91 .text
     92 .align 4
     93 .include "ihevc_neon_macros.s"
     94 
     95 
     96 
     97 .globl ihevc_intra_pred_luma_ver_av8
     98 
     99 .type ihevc_intra_pred_luma_ver_av8, %function
    100 
    101 ihevc_intra_pred_luma_ver_av8:
    102 
    103     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    104 
    105     stp         x19, x20,[sp,#-16]!
    106 
    107     lsl         x5, x4, #1                  //2nt
    108 
    109     cmp         x4, #16
    110     beq         blk_16
    111     blt         blk_4_8
    112 
    113     add         x5, x5, #1                  //2nt+1
    114     add         x6, x0, x5                  //&src[2nt+1]
    115 
    116 copy_32:
    117     add         x5, x2, x3
    118     ld1         {v20.8b, v21.8b}, [x6],#16  //16 loads (col 0:15)
    119     add         x8, x5, x3
    120 
    121     add         x10, x8, x3
    122     ld1         {v22.8b, v23.8b}, [x6]      //16 loads (col 16:31)
    123     lsl         x11, x3, #2
    124 
    125     sub         x11, x11, #16
    126     st1         {v20.8b, v21.8b}, [x2],#16
    127     st1         {v20.8b, v21.8b}, [x5],#16
    128     st1         {v20.8b, v21.8b}, [x8],#16
    129     st1         {v20.8b, v21.8b}, [x10],#16
    130 
    131     st1         {v22.8b, v23.8b}, [x2], x11
    132     st1         {v22.8b, v23.8b}, [x5], x11
    133     st1         {v22.8b, v23.8b}, [x8], x11
    134     st1         {v22.8b, v23.8b}, [x10], x11
    135 
    136     subs        x4, x4, #8
    137 
    138 kernel_copy_32:
    139     st1         {v20.8b, v21.8b}, [x2],#16
    140     st1         {v20.8b, v21.8b}, [x5],#16
    141     st1         {v20.8b, v21.8b}, [x8],#16
    142     st1         {v20.8b, v21.8b}, [x10],#16
    143 
    144     st1         {v22.8b, v23.8b}, [x2], x11
    145     st1         {v22.8b, v23.8b}, [x5], x11
    146     st1         {v22.8b, v23.8b}, [x8], x11
    147     st1         {v22.8b, v23.8b}, [x10], x11
    148 
    149     subs        x4, x4, #8
    150 
    151     st1         {v20.8b, v21.8b}, [x2],#16
    152     st1         {v20.8b, v21.8b}, [x5],#16
    153     st1         {v20.8b, v21.8b}, [x8],#16
    154     st1         {v20.8b, v21.8b}, [x10],#16
    155 
    156     st1         {v22.8b, v23.8b}, [x2], x11
    157     st1         {v22.8b, v23.8b}, [x5], x11
    158     st1         {v22.8b, v23.8b}, [x8], x11
    159     st1         {v22.8b, v23.8b}, [x10], x11
    160 
    161     bne         kernel_copy_32
    162 
    163     st1         {v20.8b, v21.8b}, [x2],#16
    164     st1         {v20.8b, v21.8b}, [x5],#16
    165     st1         {v20.8b, v21.8b}, [x8],#16
    166     st1         {v20.8b, v21.8b}, [x10],#16
    167 
    168     st1         {v22.8b, v23.8b}, [x2], x11
    169     st1         {v22.8b, v23.8b}, [x5], x11
    170     st1         {v22.8b, v23.8b}, [x8], x11
    171     st1         {v22.8b, v23.8b}, [x10], x11
    172 
    173     b           end_func
    174 
    175 blk_16:
    176     add         x6, x0, x5                  //&src[2nt]
    177 
    178     ldrb        w11, [x6], #1               //src[2nt]
    179     sxtw        x11,w11
    180 
    181     dup         v22.16b,w11                 //src[2nt]
    182     ldrb        w12, [x6]                   //src[2nt+1]
    183     sxtw        x12,w12
    184 
    185     ld1         {v16.8b, v17.8b}, [x6]      //ld for repl to cols src[2nt+1+col(0:15)] (0 ignored for stores)
    186     sub         x6, x6, #17                 //subtract -9 to take it to src[2nt-1-row(15)]
    187 
    188     dup         v24.16b,w12                 //src[2nt+1]
    189     dup         v30.8h,w12
    190     lsl         x5, x3, #3                  //8*stride
    191 
    192     ld1         {v26.16b}, [x6],#16         //load src[2nt-1-row](rows 0:15)
    193     add         x5, x2, x5                  //x5 ->
    194 
    195     movi        d18, #0x00000000000000ff
    196     uhsub       v26.16b,  v26.16b ,  v22.16b //(src[2nt-1-row] - src[2nt])>>1
    197     //vsubl.u8    q0, d26, d22
    198     //vsubl.u8    q14, d27, d22
    199 
    200     //vshr.s16    q0, q0, #1
    201     //vshr.s16    q14, q14, #1
    202 
    203     mov         v19.d[0],v17.d[0]
    204     //vaddl.s8    q0, d24, d26
    205     sxtl        v0.8h, v26.8b
    206     sxtl2       v28.8h, v26.16b
    207     sqadd       v0.8h,  v0.8h ,  v30.8h
    208     sqadd       v28.8h,  v28.8h ,  v30.8h
    209 
    210     movi        d3, #0x00000000000000ff
    211     //vaddl.s8    q1, d25, d27
    212 
    213     sqxtun      v24.8b, v28.8h
    214     sqxtun2     v24.16b, v0.8h
    215     //vmovn.u16    d25, q0
    216     //vmovn.u16    d24, q1
    217 
    218     rev64       v24.16b,  v24.16b
    219     mov         v25.d[0], v24.d[1]
    220 
    221     mov         v4.d[0],v17.d[0]
    222 
    223     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
    224     bsl         v3.8b,  v25.8b ,  v16.8b
    225 
    226     movi        d1, #0x00000000000000ff
    227     mov         v2.d[0],v17.d[0]
    228 
    229     movi        d6, #0x00000000000000ff
    230     mov         v7.d[0],v17.d[0]
    231 
    232     st1         {v18.8b, v19.8b}, [x2], x3
    233     sshr        d24, d24,#8
    234 
    235     st1         {v3.8b, v4.8b}, [x5], x3
    236     sshr        d25, d25,#8
    237 
    238 
    239     bsl         v1.8b,  v24.8b ,  v16.8b
    240     bsl         v6.8b,  v25.8b ,  v16.8b
    241 
    242     st1         {v1.8b, v2.8b}, [x2], x3
    243     sshr        d24, d24,#8
    244 
    245     st1         {v6.8b, v7.8b}, [x5], x3
    246     sshr        d25, d25,#8
    247 
    248     subs        x4, x4,#8
    249 
    250     movi        d18, #0x00000000000000ff
    251     //vmov.i64    d19, d17
    252 
    253     movi        d3, #0x00000000000000ff
    254     //vmov.i64    d11, d17
    255 
    256 
    257 loop_16:
    258 
    259 
    260     movi        d1, #0x00000000000000ff
    261 
    262     movi        d6, #0x00000000000000ff
    263 
    264     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
    265     bsl         v3.8b,  v25.8b ,  v16.8b
    266 
    267     st1         {v18.8b, v19.8b}, [x2], x3
    268     sshr        d24, d24,#8
    269 
    270     st1         {v3.8b, v4.8b}, [x5], x3
    271     sshr        d25, d25,#8
    272 
    273     movi        d18, #0x00000000000000ff
    274 
    275     movi        d3, #0x00000000000000ff
    276 
    277     bsl         v1.8b,  v24.8b ,  v16.8b
    278     bsl         v6.8b,  v25.8b ,  v16.8b
    279 
    280     st1         {v1.8b, v2.8b}, [x2], x3
    281     sshr        d24, d24,#8
    282 
    283     st1         {v6.8b, v7.8b}, [x5], x3
    284     sshr        d25, d25,#8
    285 
    286     subs        x4, x4, #4
    287 
    288     bne         loop_16
    289 
    290     movi        d1, #0x00000000000000ff
    291 
    292     movi        d6, #0x00000000000000ff
    293 
    294     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
    295     bsl         v3.8b,  v25.8b ,  v16.8b
    296 
    297     st1         {v18.8b, v19.8b}, [x2], x3
    298     sshr        d24, d24,#8
    299 
    300     st1         {v3.8b, v4.8b}, [x5], x3
    301     sshr        d25, d25,#8
    302 
    303     bsl         v1.8b,  v24.8b ,  v16.8b
    304     bsl         v6.8b,  v25.8b ,  v16.8b
    305 
    306     st1         {v1.8b, v2.8b}, [x2], x3
    307 
    308     st1         {v6.8b, v7.8b}, [x5], x3
    309 
    310     b           end_func
    311 
    312 
    313 blk_4_8:
    314     movi        d4, #0x00000000000000ff
    315     add         x6, x0, x5                  //&src[2nt]
    316 
    317     movi        d3, #0x00000000000000ff
    318     ldrb        w11, [x6], #1               //src[2nt]
    319     sxtw        x11,w11
    320 
    321     dup         v22.8b,w11                  //src[2nt]
    322     ldrb        w12, [x6]                   //src[2nt+1]
    323     sxtw        x12,w12
    324 
    325     ld1         {v16.8b},[x6]               //ld for repl to cols src[2nt+1+col(0:3 or 0:7)](0 ignored for st)
    326     sub         x6, x6, #9                  //subtract -9 to take it to src[2nt-1-row(15)]
    327 
    328     dup         v24.8b,w12                  //src[2nt+1]
    329     dup         v30.8h,w12
    330 
    331     ld1         {v26.8b},[x6],#8            //load src[2nt-1-row](rows 0:15)
    332 
    333     movi        d18, #0x00000000000000ff
    334     uhsub       v26.8b,  v26.8b ,  v22.8b   //(src[2nt-1-row] - src[2nt])>>1
    335     //vsubl.u8    q13, d26, d22
    336 
    337     //vshr.s16    q13, q13, #1
    338 
    339     movi        d19, #0x00000000000000ff
    340     sxtl        v26.8h, v26.8b
    341     //vaddl.s8    q0, d24, d26
    342     sqadd       v0.8h,  v26.8h ,  v30.8h
    343 
    344     sqxtun      v24.8b, v0.8h
    345     //vmovn.s16    d24, q0
    346 
    347     rev64       v24.8b,  v24.8b
    348 
    349     cmp         x4, #4
    350     beq         blk_4
    351 
    352     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
    353 
    354     st1         {v18.8b},[x2], x3
    355     sshr        d24, d24,#8
    356 
    357     movi        d18, #0x00000000000000ff
    358 
    359     bsl         v19.8b,  v24.8b ,  v16.8b
    360 
    361     st1         {v19.8b},[x2], x3
    362     sshr        d24, d24,#8
    363 
    364     movi        d19, #0x00000000000000ff
    365 
    366     bsl         v3.8b,  v24.8b ,  v16.8b
    367 
    368     st1         {v3.8b},[x2], x3
    369     sshr        d24, d24,#8
    370 
    371     movi        d3, #0x00000000000000ff
    372 
    373     bsl         v4.8b,  v24.8b ,  v16.8b
    374 
    375     st1         {v4.8b},[x2], x3
    376     sshr        d24, d24,#8
    377 
    378     movi        d4, #0x00000000000000ff
    379 
    380     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
    381 
    382     st1         {v18.8b},[x2], x3
    383     sshr        d24, d24,#8
    384 
    385     bsl         v19.8b,  v24.8b ,  v16.8b
    386 
    387     st1         {v19.8b},[x2], x3
    388     sshr        d24, d24,#8
    389 
    390     bsl         v3.8b,  v24.8b ,  v16.8b
    391 
    392     st1         {v3.8b},[x2], x3
    393     sshr        d24, d24,#8
    394 
    395     bsl         v4.8b,  v24.8b ,  v16.8b
    396 
    397     st1         {v4.8b},[x2], x3
    398     sshr        d24, d24,#8
    399 
    400     b           end_func
    401 
    402 
    403 blk_4:
    404     bsl         v18.8b,  v24.8b ,  v16.8b   //only select row values from q12(predpixel)
    405 
    406     st1         {v18.s}[0],[x2], x3
    407     sshr        d24, d24,#8
    408 
    409     bsl         v19.8b,  v24.8b ,  v16.8b
    410 
    411     st1         {v19.s}[0],[x2], x3
    412     sshr        d24, d24,#8
    413 
    414     bsl         v3.8b,  v24.8b ,  v16.8b
    415 
    416     st1         {v3.s}[0],[x2], x3
    417     sshr        d24, d24,#8
    418 
    419     bsl         v4.8b,  v24.8b ,  v16.8b
    420     st1         {v4.s}[0],[x2], x3
    421 
    422 
    423 end_func:
    424     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    425     ldp         x19, x20,[sp],#16
    426 
    427     ret
    428 
    429 
    430 
    431 
    432 
    433