Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_filters_dc.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for intra prediction dc filtering.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  akshaya mukund
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for dc input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] pi1_coeff
     61 //*  word8 pointer to the planar coefficients
     62 //*
     63 //* @param[in] nt
     64 //*  size of tranform block
     65 //*
     66 //* @param[in] mode
     67 //*  type of filtering
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  none
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 
     77 //void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
     78 //                              word32 src_strd,
     79 //                              uword8 *pu1_dst,
     80 //                              word32 dst_strd,
     81 //                              word32 nt,
     82 //                              word32 mode)
     83 //
     84 //**************variables vs registers*****************************************
     85 //x0 => *pu1_ref
     86 //x1 => src_strd
     87 //x2 => *pu1_dst
     88 //x3 => dst_strd
     89 
     90 //stack contents from #40
     91 //    nt
     92 //    mode
     93 //    pi1_coeff
     94 
     95 .text
     96 .align 4
     97 .include "ihevc_neon_macros.s"
     98 
     99 
    100 .globl ihevc_intra_pred_luma_dc_av8
    101 
    102 .type ihevc_intra_pred_luma_dc_av8, %function
    103 
    104 ihevc_intra_pred_luma_dc_av8:
    105 
    106     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    107 
    108     stp         x19, x20,[sp,#-16]!
    109 
    110 
    111 //********** testing
    112     //mov        x6, #128
    113     //b        prologue_cpy_32
    114 //********** testing
    115 
    116     mov         x11, #2                     //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val)
    117     mov         x9, #0
    118     mov         v17.s[0], w11
    119     mov         v17.s[1], w9
    120 
    121     clz         w5,w4
    122 
    123     add         x6, x0, x4                  //&src[nt]
    124     sub         x20, x5, #32                //log2nt
    125     neg         x5, x20
    126     add         x7, x0, x4, lsl #1          //&src[2nt]
    127 
    128     add         x8, x7, #1                  //&src[2nt+1]
    129     mvn         x5, x5
    130     add         x5, x5, #1
    131     dup         v7.2s,w5
    132 
    133     ldrb        w14, [x8]
    134     sxtw        x14,w14
    135     shl         d7, d7,#32
    136 
    137     sub         x9, x7, #1                  //&src[2nt-1]
    138     sshr        d7, d7,#32
    139 
    140     mov         x7, x8                      //x7 also stores 2nt+1
    141 
    142     ldrb        w12, [x9]
    143     sxtw        x12,w12
    144     add         x14, x14, x12               //src[2nt+1] + src[2nt-1]
    145     add         x14, x14, x11               //src[2nt+1] + src[2nt-1] + 2
    146 
    147     cmp         x4, #4
    148     beq         dc_4
    149 
    150     mov         x10, x4                     //nt
    151 
    152 add_loop:
    153     ld1         {v0.8b},[x6],#8             //load from src[nt]
    154     mov         x5, #0                      //
    155     ld1         {v1.8b},[x8],#8             //load from src[2nt+1]
    156 
    157     uaddlp      v2.4h,  v0.8b
    158 
    159     mov         v6.s[0], w4
    160     mov         v6.s[1], w5                 //store nt to accumulate
    161     uaddlp      v3.4h,  v1.8b
    162 
    163     ld1         {v0.8b},[x6],#8             //load from src[nt] (extra load for 8)
    164 
    165     ld1         {v1.8b},[x8],#8             //load from src[2nt+1] (extra load for 8)
    166     add         v4.4h,  v2.4h ,  v3.4h
    167 
    168 
    169     uaddlp      v5.2s,  v4.4h
    170 
    171 
    172     uadalp      v6.1d,  v5.2s               //accumulate all inp into d6 (end for nt==8)
    173 
    174     subs        x10, x10,#8
    175     beq         epil_add_loop
    176 
    177 core_loop_add:
    178     uaddlp      v2.4h,  v0.8b
    179     subs        x10, x10,#8
    180     uaddlp      v3.4h,  v1.8b
    181 
    182 
    183 
    184     add         v4.4h,  v2.4h ,  v3.4h
    185     ld1         {v0.8b},[x6],#8             //load from src[nt] (extra load for 16)
    186 
    187     uaddlp      v5.2s,  v4.4h
    188     ld1         {v1.8b},[x8],#8             //load from src[2nt+1] (extra load for 16)
    189 
    190     uadalp      v6.1d,  v5.2s               //accumulate all inp into d6
    191     bne         core_loop_add
    192 
    193 epil_add_loop:
    194 
    195     sshl        d18, d6, d7                 //(dc_val) shr by log2nt+1
    196     cmp         x4, #32
    197 
    198     mov         v28.s[0], w14
    199     mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
    200     mov         x20,#128
    201     csel        x6, x20, x6,eq
    202 
    203     dup         v16.8b, v18.8b[0]           //dc_val
    204     shl         d25, d18,#1                 //2*dc
    205 
    206     beq         prologue_cpy_32
    207 
    208     add         d27,  d25 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
    209     mov         x20,#0
    210     csel        x6, x20, x6,ne              //nt
    211 
    212     ushr        v29.4h, v27.4h,#2           //final dst[0]'s value in d15[0]
    213     csel        x10, x4, x10,ne
    214 
    215     add         d23,  d25 ,  d18            //3*dc
    216     sub         x12, x3, x3, lsl #3         //-7*strd
    217 
    218     add         d23,  d23 ,  d17            //3*dc + 2
    219     add         x12, x12, #8                //offset after one 8x8 block (-7*strd + 8)
    220 
    221     dup         v24.8h, v23.4h[0]           //3*dc + 2 (moved to all lanes)
    222     sub         x0, x3, x4                  //strd - nt
    223 
    224 prologue_col:
    225     //0th column and 0-7 rows done here
    226     //x8 and x9 (2nt+1+col 2nt-1-row)
    227 
    228     mov         x8, x7                      //&src[2nt+1]
    229 
    230     add         x0, x0, #8                  //strd - nt + 8
    231     ld1         {v0.8b},[x8],#8             //col 1::7 load (prol)
    232     sub         x9, x9, #7                  //&src[2nt-1-row]
    233 
    234     ld1         {v1.8b},[x9]                //row 7::1 (0 also) load (prol)
    235     sub         x9, x9, #8
    236 
    237     uxtl        v20.8h, v0.8b
    238 
    239     ld1         {v6.8b},[x8]                //col 8::15 load (prol extra)
    240     add         v20.8h,  v20.8h ,  v24.8h   //col 1::7 add 3dc+2 (prol)
    241 
    242     uxtl        v22.8h, v1.8b
    243     sqshrun     v2.8b, v20.8h,#2            //columns shx2 movn (prol)
    244 
    245     uxtl        v26.8h, v6.8b
    246     add         v22.8h,  v22.8h ,  v24.8h   //row 1::7 add 3dc+2 (prol)
    247 
    248     movi        d19, #0x00000000000000ff    //
    249     sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
    250 
    251     bsl         v19.8b,  v29.8b ,  v2.8b    //first row with dst[0]
    252     add         v26.8h,  v26.8h ,  v24.8h   //col 8::15 add 3dc+2 (prol extra)
    253 
    254     rev64       v3.8b,  v3.8b
    255 
    256     st1         {v19.8b},[x2], x3           //store row 0 (prol)
    257     sshr        d3, d3,#8                   //row 0 shift (prol) (first value to be ignored)
    258 
    259     movi        d20, #0x00000000000000ff    //byte mask row 1 (prol)
    260 
    261 loop_again_col_row:
    262 
    263     bsl         v20.8b,  v3.8b ,  v16.8b    //row 1    (prol)
    264 
    265     movi        d21, #0x00000000000000ff    //byte mask row 2 (prol)
    266     sshr        d3, d3,#8                   //row 1 shift (prol)
    267 
    268     st1         {v20.8b},[x2], x3           //store row 1 (prol)
    269     sqshrun     v4.8b, v26.8h,#2            //columns shx2 movn (prol extra)
    270 
    271 
    272     bsl         v21.8b,  v3.8b ,  v16.8b    //row 2 (prol)
    273 
    274     movi        d20, #0x00000000000000ff    //byte mask row 3 (prol)
    275     sshr        d3, d3,#8                   //row 2 shift (prol)
    276 
    277     st1         {v21.8b},[x2], x3           //store row 2 (prol)
    278 
    279 
    280     bsl         v20.8b,  v3.8b ,  v16.8b    //row 3    (prol)
    281 
    282     movi        d21, #0x00000000000000ff    //byte mask row 4 (prol)
    283     sshr        d3, d3,#8                   //row 3 shift (prol)
    284 
    285     st1         {v20.8b},[x2], x3           //store row 3 (prol)
    286 
    287 
    288     bsl         v21.8b,  v3.8b ,  v16.8b    //row 4 (prol)
    289 
    290     movi        d20, #0x00000000000000ff    //byte mask row 5 (prol)
    291     sshr        d3, d3,#8                   //row 4 shift (prol)
    292 
    293     st1         {v21.8b},[x2], x3           //store row 4 (prol)
    294 
    295 
    296     bsl         v20.8b,  v3.8b ,  v16.8b    //row 5 (prol)
    297 
    298     movi        d21, #0x00000000000000ff    //byte mask row 6 (prol)
    299     sshr        d3, d3,#8                   //row 5 shift (prol)
    300 
    301     st1         {v20.8b},[x2], x3           //store row 5 (prol)
    302 
    303     ld1         {v1.8b},[x9]                //row 8::15 load (prol extra)
    304 
    305     bsl         v21.8b,  v3.8b ,  v16.8b    //row 6 (prol)
    306 
    307     uxtl        v22.8h, v1.8b
    308 
    309     movi        d20, #0x00000000000000ff    //byte mask row 7 (prol)
    310     sshr        d3, d3,#8                   //row 6 shift (prol)
    311 
    312     st1         {v21.8b},[x2], x3           //store row 6 (prol)
    313 
    314     bsl         v20.8b,  v3.8b ,  v16.8b    //row 7 (prol)
    315     add         v22.8h,  v22.8h ,  v24.8h   //row 8::15 add 3dc+2 (prol extra)
    316 
    317     sshr        d3, d3,#8                   //row 7 shift (prol)
    318     st1         {v20.8b},[x2], x12          //store row 7 (prol)
    319 
    320     subs        x10, x10, #8                //counter for cols
    321 
    322     beq         end_func
    323     blt         copy_16
    324 
    325 
    326     movi        d20, #0x00000000000000ff    //byte mask row 9 (prol)
    327     sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
    328 
    329     rev64       v3.8b,  v3.8b
    330 
    331     st1         {v4.8b},[x2], x3            //store 2nd col (for 16x16)
    332 
    333     st1         {v16.8b},[x2], x3
    334     st1         {v16.8b},[x2], x3
    335     st1         {v16.8b},[x2], x3
    336     st1         {v16.8b},[x2], x3
    337     st1         {v16.8b},[x2], x3
    338     st1         {v16.8b},[x2], x3
    339     st1         {v16.8b},[x2], x0           //go to next row for 16
    340 
    341 
    342     bsl         v20.8b,  v3.8b ,  v16.8b    //row 9    (prol)
    343     subs        x10, x10, #8
    344 
    345     st1         {v20.8b},[x2], x3           //store row 9 (prol)
    346     sshr        d3, d3,#8                   //row 9 shift (prol)
    347 
    348     movi        d20, #0x00000000000000ff    //byte mask row 9 (prol)
    349 
    350     b           loop_again_col_row
    351 
    352 
    353 copy_16:
    354     st1         {v16.8b},[x2], x3
    355     st1         {v16.8b},[x2], x3
    356     st1         {v16.8b},[x2], x3
    357     st1         {v16.8b},[x2], x3
    358     st1         {v16.8b},[x2], x3
    359     st1         {v16.8b},[x2], x3
    360     st1         {v16.8b},[x2], x3
    361     st1         {v16.8b},[x2]
    362 
    363     b           end_func
    364 
    365 prologue_cpy_32:
    366     mov         x9, #128
    367     //sub        x7, x3, #-24
    368     add         x5, x2, x3
    369     add         x8, x5, x3
    370     add         x10, x8, x3
    371     dup         v20.16b, v16.8b[0]
    372     lsl         x6, x3, #2
    373     add         x6, x6, #-16
    374 
    375     st1         {v20.16b}, [x2],#16
    376     st1         {v20.16b}, [x5],#16
    377     st1         {v20.16b}, [x8],#16
    378     st1         {v20.16b}, [x10],#16
    379 
    380     st1         {v20.16b}, [x2], x6
    381     st1         {v20.16b}, [x5], x6
    382     st1         {v20.16b}, [x8], x6
    383     st1         {v20.16b}, [x10], x6
    384 
    385     sub         x9, x9, #32                 //32x32 prol/epil counter dec
    386 
    387 kernel_copy:
    388     st1         {v20.16b}, [x2],#16
    389     st1         {v20.16b}, [x5],#16
    390     st1         {v20.16b}, [x8],#16
    391     st1         {v20.16b}, [x10],#16
    392 
    393     st1         {v20.16b}, [x2], x6
    394     st1         {v20.16b}, [x5], x6
    395     st1         {v20.16b}, [x8], x6
    396     st1         {v20.16b}, [x10], x6
    397 
    398     subs        x9, x9, #32
    399 
    400     st1         {v20.16b}, [x2],#16
    401     st1         {v20.16b}, [x5],#16
    402     st1         {v20.16b}, [x8],#16
    403     st1         {v20.16b}, [x10],#16
    404 
    405     st1         {v20.16b}, [x2], x6
    406     st1         {v20.16b}, [x5], x6
    407     st1         {v20.16b}, [x8], x6
    408     st1         {v20.16b}, [x10], x6
    409 
    410     bne         kernel_copy
    411 
    412 epilogue_copy:
    413     st1         {v20.16b}, [x2],#16
    414     st1         {v20.16b}, [x5],#16
    415     st1         {v20.16b}, [x8],#16
    416     st1         {v20.16b}, [x10],#16
    417 
    418     st1         {v20.16b}, [x2]
    419     st1         {v20.16b}, [x5]
    420     st1         {v20.16b}, [x8]
    421     st1         {v20.16b}, [x10]
    422 
    423     b           end_func
    424 
    425 
    426 dc_4:
    427     ld1         {v0.8b},[x6],#8             //load from src[nt]
    428     ld1         {v1.8b},[x8],#8             //load from src[2nt+1]
    429 
    430     uaddlp      v2.4h,  v0.8b
    431     mov         x5, #0                      //
    432     mov         v6.s[0], w4
    433     mov         v6.s[1], w5                 //store nt to accumulate
    434     uaddlp      v3.4h,  v1.8b
    435 
    436     add         v4.4h,  v2.4h ,  v3.4h
    437 
    438 
    439     uaddlp      v5.2s,  v4.4h
    440     movi        d30, #0x00000000ffffffff
    441 
    442     and         v5.8b,  v5.8b ,  v30.8b
    443 
    444     mov         v28.s[0], w14
    445     mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
    446     add         d6,  d6 ,  d5               //accumulate all inp into d6 (end for nt==8)
    447 
    448     sshl        d18, d6, d7                 //(dc_val) shr by log2nt+1
    449     mov         x8, x7                      //&src[2nt+1]
    450 
    451     shl         d25, d18,#1                 //2*dc
    452     sub         x9, x9, #3                  //&src[2nt-1-row]
    453 
    454     dup         v16.8b, v18.8b[0]           //dc_val
    455     add         d27,  d25 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
    456 
    457     ushr        v29.4h, v27.4h,#2           //final dst[0]'s value in d15[0]
    458     sub         x12, x3, x3, lsl #2         //-3*strd
    459     add         d23,  d25 ,  d18            //3*dc
    460 
    461     add         d23,  d23 ,  d17            //3*dc + 2
    462     add         x12, x12, #4                //offset after one 4x4 block (-3*strd + 4)
    463 
    464     dup         v24.8h, v23.4h[0]           //3*dc + 2 (moved to all lanes)
    465     sub         x0, x3, x4                  //strd - nt
    466 
    467 
    468     ld1         {v0.8b},[x8]                //col 1::3 load (prol)
    469     ld1         {v1.8b},[x9]                //row 3::1 (0 also) load (prol)
    470 
    471     uxtl        v20.8h, v0.8b
    472 
    473     uxtl        v22.8h, v1.8b
    474     add         v20.8h,  v20.8h ,  v24.8h   //col 1::7 add 3dc+2 (prol)
    475 
    476     add         v22.8h,  v22.8h ,  v24.8h   //row 1::7 add 3dc+2 (prol)
    477 
    478     movi        d19, #0x00000000000000ff    //
    479     sqshrun     v2.8b, v20.8h,#2            //columns shx2 movn (prol)
    480 
    481     movi        d20, #0x00000000000000ff    //byte mask row 1 (prol)
    482     sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
    483 
    484 
    485     bsl         v19.8b,  v29.8b ,  v2.8b    //first row with dst[0]
    486 
    487     rev64       v3.8b,  v3.8b
    488 
    489     st1         {v19.s}[0],[x2], x3         //store row 0 (prol)
    490     sshr        d3, d3,#40                  //row 0 shift (prol) (first value to be ignored)
    491 
    492     movi        d21, #0x00000000000000ff    //byte mask row 2 (prol)
    493 
    494     bsl         v20.8b,  v3.8b ,  v16.8b    //row 1    (prol)
    495     sshr        d3, d3,#8                   //row 1 shift (prol)
    496 
    497     st1         {v20.s}[0],[x2], x3         //store row 1 (prol)
    498 
    499     bsl         v21.8b,  v3.8b ,  v16.8b    //row 2 (prol)
    500 
    501     movi        d20, #0x00000000000000ff    //byte mask row 3 (prol)
    502 
    503     sshr        d3, d3,#8                   //row 2 shift (prol)
    504     st1         {v21.s}[0],[x2], x3         //store row 2 (prol)
    505 
    506     bsl         v20.8b,  v3.8b ,  v16.8b    //row 3    (prol)
    507     st1         {v20.s}[0],[x2]             //store row 3 (prol)
    508 
    509 epilogue_end:
    510 end_func:
    511     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    512     ldp         x19, x20,[sp],#16
    513 
    514     ret
    515 
    516 
    517 
    518 
    519 
    520