Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_filters_planar.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  akshaya mukund
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* @brief
     44 //*    luma intraprediction filter for planar input
     45 //*
     46 //* @par description:
     47 //*
     48 //* @param[in] pu1_ref
     49 //*  uword8 pointer to the source
     50 //*
     51 //* @param[out] pu1_dst
     52 //*  uword8 pointer to the destination
     53 //*
     54 //* @param[in] src_strd
     55 //*  integer source stride
     56 //*
     57 //* @param[in] dst_strd
     58 //*  integer destination stride
     59 //*
     60 //* @param[in] pi1_coeff
     61 //*  word8 pointer to the planar coefficients
     62 //*
     63 //* @param[in] nt
     64 //*  size of tranform block
     65 //*
     66 //* @param[in] mode
     67 //*  type of filtering
     68 //*
     69 //* @returns
     70 //*
     71 //* @remarks
     72 //*  none
     73 //*
     74 //*******************************************************************************
     75 //*/
     76 
     77 //void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
     78 //                                  word32 src_strd,
     79 //                                  uword8* pu1_dst,
     80 //                                  word32 dst_strd,
     81 //                                  word32 nt,
     82 //                                  word32 mode,
     83 //                   word32 pi1_coeff)
     84 //**************variables vs registers*****************************************
     85 //x0 => *pu1_ref
     86 //x1 => src_strd
     87 //x2 => *pu1_dst
     88 //x3 => dst_strd
     89 
     90 //stack contents from #40
     91 //    nt
     92 //    mode
     93 //    pi1_coeff
     94 
     95 .text
     96 .align 4
     97 .include "ihevc_neon_macros.s"
     98 
     99 
    100 .globl ihevc_intra_pred_chroma_planar_av8
    101 .extern gau1_ihevc_planar_factor
    102 
    103 
    104 .type ihevc_intra_pred_chroma_planar_av8, %function
    105 
    106 ihevc_intra_pred_chroma_planar_av8:
    107 
    108     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    109 
    110     stp         d10,d11,[sp,#-16]!
    111     stp         d12,d13,[sp,#-16]!
    112     stp         d8,d14,[sp,#-16]!           // Storing d14 using { sub sp,sp,#8; str d14,[sp] } is giving bus error.
    113                                             // d8 is used as dummy register and stored along with d14 using stp. d8 is not used in the function.
    114     stp         x19, x20,[sp,#-16]!
    115 
    116     adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
    117     ldr         x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
    118 
    119     clz         w5,w4
    120     sub         x20, x5, #32
    121     neg         x5, x20
    122     dup         v14.8h,w5
    123     neg         v14.8h, v14.8h              //shr value (so vneg)
    124     dup         v2.8b,w4                    //nt
    125     dup         v16.8h,w4                   //nt
    126 
    127     sub         x6, x4, #1                  //nt-1
    128     add         x6, x0,x6,lsl #1            //2*(nt-1)
    129     ldr         w7,  [x6]
    130     sxtw        x7,w7
    131     dup         v0.4h,w7                    //src[nt-1]
    132 
    133     add         x6, x4, x4,lsl #1           //3nt
    134     add         x6, x6, #1                  //3nt + 1
    135     lsl         x6,x6,#1                    //2*(3nt + 1)
    136 
    137     add         x6, x6, x0
    138     ldr         w7,  [x6]
    139     sxtw        x7,w7
    140     dup         v1.4h,w7                    //src[3nt+1]
    141 
    142 
    143     add         x6, x4, x4                  //2nt
    144     add         x14, x6, #1                 //2nt+1
    145     lsl         x14,x14,#1                  //2*(2nt+1)
    146     sub         x6, x6, #1                  //2nt-1
    147     lsl         x6,x6,#1                    //2*(2nt-1)
    148     add         x6, x6, x0                  //&src[2nt-1]
    149     add         x14, x14, x0                //&src[2nt+1]
    150 
    151     mov         x8, #1                      //row+1 (row is first 0)
    152     sub         x9, x4, x8                  //nt-1-row (row is first 0)
    153 
    154     dup         v5.8b,w8                    //row + 1
    155     dup         v6.8b,w9                    //nt - 1 - row
    156     mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
    157 
    158     add         x12, x11, #1                //coeffs (to be reloaded after every row)
    159     mov         x1, x4                      //nt (row counter) (dec after every row)
    160     mov         x5, x2                      //dst (to be reloaded after every row and inc by dst_strd)
    161     mov         x10, #8                     //increment for the coeffs
    162     mov         x0, x14                     //&src[2nt+1] (to be reloaded after every row)
    163 
    164     cmp         x4, #4
    165     beq         tf_sz_4
    166 
    167 
    168 
    169     mov         x10,x6
    170 tf_sz_8_16:
    171     ld1         {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
    172     ld1         {v17.8b},[x12],#8
    173     mov         v25.8b, v17.8b
    174     zip1        v29.8b, v17.8b, v25.8b
    175     zip2        v25.8b, v17.8b, v25.8b
    176     mov         v17.d[0], v29.d[0]
    177     sub         v30.8b,  v2.8b ,  v17.8b    //[nt-1-col]
    178     sub         v31.8b,  v2.8b ,  v25.8b
    179 
    180 
    181 
    182 
    183 loop_sz_8_16:
    184 
    185     ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
    186     sxtw        x7,w7
    187     umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
    188     ldr         w11,  [x6], #-2             //src[2nt-1-row] (dec to take into account row)
    189     sxtw        x11,w11
    190     umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
    191     dup         v4.4h,w7                    //src[2nt-1-row]
    192     umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
    193     dup         v3.4h,w11                   //src[2nt-1-row]
    194     umlal       v12.8h, v30.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
    195 
    196 
    197 
    198     umull       v28.8h, v5.8b, v0.8b
    199     ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
    200     sxtw        x7,w7
    201     umlal       v28.8h, v6.8b, v11.8b
    202     add         v18.8b,  v5.8b ,  v7.8b     //row++ [(row+1)++]c
    203 
    204 
    205     umlal       v28.8h, v31.8b, v4.8b
    206     sub         v19.8b,  v6.8b ,  v7.8b     //[nt-1-row]--
    207     umlal       v28.8h, v25.8b, v1.8b
    208     dup         v4.4h,w7                    //src[2nt-1-row]
    209 
    210     umull       v26.8h, v18.8b, v0.8b       //(row+1)    *    src[nt-1]
    211     add         v12.8h,  v12.8h ,  v16.8h   //add (nt)
    212     umlal       v26.8h, v19.8b, v10.8b      //(nt-1-row)    *    src[2nt+1+col]
    213     sshl        v12.8h, v12.8h, v14.8h      //shr
    214     umlal       v26.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
    215     add         v28.8h,  v28.8h ,  v16.8h
    216     umlal       v26.8h, v30.8b, v3.8b       //(nt-1-col)    *    src[2nt-1-row]
    217     sshl        v28.8h, v28.8h, v14.8h
    218 
    219 
    220 
    221 
    222 
    223     umull       v24.8h, v18.8b, v0.8b
    224     add         v5.8b,  v18.8b ,  v7.8b     //row++ [(row+1)++]
    225     umlal       v24.8h, v19.8b, v11.8b
    226     sub         v6.8b,  v19.8b ,  v7.8b     //[nt-1-row]--
    227     umlal       v24.8h, v25.8b, v1.8b
    228     xtn         v12.8b,  v12.8h
    229     umlal       v24.8h, v31.8b, v3.8b
    230     xtn         v13.8b,  v28.8h
    231 
    232 
    233 
    234 
    235     add         v26.8h,  v26.8h ,  v16.8h   //add (nt)
    236     umull       v22.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
    237     sshl        v26.8h, v26.8h, v14.8h      //shr
    238     umlal       v22.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
    239     st1         {v12.2s, v13.2s}, [x2], x3
    240     umlal       v22.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
    241     add         v24.8h,  v24.8h ,  v16.8h
    242     umlal       v22.8h, v30.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
    243     sshl        v24.8h, v24.8h, v14.8h
    244 
    245     umull       v20.8h, v5.8b, v0.8b
    246     add         v18.8b,  v5.8b ,  v7.8b     //row++ [(row+1)++]c
    247     umlal       v20.8h, v6.8b, v11.8b
    248     sub         v19.8b,  v6.8b ,  v7.8b     //[nt-1-row]--
    249     umlal       v20.8h, v31.8b, v4.8b
    250 
    251     ldr         w11,  [x6], #-2             //src[2nt-1-row] (dec to take into account row)
    252     sxtw        x11,w11
    253     umlal       v20.8h, v25.8b, v1.8b
    254     dup         v3.4h,w11                   //src[2nt-1-row]
    255     add         v22.8h,  v22.8h ,  v16.8h   //add (nt)
    256 
    257     umull       v12.8h, v18.8b, v0.8b       //(row+1)    *    src[nt-1]
    258     xtn         v26.8b,  v26.8h
    259     umlal       v12.8h, v19.8b, v10.8b      //(nt-1-row)    *    src[2nt+1+col]
    260     xtn         v27.8b,  v24.8h
    261 
    262     umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
    263     sshl        v22.8h, v22.8h, v14.8h      //shr
    264 
    265     umlal       v12.8h, v30.8b, v3.8b       //(nt-1-col)    *    src[2nt-1-row]
    266     add         v20.8h,  v20.8h ,  v16.8h
    267 
    268     umull       v28.8h, v18.8b, v0.8b
    269     st1         {v26.2s, v27.2s}, [x2], x3
    270 
    271     umlal       v28.8h, v19.8b, v11.8b
    272     add         v5.8b,  v18.8b ,  v7.8b     //row++ [(row+1)++]
    273 
    274     sub         v6.8b,  v19.8b ,  v7.8b     //[nt-1-row]--
    275     umlal       v28.8h, v25.8b, v1.8b
    276 
    277     umlal       v28.8h, v31.8b, v3.8b
    278     sshl        v20.8h, v20.8h, v14.8h
    279 
    280 
    281     add         v12.8h,  v12.8h ,  v16.8h   //add (nt)
    282     xtn         v22.8b,  v22.8h
    283 
    284 
    285     add         v28.8h,  v28.8h ,  v16.8h
    286     xtn         v23.8b,  v20.8h
    287 
    288 
    289     sshl        v12.8h, v12.8h, v14.8h      //shr
    290     st1         {v22.2s, v23.2s}, [x2], x3
    291     sshl        v28.8h, v28.8h, v14.8h
    292 
    293 
    294 
    295 
    296 
    297     xtn         v20.8b,  v12.8h
    298     xtn         v21.8b,  v28.8h
    299 
    300     st1         {v20.2s, v21.2s}, [x2], x3
    301 
    302 
    303     subs        x1, x1, #4
    304 
    305     bne         loop_sz_8_16
    306 
    307 
    308 
    309 
    310     cmp         x4,#16
    311 
    312     bne         end_loop
    313 
    314 
    315     sub         x4, x4,#16
    316     dup         v5.8b,w8                    //row + 1
    317     dup         v6.8b,w9                    //nt - 1 - row
    318     mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
    319 
    320     mov         x6,x10
    321     mov         x1,#16
    322     sub         x2,x2,x3,lsl #4
    323     add         x2,x2,#16
    324 
    325     ld1         {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
    326     ld1         {v17.8b},[x12],#8
    327     mov         v25.8b, v17.8b
    328     zip1        v29.8b, v17.8b, v25.8b
    329     zip2        v25.8b, v17.8b, v25.8b
    330     mov         v17.d[0], v29.d[0]
    331     sub         v30.8b,  v2.8b ,  v17.8b    //[nt-1-col]
    332     sub         v31.8b,  v2.8b ,  v25.8b
    333 
    334     beq         loop_sz_8_16
    335 
    336 
    337 
    338 tf_sz_4:
    339     ld1         {v10.8b},[x14]              //load src[2nt+1+col]
    340     ld1         {v17.8b},[x12], x10         //load 8 coeffs [col+1]
    341     mov         v25.8b, v17.8b
    342     zip1        v29.8b, v17.8b, v25.8b
    343     zip2        v25.8b, v17.8b, v25.8b
    344     mov         v17.d[0], v29.d[0]
    345 loop_sz_4:
    346     //mov        x10, #4                @reduce inc to #4 for 4x4
    347     ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
    348     sxtw        x7,w7
    349     dup         v4.4h,w7                    //src[2nt-1-row]
    350 
    351     sub         v25.8b,  v2.8b ,  v17.8b    //[nt-1-col]
    352 
    353     umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
    354     umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
    355     umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
    356     umlal       v12.8h, v25.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
    357 //    vadd.i16    q6, q6, q8            @add (nt)
    358 //    vshl.s16     q6, q6, q7            @shr
    359 //    vmovn.i16     d12, q6
    360     rshrn       v12.8b, v12.8h,#3
    361 
    362     st1         {v12.2s},[x2], x3
    363 
    364     add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
    365     sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
    366     subs        x1, x1, #1
    367 
    368     bne         loop_sz_4
    369 
    370 end_loop:
    371     // ldmfd sp!,{x4-x12,x15}                   //reload the registers from sp
    372     ldp         x19, x20,[sp],#16
    373     ldp         d8,d14,[sp],#16             // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error.
    374                                             // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function.
    375     ldp         d12,d13,[sp],#16
    376     ldp         d10,d11,[sp],#16
    377     ret
    378 
    379 
    380 
    381 
    382 
    383 
    384 
    385