Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_chroma_vert_neon.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*
     35 //* //remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 ///**
     42 //*******************************************************************************
     43 //*
     44 //* //brief
     45 //*   chroma interprediction filter for vertical input
     46 //*
     47 //* //par description:
     48 //*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     49 //*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     50 //*    'pu1_dst'  the output is down shifted by 6 and clipped to 8 bits
     51 //*    assumptions : the function is optimized considering the fact width is
     52 //*    multiple of 2,4 or 8. and also considering height  should be multiple of 2
     53 //*    width 4,8 is optimized further
     54 //*
     55 //* //param[in] pu1_src
     56 //*  uword8 pointer to the source
     57 //*
     58 //* //param[out] pu1_dst
     59 //*  uword8 pointer to the destination
     60 //*
     61 //* //param[in] src_strd
     62 //*  integer source stride
     63 //*
     64 //* //param[in] dst_strd
     65 //*  integer destination stride
     66 //*
     67 //* //param[in] pi1_coeff
     68 //*  word8 pointer to the filter coefficients
     69 //*
     70 //* //param[in] ht
     71 //*  integer height of the array
     72 //*
     73 //* //param[in] wd
     74 //*  integer width of the array
     75 //*
     76 //* //returns
     77 //*
     78 //* //remarks
     79 //*  none
     80 //*
     81 //*******************************************************************************
     82 //*/
     83 //void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
     84 //                                   uword8 *pu1_dst,
     85 //                                   word32 src_strd,
     86 //                                   word32 dst_strd,
     87 //                                   word8 *pi1_coeff,
     88 //                                   word32 ht,
     89 //                                   word32 wd)
     90 //**************variables vs registers*****************************************
     91 //x0 => *pu1_src
     92 //x1 => *pi2_dst
     93 //x2 =>  src_strd
     94 //x3 =>  dst_strd
     95 .text
     96 .align 4
     97 
     98 .include "ihevc_neon_macros.s"
     99 
    100 .globl ihevc_inter_pred_chroma_vert_av8
    101 
    102 .type ihevc_inter_pred_chroma_vert_av8, %function
    103 
    104 ihevc_inter_pred_chroma_vert_av8:
    105 
    106     // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments
    107 
    108     stp         x19, x20,[sp,#-16]!
    109 
    110     mov         x15,x4 // pi1_coeff
    111     mov         x16,x5 // ht
    112     mov         x17,x6 // wd
    113 
    114     mov         x4,x16                      //loads ht
    115     mov         x12,x15                     //loads pi1_coeff
    116     cmp         x4,#0                       //checks ht == 0
    117     mov         x6,x17                      //loads wd
    118     sub         x0,x0,x2                    //pu1_src - src_strd
    119     ld1         {v0.8b},[x12]               //loads pi1_coeff
    120 
    121     ble         end_loops                   //jumps to end
    122 
    123     tst         x6,#3                       //checks (wd & 3)
    124     abs         v3.8b, v0.8b                //vabs_s8(coeff)
    125     lsl         x10,x6,#1                   //2*wd
    126     dup         v0.8b, v3.b[0]              //coeffabs_0
    127     dup         v1.8b, v3.b[1]              //coeffabs_1
    128     dup         v2.8b, v3.b[2]              //coeffabs_2
    129     dup         v3.8b, v3.b[3]              //coeffabs_3
    130 
    131     bgt         outer_loop_wd_2             //jumps to loop handling wd ==2
    132 
    133     tst         x4,#7                       //checks ht for mul of 8
    134     beq         core_loop_ht_8              //when height is multiple of 8
    135 
    136     lsl         x7,x3,#1                    //2*dst_strd
    137     sub         x9,x7,x10                   //2*dst_strd - 2wd
    138     lsl         x12,x2,#1                   //2*src_strd
    139     sub         x8,x12,x10                  //2*src_strd - 2wd
    140     mov         x5,x10                      //2wd
    141 
    142 inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2
    143 
    144     add         x6,x0,x2                    //pu1_src +src_strd
    145     ld1         {v17.8b},[x6],x2            //loads pu1_src
    146     subs        x5,x5,#8                    //2wd - 8
    147     ld1         {v5.8b},[x0],#8             //loads src
    148     umull       v6.8h, v17.8b, v1.8b        //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
    149     ld1         {v4.8b},[x6],x2             //loads incremented src
    150     umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
    151     ld1         {v16.8b},[x6],x2            //loads incremented src
    152     umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
    153     umull       v4.8h, v4.8b, v1.8b
    154     umlsl       v6.8h, v16.8b, v3.8b
    155     umlsl       v4.8h, v17.8b, v0.8b
    156     ld1         {v18.8b},[x6]               //loads the incremented src
    157     umlal       v4.8h, v16.8b, v2.8b
    158     sqrshrun    v6.8b, v6.8h,#6             //shifts right
    159     umlsl       v4.8h, v18.8b, v3.8b
    160     add         x6,x1,x3                    //pu1_dst + dst_strd
    161     sqrshrun    v4.8b, v4.8h,#6             //shifts right
    162     st1         {v6.8b},[x1],#8             //stores the loaded value
    163 
    164     st1         {v4.8b},[x6]                //stores the loaded value
    165 
    166     bgt         inner_loop_ht_2             //inner loop again
    167 
    168     subs        x4,x4,#2                    //ht - 2
    169     add         x1,x1,x9                    //pu1_dst += (2*dst_strd - 2wd)
    170     mov         x5,x10                      //2wd
    171     add         x0,x0,x8                    //pu1_src += (2*src_strd - 2wd)
    172 
    173     bgt         inner_loop_ht_2             //loop again
    174 
    175     b           end_loops                   //jumps to end
    176 
    177 outer_loop_wd_2:                            //called when width is multiple of 2
    178     lsl         x5,x3,#1                    //2*dst_strd
    179     mov         x12,x10                     //2wd
    180     sub         x9,x5,x10                   //2*dst_strd - 2wd
    181     lsl         x7,x2,#1                    //2*src_strd
    182     sub         x8,x7,x10                   //2*src_strd - 2wd
    183 
    184 inner_loop_wd_2:
    185 
    186     add         x6,x0,x2                    //pu1_src + src_strd
    187     ld1         {v6.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
    188     subs        x12,x12,#4                  //2wd - 4
    189     add         x0,x0,#4                    //pu1_src + 4
    190     ld1         {v6.s}[1],[x6],x2           //loads pu1_src_tmp
    191     dup         v7.2s, v6.s[1]
    192     ld1         {v7.s}[1],[x6],x2           //loads pu1_src_tmp
    193     umull       v4.8h, v7.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
    194     dup         v7.2s, v7.s[1]
    195     ld1         {v7.s}[1],[x6],x2
    196     umlsl       v4.8h, v6.8b, v0.8b
    197     umlal       v4.8h, v7.8b, v2.8b
    198     dup         v7.2s, v7.s[1]
    199     ld1         {v7.s}[1],[x6]
    200     add         x6,x1,x3                    //pu1_dst + dst_strd
    201     umlsl       v4.8h, v7.8b, v3.8b
    202     sqrshrun    v4.8b, v4.8h,#6             //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
    203     st1         {v4.s}[0],[x1]              //stores the loaded value
    204     add         x1,x1,#4                    //pu1_dst += 4
    205     st1         {v4.s}[1],[x6]              //stores the loaded value
    206 
    207     bgt         inner_loop_wd_2             //inner loop again
    208 
    209     //inner loop ends
    210     subs        x4,x4,#2                    //ht - 2
    211     add         x1,x1,x9                    //pu1_dst += 2*dst_strd - 2*wd
    212     mov         x12,x10                     //2wd
    213     add         x0,x0,x8                    //pu1_src += 2*src_strd - 2*wd
    214 
    215     bgt         inner_loop_wd_2             //loop again
    216 
    217     b           end_loops                   //jumps to end
    218 
    219 core_loop_ht_8:                             //when wd & ht is multiple of 8
    220 
    221     lsl         x12,x3,#2                   //4*dst_strd
    222     sub         x8,x12,x10                  //4*dst_strd - 2wd
    223     lsl         x12,x2,#2                   //4*src_strd
    224     sub         x9,x12,x10                  //4*src_strd - 2wd
    225 
    226     bic         x5,x10,#7                   //x5 ->wd
    227     lsr         x14, x10, #3                //divide by 8
    228     mul         x12, x4 , x14               //multiply height by width
    229     sub         x12, x12,#4                 //subtract by one for epilog
    230 
    231 prolog:
    232     add         x6,x0,x2                    //pu1_src + src_strd
    233     ld1         {v5.8b},[x6],x2             //loads pu1_src
    234     subs        x5,x5,#8                    //2wd - 8
    235     ld1         {v4.8b},[x0],#8             //loads the source
    236     ld1         {v6.8b},[x6],x2             //load and increment
    237     umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    238     ld1         {v7.8b},[x6],x2             //load and increment
    239     umlsl       v30.8h, v4.8b, v0.8b
    240     add         x7,x1,x3                    //pu1_dst
    241     umlal       v30.8h, v6.8b, v2.8b
    242     umlsl       v30.8h, v7.8b, v3.8b
    243     ld1         {v16.8b},[x6],x2            //load and increment
    244 
    245     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    246     add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
    247     csel        x0, x20, x0,le
    248     umlsl       v28.8h, v5.8b, v0.8b
    249     bic         x20,x10,#7                  //x5 ->wd
    250     csel        x5, x20, x5,le
    251     umlal       v28.8h, v7.8b, v2.8b
    252     ld1         {v17.8b},[x6],x2
    253     umlsl       v28.8h, v16.8b, v3.8b
    254     sqrshrun    v30.8b, v30.8h,#6
    255 
    256     ld1         {v18.8b},[x6],x2
    257     umull       v26.8h, v7.8b, v1.8b
    258     add         x6,x0,x2                    //pu1_src + src_strd
    259     umlsl       v26.8h, v6.8b, v0.8b
    260     st1         {v30.8b},[x1],#8            //stores the loaded value
    261     umlal       v26.8h, v16.8b, v2.8b
    262     ld1         {v4.8b},[x0],#8             //loads the source
    263     umlsl       v26.8h, v17.8b, v3.8b
    264     sqrshrun    v28.8b, v28.8h,#6
    265 
    266     add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
    267     csel        x1, x20, x1,le
    268     umull       v24.8h, v16.8b, v1.8b
    269     ld1         {v5.8b},[x6],x2             //loads pu1_src
    270     umlsl       v24.8h, v7.8b, v0.8b
    271     subs        x12,x12,#4
    272     ld1         {v6.8b},[x6],x2             //load and increment
    273     umlal       v24.8h, v17.8b, v2.8b
    274     ld1         {v7.8b},[x6],x2             //load and increment
    275     umlsl       v24.8h, v18.8b, v3.8b
    276 
    277     lsl         x11,x2,#2
    278     st1         {v28.8b},[x7],x3            //stores the loaded value
    279     sqrshrun    v26.8b, v26.8h,#6
    280     sub         x20,x2,x2,lsl #3
    281     neg         x11, x20
    282     add         x14,x2,x2,lsl #1
    283     add         x14,x14,x11
    284     ble         epilog                      //jumps to epilog
    285 
    286 kernel_8:
    287 
    288     umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    289     subs        x5,x5,#8                    //2wd - 8
    290     umlsl       v30.8h, v4.8b, v0.8b
    291     add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
    292     csel        x0, x20, x0,le
    293     umlal       v30.8h, v6.8b, v2.8b
    294     lsl         x20,x2,#3
    295     sub         x20,x20,x2
    296     csel        x11,x20,x11,le
    297     //rsble        x11,x2,x2,lsl #3
    298     umlsl       v30.8h, v7.8b, v3.8b
    299     st1         {v26.8b},[x7],x3            //stores the loaded value
    300     sqrshrun    v24.8b, v24.8h,#6
    301 
    302     ld1         {v16.8b},[x6],x2            //load and increment
    303 
    304     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    305     bic         x20,x10,#7                  //x5 ->wd
    306     csel        x5, x20, x5,le
    307     umlsl       v28.8h, v5.8b, v0.8b
    308     st1         {v24.8b},[x7],x3            //stores the loaded value
    309 
    310     umlal       v28.8h, v7.8b, v2.8b
    311 
    312     ld1         {v17.8b},[x6],x2
    313     sqrshrun    v30.8b, v30.8h,#6
    314 
    315     umlsl       v28.8h, v16.8b, v3.8b
    316     ld1         {v18.8b},[x6],x2
    317     add         x7,x1,x3                    //pu1_dst
    318     umull       v26.8h, v7.8b, v1.8b
    319     add         x6,x0,x2                    //pu1_src + src_strd
    320 
    321     add         x20,x0, x11
    322     prfm        PLDL1KEEP,[x20]
    323 
    324 
    325     umlsl       v26.8h, v6.8b, v0.8b
    326     ld1         {v4.8b},[x0],#8             //loads the source
    327 
    328     umlal       v26.8h, v16.8b, v2.8b
    329     st1         {v30.8b},[x1],#8            //stores the loaded value
    330 
    331     umlsl       v26.8h, v17.8b, v3.8b
    332     ld1         {v5.8b},[x6],x2             //loads pu1_src
    333 
    334     add         x11,x11,x2
    335     sqrshrun    v28.8b, v28.8h,#6
    336 
    337     umull       v24.8h, v16.8b, v1.8b
    338     ld1         {v6.8b},[x6],x2             //load and increment
    339     add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
    340     csel        x1, x20, x1,le
    341 
    342     cmp         x11,x14
    343     lsl         x20,x2,#3
    344     sub         x20,x20,x2
    345     csel        x11,x20,x11,gt
    346     //rsbgt        x11,x2,x2,lsl #3
    347 
    348     umlsl       v24.8h, v7.8b, v0.8b
    349     subs        x12,x12,#4
    350 
    351     umlal       v24.8h, v17.8b, v2.8b
    352     ld1         {v7.8b},[x6],x2             //load and increment
    353 
    354     umlsl       v24.8h, v18.8b, v3.8b
    355     st1         {v28.8b},[x7],x3            //stores the loaded value
    356     sqrshrun    v26.8b, v26.8h,#6
    357 
    358     bgt         kernel_8                    //jumps to kernel_8
    359 
    360 epilog:
    361 
    362     umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    363     umlsl       v30.8h, v4.8b, v0.8b
    364     umlal       v30.8h, v6.8b, v2.8b
    365     umlsl       v30.8h, v7.8b, v3.8b
    366     st1         {v26.8b},[x7],x3            //stores the loaded value
    367     sqrshrun    v24.8b, v24.8h,#6
    368 
    369     ld1         {v16.8b},[x6],x2            //load and increment
    370     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    371     umlsl       v28.8h, v5.8b, v0.8b
    372     umlal       v28.8h, v7.8b, v2.8b
    373     umlsl       v28.8h, v16.8b, v3.8b
    374     st1         {v24.8b},[x7],x3            //stores the loaded value
    375     sqrshrun    v30.8b, v30.8h,#6
    376 
    377     ld1         {v17.8b},[x6],x2
    378     umull       v26.8h, v7.8b, v1.8b
    379     add         x7,x1,x3                    //pu1_dst
    380     umlsl       v26.8h, v6.8b, v0.8b
    381     st1         {v30.8b},[x1],#8            //stores the loaded value
    382 
    383     sqrshrun    v28.8b, v28.8h,#6
    384     umlal       v26.8h, v16.8b, v2.8b
    385     ld1         {v18.8b},[x6],x2
    386     umlsl       v26.8h, v17.8b, v3.8b
    387 
    388     umull       v24.8h, v16.8b, v1.8b
    389     sqrshrun    v26.8b, v26.8h,#6
    390     st1         {v28.8b},[x7],x3            //stores the loaded value
    391     umlsl       v24.8h, v7.8b, v0.8b
    392     umlal       v24.8h, v17.8b, v2.8b
    393     st1         {v26.8b},[x7],x3            //stores the loaded value
    394     umlsl       v24.8h, v18.8b, v3.8b
    395 
    396     sqrshrun    v24.8b, v24.8h,#6
    397     st1         {v24.8b},[x7],x3            //stores the loaded value
    398 end_loops:
    399     // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
    400     ldp         x19, x20,[sp],#16
    401 
    402     ret
    403 
    404 
    405 
    406