Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_chroma_vert_w16out_neon.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs/ pathiban
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*
     35 //* //remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 ///**
     42 //*******************************************************************************
     43 //*
     44 //* //brief
     45 //*   interprediction chroma filter to store vertical 16bit ouput
     46 //*
     47 //* //par description:
     48 //*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     49 //*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     50 //*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
     51 //*    an input for weighted prediction   assumptions : the function is optimized
     52 //*    considering the fact width is  multiple of 2,4 or 8. and also considering
     53 //*    height  should be multiple of 2. width 4,8 is optimized further
     54 //*
     55 //* //param[in] pu1_src
     56 //*  uword8 pointer to the source
     57 //*
     58 //* //param[out] pi2_dst
     59 //*  word16 pointer to the destination
     60 //*
     61 //* //param[in] src_strd
     62 //*  integer source stride
     63 //*
     64 //* //param[in] dst_strd
     65 //*  integer destination stride
     66 //*
     67 //* //param[in] pi1_coeff
     68 //*  word8 pointer to the filter coefficients
     69 //*
     70 //* //param[in] ht
     71 //*  integer height of the array
     72 //*
     73 //* //param[in] wd
     74 //*  integer width of the array
     75 //*
     76 //* //returns
     77 //*
     78 //* //remarks
     79 //*  none
     80 //*
     81 //*****************************************************************************
     82 //*/
     83 //void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src,
     84 //                                            word16 *pi2_dst,
     85 //                                            word32 src_strd,
     86 //                                            word32 dst_strd,
     87 //                                            word8 *pi1_coeff,
     88 //                                            word32 ht,
     89 //                                            word32 wd)
     90 //**************variables vs registers*****************************************
     91 //x0 => *pu1_src
     92 //x1 => *pi2_dst
     93 //x2 =>  src_strd
     94 //x3 =>  dst_strd
     95 
     96 .text
     97 .align 4
     98 
     99 .include "ihevc_neon_macros.s"
    100 
    101 .globl ihevc_inter_pred_chroma_vert_w16out_av8
    102 
    103 .type ihevc_inter_pred_chroma_vert_w16out_av8, %function
    104 
    105 ihevc_inter_pred_chroma_vert_w16out_av8:
    106 
    107     // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments
    108 
    109     stp         x19, x20,[sp,#-16]!
    110 
    111     mov         x15,x4 // pi1_coeff
    112     mov         x16,x5 // ht
    113     mov         x17,x6 // wd
    114 
    115 
    116     mov         x4,x16                      //loads ht
    117     mov         x12,x15                     //loads pi1_coeff
    118     cmp         x4,#0                       //checks ht == 0
    119     mov         x6,x17                      //loads wd
    120     sub         x0,x0,x2                    //pu1_src - src_strd
    121     ld1         {v0.8b},[x12]               //loads pi1_coeff
    122 
    123     ble         end_loops                   //jumps to end
    124 
    125     tst         x6,#3                       //checks (wd & 3)
    126     abs         v3.8b, v0.8b                //vabs_s8(coeff)
    127     lsl         x10,x6,#1                   //2*wd
    128     dup         v0.8b, v3.b[0]              //coeffabs_0
    129     dup         v1.8b, v3.b[1]              //coeffabs_1
    130     dup         v2.8b, v3.b[2]              //coeffabs_2
    131     dup         v3.8b, v3.b[3]              //coeffabs_3
    132 
    133     bgt         outer_loop_wd_2             //jumps to loop handling wd ==2
    134 
    135     tst         x4,#7                       //checks ht for mul of 8
    136     beq         core_loop_ht_8              //when height is multiple of 8
    137 
    138     lsl         x7,x3,#2                    //2*dst_strd
    139     sub         x9,x7,x10,lsl #1            //4*dst_strd - 4wd
    140     lsl         x12,x2,#1                   //2*src_strd
    141     sub         x8,x12,x10                  //2*src_strd - 2wd
    142     lsl         x3, x3, #1
    143     mov         x5,x10                      //2wd
    144 
    145 inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2
    146 
    147     add         x6,x0,x2                    //pu1_src +src_strd
    148     ld1         {v17.8b},[x6],x2            //loads pu1_src
    149     subs        x5,x5,#8                    //2wd - 8
    150     ld1         {v5.8b},[x0],#8             //loads src
    151     umull       v6.8h, v17.8b, v1.8b        //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
    152     ld1         {v4.8b},[x6],x2             //loads incremented src
    153     umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
    154     ld1         {v16.8b},[x6],x2            //loads incremented src
    155     umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
    156     umull       v4.8h, v4.8b, v1.8b
    157     ld1         {v18.8b},[x6]               //loads the incremented src
    158     umlsl       v6.8h, v16.8b, v3.8b
    159     umlsl       v4.8h, v17.8b, v0.8b
    160     umlal       v4.8h, v16.8b, v2.8b
    161     umlsl       v4.8h, v18.8b, v3.8b
    162     add         x6,x1,x3                    //pu1_dst + dst_strd
    163     st1         { v6.8h},[x1],#16           //stores the loaded value
    164 
    165     st1         { v4.8h},[x6]               //stores the loaded value
    166 
    167     bgt         inner_loop_ht_2             //inner loop again
    168 
    169     subs        x4,x4,#2                    //ht - 2
    170     add         x1,x1,x9                    //pu1_dst += (2*dst_strd - 2wd)
    171     mov         x5,x10                      //2wd
    172     add         x0,x0,x8                    //pu1_src += (2*src_strd - 2wd)
    173 
    174     bgt         inner_loop_ht_2             //loop again
    175 
    176     b           end_loops                   //jumps to end
    177 
    178 outer_loop_wd_2:                            //called when width is multiple of 2
    179     lsl         x5,x3,#2                    //2*dst_strd
    180     mov         x12,x10                     //2wd
    181     sub         x9,x5,x10,lsl #1            //4*dst_strd - 4wd
    182     lsl         x7,x2,#1                    //2*src_strd
    183     sub         x8,x7,x10                   //2*src_strd - 2wd
    184 
    185 inner_loop_wd_2:
    186 
    187     add         x6,x0,x2                    //pu1_src + src_strd
    188     ld1         {v6.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
    189     subs        x12,x12,#4                  //2wd - 4
    190     add         x0,x0,#4                    //pu1_src + 4
    191     ld1         {v6.s}[1],[x6],x2           //loads pu1_src_tmp
    192     dup         v7.2s, v6.s[1]
    193     ld1         {v7.s}[1],[x6],x2           //loads pu1_src_tmp
    194     umull       v4.8h, v7.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
    195     dup         v7.2s, v7.s[1]
    196     ld1         {v7.s}[1],[x6],x2
    197     umlsl       v4.8h, v6.8b, v0.8b
    198     umlal       v4.8h, v7.8b, v2.8b
    199     dup         v7.2s, v7.s[1]
    200     ld1         {v7.s}[1],[x6]
    201     add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
    202     umlsl       v4.8h, v7.8b, v3.8b
    203     st1         {v4.d}[0],[x1]              //stores the loaded value
    204     add         x1,x1,#8                    //pu1_dst += 4
    205     st1         {v4.d}[1],[x6]              //stores the loaded value
    206 
    207     bgt         inner_loop_wd_2             //inner loop again
    208 
    209     //inner loop ends
    210     subs        x4,x4,#2                    //ht - 2
    211     add         x1,x1,x9                    //pu1_dst += 2*dst_strd - 2*wd
    212     mov         x12,x10                     //2wd
    213     add         x0,x0,x8                    //pu1_src += 2*src_strd - 2*wd
    214 
    215     bgt         inner_loop_wd_2             //loop again
    216 
    217     b           end_loops                   //jumps to end
    218 
    219 core_loop_ht_8:                             //when wd & ht is multiple of 8
    220 
    221     lsl         x12,x3,#3                   //4*dst_strd
    222     sub         x8,x12,x10,lsl #1           //4*dst_strd - 2wd
    223     lsl         x12,x2,#2                   //4*src_strd
    224     sub         x9,x12,x10                  //4*src_strd - 2wd
    225 
    226     bic         x5,x10,#7                   //x5 ->wd
    227     lsr         x14, x10, #3                //divide by 8
    228     mul         x12, x4 , x14               //multiply height by width
    229     sub         x12, x12,#4                 //subtract by one for epilog
    230     lsl         x3, x3, #1
    231 
    232 prolog:
    233     add         x6,x0,x2                    //pu1_src + src_strd
    234     ld1         {v5.8b},[x6],x2             //loads pu1_src
    235     subs        x5,x5,#8                    //2wd - 8
    236     ld1         {v4.8b},[x0],#8             //loads the source
    237     ld1         {v6.8b},[x6],x2             //load and increment
    238     umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    239     ld1         {v7.8b},[x6],x2             //load and increment
    240     umlsl       v30.8h, v4.8b, v0.8b
    241     add         x7,x1,x3                    //pu1_dst
    242     umlal       v30.8h, v6.8b, v2.8b
    243     umlsl       v30.8h, v7.8b, v3.8b
    244     ld1         {v16.8b},[x6],x2            //load and increment
    245 
    246     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    247     add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
    248     csel        x0, x20, x0,le
    249     umlsl       v28.8h, v5.8b, v0.8b
    250     bic         x20,x10,#7                  //x5 ->wd
    251     csel        x5, x20, x5,le
    252     umlal       v28.8h, v7.8b, v2.8b
    253     ld1         {v17.8b},[x6],x2
    254     umlsl       v28.8h, v16.8b, v3.8b
    255 
    256     ld1         {v18.8b},[x6],x2
    257     umull       v26.8h, v7.8b, v1.8b
    258     add         x6,x0,x2                    //pu1_src + src_strd
    259     umlsl       v26.8h, v6.8b, v0.8b
    260     st1         { v30.16b},[x1],#16         //stores the loaded value
    261     umlal       v26.8h, v16.8b, v2.8b
    262     ld1         {v4.8b},[x0],#8             //loads the source
    263     umlsl       v26.8h, v17.8b, v3.8b
    264 
    265     add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
    266     csel        x1, x20, x1,le
    267     umull       v24.8h, v16.8b, v1.8b
    268     ld1         {v5.8b},[x6],x2             //loads pu1_src
    269     umlsl       v24.8h, v7.8b, v0.8b
    270     subs        x12,x12,#4
    271     ld1         {v6.8b},[x6],x2             //load and increment
    272     umlal       v24.8h, v17.8b, v2.8b
    273     ld1         {v7.8b},[x6],x2             //load and increment
    274     umlsl       v24.8h, v18.8b, v3.8b
    275     sub         x20,x2,x2,lsl #3
    276     neg         x11, x20
    277     add         x14,x2,x2,lsl #1
    278     add         x14,x14,x11
    279     st1         { v28.16b},[x7],x3          //stores the loaded value
    280 
    281     ble         epilog                      //jumps to epilog
    282 
    283 kernel_8:
    284 
    285     umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    286     subs        x5,x5,#8                    //2wd - 8
    287     umlsl       v30.8h, v4.8b, v0.8b
    288     add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
    289     csel        x0, x20, x0,le
    290     umlal       v30.8h, v6.8b, v2.8b
    291 
    292     lsl         x20,x2,#3
    293     sub         x20,x20,x2
    294     csel        x11,x20,x11,le
    295     //rsble        x11,x2,x2,lsl #3
    296     umlsl       v30.8h, v7.8b, v3.8b
    297     st1         { v26.16b},[x7],x3          //stores the loaded value
    298 
    299     ld1         {v16.8b},[x6],x2            //load and increment
    300 
    301     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    302     bic         x20,x10,#7                  //x5 ->wd
    303     csel        x5, x20, x5,le
    304     umlsl       v28.8h, v5.8b, v0.8b
    305     st1         { v24.16b},[x7],x3          //stores the loaded value
    306 
    307     umlal       v28.8h, v7.8b, v2.8b
    308     ld1         {v17.8b},[x6],x2
    309 
    310     umlsl       v28.8h, v16.8b, v3.8b
    311     ld1         {v18.8b},[x6],x2
    312     add         x7,x1,x3                    //pu1_dst
    313     umull       v26.8h, v7.8b, v1.8b
    314     add         x6,x0,x2                    //pu1_src + src_strd
    315     add         x20,x0, x11
    316     prfm        PLDL1KEEP,[x20]
    317 
    318     umlsl       v26.8h, v6.8b, v0.8b
    319     ld1         {v4.8b},[x0],#8             //loads the source
    320 
    321     add         x11,x11,x2
    322     umlal       v26.8h, v16.8b, v2.8b
    323     st1         { v30.16b},[x1],#16         //stores the loaded value
    324 
    325     umlsl       v26.8h, v17.8b, v3.8b
    326     ld1         {v5.8b},[x6],x2             //loads pu1_src
    327 
    328     umull       v24.8h, v16.8b, v1.8b
    329     ld1         {v6.8b},[x6],x2             //load and increment
    330     add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
    331     csel        x1, x20, x1,le
    332 
    333     cmp         x11,x14
    334 
    335     lsl         x20,x2,#3
    336     sub         x20,x20,x2
    337     csel        x11,x20,x11,gt
    338     //rsbgt        x11,x2,x2,lsl #3
    339 
    340     umlsl       v24.8h, v7.8b, v0.8b
    341     subs        x12,x12,#4
    342 
    343 
    344     umlal       v24.8h, v17.8b, v2.8b
    345     ld1         {v7.8b},[x6],x2             //load and increment
    346 
    347     umlsl       v24.8h, v18.8b, v3.8b
    348     st1         { v28.16b},[x7],x3          //stores the loaded value
    349 
    350     bgt         kernel_8                    //jumps to kernel_8
    351 
    352 epilog:
    353 
    354     umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
    355     umlsl       v30.8h, v4.8b, v0.8b
    356     umlal       v30.8h, v6.8b, v2.8b
    357     umlsl       v30.8h, v7.8b, v3.8b
    358     st1         { v26.16b},[x7],x3          //stores the loaded value
    359 
    360     ld1         {v16.8b},[x6],x2            //load and increment
    361     umull       v28.8h, v6.8b, v1.8b        //mul_res 2
    362     umlsl       v28.8h, v5.8b, v0.8b
    363     umlal       v28.8h, v7.8b, v2.8b
    364     umlsl       v28.8h, v16.8b, v3.8b
    365     st1         { v24.16b},[x7],x3          //stores the loaded value
    366 
    367     ld1         {v17.8b},[x6],x2
    368     umull       v26.8h, v7.8b, v1.8b
    369     add         x7,x1,x3                    //pu1_dst
    370     umlsl       v26.8h, v6.8b, v0.8b
    371     st1         { v30.16b},[x1],#16         //stores the loaded value
    372     umlal       v26.8h, v16.8b, v2.8b
    373     ld1         {v18.8b},[x6],x2
    374     umlsl       v26.8h, v17.8b, v3.8b
    375 
    376     umull       v24.8h, v16.8b, v1.8b
    377     st1         { v28.16b},[x7],x3          //stores the loaded value
    378     umlsl       v24.8h, v7.8b, v0.8b
    379     umlal       v24.8h, v17.8b, v2.8b
    380     st1         { v26.16b},[x7],x3          //stores the loaded value
    381     umlsl       v24.8h, v18.8b, v3.8b
    382 
    383     st1         { v24.16b},[x7],x3          //stores the loaded value
    384 
    385 end_loops:
    386     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    387     ldp         x19, x20,[sp],#16
    388 
    389     ret
    390 
    391 
    392 
    393