Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs / parthiban
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*
     35 //* //remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 ///**
     42 //*******************************************************************************
     43 //*
     44 //* //brief
     45 //*    chroma interprediction filter for 16bit vertical input and output.
     46 //*
     47 //* //par description:
     48 //*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     49 //*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     50 //*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 6 and
     51 //*    8192 is  subtracted to store it as a 16 bit number  the output is used as
     52 //*    a input to weighted prediction   assumptions : the function is optimized
     53 //*    considering the fact width and  height are multiple of 2.
     54 //*
     55 //* //param[in] pi2_src
     56 //*  word16 pointer to the source
     57 //*
     58 //* //param[out] pi2_dst
     59 //*  word16 pointer to the destination
     60 //*
     61 //* //param[in] src_strd
     62 //*  integer source stride
     63 //*
     64 //* //param[in] dst_strd
     65 //*  integer destination stride
     66 //*
     67 //* //param[in] pi1_coeff
     68 //*  word8 pointer to the filter coefficients
     69 //*
     70 //* //param[in] ht
     71 //*  integer height of the array
     72 //*
     73 //* //param[in] wd
     74 //*  integer width of the array
     75 //*
     76 //* //returns
     77 //*
     78 //* //remarks
     79 //*  none
     80 //*
     81 //*******************************************************************************
     82 //*/
     83 //void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
     84 //                                                 word16 *pi2_dst,
     85 //                                                 word32 src_strd,
     86 //                                                 word32 dst_strd,
     87 //                                                 word8 *pi1_coeff,
     88 //                                                 word32 ht,
     89 //                                                 word32 wd)
     90 //**************variables vs registers*****************************************
     91 //x0 => *pu1_src
     92 //x1 => *pi2_dst
     93 //x2 =>  src_strd
     94 //x3 =>  dst_strd
     95 .text
     96 .align 4
     97 
     98 .include "ihevc_neon_macros.s"
     99 
    100 .globl ihevc_inter_pred_chroma_vert_w16inp_w16out_av8
    101 
    102 .type ihevc_inter_pred_chroma_vert_w16inp_w16out_av8, %function
    103 
    104 ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
    105 
    106     // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
    107 
    108     stp         x19, x20,[sp,#-16]!
    109 
    110     mov         x15,x4 // pi1_coeff
    111     mov         x16,x5 // ht
    112     mov         x17,x6 // wd
    113 
    114     mov         x4, x15                     //loads pi1_coeff
    115     mov         x6, x17                     //wd
    116     lsl         x2,x2,#1                    //src_strd = 2* src_strd
    117     mov         x5,x16                      //loads ht
    118     ld1         {v0.8b},[x4]                //loads pi1_coeff
    119     sub         x4,x0,x2                    //pu1_src - src_strd
    120     sxtl        v0.8h, v0.8b                //long the value
    121 
    122     tst         x6,#3                       //checks wd  == 2
    123     dup         v16.4h, v0.h[0]             //coeff_0
    124     dup         v17.4h, v0.h[1]             //coeff_1
    125     dup         v18.4h, v0.h[2]             //coeff_2
    126     dup         v19.4h, v0.h[3]             //coeff_3
    127 
    128     bgt         core_loop_ht_2              //jumps to loop handles wd 2
    129 
    130     tst         x5,#3                       //checks ht == mul of 4
    131     beq         core_loop_ht_4              //jumps to loop handles ht mul of 4
    132 
    133 core_loop_ht_2:
    134     lsl         x7,x2,#1                    //2*src_strd
    135     lsl         x3,x3,#1                    //2*dst_strd
    136     lsl         x9,x6,#2                    //4*wd
    137     sub         x6,x3,x6,lsl #1             //2*dst_strd - 2*wd
    138     sub         x8,x7,x9                    //2*src_strd - 4*wd
    139     mov         x12,x9                      //4wd
    140 
    141 inner_loop_ht_2:
    142     add         x0,x4,x2                    //increments pi2_src
    143     ld1         {v0.4h},[x4],#8             //loads pu1_src
    144     smull       v0.4s, v0.4h, v16.4h        //vmull_s16(src_tmp1, coeff_0)
    145     subs        x12,x12,#8                  //2wd + 8
    146     ld1         {v2.4h},[x0],x2             //loads pi2_src
    147     smull       v7.4s, v2.4h, v16.4h        //vmull_s16(src_tmp2, coeff_0)
    148     ld1         {v3.4h},[x0],x2             //loads pi2_src
    149     smlal       v0.4s, v2.4h, v17.4h
    150     ld1         {v6.4h},[x0],x2
    151     smlal       v7.4s, v3.4h, v17.4h
    152     ld1         {v2.4h},[x0]
    153     add         x7,x1,x3                    //pu1_dst + dst_strd
    154     smlal       v0.4s, v3.4h, v18.4h
    155     smlal       v7.4s, v6.4h, v18.4h
    156     smlal       v0.4s, v6.4h, v19.4h
    157     smlal       v7.4s, v2.4h, v19.4h
    158     sqshrn      v0.4h, v0.4s,#6             //right shift
    159     sqshrn      v30.4h, v7.4s,#6            //right shift
    160     st1         {v0.2s},[x1],#8             //stores the loaded value
    161     st1         {v30.2s},[x7]               //stores the loaded value
    162     bgt         inner_loop_ht_2             //inner loop -again
    163 
    164     //inner loop ends
    165     subs        x5,x5,#2                    //increments ht
    166     add         x1,x1,x6,lsl #1             //pu1_dst += 2*dst_strd - 2*wd
    167     mov         x12,x9                      //4wd
    168     add         x4,x4,x8                    //pi1_src_tmp1 += 2*src_strd - 4*wd
    169     bgt         inner_loop_ht_2             //loop again
    170 
    171     b           end_loops                   //jumps to end
    172 
    173 core_loop_ht_4:
    174     lsl         x7,x2,#2                    //2*src_strd
    175     lsl         x10,x3,#2                   //2*dst_strd
    176     lsr         x11, x6, #1                 //divide by 2
    177     sub         x14,x10,x6,lsl #1           //2*dst_strd - 2*wd
    178     sub         x8,x7,x6,lsl #2             //2*src_strd - 4*wd
    179 
    180     mul         x12, x5 , x11               //multiply height by width
    181     sub         x12, x12,#4                 //subtract by one for epilog
    182     lsl         x11, x6, #1                 //2*wd
    183     lsl         x3,x3,#1                    //2*dst_strd
    184 
    185 prolog:
    186     add         x0,x4,x2                    //increments pi2_src
    187     ld1         {v0.4h},[x4],#8             //loads pu1_src
    188     ld1         {v1.4h},[x0],x2             //loads pi2_src
    189     subs        x11,x11,#4
    190     ld1         {v2.4h},[x0],x2             //loads pi2_src
    191     smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    192     ld1         {v3.4h},[x0],x2
    193     smlal       v30.4s, v1.4h, v17.4h
    194     smlal       v30.4s, v2.4h, v18.4h
    195     add         x9,x1,x3                    //pu1_dst + dst_strd
    196     smlal       v30.4s, v3.4h, v19.4h
    197 
    198     ld1         {v4.4h},[x0],x2
    199     smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    200     add         x20,x4,x8
    201     csel        x4, x20, x4,le
    202     lsl         x20,x6,#1
    203     csel        x11, x20, x11,le
    204     smlal       v28.4s, v2.4h, v17.4h
    205     smlal       v28.4s, v3.4h, v18.4h
    206     ld1         {v5.4h},[x0],x2
    207     smlal       v28.4s, v4.4h, v19.4h
    208 
    209     sqshrn      v30.4h, v30.4s,#6           //right shift
    210 
    211     ld1         {v6.4h},[x0],x2
    212     smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    213     smlal       v26.4s, v3.4h, v17.4h
    214     smlal       v26.4s, v4.4h, v18.4h
    215     add         x0,x4,x2
    216     ld1         {v0.4h},[x4],#8             //loads pu1_src
    217     smlal       v26.4s, v5.4h, v19.4h
    218 
    219     sqshrn      v28.4h, v28.4s,#6           //right shift
    220 
    221     ld1         {v1.4h},[x0],x2             //loads pi2_src
    222     smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    223     st1         {v30.2s},[x1],#8            //stores the loaded value
    224     smlal       v24.4s, v4.4h, v17.4h
    225     ld1         {v2.4h},[x0],x2             //loads pi2_src
    226     smlal       v24.4s, v5.4h, v18.4h
    227     ld1         {v3.4h},[x0],x2
    228     smlal       v24.4s, v6.4h, v19.4h
    229     add         x20,x1,x14,lsl #1
    230     csel        x1, x20, x1,le
    231 
    232     sqshrn      v26.4h, v26.4s,#6           //right shift
    233     subs        x12,x12,#4
    234 
    235     beq         epilog                      //jumps to epilog
    236 
    237 kernel_4:
    238     smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    239     subs        x11,x11,#4
    240     smlal       v30.4s, v1.4h, v17.4h
    241     st1         {v28.2s},[x9],x3            //stores the loaded value
    242     smlal       v30.4s, v2.4h, v18.4h
    243     smlal       v30.4s, v3.4h, v19.4h
    244 
    245     sqshrn      v24.4h, v24.4s,#6           //right shift
    246 
    247     ld1         {v4.4h},[x0],x2
    248     smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    249     smlal       v28.4s, v2.4h, v17.4h
    250     smlal       v28.4s, v3.4h, v18.4h
    251     smlal       v28.4s, v4.4h, v19.4h
    252     st1         {v26.2s},[x9],x3            //stores the loaded value
    253     add         x20,x4,x8
    254     csel        x4, x20, x4,le
    255     lsl         x20,x6,#1
    256     csel        x11, x20, x11,le
    257 
    258     sqshrn      v30.4h, v30.4s,#6           //right shift
    259 
    260     ld1         {v5.4h},[x0],x2
    261     smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    262     ld1         {v6.4h},[x0],x2
    263     smlal       v26.4s, v3.4h, v17.4h
    264     st1         {v24.2s},[x9]               //stores the loaded value
    265     add         x0,x4,x2
    266     smlal       v26.4s, v4.4h, v18.4h
    267     ld1         {v0.4h},[x4],#8             //loads pu1_src
    268     smlal       v26.4s, v5.4h, v19.4h
    269 
    270     sqshrn      v28.4h, v28.4s,#6           //right shift
    271 
    272     ld1         {v1.4h},[x0],x2             //loads pi2_src
    273     smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    274     ld1         {v2.4h},[x0],x2             //loads pi2_src
    275     smlal       v24.4s, v4.4h, v17.4h
    276     add         x9,x1,x3                    //pu1_dst + dst_strd
    277     ld1         {v3.4h},[x0],x2
    278     smlal       v24.4s, v5.4h, v18.4h
    279 
    280     st1         {v30.2s},[x1],#8            //stores the loaded value
    281     smlal       v24.4s, v6.4h, v19.4h
    282 
    283     sqshrn      v26.4h, v26.4s,#6           //right shift
    284     add         x20,x1,x14,lsl #1
    285     csel        x1, x20, x1,le
    286 
    287     subs        x12,x12,#4
    288 
    289     bgt         kernel_4                    //jumps to kernel_4
    290 
    291 epilog:
    292     smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    293     st1         {v28.2s},[x9],x3            //stores the loaded value
    294     smlal       v30.4s, v1.4h, v17.4h
    295     smlal       v30.4s, v2.4h, v18.4h
    296     smlal       v30.4s, v3.4h, v19.4h
    297 
    298     sqshrn      v24.4h, v24.4s,#6           //right shift
    299 
    300     smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    301     ld1         {v4.4h},[x0],x2
    302     smlal       v28.4s, v2.4h, v17.4h
    303     st1         {v26.2s},[x9],x3            //stores the loaded value
    304     smlal       v28.4s, v3.4h, v18.4h
    305     smlal       v28.4s, v4.4h, v19.4h
    306 
    307     sqshrn      v30.4h, v30.4s,#6           //right shift
    308 
    309     smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    310     ld1         {v5.4h},[x0],x2
    311     smlal       v26.4s, v3.4h, v17.4h
    312     smlal       v26.4s, v4.4h, v18.4h
    313     smlal       v26.4s, v5.4h, v19.4h
    314 
    315     sqshrn      v28.4h, v28.4s,#6           //right shift
    316 
    317     st1         {v24.2s},[x9]               //stores the loaded value
    318     smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    319     smlal       v24.4s, v4.4h, v17.4h
    320     add         x9,x1,x3                    //pu1_dst + dst_strd
    321     ld1         {v6.4h},[x0],x2
    322     smlal       v24.4s, v5.4h, v18.4h
    323     smlal       v24.4s, v6.4h, v19.4h
    324     st1         {v30.2s},[x1],#8            //stores the loaded value
    325 
    326     sqshrn      v26.4h, v26.4s,#6           //right shift
    327 
    328     st1         {v28.2s},[x9],x3            //stores the loaded value
    329 
    330     sqshrn      v24.4h, v24.4s,#6           //right shift
    331     st1         {v26.2s},[x9],x3            //stores the loaded value
    332 
    333     st1         {v24.2s},[x9]               //stores the loaded value
    334 
    335 end_loops:
    336     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    337     ldp         x19, x20,[sp],#16
    338 
    339     ret
    340 
    341 
    342 
    343 
    344