Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs / parthiban
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*
     35 //* //remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 ///**
     42 //*******************************************************************************
     43 //*
     44 //* //brief
     45 //*       chroma interprediction filter for 16bit vertical input.
     46 //*
     47 //* //par description:
     48 //*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     49 //*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     50 //*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
     51 //*    clipped to lie  between 0 and 255   assumptions : the function is
     52 //*    optimized considering the fact width and  height are multiple of 2.
     53 //*
     54 //* //param[in] pi2_src
     55 //*  word16 pointer to the source
     56 //*
     57 //* //param[out] pu1_dst
     58 //*  uword8 pointer to the destination
     59 //*
     60 //* //param[in] src_strd
     61 //*  integer source stride
     62 //*
     63 //* //param[in] dst_strd
     64 //*  integer destination stride
     65 //*
     66 //* //param[in] pi1_coeff
     67 //*  word8 pointer to the filter coefficients
     68 //*
     69 //* //param[in] ht
     70 //*  integer height of the array
     71 //*
     72 //* //param[in] wd
     73 //*  integer width of the array
     74 //*
     75 //* //returns
     76 //*
     77 //* //remarks
     78 //*  none
     79 //*
     80 //*******************************************************************************
     81 //*/
     82 //void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
     83 //                                          uword8 *pu1_dst,
     84 //                                          word32 src_strd,
     85 //                                          word32 dst_strd,
     86 //                                          word8 *pi1_coeff,
     87 //                                          word32 ht,
     88 //                                          word32 wd)
     89 //**************variables vs registers*****************************************
     90 //x0 => *pu1_src
     91 //x1 => *pi2_dst
     92 //x2 =>  src_strd
     93 //x3 =>  dst_strd
     94 
     95 .text
     96 .align 4
     97 
     98 .include "ihevc_neon_macros.s"
     99 
    100 .globl ihevc_inter_pred_chroma_vert_w16inp_av8
    101 
    102 .type ihevc_inter_pred_chroma_vert_w16inp_av8, %function
    103 
    104 ihevc_inter_pred_chroma_vert_w16inp_av8:
    105 
    106     // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
    107 
    108     stp         x19, x20,[sp,#-16]!
    109 
    110     mov         x15,x4 // pi1_coeff
    111     mov         x16,x5 // ht
    112     mov         x17,x6 // wd
    113 
    114     mov         x4, x15                     //loads pi1_coeff
    115     mov         x6, x17                     //wd
    116     lsl         x2,x2,#1                    //src_strd = 2* src_strd
    117     mov         x5,x16                      //loads ht
    118     ld1         {v0.8b},[x4]                //loads pi1_coeff
    119     sub         x4,x0,x2                    //pu1_src - src_strd
    120     sxtl        v0.8h, v0.8b                //long the value
    121 
    122     tst         x6,#3                       //checks wd  == 2
    123     dup         v16.4h, v0.h[0]             //coeff_0
    124     dup         v17.4h, v0.h[1]             //coeff_1
    125     dup         v18.4h, v0.h[2]             //coeff_2
    126     dup         v19.4h, v0.h[3]             //coeff_3
    127 
    128     bgt         core_loop_ht_2              //jumps to loop handles wd 2
    129 
    130     tst         x5,#3                       //checks ht == mul of 4
    131     beq         core_loop_ht_4              //jumps to loop handles ht mul of 4
    132 
    133 core_loop_ht_2:
    134     lsl         x7,x2,#1                    //2*src_strd
    135     lsl         x12,x3,#1                   //2*dst_strd
    136     lsl         x9,x6,#2                    //4*wd
    137     sub         x6,x12,x6,lsl #1            //2*dst_strd - 2*wd
    138     sub         x8,x7,x9                    //2*src_strd - 4*wd
    139     mov         x12,x9                      //4wd
    140 
    141 inner_loop_ht_2:
    142     add         x0,x4,x2                    //increments pi2_src
    143     ld1         {v0.4h},[x4],#8             //loads pu1_src
    144     smull       v0.4s, v0.4h, v16.4h        //vmull_s16(src_tmp1, coeff_0)
    145     subs        x12,x12,#8                  //2wd + 8
    146     ld1         {v2.4h},[x0],x2             //loads pi2_src
    147     smull       v7.4s, v2.4h, v16.4h        //vmull_s16(src_tmp2, coeff_0)
    148     ld1         {v3.4h},[x0],x2             //loads pi2_src
    149     smlal       v0.4s, v2.4h, v17.4h
    150     ld1         {v6.4h},[x0],x2
    151     smlal       v7.4s, v3.4h, v17.4h
    152     ld1         {v2.4h},[x0]
    153     add         x7,x1,x3                    //pu1_dst + dst_strd
    154     smlal       v0.4s, v3.4h, v18.4h
    155     smlal       v7.4s, v6.4h, v18.4h
    156     smlal       v0.4s, v6.4h, v19.4h
    157     smlal       v7.4s, v2.4h, v19.4h
    158     sqshrn      v0.4h, v0.4s,#6             //right shift
    159     sqshrn      v30.4h, v7.4s,#6            //right shift
    160     sqrshrun    v0.8b, v0.8h,#6             //rounding shift
    161     sqrshrun    v30.8b, v30.8h,#6           //rounding shift
    162     st1         {v0.s}[0],[x1],#4           //stores the loaded value
    163     st1         {v30.s}[0],[x7]             //stores the loaded value
    164     bgt         inner_loop_ht_2             //inner loop -again
    165 
    166     //inner loop ends
    167     subs        x5,x5,#2                    //increments ht
    168     add         x1,x1,x6                    //pu1_dst += 2*dst_strd - 2*wd
    169     mov         x12,x9                      //4wd
    170     add         x4,x4,x8                    //pi1_src_tmp1 += 2*src_strd - 4*wd
    171     bgt         inner_loop_ht_2             //loop again
    172 
    173     b           end_loops                   //jumps to end
    174 
    175 core_loop_ht_4:
    176     lsl         x7,x2,#2                    //2*src_strd
    177     lsl         x12,x3,#2                   //2*dst_strd
    178     lsr         x11, x6, #1                 //divide by 2
    179     sub         x14,x12,x6,lsl #1           //2*dst_strd - 2*wd
    180     sub         x8,x7,x6,lsl #2             //2*src_strd - 4*wd
    181 
    182     mul         x12, x5 , x11               //multiply height by width
    183     sub         x12, x12,#4                 //subtract by one for epilog
    184     lsl         x11, x6, #1                 //2*wd
    185 
    186 prolog:
    187     add         x0,x4,x2                    //increments pi2_src
    188     ld1         {v0.4h},[x4],#8             //loads pu1_src
    189     ld1         {v1.4h},[x0],x2             //loads pi2_src
    190     subs        x11,x11,#4
    191     ld1         {v2.4h},[x0],x2             //loads pi2_src
    192     smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    193     ld1         {v3.4h},[x0],x2
    194     smlal       v30.4s, v1.4h, v17.4h
    195     smlal       v30.4s, v2.4h, v18.4h
    196     add         x9,x1,x3                    //pu1_dst + dst_strd
    197     smlal       v30.4s, v3.4h, v19.4h
    198 
    199     ld1         {v4.4h},[x0],x2
    200     smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    201     add         x20,x4,x8
    202     csel        x4, x20, x4,le
    203     smlal       v28.4s, v2.4h, v17.4h
    204     ld1         {v5.4h},[x0],x2
    205     smlal       v28.4s, v3.4h, v18.4h
    206     ld1         {v6.4h},[x0],x2
    207     smlal       v28.4s, v4.4h, v19.4h
    208     lsl         x20,x6,#1
    209     csel        x11, x20, x11,le
    210 
    211     sqshrn      v30.4h, v30.4s,#6           //right shift
    212 
    213     smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    214     add         x0,x4,x2
    215     smlal       v26.4s, v3.4h, v17.4h
    216     smlal       v26.4s, v4.4h, v18.4h
    217     ld1         {v0.4h},[x4],#8             //loads pu1_src
    218     smlal       v26.4s, v5.4h, v19.4h
    219 
    220     sqrshrun    v30.8b, v30.8h,#6           //rounding shift
    221     sqshrn      v28.4h, v28.4s,#6           //right shift
    222 
    223     ld1         {v1.4h},[x0],x2             //loads pi2_src
    224     smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    225     st1         {v30.s}[0],[x1],#4          //stores the loaded value
    226     smlal       v24.4s, v4.4h, v17.4h
    227     ld1         {v2.4h},[x0],x2             //loads pi2_src
    228     smlal       v24.4s, v5.4h, v18.4h
    229     ld1         {v3.4h},[x0],x2
    230     smlal       v24.4s, v6.4h, v19.4h
    231     add         x20,x1,x14
    232     csel        x1, x20, x1,le
    233 
    234     sqshrn      v26.4h, v26.4s,#6           //right shift
    235     subs        x12,x12,#4
    236     sqrshrun    v28.8b, v28.8h,#6           //rounding shift
    237 
    238     beq         epilog                      //jumps to epilog
    239 
    240 kernel_4:
    241     smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    242     subs        x11,x11,#4
    243     smlal       v30.4s, v1.4h, v17.4h
    244     st1         {v28.s}[0],[x9],x3          //stores the loaded value
    245     smlal       v30.4s, v2.4h, v18.4h
    246     smlal       v30.4s, v3.4h, v19.4h
    247 
    248     sqshrn      v24.4h, v24.4s,#6           //right shift
    249     sqrshrun    v26.8b, v26.8h,#6           //rounding shift
    250 
    251     ld1         {v4.4h},[x0],x2
    252     smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    253     smlal       v28.4s, v2.4h, v17.4h
    254     smlal       v28.4s, v3.4h, v18.4h
    255     smlal       v28.4s, v4.4h, v19.4h
    256     st1         {v26.s}[0],[x9],x3          //stores the loaded value
    257     add         x20,x4,x8
    258     csel        x4, x20, x4,le
    259     lsl         x20,x6,#1
    260     csel        x11, x20, x11,le
    261 
    262     sqshrn      v30.4h, v30.4s,#6           //right shift
    263     sqrshrun    v24.8b, v24.8h,#6           //rounding shift
    264 
    265     ld1         {v5.4h},[x0],x2
    266     smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    267     ld1         {v6.4h},[x0],x2
    268     smlal       v26.4s, v3.4h, v17.4h
    269     st1         {v24.s}[0],[x9]             //stores the loaded value
    270     add         x0,x4,x2
    271     smlal       v26.4s, v4.4h, v18.4h
    272     ld1         {v0.4h},[x4],#8             //loads pu1_src
    273     smlal       v26.4s, v5.4h, v19.4h
    274 
    275     sqshrn      v28.4h, v28.4s,#6           //right shift
    276     sqrshrun    v30.8b, v30.8h,#6           //rounding shift
    277 
    278     ld1         {v1.4h},[x0],x2             //loads pi2_src
    279     smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    280     add         x9,x1,x3                    //pu1_dst + dst_strd
    281     ld1         {v2.4h},[x0],x2             //loads pi2_src
    282     smlal       v24.4s, v4.4h, v17.4h
    283     ld1         {v3.4h},[x0],x2
    284     smlal       v24.4s, v5.4h, v18.4h
    285 
    286     st1         {v30.s}[0],[x1],#4          //stores the loaded value
    287     smlal       v24.4s, v6.4h, v19.4h
    288 
    289     sqshrn      v26.4h, v26.4s,#6           //right shift
    290     sqrshrun    v28.8b, v28.8h,#6           //rounding shift
    291     add         x20,x1,x14
    292     csel        x1, x20, x1,le
    293 
    294     subs        x12,x12,#4
    295 
    296     bgt         kernel_4                    //jumps to kernel_4
    297 
    298 epilog:
    299     smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    300     st1         {v28.s}[0],[x9],x3          //stores the loaded value
    301     smlal       v30.4s, v1.4h, v17.4h
    302     smlal       v30.4s, v2.4h, v18.4h
    303     smlal       v30.4s, v3.4h, v19.4h
    304 
    305     sqshrn      v24.4h, v24.4s,#6           //right shift
    306     sqrshrun    v26.8b, v26.8h,#6           //rounding shift
    307 
    308     smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    309     ld1         {v4.4h},[x0],x2
    310     smlal       v28.4s, v2.4h, v17.4h
    311     st1         {v26.s}[0],[x9],x3          //stores the loaded value
    312     smlal       v28.4s, v3.4h, v18.4h
    313     smlal       v28.4s, v4.4h, v19.4h
    314 
    315     sqshrn      v30.4h, v30.4s,#6           //right shift
    316     sqrshrun    v24.8b, v24.8h,#6           //rounding shift
    317 
    318     smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    319     ld1         {v5.4h},[x0],x2
    320     smlal       v26.4s, v3.4h, v17.4h
    321     smlal       v26.4s, v4.4h, v18.4h
    322     smlal       v26.4s, v5.4h, v19.4h
    323 
    324     sqshrn      v28.4h, v28.4s,#6           //right shift
    325     sqrshrun    v30.8b, v30.8h,#6           //rounding shift
    326 
    327     st1         {v24.s}[0],[x9]             //stores the loaded value
    328     smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    329     smlal       v24.4s, v4.4h, v17.4h
    330     add         x9,x1,x3                    //pu1_dst + dst_strd
    331     ld1         {v6.4h},[x0],x2
    332     smlal       v24.4s, v5.4h, v18.4h
    333     smlal       v24.4s, v6.4h, v19.4h
    334     st1         {v30.s}[0],[x1],#4          //stores the loaded value
    335 
    336     sqrshrun    v28.8b, v28.8h,#6           //rounding shift
    337     sqshrn      v26.4h, v26.4s,#6           //right shift
    338 
    339     st1         {v28.s}[0],[x9],x3          //stores the loaded value
    340     sqrshrun    v26.8b, v26.8h,#6           //rounding shift
    341 
    342     sqshrn      v24.4h, v24.4s,#6           //right shift
    343     st1         {v26.s}[0],[x9],x3          //stores the loaded value
    344     sqrshrun    v24.8b, v24.8h,#6           //rounding shift
    345 
    346     st1         {v24.s}[0],[x9]             //stores the loaded value
    347 
    348 end_loops:
    349     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    350     ldp         x19, x20,[sp],#16
    351 
    352     ret
    353 
    354 
    355 
    356 
    357