Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* //file
     21 //*  ihevc_inter_pred_chroma_copy_w16out_neon.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* //author
     30 //*  yogeswaran rs
     31 //*
     32 //* //par list of functions:
     33 //*
     34 //*
     35 //* //remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************
     39 //*/
     40 ///**
     41 //*******************************************************************************
     42 //*
     43 //* //brief
     44 //*   chroma interprediction filter for copy
     45 //*
     46 //* //par description:
     47 //*    copies the array of width 'wd' and height 'ht' from the  location pointed
     48 //*    by 'src' to the location pointed by 'dst'
     49 //*
     50 //* //param[in] pu1_src
     51 //*  uword8 pointer to the source
     52 //*
     53 //* //param[out] pu1_dst
     54 //*  uword8 pointer to the destination
     55 //*
     56 //* //param[in] src_strd
     57 //*  integer source stride
     58 //*
     59 //* //param[in] dst_strd
     60 //*  integer destination stride
     61 //*
     62 //* //param[in] pi1_coeff
     63 //*  word8 pointer to the filter coefficients
     64 //*
     65 //* //param[in] ht
     66 //*  integer height of the array
     67 //*
     68 //* //param[in] wd
     69 //*  integer width of the array
     70 //*
     71 //* //returns
     72 //*
     73 //* //remarks
     74 //*  none
     75 //*
     76 //*******************************************************************************
     77 //*/
     78 
     79 //void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
     80 //                                            word16 *pi2_dst,
     81 //                                            word32 src_strd,
     82 //                                            word32 dst_strd,
     83 //                                            word8 *pi1_coeff,
     84 //                                            word32 ht,
     85 //                                            word32 wd)
     86 //**************variables vs registers*****************************************
     87 //x0 => *pu1_src
     88 //x1 => *pi2_dst
     89 //x2 =>  src_strd
     90 //x3 =>  dst_strd
     91 //x4 => *pi1_coeff
     92 //x5 =>  ht
     93 //x6 =>  wd
     94 
     95 .text
     96 .align 4
     97 
     98 .include "ihevc_neon_macros.s"
     99 
    100 .globl ihevc_inter_pred_chroma_copy_w16out_av8
    101 
    102 .type ihevc_inter_pred_chroma_copy_w16out_av8, %function
    103 
    104 ihevc_inter_pred_chroma_copy_w16out_av8:
    105 
    106     // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
    107 
    108     stp         x19, x20,[sp,#-16]!
    109 
    110     mov         x15,x4 // pi1_coeff
    111     mov         x16,x5 // ht
    112     mov         x17,x6 // wd
    113 
    114 
    115     mov         x12,x17                     //loads wd
    116     lsl         x12,x12,#1                  //2*wd
    117     mov         x7,x16                      //loads ht
    118     cmp         x7,#0                       //ht condition(ht == 0)
    119     ble         end_loops                   //loop
    120     and         x8,x7,#3                    //check ht for mul of 2
    121     sub         x9,x7,x8                    //check the rounded height value
    122     and         x11,x7,#6
    123     cmp         x11,#6
    124     beq         loop_ht_6
    125     tst         x12,#7                      //conditional check for wd (multiples)
    126     beq         core_loop_wd_8
    127 
    128 loop_ht_6:
    129     sub         x11,x12,#4
    130     lsl         x6, x3,#1
    131     adds        x6, x6,#0
    132     cmp         x9,#0
    133     beq         outer_loop_wd_4_ht_2
    134 
    135 outer_loop_wd_4:
    136     subs        x4,x12,#0                   //wd conditional subtract
    137     ble         end_inner_loop_wd_4
    138 
    139 inner_loop_wd_4:
    140     ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
    141     add         x5,x0,x2                    //pu1_src +src_strd
    142     uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
    143     add         x10,x1,x6
    144     subs        x4,x4,#4                    //wd - 4
    145     shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
    146     ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
    147     add         x0,x0,#4                    //pu1_src += 4
    148     st1         {v0.1d},[x1]                //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    149     add         x1,x1,#8
    150     uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
    151     ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
    152     shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
    153     uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
    154     st1         {v22.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    155     shl         v24.2d, v24.2d,#6           //vshlq_n_s64(temp, 6)
    156     ld1         {v26.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
    157     st1         {v24.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    158     uxtl        v26.8h, v26.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
    159     shl         v26.2d, v26.2d,#6           //vshlq_n_s64(temp, 6)
    160     st1         {v26.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    161     bgt         inner_loop_wd_4
    162 
    163 end_inner_loop_wd_4:
    164     subs        x9,x9,#4                    //ht - 4
    165     sub         x0,x5,x11
    166     sub         x1,x10,x11,lsl #1
    167     bgt         outer_loop_wd_4
    168     cmp         x8,#0
    169     bgt         outer_loop_wd_4_ht_2
    170 
    171 
    172 end_loops:
    173     // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
    174     ldp         x19, x20,[sp],#16
    175 
    176     ret
    177 
    178 
    179 outer_loop_wd_4_ht_2:
    180     subs        x4,x12,#0                   //wd conditional subtract
    181     ble         end_inner_loop_wd_4
    182 
    183 inner_loop_wd_4_ht_2:
    184     ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
    185     add         x5,x0,x2                    //pu1_src +src_strd
    186     uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
    187     add         x10,x1,x6
    188     subs        x4,x4,#4                    //wd - 4
    189     shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
    190     ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
    191     add         x0,x0,#4                    //pu1_src += 4
    192     st1         {v0.1d},[x1]                //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    193     add         x1,x1,#8
    194     uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
    195     ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
    196     shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
    197     uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
    198     st1         {v22.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    199     bgt         inner_loop_wd_4_ht_2
    200     b           end_loops
    201 
    202 
    203 core_loop_wd_8:
    204     //sub            x11,x12,#8
    205     lsl         x5, x3,#1
    206     adds        x5, x5,#0
    207     sub         x20,x12,x3, lsl #2          // x11 = (dst_strd * 4) - width
    208     neg         x11, x20
    209     sub         x20,x12,x2,lsl #2           //x2->src_strd
    210     neg         x8, x20
    211     lsr         x4, x12, #3                 // divide by 8
    212     mov         x7,x9
    213     mul         x7, x7, x4
    214     sub         x4,x12,#0                   //wd conditional check
    215     sub         x7,x7,#4                    //subtract one for epilog
    216     cmp         x9,#0
    217     beq         core_loop_wd_8_ht_2
    218 
    219 prolog:
    220     add         x6,x0,x2                    //pu1_src_tmp += src_strd
    221     add         x10,x1,x5
    222     ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    223     ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    224     ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    225     ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    226     uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
    227     uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    228     uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    229     uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    230     subs        x4,x4,#8                    //wd decrements by 8
    231     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
    232     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
    233     shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
    234     shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
    235     add         x20,x0,x8
    236     csel        x0, x20, x0,le
    237     add         x6,x0,x2                    //pu1_src_tmp += src_strd
    238     ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    239     ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    240     ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    241     ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    242 
    243     st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
    244     add         x20,x1,x11,lsl #1
    245     csel        x1, x20, x1,le
    246     sub         x20,x12,#0                  //wd conditional check
    247     csel        x4, x20, x4,le
    248 
    249     subs        x7,x7,#4                    //ht - 4
    250 
    251     blt         epilog_end                  //jumps to epilog_end
    252     beq         epilog                      //jumps to epilog
    253 
    254 
    255 
    256 outer_loop_wd_8:
    257 
    258     st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    259     uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
    260 
    261     st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    262     uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    263 
    264     st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    265     uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    266 
    267     uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    268 
    269     subs        x4,x4,#8                    //wd decrements by 8
    270     add         x20,x0,x8
    271     csel        x0, x20, x0,le
    272 
    273     add         x6,x0,x2                    //pu1_src_tmp += src_strd
    274 
    275     ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    276     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
    277 
    278     ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    279     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
    280 
    281     ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    282     shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
    283 
    284     ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    285     add         x10,x1,x5
    286 
    287     shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
    288 
    289     st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
    290 
    291     add         x20,x1,x11,lsl #1
    292     csel        x1, x20, x1,le
    293     sub         x20,x12,#0                  //wd conditional check
    294     csel        x4, x20, x4,le
    295 
    296     subs        x7,x7,#4                    //ht - 4
    297     bgt         outer_loop_wd_8
    298 
    299 epilog:
    300     st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    301     uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
    302 
    303     st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    304     uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    305 
    306     st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    307     uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    308 
    309     uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    310     //add          x6,x0,x2                //pu1_src_tmp += src_strd
    311 
    312     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
    313     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
    314     shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
    315     add         x10,x1,x5
    316     shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
    317 
    318     st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
    319 epilog_end:
    320     st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    321     st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    322     st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    323     b           end_loops
    324 
    325 core_loop_wd_8_ht_2:
    326     add         x6,x0,x2                    //pu1_src_tmp += src_strd
    327     add         x10,x1,x5
    328     ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
    329     ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
    330     uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
    331     uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
    332     subs        x12,x12,#8                  //wd decrements by 8
    333     shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
    334     shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
    335     st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
    336     st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
    337     bgt         core_loop_wd_8_ht_2
    338 
    339     // ldmfd sp!,{x4-x12,x15}         //reload the registers from sp
    340     ldp         x19, x20,[sp],#16
    341 
    342     ret
    343 
    344 
    345 
    346 
    347 
    348 
    349