Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_chroma_horz_neon.s
     22 //*
     23 //* @brief
     24 //*  contains function definition for intra prediction  interpolation filters
     25 //*
     26 //*
     27 //* @author
     28 //*  parthiban v
     29 //*
     30 //* @par list of functions:
     31 //*  - ihevc_intra_pred_luma_horz()
     32 //*
     33 //* @remarks
     34 //*  none
     35 //*
     36 //*******************************************************************************
     37 //*/
     38 //
     39 ///**
     40 //*******************************************************************************
     41 //*
     42 //* @brief
     43 //*     intra prediction interpolation filter for horizontal luma variable.
     44 //*
     45 //* @par description:
     46 //*      horizontal intraprediction(mode 10) with.extern  samples location
     47 //*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
     48 //*      to section 8.4.4.2.6 in the standard (special case)
     49 //*
     50 //* @param[in] pu1_src
     51 //*  uword8 pointer to the source
     52 //*
     53 //* @param[out] pu1_dst
     54 //*  uword8 pointer to the destination
     55 //*
     56 //* @param[in] src_strd
     57 //*  integer source stride
     58 //*
     59 //* @param[in] dst_strd
     60 //*  integer destination stride
     61 //*
     62 //* @param[in] nt
     63 //*  integer transform block size
     64 //*
     65 //* @param[in] mode
     66 //*  integer intraprediction mode
     67 //*
     68 //* @returns
     69 //*
     70 //* @remarks
     71 //*  none
     72 //*
     73 //*******************************************************************************
     74 //*/
     75 //void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
     76 //                                  word32 src_strd,
     77 //                                  uword8 *pu1_dst,
     78 //                                  word32 dst_strd,
     79 //                                  word32 nt,
     80 //                                  word32 mode)
     81 //**************variables vs registers*****************************************
     82 //x0 => *pu1_ref
     83 //x1 =>  src_strd
     84 //x2 => *pu1_dst
     85 //x3 =>  dst_strd
     86 
     87 .text
     88 .align 4
     89 .include "ihevc_neon_macros.s"
     90 
     91 
     92 .globl ihevc_intra_pred_chroma_horz_av8
     93 
     94 .type ihevc_intra_pred_chroma_horz_av8, %function
     95 
     96 ihevc_intra_pred_chroma_horz_av8:
     97 
     98     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
     99 
    100     stp         x19, x20,[sp,#-16]!
    101 
    102     lsl         x6,x4,#2                    //four_nt
    103 
    104     add         x12,x0,x6                   //*pu1_ref[four_nt]
    105     cmp         x4,#4                       //if nt == 4
    106     beq         core_loop_4
    107 
    108     cmp         x4,#8                       //if nt == 8
    109     beq         core_loop_8
    110 
    111     //cmp            x4,#16                            @if nt == 16
    112     //beq            core_loop_16
    113 
    114     sub         x12,x12,#16                 //move to 16th value pointer
    115     add         x9,x2,#16
    116 
    117 core_loop_16:
    118     ld1         { v0.8h},[x12]              //load 16 values. d1[7] will have the 1st value.
    119     sub         x12,x12,#16
    120     ld1         { v18.8h},[x12]             //load 16 values. d1[7] will have the 1st value.
    121 
    122     dup         v2.8h, v0.4h[7]             //duplicate the i value.
    123 
    124     dup         v4.8h, v0.4h[6]             //duplicate the ii value.
    125     dup         v6.8h, v0.4h[5]             //duplicate the iii value.
    126     st1         { v2.8h},[x2],x3            //store in 1st row 0-16 columns
    127     st1         { v2.8h},[x9],x3            //store in 1st row 16-32 columns
    128 
    129     dup         v1.8h, v0.4h[4]
    130     st1         { v4.8h},[x2],x3
    131     st1         { v4.8h},[x9],x3
    132 
    133     dup         v2.8h, v0.4h[3]
    134     st1         { v6.8h},[x2],x3
    135     st1         { v6.8h},[x9],x3
    136 
    137     dup         v4.8h, v0.4h[2]
    138     st1         { v1.8h},[x2],x3
    139     st1         { v1.8h},[x9],x3
    140 
    141     dup         v6.8h, v0.4h[1]
    142     st1         { v2.8h},[x2],x3
    143     st1         { v2.8h},[x9],x3
    144 
    145     dup         v1.8h, v0.4h[0]
    146     st1         { v4.8h},[x2],x3
    147     st1         { v4.8h},[x9],x3
    148 
    149     dup         v2.8h, v18.4h[7]
    150     st1         { v6.8h},[x2],x3
    151     st1         { v6.8h},[x9],x3
    152 
    153     dup         v4.8h, v18.4h[6]
    154     st1         { v1.8h},[x2],x3
    155     st1         { v1.8h},[x9],x3
    156 
    157     dup         v6.8h, v18.4h[5]
    158     st1         { v2.8h},[x2],x3
    159     st1         { v2.8h},[x9],x3
    160 
    161     dup         v1.8h, v18.4h[4]
    162     st1         { v4.8h},[x2],x3
    163     st1         { v4.8h},[x9],x3
    164 
    165     dup         v2.8h, v18.4h[3]
    166     st1         { v6.8h},[x2],x3
    167     st1         { v6.8h},[x9],x3
    168 
    169     dup         v4.8h, v18.4h[2]
    170     st1         { v1.8h},[x2],x3
    171     st1         { v1.8h},[x9],x3
    172 
    173     dup         v6.8h, v18.4h[1]
    174     st1         { v2.8h},[x2],x3
    175     st1         { v2.8h},[x9],x3
    176     sub         x12,x12,#16                 //move to 16th value pointer
    177 
    178     dup         v1.8h, v18.4h[0]
    179     st1         { v4.8h},[x2],x3
    180     st1         { v4.8h},[x9],x3
    181 
    182     subs        x4,x4,#16                   //decrement the loop count by 16
    183     st1         { v6.8h},[x2],x3
    184     st1         { v6.8h},[x9],x3
    185 
    186     st1         { v1.8h},[x2],x3
    187     st1         { v1.8h},[x9],x3
    188     bgt         core_loop_16
    189     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    190     ldp         x19, x20,[sp],#16
    191 
    192     ret
    193     b           endloop
    194 
    195 core_loop_8:
    196     ldrb        w14,[x12],#1                //pu1_ref[two_nt]
    197     sxtw        x14,w14
    198     //vld1.8        {q15},[x12]                        @pu1_ref[two_nt + 1 + col]
    199 
    200     dup         v28.8b,w14
    201     sub         x12,x12,#17
    202     ld1         { v0.16b},[x12]
    203 
    204     sub         x12,x12,#16
    205 //    ld1 { v30.16b},[x12]
    206     dup         v18.8h, v0.4h[7]
    207     //vmovl.u8    q13,d26
    208 
    209     dup         v2.8h, v0.4h[6]
    210     //vsubl.u8    q12,d30,d28
    211 
    212     dup         v4.8h, v0.4h[5]
    213     //vshr.s16    q12,q12,#1
    214 
    215     dup         v6.8h, v0.4h[4]
    216     //vqadd.s16    q11,q13,q12
    217 
    218     dup         v1.8h, v0.4h[3]
    219     //vqmovun.s16 d22,q11
    220 
    221     st1         { v18.8h},[x2],x3
    222 
    223     dup         v18.8h, v0.4h[2]
    224     //vsubl.u8    q12,d31,d28
    225 
    226     dup         v19.8h, v0.4h[1]
    227     //vshr.s16    q12,q12,#1
    228 
    229     dup         v20.8h, v0.4h[0]
    230     //vqadd.s16    q11,q13,q12
    231 
    232     dup         v16.8h, v0.4h[3]
    233     //vqmovun.s16 d22,q11
    234 
    235     st1         { v2.8h},[x2],x3
    236     //sub            x2,x2,#8
    237 
    238     st1         { v4.8h},[x2],x3
    239 
    240     st1         { v6.8h},[x2],x3
    241     st1         { v1.8h},[x2],x3
    242     st1         { v18.8h},[x2],x3
    243 
    244     //vdup.8        q1,d0[2]
    245     st1         { v19.8h},[x2],x3
    246 
    247     //vdup.8        q2,d0[1]
    248     st1         { v20.8h},[x2],x3
    249 
    250     //vdup.8        q3,d0[0]
    251     //vst1.8        {q7},[x2],x3
    252 
    253     //vdup.8        q4,d0[3]
    254     //vst1.8        {q8},[x2],x3
    255 
    256     //vdup.8        q5,d0[2]
    257     //vst1.8        {q1},[x2],x3
    258 
    259     //vdup.8        q6,d0[1]
    260     //vst1.8        {q2},[x2],x3
    261 
    262     //vdup.8        q7,d0[0]
    263     //vst1.8        {q3},[x2],x3
    264 
    265     //vst1.8        {q4},[x2],x3
    266     //vst1.8        {q5},[x2],x3
    267     //vst1.8        {q6},[x2],x3
    268     //vst1.8        {q7},[x2],x3
    269 
    270     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    271     ldp         x19, x20,[sp],#16
    272 
    273     ret
    274     b           endloop
    275 
    276 
    277 core_loop_4:
    278     ldrb        w14,[x12]                   //pu1_ref[two_nt]
    279     sxtw        x14,w14
    280     add         x12,x12,#1                  //pu1_ref[two_nt + 1]
    281     //vld1.8        {d30},[x12]                        @pu1_ref[two_nt + 1 + col]
    282 
    283     sub         x12,x12,#9
    284     ld1         {v0.8b},[x12]
    285     sub         x12,x12,#8
    286     ld1         {v30.8b},[x12]
    287     dup         v26.4h, v0.4h[3]
    288     dup         v28.8b,w14
    289 
    290     dup         v3.4h, v0.4h[2]
    291     uxtl        v26.8h, v26.8b
    292 
    293     dup         v4.4h, v0.4h[1]
    294     usubl       v24.8h, v30.8b, v28.8b
    295 
    296     dup         v5.4h, v0.4h[0]
    297     sshr        v24.8h, v24.8h,#1
    298 
    299     dup         v6.4h, v0.4h[3]
    300     sqadd       v22.8h,  v26.8h ,  v24.8h
    301 
    302     dup         v7.4h, v0.4h[2]
    303     sqxtun      v22.8b, v22.8h
    304 
    305     st1         {v6.8b},[x2],x3
    306     st1         {v3.8b},[x2],x3
    307 
    308     dup         v1.4h, v0.4h[1]
    309     st1         {v4.8b},[x2],x3
    310     st1         {v5.8b},[x2],x3
    311 
    312     dup         v17.4h, v0.4h[0]
    313     //vst1.8        {d6},[x2],x3
    314     //vst1.8        {d7},[x2],x3
    315 
    316     //vst1.8        {d8},[x2],x3
    317     //vst1.8        {d9},[x2],x3
    318     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    319     ldp         x19, x20,[sp],#16
    320 
    321     ret
    322     b           endloop
    323 
    324 
    325 //core_loop_4
    326     ldrb        w14,[x12]                   //pu1_ref[two_nt]
    327     sxtw        x14,w14
    328     add         x12,x12,#1                  //pu1_ref[two_nt + 1]
    329     ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
    330 
    331     sub         x12,x12,#5
    332     ld1         {v0.8b},[x12]
    333     dup         v28.8b,w14
    334     dup         v26.8b, v0.8b[3]
    335     uxtl        v26.8h, v26.8b
    336 
    337     dup         v3.8b, v0.8b[2]
    338     usubl       v24.8h, v30.8b, v28.8b
    339 
    340     dup         v4.8b, v0.8b[1]
    341     sshr        v24.8h, v24.8h,#1
    342 
    343     dup         v5.8b, v0.8b[0]
    344     sqadd       v22.8h,  v26.8h ,  v24.8h
    345 
    346     sqxtun      v22.8b, v22.8h
    347 
    348     st1         {v22.s}[0],[x2],x3
    349     st1         {v3.s}[0],[x2],x3
    350     st1         {v4.s}[0],[x2],x3
    351     st1         {v5.s}[0],[x2],x3
    352 
    353     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    354     ldp         x19, x20,[sp],#16
    355 
    356     ret
    357 
    358 endloop:
    359 
    360 
    361 
    362