Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_intra_pred_luma_horz_neon.s
     22 //*
     23 //* @brief
     24 //*  contains function definition for intra prediction  interpolation filters
     25 //*
     26 //*
     27 //* @author
     28 //*  parthiban v
     29 //*
     30 //* @par list of functions:
     31 //*  - ihevc_intra_pred_luma_horz()
     32 //*
     33 //* @remarks
     34 //*  none
     35 //*
     36 //*******************************************************************************
     37 //*/
     38 //
     39 ///**
     40 //*******************************************************************************
     41 //*
     42 //* @brief
     43 //*     intra prediction interpolation filter for horizontal luma variable.
     44 //*
     45 //* @par description:
     46 //*      horizontal intraprediction(mode 10) with.extern  samples location
     47 //*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
     48 //*      to section 8.4.4.2.6 in the standard (special case)
     49 //*
     50 //* @param[in] pu1_src
     51 //*  uword8 pointer to the source
     52 //*
     53 //* @param[out] pu1_dst
     54 //*  uword8 pointer to the destination
     55 //*
     56 //* @param[in] src_strd
     57 //*  integer source stride
     58 //*
     59 //* @param[in] dst_strd
     60 //*  integer destination stride
     61 //*
     62 //* @param[in] nt
     63 //*  integer transform block size
     64 //*
     65 //* @param[in] mode
     66 //*  integer intraprediction mode
     67 //*
     68 //* @returns
     69 //*
     70 //* @remarks
     71 //*  none
     72 //*
     73 //*******************************************************************************
     74 //*/
     75 //void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
     76 //                                word32 src_strd,
     77 //                                uword8 *pu1_dst,
     78 //                                word32 dst_strd,
     79 //                                word32 nt,
     80 //                                word32 mode)
     81 //**************variables vs registers*****************************************
     82 //x0 => *pu1_ref
     83 //x1 =>  src_strd
     84 //x2 => *pu1_dst
     85 //x3 =>  dst_strd
     86 
     87 .text
     88 .align 4
     89 .include "ihevc_neon_macros.s"
     90 
     91 
     92 
     93 .globl ihevc_intra_pred_luma_horz_av8
     94 
     95 .type ihevc_intra_pred_luma_horz_av8, %function
     96 
     97 ihevc_intra_pred_luma_horz_av8:
     98 
     99     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
    100 
    101     stp         x19, x20,[sp,#-16]!
    102 
    103     //ldr          x5,[sp,#44]                        @loads mode
    104 
    105     lsl         x6,x4,#1                    //two_nt
    106 
    107     add         x12,x0,x6                   //*pu1_ref[two_nt]
    108     cmp         x4,#4                       //if nt == 4
    109     beq         core_loop_4
    110 
    111     cmp         x4,#8                       //if nt == 8
    112     beq         core_loop_8
    113 
    114     cmp         x4,#16                      //if nt == 16
    115     beq         core_loop_16
    116     sub         x12,x12,#16                 //move to 16th value pointer
    117     add         x9,x2,#16
    118 
    119 core_loop_32:
    120     ld1         { v0.16b},[x12]             //load 16 values. d1[7] will have the 1st value.
    121 
    122     dup         v2.16b, v0.16b[15]          //duplicate the i value.
    123 
    124     dup         v4.16b, v0.16b[14]          //duplicate the ii value.
    125     dup         v6.16b, v0.16b[13]          //duplicate the iii value.
    126     st1         { v2.16b},[x2],x3           //store in 1st row 0-16 columns
    127     st1         { v2.16b},[x9],x3           //store in 1st row 16-32 columns
    128 
    129     dup         v1.16b, v0.16b[12]
    130     st1         { v4.16b},[x2],x3
    131     st1         { v4.16b},[x9],x3
    132 
    133     dup         v2.16b, v0.16b[11]
    134     st1         { v6.16b},[x2],x3
    135     st1         { v6.16b},[x9],x3
    136 
    137     dup         v4.16b, v0.16b[10]
    138     st1         { v1.16b},[x2],x3
    139     st1         { v1.16b},[x9],x3
    140 
    141     dup         v6.16b, v0.16b[9]
    142     st1         { v2.16b},[x2],x3
    143     st1         { v2.16b},[x9],x3
    144 
    145     dup         v1.16b, v0.16b[8]
    146     st1         { v4.16b},[x2],x3
    147     st1         { v4.16b},[x9],x3
    148 
    149     dup         v2.16b, v0.8b[7]
    150     st1         { v6.16b},[x2],x3
    151     st1         { v6.16b},[x9],x3
    152 
    153     dup         v4.16b, v0.8b[6]
    154     st1         { v1.16b},[x2],x3
    155     st1         { v1.16b},[x9],x3
    156 
    157     dup         v6.16b, v0.8b[5]
    158     st1         { v2.16b},[x2],x3
    159     st1         { v2.16b},[x9],x3
    160 
    161     dup         v1.16b, v0.8b[4]
    162     st1         { v4.16b},[x2],x3
    163     st1         { v4.16b},[x9],x3
    164 
    165     dup         v2.16b, v0.8b[3]
    166     st1         { v6.16b},[x2],x3
    167     st1         { v6.16b},[x9],x3
    168 
    169     dup         v4.16b, v0.8b[2]
    170     st1         { v1.16b},[x2],x3
    171     st1         { v1.16b},[x9],x3
    172 
    173     dup         v6.16b, v0.8b[1]
    174     st1         { v2.16b},[x2],x3
    175     st1         { v2.16b},[x9],x3
    176     sub         x12,x12,#16                 //move to 16th value pointer
    177 
    178     dup         v1.16b, v0.8b[0]
    179     st1         { v4.16b},[x2],x3
    180     st1         { v4.16b},[x9],x3
    181 
    182     subs        x4,x4,#16                   //decrement the loop count by 16
    183     st1         { v6.16b},[x2],x3
    184     st1         { v6.16b},[x9],x3
    185 
    186     st1         { v1.16b},[x2],x3
    187     st1         { v1.16b},[x9],x3
    188     bgt         core_loop_32
    189     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    190     ldp         x19, x20,[sp],#16
    191 
    192     ret
    193     b           end_func
    194 
    195 core_loop_16:
    196     ldrb        w14,[x12],#1                //pu1_ref[two_nt]
    197     sxtw        x14,w14
    198     ld1         { v30.8b},[x12],#8          //pu1_ref[two_nt + 1 + col]
    199     ld1         { v31.8b},[x12]             //pu1_ref[two_nt + 1 + col]
    200     sub         x12,x12,#8
    201 
    202     dup         v28.8b,w14
    203     sub         x12,x12,#17
    204     ld1         { v0.16b},[x12]
    205     dup         v26.8b, v0.16b[15]
    206     uxtl        v26.8h, v26.8b
    207 
    208     dup         v2.16b, v0.16b[14]
    209     usubl       v24.8h, v30.8b, v28.8b
    210 
    211     dup         v4.16b, v0.16b[13]
    212     sshr        v24.8h, v24.8h,#1
    213 
    214     dup         v6.16b, v0.16b[12]
    215     sqadd       v22.8h,  v26.8h ,  v24.8h
    216 
    217     dup         v1.16b, v0.16b[11]
    218     sqxtun      v22.8b, v22.8h
    219 
    220     st1         {v22.8b},[x2],#8
    221 
    222     dup         v18.16b, v0.16b[10]
    223     usubl       v24.8h, v31.8b, v28.8b
    224 
    225     dup         v19.16b, v0.16b[9]
    226     sshr        v24.8h, v24.8h,#1
    227 
    228     dup         v20.16b, v0.16b[8]
    229     sqadd       v22.8h,  v26.8h ,  v24.8h
    230 
    231     dup         v16.16b, v0.8b[7]
    232     sqxtun      v22.8b, v22.8h
    233 
    234     st1         {v22.8b},[x2],x3
    235     sub         x2,x2,#8
    236 
    237     st1         { v2.16b},[x2],x3
    238 
    239     st1         { v4.16b},[x2],x3
    240     st1         { v6.16b},[x2],x3
    241     st1         { v1.16b},[x2],x3
    242 
    243     dup         v2.16b, v0.8b[6]
    244     st1         { v18.16b},[x2],x3
    245 
    246     dup         v4.16b, v0.8b[5]
    247     st1         { v19.16b},[x2],x3
    248 
    249     dup         v6.16b, v0.8b[4]
    250     st1         { v20.16b},[x2],x3
    251 
    252     dup         v1.16b, v0.8b[3]
    253     st1         { v16.16b},[x2],x3
    254 
    255     dup         v18.16b, v0.8b[2]
    256     st1         { v2.16b},[x2],x3
    257 
    258     dup         v19.16b, v0.8b[1]
    259     st1         { v4.16b},[x2],x3
    260 
    261     dup         v20.16b, v0.8b[0]
    262     st1         { v6.16b},[x2],x3
    263 
    264     st1         { v1.16b},[x2],x3
    265     st1         { v18.16b},[x2],x3
    266     st1         { v19.16b},[x2],x3
    267     st1         { v20.16b},[x2],x3
    268 
    269     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    270     ldp         x19, x20,[sp],#16
    271 
    272     ret
    273     b           end_func
    274 
    275 
    276 core_loop_8:
    277     ldrb        w14,[x12]                   //pu1_ref[two_nt]
    278     sxtw        x14,w14
    279     add         x12,x12,#1                  //pu1_ref[two_nt + 1]
    280     ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
    281 
    282     sub         x12,x12,#9
    283     ld1         {v0.8b},[x12]
    284     dup         v26.8b, v0.8b[7]
    285     dup         v28.8b,w14
    286 
    287     dup         v3.8b, v0.8b[6]
    288     uxtl        v26.8h, v26.8b
    289 
    290     dup         v4.8b, v0.8b[5]
    291     usubl       v24.8h, v30.8b, v28.8b
    292 
    293     dup         v5.8b, v0.8b[4]
    294     sshr        v24.8h, v24.8h,#1
    295 
    296     dup         v6.8b, v0.8b[3]
    297     sqadd       v22.8h,  v26.8h ,  v24.8h
    298 
    299     dup         v7.8b, v0.8b[2]
    300     sqxtun      v22.8b, v22.8h
    301 
    302     st1         {v22.8b},[x2],x3
    303     st1         {v3.8b},[x2],x3
    304 
    305     dup         v1.8b, v0.8b[1]
    306     st1         {v4.8b},[x2],x3
    307     st1         {v5.8b},[x2],x3
    308 
    309     dup         v17.8b, v0.8b[0]
    310     st1         {v6.8b},[x2],x3
    311     st1         {v7.8b},[x2],x3
    312 
    313     st1         {v1.8b},[x2],x3
    314     st1         {v17.8b},[x2],x3
    315     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    316     ldp         x19, x20,[sp],#16
    317 
    318     ret
    319     b           end_func
    320 
    321 
    322 core_loop_4:
    323     ldrb        w14,[x12]                   //pu1_ref[two_nt]
    324     sxtw        x14,w14
    325     add         x12,x12,#1                  //pu1_ref[two_nt + 1]
    326     ld1         {v30.8b},[x12]              //pu1_ref[two_nt + 1 + col]
    327 
    328     sub         x12,x12,#5
    329     ld1         {v0.8b},[x12]
    330     dup         v28.8b,w14
    331     dup         v26.8b, v0.8b[3]
    332     uxtl        v26.8h, v26.8b
    333 
    334     dup         v3.8b, v0.8b[2]
    335     usubl       v24.8h, v30.8b, v28.8b
    336 
    337     dup         v4.8b, v0.8b[1]
    338     sshr        v24.8h, v24.8h,#1
    339 
    340     dup         v5.8b, v0.8b[0]
    341     sqadd       v22.8h,  v26.8h ,  v24.8h
    342 
    343     sqxtun      v22.8b, v22.8h
    344 
    345     st1         {v22.s}[0],[x2],x3
    346     st1         {v3.s}[0],[x2],x3
    347     st1         {v4.s}[0],[x2],x3
    348     st1         {v5.s}[0],[x2],x3
    349 
    350     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    351     ldp         x19, x20,[sp],#16
    352 
    353     ret
    354 end_func:
    355 
    356 
    357 
    358