Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 ///*******************************************************************************
     20 //* //file
     21 //*  ihevcd_itrans_recon_dc_luma.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions itrans and recon for dc only case
     25 //*
     26 //* //author
     27 //*  ittiam
     28 //*
     29 //* //par list of functions:
     30 //*
     31 //*
     32 //* //remarks
     33 //*  none
     34 //*
     35 //*******************************************************************************/
     36 
     37 .text
     38 .include "ihevc_neon_macros.s"
     39 
     40 
     41 
     42 .globl ihevcd_itrans_recon_dc_luma_av8
     43 
     44 .type ihevcd_itrans_recon_dc_luma_av8, %function
     45 
     46 ihevcd_itrans_recon_dc_luma_av8:
     47 
     48 //void ihevcd_itrans_recon_dc_luma(uword8 *pu1_pred,
     49 //                            uword8 *pu1_dst,
     50 //                            word32 pred_strd,
     51 //                            word32 dst_strd,
     52 //                            word32 log2_trans_size,
     53 //                            word16 i2_coeff_value)
     54 
     55 //x0:pu1_pred
     56 //x1:pu1_dest
     57 //x2:pred_strd
     58 //x3:dst_strd
     59 
     60 
     61 
     62 
     63     stp         x19, x20,[sp,#-16]!
     64     sxth        x5,w5
     65 
     66     mov         x10,#1
     67     lsl         x4,x10,x4                   //    trans_size = (1 << log2_trans_size)//
     68     mov         x6,#64                      // 1 << (shift1 - 1)//
     69     mov         x7,#2048                    // 1<<(shift2-1)
     70 
     71     add         x8,x6,x5,lsl #6
     72     asr         x20, x8, #7
     73     mov         x19, #32767
     74     cmp         x20,x19
     75     blt         lbl37
     76     mov         x8,#32767
     77     b           lbl37_1
     78 lbl37:
     79     mov         x19,#-32768
     80     cmp         x20,x19
     81     csel        x8, x19, x20, lt
     82 lbl37_1:
     83 
     84     add         x5,x7,x8,lsl #6
     85     asr         x20, x5, #12
     86     mov         x19,#32767
     87     cmp         x20,x19
     88     blt         lbl39
     89     mov         x6,#32767
     90     b           lbl39_1
     91 lbl39:
     92     mov         x19,#-32768
     93     cmp         x20,x19
     94     csel        x6, x19, x20, lt
     95 lbl39_1:
     96 
     97     mov         x9,x4
     98     mov         x8,x4
     99 
    100     // x6 has the dc_value
    101     // x4 has the trans_size value
    102     // x8 has the row value
    103     // x9 has the col value
    104     dup         v0.8h,w6
    105     cmp         x4,#4
    106     beq         row_loop_4
    107 
    108 
    109 row_loop:
    110     mov         x9,x4
    111 
    112 
    113 col_loop:
    114 
    115     mov         x7,x0
    116     ld1         {v2.8b},[x7],x2
    117     ld1         {v3.8b},[x7],x2
    118     ld1         {v4.8b},[x7],x2
    119     ld1         {v5.8b},[x7],x2
    120 
    121     ld1         {v6.8b},[x7],x2
    122     ld1         {v7.8b},[x7],x2
    123     ld1         {v1.8b},[x7],x2
    124     ld1         {v17.8b},[x7]
    125 
    126     add         x0,x0,#8
    127 
    128 
    129     uaddw       v30.8h,  v0.8h ,  v2.8b
    130     uaddw       v28.8h,  v0.8h ,  v3.8b
    131     uaddw       v26.8h,  v0.8h ,  v4.8b
    132     uaddw       v24.8h,  v0.8h ,  v5.8b
    133     uaddw       v22.8h,  v0.8h ,  v6.8b
    134     uaddw       v20.8h,  v0.8h ,  v7.8b
    135     uaddw       v18.8h,  v0.8h ,  v1.8b
    136     uaddw       v16.8h,  v0.8h ,  v17.8b
    137 
    138     mov         x11,x1
    139     sqxtun      v2.8b, v30.8h
    140     sqxtun      v3.8b, v28.8h
    141     sqxtun      v4.8b, v26.8h
    142     sqxtun      v5.8b, v24.8h
    143     sqxtun      v6.8b, v22.8h
    144     sqxtun      v7.8b, v20.8h
    145     sqxtun      v1.8b, v18.8h
    146     sqxtun      v17.8b, v16.8h
    147 
    148 
    149     st1         {v2.2s},[x11],x3
    150     st1         {v3.2s},[x11],x3
    151     st1         {v4.2s},[x11],x3
    152     st1         {v5.2s},[x11],x3
    153     st1         {v6.2s},[x11],x3
    154     st1         {v7.2s},[x11],x3
    155     st1         {v1.2s},[x11],x3
    156     st1         {v17.2s},[x11]
    157 
    158     add         x1,x1,#8
    159 
    160     subs        x9,x9,#8
    161     bgt         col_loop
    162 
    163     subs        x8,x8,#8
    164 
    165     add         x0,x0,x2,lsl #3
    166     add         x1,x1,x3,lsl #3
    167     sub         x0,x0,x4
    168     sub         x1,x1,x4
    169     bgt         row_loop
    170     b           end_loops
    171 
    172 
    173 row_loop_4:
    174     mov         x9,x10
    175 
    176 
    177 col_loop_4:
    178 
    179 
    180     ld1         {v2.8b},[x0],x2
    181     ld1         {v3.8b},[x0],x2
    182     ld1         {v4.8b},[x0],x2
    183     ld1         {v5.8b},[x0]
    184 
    185 
    186 
    187 
    188     uaddw       v30.8h,  v0.8h ,  v2.8b
    189     uaddw       v28.8h,  v0.8h ,  v3.8b
    190     uaddw       v26.8h,  v0.8h ,  v4.8b
    191     uaddw       v24.8h,  v0.8h ,  v5.8b
    192 
    193 
    194 
    195     sqxtun      v2.8b, v30.8h
    196     sqxtun      v3.8b, v28.8h
    197     sqxtun      v4.8b, v26.8h
    198     sqxtun      v5.8b, v24.8h
    199 
    200 
    201 
    202     st1         {v2.s}[0],[x1],x3
    203     st1         {v3.s}[0],[x1],x3
    204     st1         {v4.s}[0],[x1],x3
    205     st1         {v5.s}[0],[x1]
    206 
    207 end_loops:
    208     ldp         x19, x20,[sp],#16
    209 
    210     ret
    211 
    212 
    213 
    214 
    215 
    216 
    217 
    218 
    219