Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 ///*******************************************************************************
     20 //* //file
     21 //*  ihevcd_itrans_recon_dc_chroma.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions itrans and recon for dc only case
     25 //*
     26 //* //author
     27 //*  ittiam
     28 //*
     29 //* //par list of functions:
     30 //*
     31 //*
     32 //* //remarks
     33 //*  none
     34 //*
     35 //*******************************************************************************/
     36 
     37 
     38 .text
     39 .include "ihevc_neon_macros.s"
     40 
     41 
     42 .globl ihevcd_itrans_recon_dc_chroma_av8
     43 
     44 .type ihevcd_itrans_recon_dc_chroma_av8, %function
     45 
     46 ihevcd_itrans_recon_dc_chroma_av8:
     47 
     48 //void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred,
     49 //                            uword8 *pu1_dst,
     50 //                            word32 pred_strd,
     51 //                            word32 dst_strd,
     52 //                            word32 log2_trans_size,
     53 //                            word16 i2_coeff_value)
     54 
     55 //x0:pu1_pred
     56 //x1:pu1_dest
     57 //x2:pred_strd
     58 //x3:dst_strd
     59 
     60 
     61 
     62     push_v_regs
     63     stp         x19, x20,[sp,#-16]!
     64 
     65     sxth        x5, w5 // since the argument is of word16, sign extend to x register
     66 
     67     mov         x10,#1
     68     lsl         x4,x10,x4                   //    trans_size = (1 << log2_trans_size)//
     69     mov         x6,#64                      // 1 << (shift1 - 1)//
     70     mov         x7,#2048                    // 1<<(shift2-1)
     71 
     72     add         x8,x6,x5,lsl #6
     73     asr         x20, x8, #7
     74     mov         x19,#32767
     75     cmp         x20,x19
     76     blt         lbl36
     77     mov         x8,#32767
     78     b           lbl36_1
     79 lbl36:
     80     mov         x19,#-32768
     81     cmp         x20,x19
     82     csel        x8, x19, x20, lt
     83 lbl36_1:
     84 
     85     add         x5,x7,x8,lsl #6
     86     asr         x20, x5, #12
     87     mov         x19,#32767
     88     cmp         x20,x19
     89     blt         lbl38
     90     mov         x6,#32767
     91     b           lbl38_1
     92 lbl38:
     93     mov         x19,#-32768
     94     cmp         x20,x19
     95     csel        x6, x19, x20, lt
     96 lbl38_1:
     97 
     98     mov         x9,x4
     99     mov         x8,x4
    100 
    101     // x6 has the dc_value
    102     // x4 has the trans_size value
    103     // x8 has the row value
    104     // x9 has the col value
    105     dup         v0.8h,w6
    106     cmp         x4,#4
    107     beq         row_loop_4chroma
    108 
    109 
    110 row_loop_chroma:
    111     mov         x9,x4
    112 
    113 
    114 col_loop_chroma:
    115 
    116     mov         x7,x0
    117     ld2         {v2.8b, v3.8b},[x7],x2
    118     ld2         {v4.8b, v5.8b},[x7],x2
    119     ld2         {v6.8b, v7.8b},[x7],x2
    120     ld2         {v8.8b, v9.8b},[x7],x2
    121 
    122     ld2         {v10.8b, v11.8b},[x7],x2
    123     ld2         {v12.8b, v13.8b},[x7],x2
    124     ld2         {v14.8b, v15.8b},[x7],x2
    125     ld2         {v16.8b, v17.8b},[x7]
    126 
    127     add         x0,x0,#16
    128 
    129 
    130     uaddw       v30.8h,  v0.8h ,  v2.8b
    131     uaddw       v28.8h,  v0.8h ,  v4.8b
    132     uaddw       v26.8h,  v0.8h ,  v6.8b
    133     uaddw       v24.8h,  v0.8h ,  v8.8b
    134     uaddw       v22.8h,  v0.8h ,  v10.8b
    135     uaddw       v20.8h,  v0.8h ,  v12.8b
    136     uaddw       v18.8h,  v0.8h ,  v14.8b
    137 
    138 
    139     mov         x11,x1
    140     sqxtun      v2.8b, v30.8h
    141     sqxtun      v4.8b, v28.8h
    142     sqxtun      v6.8b, v26.8h
    143     sqxtun      v8.8b, v24.8h
    144 
    145     uaddw       v30.8h,  v0.8h ,  v16.8b
    146 
    147     sqxtun      v10.8b, v22.8h
    148     sqxtun      v12.8b, v20.8h
    149     sqxtun      v14.8b, v18.8h
    150     sqxtun      v16.8b, v30.8h
    151 
    152     st2         {v2.8b, v3.8b},[x11],x3
    153     st2         {v4.8b, v5.8b},[x11],x3
    154     st2         {v6.8b, v7.8b},[x11],x3
    155     st2         {v8.8b, v9.8b},[x11],x3
    156 
    157     st2         {v10.8b, v11.8b},[x11],x3
    158     st2         {v12.8b, v13.8b},[x11],x3
    159     st2         {v14.8b, v15.8b},[x11],x3
    160     st2         {v16.8b, v17.8b},[x11]
    161 
    162     add         x1,x1,#16
    163 
    164     subs        x9,x9,#8
    165     bgt         col_loop_chroma
    166 
    167     subs        x8,x8,#8
    168 
    169     add         x0,x0,x2,lsl #3
    170     add         x1,x1,x3,lsl #3
    171     sub         x0,x0,x4,lsl #1
    172     sub         x1,x1,x4,lsl #1
    173     bgt         row_loop_chroma
    174     b           end_loops_chroma
    175 
    176 
    177 row_loop_4chroma:
    178     mov         x9,x10
    179 
    180 
    181 col_loop_4chroma:
    182 
    183 
    184     ld2         {v2.8b, v3.8b},[x0],x2
    185     ld2         {v4.8b, v5.8b},[x0],x2
    186     ld2         {v6.8b, v7.8b},[x0],x2
    187     ld2         {v8.8b, v9.8b},[x0]
    188 
    189 
    190 
    191 
    192     uaddw       v30.8h,  v0.8h ,  v2.8b
    193     uaddw       v28.8h,  v0.8h ,  v4.8b
    194     uaddw       v26.8h,  v0.8h ,  v6.8b
    195     uaddw       v24.8h,  v0.8h ,  v8.8b
    196 
    197 
    198 
    199     sqxtun      v31.8b, v30.8h
    200     sqxtun      v29.8b, v28.8h
    201     sqxtun      v27.8b, v26.8h
    202     sqxtun      v25.8b, v24.8h
    203 
    204 
    205     zip1        v2.8b, v31.8b, v3.8b
    206     zip1        v4.8b, v29.8b, v5.8b
    207     zip1        v6.8b, v27.8b, v7.8b
    208     zip1        v8.8b, v25.8b, v9.8b
    209 
    210     st1         {v2.2s},[x1],x3
    211     st1         {v4.2s},[x1],x3
    212     st1         {v6.2s},[x1],x3
    213     st1         {v8.2s},[x1]
    214 
    215 end_loops_chroma:
    216     ldp         x19, x20,[sp],#16
    217     pop_v_regs
    218     ret
    219 
    220 
    221