Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 ///*******************************************************************************
     20 //* @file
     21 //*  ihevc_deblk_luma_vert.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for inter prediction  interpolation.
     25 //* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 //* rvct
     28 //*
     29 //* @author
     30 //*  anand s
     31 //*
     32 //* @par list of functions:
     33 //*
     34 //*
     35 //* @remarks
     36 //*  none
     37 //*
     38 //*******************************************************************************/
     39 //void ihevc_deblk_chroma_vert(UWORD8 *pu1_src,
     40 //                             WORD32 src_strd,
     41 //                             WORD32 quant_param_p,
     42 //                             WORD32 quant_param_q,
     43 //                             WORD32 qp_offset_u,
     44 //                             WORD32 qp_offset_v,
     45 //                             WORD32 tc_offset_div2,
     46 //                             WORD32 filter_flag_p,
     47 //                             WORD32 filter_flag_q)
     48 
     49 .text
     50 .align 4
     51 .include "ihevc_neon_macros.s"
     52 
     53 
     54 
     55 .extern gai4_ihevc_qp_table
     56 .extern gai4_ihevc_tc_table
     57 .globl ihevc_deblk_chroma_vert_av8
     58 
     59 .type ihevc_deblk_chroma_vert_av8, %function
     60 
     61 ihevc_deblk_chroma_vert_av8:
     62     sxtw        x4,w4
     63     sxtw        x5,w5
     64     sxtw        x6,w6
     65     mov         x15,x5
     66     mov         x5,x6
     67     mov         x6,x15
     68     mov         x12, x7
     69     mov         x7, x4
     70     ldr         w4, [sp]
     71 
     72     push_v_regs
     73     stp         x19, x20,[sp,#-16]!
     74 
     75     sub         x8,x0,#4
     76     add         x2,x2,x3
     77     ld1         {v5.8b},[x8],x1
     78     add         x2,x2,#1
     79     ld1         {v17.8b},[x8],x1
     80     ld1         {v16.8b},[x8],x1
     81     ld1         {v4.8b},[x8]
     82 
     83     trn1        v29.8b, v5.8b, v17.8b
     84     trn2        v17.8b, v5.8b, v17.8b
     85     mov         v5.d[0], v29.d[0]
     86     adds        x3,x7,x2,asr #1
     87     trn1        v29.8b, v16.8b, v4.8b
     88     trn2        v4.8b, v16.8b, v4.8b
     89     mov         v16.d[0], v29.d[0]
     90     adrp        x7, :got:gai4_ihevc_qp_table
     91     ldr         x7, [x7, #:got_lo12:gai4_ihevc_qp_table]
     92 
     93 
     94     bmi         l1.2944
     95     cmp         x3,#0x39
     96     bgt         lbl78
     97     ldr         w3, [x7,x3,lsl #2]
     98     sxtw        x3,w3
     99 lbl78:
    100     sub         x20,x3,#6
    101     csel        x3, x20, x3,gt
    102 l1.2944:
    103     trn1        v29.4h, v5.4h, v16.4h
    104     trn2        v16.4h, v5.4h, v16.4h
    105     mov         v5.d[0], v29.d[0]
    106     adds        x2,x6,x2,asr #1
    107     trn1        v29.4h, v17.4h, v4.4h
    108     trn2        v4.4h, v17.4h, v4.4h
    109     mov         v17.d[0], v29.d[0]
    110     bmi         l1.2964
    111     cmp         x2,#0x39
    112     bgt         lbl86
    113     ldr         w2, [x7,x2,lsl #2]
    114     sxtw        x2,w2
    115 lbl86:
    116     sub         x20,x2,#6
    117     csel        x2, x20, x2,gt
    118 l1.2964:
    119     trn1        v29.2s, v5.2s, v17.2s
    120     trn2        v17.2s, v5.2s, v17.2s
    121     mov         v5.d[0], v29.d[0]
    122     add         x3,x3,x5,lsl #1
    123     trn1        v29.2s, v16.2s, v4.2s
    124     trn2        v4.2s, v16.2s, v4.2s
    125     mov         v16.d[0], v29.d[0]
    126     add         x6,x3,#2
    127     uxtl        v18.8h, v17.8b
    128     cmp         x6,#0x35
    129     mov         x20,#0x35
    130     csel        x3, x20, x3,gt
    131     bgt         l1.2996
    132     adds        x6,x3,#2
    133     add         x20,x3,#2
    134     csel        x3, x20, x3,pl
    135     mov         x20,#0
    136     csel        x3, x20, x3,mi
    137 l1.2996:
    138     usubl       v0.8h, v17.8b, v16.8b
    139     adrp        x6, :got:gai4_ihevc_tc_table
    140     ldr         x6, [x6, #:got_lo12:gai4_ihevc_tc_table]
    141     shl         v0.8h, v0.8h,#2
    142     add         x2,x2,x5,lsl #1
    143     add         x5,x2,#2
    144     uaddw       v0.8h,  v0.8h ,  v5.8b
    145     cmp         x5,#0x35
    146     ldr         w3, [x6,x3,lsl #2]
    147     sxtw        x3,w3
    148     usubw       v4.8h,  v0.8h ,  v4.8b
    149     mov         x20,#0x35
    150     csel        x2, x20, x2,gt
    151     bgt         l1.3036
    152     adds        x5,x2,#2
    153     add         x20,x2,#2
    154     csel        x2, x20, x2,pl
    155     mov         x20,#0
    156     csel        x2, x20, x2,mi
    157 l1.3036:
    158 
    159 
    160     srshr       v6.8h, v4.8h,#3
    161     dup         v2.4h,w3
    162     ldr         w2, [x6,x2,lsl #2]
    163     sxtw        x2,w2
    164     sub         x20,x3,#0
    165     neg         x3, x20
    166     cmp         x12,#0
    167     dup         v3.4h,w2
    168     sub         x20,x2,#0
    169     neg         x2, x20
    170     dup         v30.4h,w3
    171     dup         v31.4h,w2
    172 
    173     mov         v30.d[1],v31.d[0]
    174     mov         v2.d[1],v3.d[0]
    175 
    176     smin        v4.8h,  v6.8h ,  v2.8h
    177     smax        v2.8h,  v30.8h ,  v4.8h
    178 
    179     uxtl        v6.8h, v16.8b
    180 
    181     add         v0.8h,  v6.8h ,  v2.8h
    182     sub         v2.8h,  v18.8h ,  v2.8h
    183     sqxtun      v0.8b, v0.8h
    184     sub         x2,x0,#2
    185     sqxtun      v1.8b, v2.8h
    186     trn1        v29.2s, v0.2s, v1.2s
    187     trn2        v1.2s, v0.2s, v1.2s
    188     mov         v0.d[0], v29.d[0]
    189     trn1        v29.8b, v0.8b, v1.8b
    190     trn2        v1.8b, v0.8b, v1.8b
    191     mov         v0.d[0], v29.d[0]
    192     beq         l1.3204
    193 
    194     st1         {v0.h}[0],[x2],x1
    195     st1         {v1.h}[0],[x2],x1
    196     st1         {v0.h}[1],[x2],x1
    197     st1         {v1.h}[1],[x2]
    198 l1.3204:
    199     cmp         x4,#0
    200     beq         l1.3228
    201     st1         {v0.h}[2],[x0],x1
    202     st1         {v1.h}[2],[x0],x1
    203     st1         {v0.h}[3],[x0],x1
    204     st1         {v1.h}[3],[x0]
    205 l1.3228:
    206     ldp         x19, x20,[sp],#16
    207     pop_v_regs
    208     ret
    209 
    210 
    211 
    212