Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 
     21 //******************************************************************************
     22 //*
     23 //* @brief
     24 //*  This file contains definitions of routines for spatial filter
     25 //*
     26 //* @author
     27 //*  Ittiam
     28 //*
     29 //* @par List of Functions:
     30 //*  - ideint_cac_8x8_av8()
     31 //*
     32 //* @remarks
     33 //*  None
     34 //*
     35 //*******************************************************************************
     36 
     37 
     38 //******************************************************************************
     39 //*
     40 //*  @brief Calculates Combing Artifact
     41 //*
     42 //*  @par   Description
     43 //*   This functions calculates combing artifact check (CAC) for given two fields
     44 //*
     45 //* @param[in] pu1_top
     46 //*  UWORD8 pointer to top field
     47 //*
     48 //* @param[in] pu1_bot
     49 //*  UWORD8 pointer to bottom field
     50 //*
     51 //* @param[in] top_strd
     52 //*  Top field stride
     53 //*
     54 //* @param[in] bot_strd
     55 //*  Bottom field stride
     56 //*
     57 //* @returns
     58 //*     None
     59 //*
     60 //* @remarks
     61 //*
     62 //******************************************************************************
     63 
     64     .global ideint_cac_8x8_av8
     65 
     66 ideint_cac_8x8_av8:
     67 
     68     // Load first row of top
     69     ld1     {v28.8b},       [x0],       x2
     70 
     71     // Load first row of bottom
     72     ld1     {v29.8b},       [x1],       x3
     73     mov     v28.d[1],       v29.d[0]
     74 
     75     // Load second row of top
     76     ld1     {v30.8b},       [x0],       x2
     77 
     78     // Load second row of bottom
     79     ld1     {v31.8b},       [x1],       x3
     80     mov     v30.d[1],       v31.d[0]
     81 
     82 
     83     // Calculate row based adj and alt values
     84     // Get row sums
     85     uaddlp  v0.8h,          v28.16b
     86 
     87     uaddlp  v2.8h,          v30.16b
     88 
     89     uaddlp  v0.4s,          v0.8h
     90 
     91     uaddlp  v2.4s,          v2.8h
     92 
     93     // Both v0 and v2 have four 32 bit sums corresponding to first 4 rows
     94     // Pack v0 and v2 into a single register (sum does not exceed 16bits)
     95 
     96     shl     v16.4s,         v2.4s,      #16
     97     orr     v16.16b,        v0.16b,     v16.16b
     98     // v16 now contains 8 sums
     99 
    100     // Load third row of top
    101     ld1     {v24.8b},       [x0],       x2
    102 
    103     // Load third row of bottom
    104     ld1     {v25.8b},       [x1],       x3
    105     mov     v24.d[1],       v25.d[0]
    106 
    107     // Load fourth row of top
    108     ld1     {v26.8b},       [x0],       x2
    109 
    110     // Load fourth row of bottom
    111     ld1     {v27.8b},       [x1],       x3
    112     mov     v26.d[1],       v27.d[0]
    113 
    114     // Get row sums
    115     uaddlp  v4.8h,          v24.16b
    116 
    117     uaddlp  v6.8h,          v26.16b
    118 
    119     uaddlp  v4.4s,          v4.8h
    120 
    121     uaddlp  v6.4s,          v6.8h
    122     // Both v4 and v6 have four 32 bit sums corresponding to last 4 rows
    123     // Pack v4 and v6 into a single register (sum does not exceed 16bits)
    124 
    125     shl     v18.4s,         v6.4s,      #16
    126     orr     v18.16b,        v4.16b,     v18.16b
    127     // v18 now contains 8 sums
    128 
    129     // Compute absolute diff between top and bottom row sums
    130     mov     v17.d[0],       v16.d[1]
    131     uabd    v16.4h,         v16.4h,     v17.4h
    132 
    133     mov     v19.d[0],       v18.d[1]
    134     uabd    v17.4h,         v18.4h,     v19.4h
    135 
    136     mov     v16.d[1],       v17.d[0]
    137 
    138     // RSUM_CSUM_THRESH
    139     movi    v18.8h,         #20
    140 
    141     // Eliminate values smaller than RSUM_CSUM_THRESH
    142     cmhs    v20.8h,         v16.8h,     v18.8h
    143     and     v20.16b,        v16.16b,    v20.16b
    144 
    145     // v20 now contains 8 absolute diff of sums above the threshold
    146 
    147     // Compute adj
    148     mov     v21.d[0],       v20.d[1]
    149     add     v20.4h,         v20.4h,     v21.4h
    150 
    151     // v20 has four adj values for two sub-blocks
    152 
    153     // Compute alt
    154     uabd    v0.4s,      v0.4s,      v2.4s
    155     uabd    v4.4s,      v4.4s,      v6.4s
    156 
    157     add     v0.4s,      v0.4s,      v4.4s
    158 
    159     mov     v1.d[0],    v0.d[1]
    160     add     v21.4s,     v0.4s,      v1.4s
    161     // d21 has two values for two sub-blocks
    162 
    163 
    164     // Calculate column based adj and alt values
    165 
    166     urhadd  v0.16b,     v28.16b,    v30.16b
    167     urhadd  v2.16b,     v24.16b,    v26.16b
    168     urhadd  v0.16b,     v0.16b,     v2.16b
    169 
    170     mov     v1.d[0],    v0.d[1]
    171     uabd    v0.8b,      v0.8b,      v1.8b
    172 
    173     // RSUM_CSUM_THRESH >> 2
    174     movi    v22.16b,        #5
    175 
    176     // Eliminate values smaller than RSUM_CSUM_THRESH >> 2
    177     cmhs    v1.16b,      v0.16b,        v22.16b
    178     and     v0.16b,      v0.16b,        v1.16b
    179     // d0 now contains 8 absolute diff of sums above the threshold
    180 
    181 
    182     uaddlp  v0.4h,      v0.8b
    183     shl     v0.4h,      v0.4h,#2
    184 
    185     // Add row based adj
    186     add     v20.4h,     v0.4h,      v20.4h
    187 
    188     uaddlp  v20.2s,     v20.4h
    189     // d20 now contains 2 adj values
    190 
    191 
    192     urhadd  v0.8b,      v28.8b,     v29.8b
    193     urhadd  v2.8b,      v24.8b,     v25.8b
    194     urhadd  v0.8b,      v0.8b,      v2.8b
    195 
    196     urhadd  v1.8b,      v30.8b,     v31.8b
    197     urhadd  v3.8b,      v26.8b,     v27.8b
    198     urhadd  v1.8b,      v1.8b,      v3.8b
    199 
    200     uabd    v0.8b,      v0.8b,      v1.8b
    201     uaddlp  v0.4h,      v0.8b
    202 
    203     shl     v0.4h,      v0.4h,      #2
    204     uaddlp  v0.2s,      v0.4h
    205     add     v21.2s,     v0.2s,      v21.2s
    206 
    207 
    208     // d21 now contains 2 alt values
    209 
    210     // SAD_BIAS_MULT_SHIFT
    211     ushr    v0.2s,      v21.2s,     #3
    212     add     v21.2s,     v21.2s,     v0.2s
    213 
    214     // SAD_BIAS_ADDITIVE >> 1
    215     movi    v0.2s,      #4
    216     add     v21.2s,     v21.2s,     v0.2s
    217 
    218     cmhi    v0.2s,      v20.2s,     v21.2s
    219     uaddlp  v0.1d,      v0.2s
    220 
    221     smov    x0,         v0.2s[0]
    222     cmp     x0,         #0
    223     mov     x4,         #1
    224     csel    x0,         x4,         x0,         ne
    225     ret
    226