Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///*****************************************************************************/
     21 ///*                                                                           */
     22 ///*  File Name         : ih264_deblk_chroma_av8.s                              */
     23 ///*                                                                           */
     24 ///*  Description       : Contains function definitions for deblocking luma    */
     25 ///*                      edge. Functions are coded in NEON assembly and can   */
     26 ///*                      be compiled using ARM RVDS.                          */
     27 ///*                                                                           */
     28 ///*  List of Functions : ih264_deblk_chroma_vert_bs4_av8()              */
     29 ///*                      ih264_deblk_chroma_vert_bslt4_av8()            */
     30 ///*                      ih264_deblk_chroma_horz_bs4_av8()              */
     31 ///*                      ih264_deblk_chroma_horz_bslt4_av8()            */
     32 ///*  Issues / Problems : None                                                 */
     33 ///*                                                                           */
     34 ///*  Revision History  :                                                      */
     35 ///*                                                                           */
     36 ///*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
     37 ///*         28 11 2013   Ittiam          Draft                                */
     38 ///*****************************************************************************/
     39 
     40 
     41 .text
     42 .p2align 2
     43 .include "ih264_neon_macros.s"
     44 
     45 ///**
     46 //*******************************************************************************
     47 //*
     48 //* @brief
     49 //*     Performs filtering of a chroma block horizontal edge when the
     50 //*     boundary strength is set to 4 in high profile
     51 //*
     52 //* @par Description:
     53 //*       This operation is described in  Sec. 8.7.2.4 under the title
     54 //*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
     55 //*
     56 //* @param[in] x0 - pu1_src
     57 //*  Pointer to the src sample q0
     58 //*
     59 //* @param[in] w1 - src_strd
     60 //*  Source stride
     61 //*
     62 //* @param[in] w2 - alpha_cb
     63 //*  Alpha Value for the boundary in U
     64 //*
     65 //* @param[in] w3 - beta_cb
     66 //*  Beta Value for the boundary in U
     67 //*
     68 //* @param[in] w4 - alpha_cr
     69 //*    Alpha Value for the boundary in V
     70 //*
     71 //* @param[in] w5 - beta_cr
     72 //*    Beta Value for the boundary in V
     73 //*
     74 //* @returns
     75 //*  None
     76 //*
     77 //* @remarks
     78 //*  None
     79 //*
     80 //*******************************************************************************
     81 //*/
     82 
     83     .global ih264_deblk_chroma_horz_bs4_av8
     84 
     85 ih264_deblk_chroma_horz_bs4_av8:
     86 
     87     // STMFD sp!,{x4-x6,x14}            //
     88     push_v_regs
     89     stp       x19, x20, [sp, #-16]!
     90     sxtw      x1, w1
     91     mov       x6, x5
     92     mov       x5, x4
     93     sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixel pointing to p1 of chroma
     94     ld2       {v6.8b, v7.8b}, [x0], x1  //D6 = p1u , D7 = p1v
     95     mov       x4, x0                    //Keeping a backup of the pointer p0 of chroma
     96     ld2       {v4.8b, v5.8b}, [x0], x1  //D4 = p0u , D5 = p0v
     97     dup       v20.8b, w2                //D20 contains alpha_cb
     98     dup       v21.8b, w5                //D21 contains alpha_cr
     99     mov       v20.d[1], v21.d[0]
    100     ld2       {v0.8b, v1.8b}, [x0], x1  //D0 = q0u , D1 = q0v
    101     uaddl     v8.8h, v6.8b, v0.8b       //
    102     uaddl     v10.8h, v7.8b, v1.8b      //Q4,Q5 = q0 + p1
    103     movi      v31.8b, #2                //
    104     ld2       {v2.8b, v3.8b}, [x0]      //D2 = q1u , D3 = q1v
    105     mov       v0.d[1], v1.d[0]
    106     mov       v2.d[1], v3.d[0]
    107     mov       v4.d[1], v5.d[0]
    108     mov       v6.d[1], v7.d[0]
    109     uabd      v26.16b, v6.16b , v4.16b  //Q13 = ABS(p1 - p0)
    110     umlal     v8.8h, v2.8b, v31.8b      //
    111     umlal     v10.8h, v3.8b, v31.8b     //Q5,Q4 = (X2(q1U) + q0U + p1U)
    112     uabd      v22.16b, v4.16b , v0.16b  //Q11 = ABS(p0 - q0)
    113     uabd      v24.16b, v2.16b , v0.16b  //Q12 = ABS(q1 - q0)
    114     uaddl     v14.8h, v4.8b, v2.8b      //
    115     uaddl     v28.8h, v5.8b, v3.8b      //Q14,Q7 = P0 + Q1
    116     dup       v16.8b, w3                //D16 contains beta_cb
    117     dup       v17.8b, w6                //D17 contains beta_cr
    118     mov       v16.d[1], v17.d[0]
    119     umlal     v14.8h, v6.8b, v31.8b     //
    120     umlal     v28.8h, v7.8b, v31.8b     //Q14,Q7 = (X2(p1U) + p0U + q1U)
    121     cmhs      v18.16b, v22.16b, v20.16b
    122     cmhs      v24.16b, v24.16b, v16.16b
    123     cmhs      v26.16b, v26.16b, v16.16b
    124     rshrn     v8.8b, v8.8h, #2          //
    125     rshrn     v9.8b, v10.8h, #2         //Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
    126     mov       v8.d[1], v9.d[0]
    127     orr       v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
    128     rshrn     v10.8b, v14.8h, #2        //
    129     rshrn     v11.8b, v28.8h, #2        //Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
    130     mov       v10.d[1], v11.d[0]
    131     orr       v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
    132     bit       v10.16b, v4.16b , v18.16b //
    133     bit       v8.16b, v0.16b , v18.16b  //
    134     mov       v11.d[0], v10.d[1]
    135     mov       v9.d[0], v8.d[1]
    136     st2       {v10.8b, v11.8b}, [x4], x1 //
    137     st2       {v8.8b, v9.8b}, [x4]      //
    138     // LDMFD sp!,{x4-x6,pc}                //
    139     ldp       x19, x20, [sp], #16
    140     pop_v_regs
    141     ret
    142 
    143 
    144 
    145 ///**
    146 //*******************************************************************************
    147 //*
    148 //* @brief
    149 //*     Performs filtering of a chroma block vertical edge when the
    150 //*     boundary strength is set to 4 in high profile
    151 //*
    152 //* @par Description:
    153 //*       This operation is described in  Sec. 8.7.2.4 under the title
    154 //*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
    155 //*
    156 //* @param[in] x0 - pu1_src
    157 //*  Pointer to the src sample q0
    158 //*
    159 //* @param[in] w1 - src_strd
    160 //*  Source stride
    161 //*
    162 //* @param[in] w2 - alpha_cb
    163 //*  Alpha Value for the boundary in U
    164 //*
    165 //* @param[in] w3 - beta_cb
    166 //*  Beta Value for the boundary in U
    167 //*
    168 //* @param[in] w4 - alpha_cr
    169 //*    Alpha Value for the boundary in V
    170 //*
    171 //* @param[in] w5 - beta_cr
    172 //*    Beta Value for the boundary in V
    173 //*
    174 //* @returns
    175 //*  None
    176 //*
    177 //* @remarks
    178 //*  None
    179 //*
    180 //*******************************************************************************
    181 //*/
    182 
    183     .global ih264_deblk_chroma_vert_bs4_av8
    184 
    185 ih264_deblk_chroma_vert_bs4_av8:
    186 
    187     // STMFD sp!,{x4,x5,x12,x14}
    188     push_v_regs
    189     stp       x19, x20, [sp, #-16]!
    190     sxtw      x1, w1
    191 
    192     sub       x0, x0, #4                //point x0 to p1u of row0.
    193     mov       x12, x0                   //keep a back up of x0 for buffer write
    194 
    195     add       w2, w2, w4, lsl #8        //w2 = (alpha_cr,alpha_cb)
    196     add       w3, w3, w5, lsl #8        //w3 = (beta_cr,beta_cb)
    197 
    198     ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
    199     ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
    200     ld4       {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1
    201     ld4       {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1
    202 
    203     ld4       {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1
    204     ld4       {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1
    205     ld4       {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1
    206     ld4       {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1
    207 
    208     mov       v10.16b, v2.16b
    209     mov       v2.16b, v1.16b
    210     mov       v1.16b, v4.16b
    211     mov       v4.16b, v10.16b
    212     mov       v10.16b, v6.16b
    213     mov       v6.16b, v3.16b
    214     mov       v3.16b, v5.16b
    215     mov       v5.16b, v10.16b
    216 
    217     dup       v22.8h, w2                //Q11 = alpha
    218     dup       v24.8h, w3                //Q12 = beta
    219     movi      v31.8b, #2
    220 
    221     mov       v0.d[1], v1.d[0]
    222     mov       v2.d[1], v3.d[0]
    223     mov       v4.d[1], v5.d[0]
    224     mov       v6.d[1], v7.d[0]
    225 
    226     uabd      v8.16b, v2.16b , v4.16b   //|p0-q0|
    227     uabd      v10.16b, v6.16b , v4.16b  //|q1-q0|
    228     uabd      v12.16b, v0.16b , v2.16b  //|p1-p0|
    229     uaddl     v14.8h, v2.8b, v6.8b
    230     uaddl     v16.8h, v3.8b, v7.8b      //(p0 + q1)
    231     cmhi      v8.16b, v22.16b , v8.16b  //|p0-q0| < alpha ?
    232     cmhi      v10.16b, v24.16b , v10.16b //|q1-q0| < beta ?
    233     cmhi      v12.16b, v24.16b , v12.16b //|p1-p0| < beta ?
    234     umlal     v14.8h, v0.8b, v31.8b
    235     umlal     v16.8h, v1.8b, v31.8b     //2*p1 + (p0 + q1)
    236     uaddl     v18.8h, v0.8b, v4.8b
    237     uaddl     v20.8h, v1.8b, v5.8b      //(p1 + q0)
    238     and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta
    239     umlal     v18.8h, v6.8b, v31.8b
    240     umlal     v20.8h, v7.8b, v31.8b     //2*q1 + (p1 + q0)
    241 
    242     rshrn     v14.8b, v14.8h, #2
    243     rshrn     v15.8b, v16.8h, #2        //(2*p1 + (p0 + q1) + 2) >> 2
    244     mov       v14.d[1], v15.d[0]
    245     and       v8.16b, v8.16b , v12.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
    246     rshrn     v18.8b, v18.8h, #2
    247     rshrn     v19.8b, v20.8h, #2        //(2*q1 + (p1 + q0) + 2) >> 2
    248     mov       v18.d[1], v19.d[0]
    249     bit       v2.16b, v14.16b , v8.16b
    250     bit       v4.16b, v18.16b , v8.16b
    251 
    252     mov       v1.d[0], v0.d[1]
    253     mov       v3.d[0], v2.d[1]
    254     mov       v5.d[0], v4.d[1]
    255     mov       v7.d[0], v6.d[1]
    256 
    257     mov       v10.16b, v1.16b
    258     mov       v1.16b, v2.16b
    259     mov       v2.16b, v4.16b
    260     mov       v4.16b, v10.16b
    261     mov       v10.16b, v3.16b
    262     mov       v3.16b, v6.16b
    263     mov       v6.16b, v5.16b
    264     mov       v5.16b, v10.16b
    265 
    266     st4       {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1
    267     st4       {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1
    268     st4       {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1
    269     st4       {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1
    270 
    271     st4       {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1
    272     st4       {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1
    273     st4       {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1
    274     st4       {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1
    275 
    276     // LDMFD sp!,{x4,x5,x12,pc}
    277     ldp       x19, x20, [sp], #16
    278     pop_v_regs
    279     ret
    280 
    281 
    282 
    283 ///**
    284 //*******************************************************************************
    285 //*
    286 //* @brief
    287 //*     Performs filtering of a chroma block horizontal edge for cases where the
    288 //*     boundary strength is less than 4 in high profile
    289 //*
    290 //* @par Description:
    291 //*       This operation is described in  Sec. 8.7.2.4 under the title
    292 //*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
    293 //*
    294 //* @param[in] x0 - pu1_src
    295 //*  Pointer to the src sample q0
    296 //*
    297 //* @param[in] w1 - src_strd
    298 //*  Source stride
    299 //*
    300 //* @param[in] w2 - alpha_cb
    301 //*  Alpha Value for the boundary in U
    302 //*
    303 //* @param[in] w3 - beta_cb
    304 //*  Beta Value for the boundary in U
    305 //*
    306 //* @param[in] w4 - alpha_cr
    307 //*    Alpha Value for the boundary in V
    308 //*
    309 //* @param[in] w5 - beta_cr
    310 //*    Beta Value for the boundary in V
    311 //*
    312 //* @param[in] w6 - u4_bs
    313 //*    Packed Boundary strength array
    314 //*
    315 //* @param[in] x7 - pu1_cliptab_cb
    316 //*    tc0_table for U
    317 //*
    318 //* @param[in] sp(0) - pu1_cliptab_cr
    319 //*    tc0_table for V
    320 //*
    321 //* @returns
    322 //*  None
    323 //*
    324 //* @remarks
    325 //*  None
    326 //*
    327 //*******************************************************************************
    328 //*/
    329 
    330     .global ih264_deblk_chroma_horz_bslt4_av8
    331 
    332 ih264_deblk_chroma_horz_bslt4_av8:
    333 
    334     // STMFD sp!,{x4-x9,x14}        //
    335     push_v_regs
    336     stp       x19, x20, [sp, #-16]!
    337     sxtw      x1, w1
    338     ldr       x8, [sp, #80]
    339     sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixelU pointing to p1 of chroma U
    340     rev       w6, w6                    //
    341     mov       v12.s[0], w6              //D12[0] = ui_Bs
    342     ld1       {v16.s}[0], [x7]          //D16[0] contains cliptab_cb
    343     ld1       {v17.s}[0], [x8]          //D17[0] contains cliptab_cr
    344     ld2       {v6.8b, v7.8b}, [x0], x1  //Q3=p1
    345     tbl       v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U
    346     tbl       v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V
    347     uxtl      v12.8h, v12.8b            //Q6 = uc_Bs in each 16 bit scalar
    348     mov       x6, x0                    //Keeping a backup of the pointer to chroma U P0
    349     ld2       {v4.8b, v5.8b}, [x0], x1  //Q2=p0
    350     movi      v30.8b, #1                //
    351     dup       v20.8b, w2                //D20 contains alpha_cb
    352     dup       v21.8b, w4                //D21 contains alpha_cr
    353     mov       v20.d[1], v21.d[0]
    354     ld2       {v0.8b, v1.8b}, [x0], x1  //Q0=q0
    355     uxtl      v14.8h, v14.8b            //
    356     uxtl      v28.8h, v28.8b            //
    357     mov       v15.d[0], v28.d[0]        //D14 has cliptab values for U, D15 for V
    358     mov       v14.d[1], v28.d[0]
    359     ld2       {v2.8b, v3.8b}, [x0]      //Q1=q1
    360     usubl     v10.8h, v1.8b, v5.8b      //
    361     usubl     v8.8h, v0.8b, v4.8b       //Q5,Q4 = (q0 - p0)
    362     mov       v6.d[1], v7.d[0]
    363     mov       v4.d[1], v5.d[0]
    364     uabd      v26.16b, v6.16b , v4.16b  //Q13 = ABS(p1 - p0)
    365     shl       v10.8h, v10.8h, #2        //Q5 = (q0 - p0)<<2
    366     mov       v0.d[1], v1.d[0]
    367     uabd      v22.16b, v4.16b , v0.16b  //Q11 = ABS(p0 - q0)
    368     shl       v8.8h, v8.8h, #2          //Q4 = (q0 - p0)<<2
    369     mov       v14.d[1], v15.d[0]
    370     sli       v14.8h, v14.8h, #8
    371     mov       v15.d[0], v14.d[1]
    372     mov       v2.d[1], v3.d[0]
    373     uabd      v24.16b, v2.16b , v0.16b  //Q12 = ABS(q1 - q0)
    374     cmhs      v18.16b, v22.16b, v20.16b
    375     usubl     v20.8h, v6.8b, v2.8b      //Q10 = (p1 - q1)L
    376     usubl     v6.8h, v7.8b, v3.8b       //Q3 = (p1 - q1)H
    377     dup       v16.8b, w3                //Q8 contains beta_cb
    378     dup       v17.8b, w5                //Q8 contains beta_cr
    379     mov       v16.d[1], v17.d[0]
    380     add       v8.8h, v8.8h , v20.8h     //
    381     add       v10.8h, v10.8h , v6.8h    //Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
    382     cmhs      v24.16b, v24.16b, v16.16b
    383     cmgt      v12.4h, v12.4h, #0
    384     sqrshrn   v8.8b, v8.8h, #3          //
    385     sqrshrn   v9.8b, v10.8h, #3         //Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
    386     mov       v8.d[1], v9.d[0]
    387     add       v14.8b, v14.8b , v30.8b   //D14 = C = C0+1 for U
    388     cmhs      v26.16b, v26.16b, v16.16b
    389     orr       v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
    390     abs       v6.16b, v8.16b            //Q4 = ABS (i_macro)
    391     add       v15.8b, v15.8b , v30.8b   //D15 = C = C0+1 for V
    392     mov       v14.d[1], v15.d[0]
    393     mov       v13.8b, v12.8b
    394     mov       v12.d[1], v13.d[0]        //
    395     orr       v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
    396     umin      v14.16b, v6.16b , v14.16b //Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
    397     bic       v12.16b, v12.16b , v18.16b //final condition
    398     cmge      v8.16b, v8.16b, #0
    399     and       v14.16b, v14.16b , v12.16b //Making delta zero in places where values shouldn be filterd
    400     uqadd     v16.16b, v4.16b , v14.16b //Q8 = p0 + delta
    401     uqsub     v4.16b, v4.16b , v14.16b  //Q2 = p0 - delta
    402     uqadd     v18.16b, v0.16b , v14.16b //Q9 = q0 + delta
    403     uqsub     v0.16b, v0.16b , v14.16b  //Q0 = q0 - delta
    404     bif       v16.16b, v4.16b , v8.16b  //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
    405     bif       v0.16b, v18.16b , v8.16b  //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
    406     mov       v17.d[0], v16.d[1]
    407     mov       v1.d[0], v0.d[1]
    408     st2       {v16.8b, v17.8b}, [x6], x1 //
    409     st2       {v0.8b, v1.8b}, [x6]      //
    410 
    411     ldp       x19, x20, [sp], #16
    412     pop_v_regs
    413     ret
    414 
    415 
    416 
    417 
    418 ///**
    419 //*******************************************************************************
    420 //*
    421 //* @brief
    422 //*     Performs filtering of a chroma block vertical edge for cases where the
    423 //*     boundary strength is less than 4 in high profile
    424 //*
    425 //* @par Description:
    426 //*       This operation is described in  Sec. 8.7.2.4 under the title
    427 //*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
    428 //*
    429 //* @param[in] x0 - pu1_src
    430 //*  Pointer to the src sample q0
    431 //*
    432 //* @param[in] w1 - src_strd
    433 //*  Source stride
    434 //*
    435 //* @param[in] w2 - alpha_cb
    436 //*  Alpha Value for the boundary in U
    437 //*
    438 //* @param[in] w3 - beta_cb
    439 //*  Beta Value for the boundary in U
    440 //*
    441 //* @param[in] w4 - alpha_cr
    442 //*    Alpha Value for the boundary in V
    443 //*
    444 //* @param[in] w5 - beta_cr
    445 //*    Beta Value for the boundary in V
    446 //*
    447 //* @param[in] w6 - u4_bs
    448 //*    Packed Boundary strength array
    449 //*
    450 //* @param[in] x7 - pu1_cliptab_cb
    451 //*    tc0_table for U
    452 //*
    453 //* @param[in] sp(0) - pu1_cliptab_cr
    454 //*    tc0_table for V
    455 //*
    456 //* @returns
    457 //*  None
    458 //*
    459 //* @remarks
    460 //*  None
    461 //*
    462 //*******************************************************************************
    463 //*/
    464 
    465     .global ih264_deblk_chroma_vert_bslt4_av8
    466 
    467 ih264_deblk_chroma_vert_bslt4_av8:
    468 
    469     // STMFD sp!,{x4-x7,x10-x12,x14}
    470     push_v_regs
    471     stp       x19, x20, [sp, #-16]!
    472     sxtw      x1, w1
    473     mov       x10, x7
    474     ldr       x11, [sp, #80]            //x11 = u4_bs
    475     sub       x0, x0, #4                //point x0 to p1u of row0.
    476     add       w2, w2, w4, lsl #8
    477     add       w3, w3, w5, lsl #8
    478     mov       x12, x0                   //keep a back up of x0 for buffer write
    479     ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
    480     ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
    481     ld4       {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1
    482     ld4       {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1
    483 
    484     ld4       {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1
    485     ld4       {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1
    486     ld4       {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1
    487     ld4       {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1
    488 
    489     mov       v10.16b, v2.16b
    490     mov       v2.16b, v1.16b
    491     mov       v1.16b, v4.16b
    492     mov       v4.16b, v10.16b
    493     mov       v10.16b, v6.16b
    494     mov       v6.16b, v3.16b
    495     mov       v3.16b, v5.16b
    496     mov       v5.16b, v10.16b
    497     dup       v22.8h, w2                //Q11 = alpha
    498     mov       v2.d[1], v3.d[0]
    499     mov       v4.d[1], v5.d[0]
    500     uabd      v8.16b, v2.16b , v4.16b   //|p0-q0|
    501     dup       v24.8h, w3                //Q12 = beta
    502     mov       v25.d[0], v24.d[1]
    503     mov       v6.d[1], v7.d[0]
    504     mov       v0.d[1], v1.d[0]
    505     uabd      v10.16b, v6.16b , v4.16b  //|q1-q0|
    506     uabd      v12.16b, v0.16b , v2.16b  //|p1-p0|
    507     cmhi      v8.16b, v22.16b , v8.16b  //|p0-q0| < alpha ?
    508     usubl     v14.8h, v0.8b, v6.8b
    509     cmhi      v10.16b, v24.16b , v10.16b //|q1-q0| < beta ?
    510     usubl     v16.8h, v1.8b, v7.8b      //(p1 - q1)
    511     cmhi      v12.16b, v24.16b , v12.16b //|p1-p0| < beta ?
    512     usubl     v18.8h, v4.8b, v2.8b
    513     and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta
    514     usubl     v20.8h, v5.8b, v3.8b      //(q0 - p0)
    515     movi      v28.8h, #4
    516     ld1       {v24.s}[0], [x10]         //Load ClipTable for U
    517     ld1       {v25.s}[0], [x11]         //Load ClipTable for V
    518     rev       w6, w6                    //Blocking strengths
    519     and       v8.16b, v8.16b , v12.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
    520     mov       v10.s[0], w6
    521     mla       v14.8h, v18.8h , v28.8h
    522     mla       v16.8h, v20.8h , v28.8h   //4*(q0 - p0) + (p1 - q1)
    523     uxtl      v10.8h, v10.8b
    524     sli       v10.4h, v10.4h, #8
    525     tbl       v12.8b, {v24.16b}, v10.8b //tC0 for U
    526     tbl       v13.8b, {v25.16b}, v10.8b //tC0 for V
    527     zip1      v31.8b, v12.8b, v13.8b
    528     zip2      v13.8b, v12.8b, v13.8b
    529     mov       v12.8b, v31.8b
    530     mov       v12.d[1], v13.d[0]
    531     uxtl      v10.4s, v10.4h
    532     sli       v10.4s, v10.4s, #16
    533     movi      v24.16b, #1
    534     add       v12.16b, v12.16b , v24.16b //tC0 + 1
    535     cmhs      v10.16b, v10.16b , v24.16b
    536     and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
    537     // Q0 - Q3(inputs),
    538     // Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
    539     // Q6 (tC)
    540     srshr     v14.8h, v14.8h, #3
    541     srshr     v16.8h, v16.8h, #3        //(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
    542     cmgt      v18.8h, v14.8h , #0
    543     cmgt      v20.8h, v16.8h , #0
    544     xtn       v18.8b, v18.8h
    545     xtn       v19.8b, v20.8h            //Q9 = sign(delta)
    546     mov       v18.d[1], v19.d[0]
    547     abs       v14.8h, v14.8h
    548     abs       v16.8h, v16.8h
    549     xtn       v14.8b, v14.8h
    550     xtn       v15.8b, v16.8h
    551     mov       v14.d[1], v15.d[0]
    552     umin      v14.16b, v14.16b , v12.16b //Q7 = |delta|
    553     uqadd     v20.16b, v2.16b , v14.16b //p0+|delta|
    554     uqadd     v22.16b, v4.16b , v14.16b //q0+|delta|
    555     uqsub     v24.16b, v2.16b , v14.16b //p0-|delta|
    556     uqsub     v26.16b, v4.16b , v14.16b //q0-|delta|
    557     bit       v24.16b, v20.16b , v18.16b //p0 + delta
    558     bit       v22.16b, v26.16b , v18.16b //q0 - delta
    559     bit       v2.16b, v24.16b , v8.16b
    560     bit       v4.16b, v22.16b , v8.16b
    561     mov       v1.d[0], v0.d[1]
    562     mov       v3.d[0], v2.d[1]
    563     mov       v5.d[0], v4.d[1]
    564     mov       v7.d[0], v6.d[1]
    565     mov       v10.16b, v1.16b
    566     mov       v1.16b, v2.16b
    567     mov       v2.16b, v4.16b
    568     mov       v4.16b, v10.16b
    569     mov       v10.16b, v3.16b
    570     mov       v3.16b, v6.16b
    571     mov       v6.16b, v5.16b
    572     mov       v5.16b, v10.16b
    573     st4       {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1
    574     st4       {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1
    575     st4       {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1
    576     st4       {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1
    577 
    578     st4       {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1
    579     st4       {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1
    580     st4       {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1
    581     st4       {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1
    582 
    583     ldp       x19, x20, [sp], #16
    584     pop_v_regs
    585     ret
    586 
    587 
    588