Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///*******************************************************************************
     19 //* @file
     20 //*  ihevc_deblk_luma_vert.s
     21 //*
     22 //* @brief
     23 //*  contains function definitions for inter prediction  interpolation.
     24 //* functions are coded using neon  intrinsics and can be compiled using
     25 
     26 //* rvct
     27 //*
     28 //* @author
     29 //*  anand s
     30 //*
     31 //* @par list of functions:
     32 //*
     33 //*
     34 //* @remarks
     35 //*  none
     36 //*
     37 //*******************************************************************************/
     38 
     39 .text
     40 .align 4
     41 
     42 
     43 .extern gai4_ihevc_tc_table
     44 .extern gai4_ihevc_beta_table
     45 .globl ihevc_deblk_luma_horz_av8
     46 
     47 .type ihevc_deblk_luma_horz_av8, %function
     48 
     49 ihevc_deblk_luma_horz_av8:
     50     // stmfd sp!, {x3-x12,x14}
     51     sxtw        x5,w5
     52     sxtw        x6,w6
     53     stp         d8,d9,[sp,#-16]!            // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error.
     54                                             // d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function.
     55     stp         d10,d11,[sp,#-16]!
     56     stp         d12,d13,[sp,#-16]!
     57     stp         d14,d15,[sp,#-16]!
     58     stp         x19, x20,[sp,#-16]!
     59     stp         x21, x22,[sp,#-16]!
     60 
     61     mov         x21,x7
     62     ldr         w22,[sp,#96]
     63 
     64     add         x3,x3,x4
     65     add         x3,x3,#1
     66     asr         x3,x3,#1
     67     add         x7,x3,x5,lsl #1
     68     add         x3,x3,x6,lsl #1
     69     cmp         x7,#0x33
     70     mov         x20,#0x33
     71     csel        x7, x20, x7,gt
     72     bgt         l1.1532
     73     cmp         x7,#0x0
     74     mov         x20,#0x0
     75     csel        x7, x20, x7,lt              // x7 has the beta_index value
     76 l1.1532:
     77     //     bic      x2,x2,#1
     78     asr         x2,x2,#1
     79 
     80     add         x3,x3,x2,lsl #1
     81     cmp         x3,#0x35
     82     mov         x20,#0x35
     83     csel        x3, x20, x3,gt
     84     bgt         l1.1564
     85     cmp         x3,#0x0
     86     mov         x20,#0x0
     87     csel        x3, x20, x3,lt              // x3 has the tc_index value
     88 
     89     //    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
     90     //    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
     91     //    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
     92 
     93 l1.1564:
     94     adrp        x2, :got:gai4_ihevc_beta_table
     95     ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
     96 
     97     adrp        x4, :got:gai4_ihevc_tc_table
     98     ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
     99 
    100     ldr         w5, [x2,x7,lsl #2]          // beta
    101     ldr         w6, [x4,x3,lsl #2]          // tc
    102 
    103 
    104 
    105     cmp         x6,#0
    106     beq         l1.2404
    107     movi        v0.4h, #0x2
    108     lsl         x7,x6,#1
    109     add         x14,x1,x1,lsl #1
    110     neg         x19,x14
    111     ldr         w8, [x0,x19]                // -3 value
    112     dup         v1.8b,w7
    113     lsl         x19,x1,#1
    114     neg         x19,x19
    115     ldr         w10, [x0,x19]               //-2 value
    116     dup         v23.2s,w8                   // -3 value
    117     neg         x19,x1
    118     ldr         w11, [x0,x19]               //-1 value
    119     dup         v24.2s,w10                  // -2 value
    120     and         x8,x8,#0xff
    121     ldr         w12, [x0,#0]                // 0 value
    122     dup         v25.2s,w11                  // -1 value
    123     and         x10,x10,#0xff
    124     ldr         w9, [x0,x1]                 // 1 value
    125     dup         v26.2s,w12                  // 0 value
    126     and         x11,x11,#0xff
    127     lsl         x19,x1,#1
    128     ldr         w2, [x0,x19]                // 2 value
    129     dup         v27.2s,w9                   // 1value
    130     and         x12,x12,#0xff
    131     dup         v28.2s,w2                   // 2 value
    132     and         x9,x9,#0xff
    133     and         x2,x2,#0xff
    134 
    135     add         x12,x12,x2
    136     subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
    137     csneg       x9,x9,x9,pl
    138     //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
    139 
    140     add         x8,x8,x11
    141     subs        x8,x8,x10,lsl #1
    142     csneg       x8,x8,x8,pl                 // dp0 value is stored in x8
    143     //  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
    144 
    145 
    146 
    147     add         x3,x1,x1,lsl #1
    148     add         x14,x0,#3
    149 
    150 
    151     neg         x19,x3
    152     ldrb        w2,[x14,x19]                // -2 value
    153     lsl         x19,x1,#1
    154     neg         x19,x19
    155     ldrb        w10,[x14,x19]               // -2 value
    156     neg         x19,x1
    157     ldrb        w11,[x14,x19]               // -1 value
    158     ldrb        w12,[x14,#0]                // 0 value
    159     ldrb        w3,[x14,x1]                 // 1 value
    160     lsl         x19,x1,#1
    161     ldrb        w4,[x14,x19]                // 2 value
    162 
    163 
    164     add         x12,x12,x4
    165     subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
    166     csneg       x12,x12,x12,pl
    167     //    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
    168 
    169 
    170     add         x2,x2,x11
    171     subs        x11,x2,x10,lsl #1
    172     csneg       x11,x11,x11,pl              // dp3 value is stored in x8
    173     //    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
    174 
    175 
    176 
    177     add         x3,x8,x9                    // x3 has the d0 value
    178     add         x4,x11,x12                  // x4 has the d3 value
    179 
    180 
    181     //    d0 = dp0 + dq0@
    182     //    d3 = dp3 + dq3@
    183 
    184     add         x14,x8,x11                  // x13 has the value dp
    185     add         x12,x12,x9                  // x12 has the value  dq
    186     //    dp = dp0 + dp3@
    187     //   dq = dq0 + dq3@
    188 
    189     add         x11, x3, x4                 // x3 has the value d
    190 
    191     //   d = d0 + d3@
    192 
    193 
    194     cmp         x11,x5
    195     bge         l1.2404
    196 
    197     //    if(d < beta)
    198 
    199 
    200     // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
    201 
    202     // registers for use: x2,x7,x8,x9,x10,
    203 
    204     asr         x10,x5,#2
    205     uqadd       v30.8b,  v26.8b ,  v1.8b
    206     cmp         x10,x3,lsl #1
    207     uqsub       v31.8b,  v26.8b ,  v1.8b
    208     ble         l1.1840
    209     add         x10,x1,x1,lsl #1
    210     uaddl       v6.8h,  v25.8b ,  v26.8b
    211     neg         x19,x1
    212     ldr         w2, [x0,x19,lsl #2]         // has the -4 value
    213     neg         x19, x1
    214     ldrb        w7,[x0,x19]                 // has the -1 value
    215     dup         v22.2s,w2                   // -4 value
    216     uaddw       v7.8h,  v6.8h ,  v27.8b
    217     ldrb        w3,[x0,#0]                  // x4 has the 0 value
    218     uqadd       v16.8b,  v27.8b ,  v1.8b
    219     and         x2,x2,#0xff
    220     mul         v12.8h, v7.8h, v0.h[0]
    221     ldr         w8, [x0,x10]                // has the 3 value
    222     uaddl       v10.8h,  v24.8b ,  v28.8b
    223     subs        x2,x2,x7
    224     uqsub       v17.8b,  v27.8b ,  v1.8b
    225     dup         v29.2s,w8                   // 3 value
    226     and         x8,x8,#0xff
    227     add         v12.8h,  v12.8h ,  v10.8h
    228     csneg       x2,x2,x2,pl
    229     rshrn       v20.8b, v12.8h,#3
    230     subs        x8,x8,x3
    231     csneg       x8,x8,x8,pl
    232     umin        v18.8b,  v20.8b ,  v30.8b
    233     add         x8,x8,x2
    234 
    235     cmp         x8,x5,asr #3
    236     bge         l1.1840
    237     uaddw       v14.8h,  v7.8h ,  v28.8b
    238     subs        x7,x3,x7
    239     umax        v4.8b,  v18.8b ,  v31.8b
    240     csneg       x7,x7,x7,pl
    241     uqadd       v30.8b,  v28.8b ,  v1.8b
    242     mov         x10,#5
    243     rshrn       v21.8b, v14.8h,#2
    244     mul         x10, x10, x6
    245     uqsub       v31.8b,  v28.8b ,  v1.8b
    246     add         x10, x10,#1
    247     cmp         x7,x10,asr #1
    248     umin        v18.8b,  v21.8b ,  v16.8b
    249     bge         l1.1840
    250 
    251 
    252     //        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
    253     //            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
    254 
    255     umax        v5.8b,  v18.8b ,  v17.8b
    256     asr         x10,x5,#2
    257     uaddl       v16.8h,  v29.8b ,  v28.8b
    258     cmp         x10,x4,lsl #1
    259     ble         l1.1840
    260 
    261     add         x10,x1,x1,lsl #1
    262     mul         v16.8h, v16.8h, v0.h[0]
    263     add         x4,x0,#3
    264 
    265 
    266     lsl         x19,x1,#2
    267     neg         x19,x19
    268     ldrb        w2,[x4,x19]
    269     add         v16.8h,  v16.8h ,  v14.8h
    270     neg         x19,x1
    271     ldrb        w7,[x4,x19]
    272     rshrn       v19.8b, v16.8h,#3
    273     ldrb        w3,[x4,#0]
    274     ldrb        w8,[x4,x10]
    275     //   ubfx   x7,x2,#24,#8           @ has the -1 value
    276     //  and    x2,#0xff               @ has the -4 value
    277     //  ubfx   x8,x3,#24,#8           @ has the 3 value
    278     //  and    x3,#0xff               @ x4 has the 0 value
    279 
    280 
    281 
    282     subs        x8,x8,x3
    283     umin        v18.8b,  v19.8b ,  v30.8b
    284     csneg       x8,x8,x8,pl
    285     uaddl       v6.8h,  v25.8b ,  v24.8b
    286     subs        x2,x2,x7
    287     umax        v3.8b,  v18.8b ,  v31.8b
    288     csneg       x2,x2,x2,pl
    289     uaddw       v7.8h,  v6.8h ,  v26.8b
    290     add         x8,x8,x2
    291     uqadd       v30.8b,  v25.8b ,  v1.8b
    292     cmp         x8,x5,asr #3
    293     uqsub       v31.8b,  v25.8b ,  v1.8b
    294     bge         l1.1840
    295     mul         v12.8h, v7.8h, v0.h[0]
    296     subs        x7,x3,x7
    297     uqadd       v16.8b,  v24.8b ,  v1.8b
    298     csneg       x7,x7,x7,pl
    299     uaddl       v10.8h,  v23.8b ,  v27.8b
    300     mov         x10,#5
    301     uqsub       v17.8b,  v24.8b ,  v1.8b
    302     mul         x10, x10, x6
    303     add         v12.8h,  v12.8h ,  v10.8h
    304     add         x10, x10,#1
    305     rshrn       v20.8b, v12.8h,#3
    306     cmp         x7,x10,asr #1
    307     uaddw       v14.8h,  v7.8h ,  v23.8b
    308     bge         l1.1840
    309     umin        v18.8b,  v20.8b ,  v30.8b
    310     mov         x2,#2
    311     uqadd       v30.8b,  v23.8b ,  v1.8b
    312     mov         w4,w21
    313     umax        v2.8b,  v18.8b ,  v31.8b
    314     mov         w5,w22
    315     rshrn       v21.8b, v14.8h,#2
    316     b           end_dep_deq_decision_horz
    317     // x2 has the value of de
    318     // x6 has teh value of tc
    319     // x5 has the value of beta
    320     // x14 has the value of dp
    321     // x12 has the value of dq
    322     // x0 has the value of source address
    323     // x1 has the src stride
    324 
    325 l1.1840:
    326     mov         x2,#1
    327 
    328     mov         x11,x5
    329     mov         w4,w21
    330     mov         w5,w22
    331 
    332     cmp         x6,#1
    333     mov         x20,#0
    334     csel        x9, x20, x9,eq
    335     mov         x20,#0
    336     csel        x10, x20, x10,eq
    337     beq         end_dep_deq_decision_horz
    338 
    339     and         x7,x4,x5
    340     cmp         x7,#1
    341     beq         both_flags_set_horz
    342     cmp         x4,#0
    343     beq         set_flag_dep_zero_horz
    344 
    345 
    346     add         x8,x11,x11,asr #1
    347     mov         x10,#0
    348     asr         x8,x8,#3
    349     cmp         x8,x14
    350     mov         x20,#1
    351     csel        x9, x20, x9,gt
    352     mov         x20,#0
    353     csel        x9, x20, x9,le
    354     b           end_dep_deq_decision_horz
    355 set_flag_dep_zero_horz:
    356 
    357     add         x8,x11,x11,asr #1
    358     mov         x9,#0
    359     asr         x8,x8,#3
    360     cmp         x8,x12
    361     mov         x20,#1
    362     csel        x10, x20, x10,gt
    363     mov         x20,#0
    364     csel        x10, x20, x10,le
    365     b           end_dep_deq_decision_horz
    366 
    367 both_flags_set_horz:
    368     add         x8,x11,x11,asr #1
    369     asr         x8,x8,#3
    370     cmp         x8,x14
    371     mov         x20,#1
    372     csel        x9, x20, x9,gt
    373     mov         x20,#0
    374     csel        x9, x20, x9,le
    375     cmp         x8,x12
    376     mov         x20,#1
    377     csel        x10, x20, x10,gt
    378     mov         x20,#0
    379     csel        x10, x20, x10,le
    380 end_dep_deq_decision_horz:
    381 
    382     //x0=source address
    383     //x1=stride
    384     // x2 =de
    385     // x4=flag p
    386     //x5= flag q
    387     //x6 =tc
    388     // x9 =dep
    389     // x10=deq
    390 
    391 
    392 
    393     //    add        x14,x1,x1,lsl #1
    394     //    lsl        x7,x6,#1
    395     //    vdup.8    d1,x7
    396     //    vmov.i16  d0,#0x2
    397     umin        v18.8b,  v21.8b ,  v16.8b
    398     cmp         x2,#1
    399     uqsub       v31.8b,  v23.8b ,  v1.8b
    400     beq         l1.2408
    401     uaddl       v7.8h,  v23.8b ,  v22.8b
    402     cmp         x5,#1
    403 
    404     bne         strong_filtering_p
    405 
    406 strong_filtering_q:
    407     mov         x12,x0
    408     st1         {v4.s}[0],[x12],x1
    409     st1         {v5.s}[0],[x12],x1
    410     st1         {v3.s}[0],[x12]
    411     cmp         x4,#1
    412     bne         l1.2404
    413 strong_filtering_p:
    414     umax        v5.8b,  v18.8b ,  v17.8b
    415     mov         x12,x0
    416     mul         v7.8h, v7.8h, v0.h[0]
    417     sub         x20,x1,#0
    418     neg         x11, x20
    419     add         v16.8h,  v7.8h ,  v14.8h
    420     add         x12,x12,x11
    421     rshrn       v19.8b, v16.8h,#3
    422     st1         {v2.s}[0],[x12],x11
    423     umin        v18.8b,  v19.8b ,  v30.8b
    424     st1         {v5.s}[0],[x12],x11
    425     umax        v3.8b,  v18.8b ,  v31.8b
    426     st1         {v3.s}[0],[x12]
    427 
    428 l1.2404:
    429     // ldmfd sp!, {x3-x12,pc}
    430     ldp         x21, x22,[sp],#16
    431     ldp         x19, x20,[sp],#16
    432     ldp         d14,d15,[sp],#16
    433     ldp         d12,d13,[sp],#16
    434     ldp         d10,d11,[sp],#16
    435     ldp         d8,d9,[sp],#16              // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
    436                                             // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
    437     ret
    438 
    439     // x4=flag p
    440     //x5= flag q
    441     //x6 =tc
    442     // x9 =dep
    443     // x10=deq
    444 
    445 
    446     //        d22             -4 value
    447 
    448     //d23        @ -3 value
    449 
    450     //    vdup.32    d24,x11            @ -2 value
    451 
    452     //    vdup.32    d25, x11        @-1 value
    453 
    454     //    vdup.32    d26,x11            @ 0 value
    455 
    456     //    vdup.32    d27,x11            @ 1value
    457 
    458     //    vdup.32    d28,x11            @ 2 value
    459 
    460     //    vdup.32    d29,x11            @ 3 value
    461 
    462 l1.2408:
    463 
    464     movi        v0.4h, #0x9
    465 
    466     usubl       v10.8h,  v26.8b ,  v25.8b
    467 
    468     mul         v10.8h, v10.8h, v0.h[0]
    469 
    470     movi        v0.4h, #0x3
    471 
    472     usubl       v12.8h,  v27.8b ,  v24.8b
    473     mul         v12.8h, v12.8h, v0.h[0]
    474 
    475 
    476     dup         v30.8b,w6                   // duplicating the +tc value
    477 
    478     sub         x20,x6,#0
    479     neg         x12, x20
    480     dup         v31.8b,w12                  // duplicating the -tc value
    481 
    482 
    483 
    484     sub         v10.8h,  v10.8h ,  v12.8h
    485 
    486 
    487 
    488     srshr       v10.8h, v10.8h,#4
    489     //   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
    490 
    491     abs         v7.8h, v10.8h
    492     xtn         v9.8b,  v7.8h
    493     // storing the absolute values of delta in d9
    494 
    495     sqxtn       v10.8b,  v10.8h
    496     // storing the clipped values of delta in d16
    497 
    498 
    499     smin        v11.8b,  v10.8b ,  v30.8b
    500     smax        v7.8b,  v31.8b ,  v11.8b    // d8 has the value  delta = clip3(delta, -tc, tc)//
    501 
    502 
    503     uxtl        v6.8h, v25.8b
    504 
    505     saddw       v4.8h,  v6.8h ,  v7.8b
    506 
    507     sqxtun      v12.8b, v4.8h
    508     uxtl        v6.8h, v26.8b
    509     ssubw       v4.8h,  v6.8h ,  v7.8b
    510     sqxtun      v13.8b, v4.8h
    511 
    512 
    513     mov         x11,#0xa
    514     mul         x12, x11, x6
    515     dup         v2.8b,w12                   // d2 has the 10*tc value
    516     mov         v18.8b, v24.8b
    517     dup         v0.8b,w6
    518     sshr        v0.8b,v0.8b,#1
    519     neg         v1.8b, v0.8b
    520 
    521     cmp         x4,#1
    522     bne         l1.2724
    523     cmp         x9,#1
    524     bne         l1.2700
    525 
    526     // d12 and d13 have the value temp_p0 and temp_q0
    527     uaddl       v14.8h,  v23.8b ,  v25.8b
    528     rshrn       v14.8b, v14.8h,#1
    529     usubl       v14.8h,  v14.8b ,  v24.8b
    530     saddw       v14.8h,  v14.8h ,  v7.8b
    531     sqshrn      v14.8b, v14.8h,#1
    532     smin        v15.8b,  v14.8b ,  v0.8b
    533     smax        v14.8b,  v1.8b ,  v15.8b
    534 
    535     // d14 has the delta p value
    536     uxtl        v16.8h, v24.8b
    537     saddw       v16.8h,  v16.8h ,  v14.8b
    538     sqxtun      v14.8b, v16.8h
    539 
    540     //  d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
    541     cmhs        v18.8b,v9.8b,v2.8b
    542     bsl         v18.8b,v24.8b,v14.8b
    543 
    544 l1.2700:
    545     mov         x12,x0
    546     sub         x20,x1,#0
    547     neg         x11, x20
    548     add         x12,x12,x11
    549     cmhs        v19.8b,v9.8b,v2.8b
    550     bsl         v19.8b,v25.8b,v12.8b
    551     st1         {v19.s}[0],[x12],x11
    552     st1         {v18.s}[0],[x12]
    553 l1.2724:
    554     cmp         x5,#1
    555     bne         l1.2404
    556     cmp         x10,#1
    557     mov         v18.8b, v27.8b
    558     bne         l1.2852
    559 
    560     uaddl       v14.8h,  v26.8b ,  v28.8b
    561     rshrn       v14.8b, v14.8h,#1
    562     usubl       v14.8h,  v14.8b ,  v27.8b
    563     ssubw       v14.8h,  v14.8h ,  v7.8b
    564     sqshrn      v14.8b, v14.8h,#1
    565     smin        v15.8b,  v14.8b ,  v0.8b
    566     smax        v14.8b,  v1.8b ,  v15.8b
    567 // d14 has the delta p value
    568     uxtl        v16.8h, v27.8b
    569     saddw       v16.8h,  v16.8h ,  v14.8b
    570     sqxtun      v14.8b, v16.8h
    571     cmhs        v18.8b,v9.8b,v2.8b
    572     bsl         v18.8b,v27.8b,v14.8b
    573 l1.2852:
    574     mov         x12,x0
    575     cmhs        v19.8b,v9.8b,v2.8b
    576     bsl         v19.8b,v26.8b,v13.8b
    577     st1         {v19.s}[0],[x12],x1
    578     st1         {v18.s}[0],[x12]
    579     // ldmfd sp!, {x3-x12,x15}
    580     ldp         x21, x22,[sp],#16
    581     ldp         x19, x20,[sp],#16
    582     ldp         d14,d15,[sp],#16
    583     ldp         d12,d13,[sp],#16
    584     ldp         d10,d11,[sp],#16
    585     ldp         d8,d9,[sp],#16              // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
    586                                             // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
    587     ret
    588 
    589 
    590