Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @/*******************************************************************************
     20 @* @file
     21 @*  ihevc_deblk_luma_vert.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  anand s
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************/
     39 
     40 .text
     41 .align 4
     42 
     43 
     44 
     45 
     46 
     47 .extern gai4_ihevc_tc_table
     48 .extern gai4_ihevc_beta_table
     49 
     50 .globl ihevc_deblk_luma_vert_a9q
     51 
     52 gai4_ihevc_tc_table_addr:
     53 .long gai4_ihevc_tc_table   - ulbl1 - 8
     54 
     55 gai4_ihevc_beta_table_addr:
     56 .long gai4_ihevc_beta_table   - ulbl2 - 8
     57 
     58 .type ihevc_deblk_luma_vert_a9q, %function
     59 
     60 ihevc_deblk_luma_vert_a9q:
     61 
     62     push        {r3-r12,lr}
     63     ldr         r4,[sp,#0x2c]
     64     ldr         r5,[sp,#0x30]
     65 
     66     add         r3,r3,r4
     67     add         r3,r3,#1
     68     ldr         r6, [sp,#0x34]
     69     asr         r3,r3,#1
     70     add         r7,r3,r5,lsl #1
     71     add         r3,r3,r6,lsl #1
     72     cmp         r7,#0x33
     73     movgt       r7,#0x33
     74     bgt         l1.56
     75     cmp         r7,#0x0
     76     movlt       r7,#0x0                     @ r7 has the beta_index value
     77 l1.56:
     78 
     79 @     bic      r2,r2,#1
     80     asr         r2,r2,#1
     81 
     82     add         r3,r3,r2,lsl #1
     83     cmp         r3,#0x35
     84     movgt       r3,#0x35
     85     bgt         l1.88
     86     cmp         r3,#0x0
     87     movlt       r3,#0x0                     @ r3 has the tc_index value
     88 
     89 @    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
     90 @    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
     91 @    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
     92 
     93 l1.88:
     94     ldr         r2,gai4_ihevc_beta_table_addr
     95 ulbl2:
     96     add         r2,r2,pc
     97     vmov.i8     d18,#0x2
     98     ldr         r4,gai4_ihevc_tc_table_addr
     99 ulbl1:
    100     add         r4,r4,pc
    101 
    102     ldr         r5,[r2,r7,lsl #2]           @ beta
    103     vmov.i16    q8,#0x2
    104     ldr         r6,[r4,r3,lsl #2]           @ tc
    105     lsl         r8,r6,#1
    106     cmp         r6,#0
    107     vdup.8      d19,r8
    108     sub         r7,r0,#4
    109     vmov.i8     d23,#0x3
    110     beq         l1.964
    111 
    112 
    113     vld1.8      {d24},[r7],r1
    114     ldrb        r8,[r0,#-3]                 @ -3 value
    115     vld1.8      {d1},[r7],r1
    116     ldrb        r10,[r0,#-2]                @-2 value
    117     vld1.8      {d2},[r7],r1
    118     ldrb        r11,[r0,#-1]                @-1 value
    119     vld1.8      {d0},[r7]
    120     ldrb        r12,[r0,#0]                 @ 0 value
    121     ldrb        r9,[r0,#1]                  @ 1 value
    122     vtrn.8      d24,d1
    123     ldrb        r2,[r0,#2]                  @ 2 value
    124     vtrn.8      d2,d0
    125     add         r12,r12,r2
    126     subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
    127     rsbmi       r9,r9,#0
    128 @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
    129     vtrn.16     d24,d2
    130     add         r8,r8,r11
    131     vtrn.16     d1,d0
    132     subs        r8,r8,r10,lsl #1
    133     rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
    134 @  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
    135 
    136 
    137 
    138     add         r14,r1,r1,lsl #1
    139     add         r14,r0,r14
    140 
    141     vdup.32     d4,d24[1]
    142     ldrb        r2,[r14,#-3]                @ -2 value
    143     vdup.32     d7,d2[1]
    144     ldrb        r10,[r14,#-2]               @ -2 value
    145     vdup.32     d3,d2[0]
    146     ldrb        r11,[r14,#-1]               @ -1 value
    147     vdup.32     d5,d1[1]
    148     ldrb        r12,[r14,#0]                @ 0 value
    149     vdup.32     d6,d1[0]
    150     ldrb        r3,[r14,#1]                 @ 1 value
    151     vdup.32     d2,d0[0]
    152     ldrb        r4,[r14,#2]                 @ 2 value
    153 
    154 
    155     add         r12,r12,r4
    156     subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
    157     rsbmi       r12,r12,#0
    158 @    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
    159 
    160 
    161     add         r2,r2,r11
    162     subs        r11,r2,r10,lsl #1
    163     rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
    164 @    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
    165 
    166 
    167 
    168     add         r3,r8,r9                    @ r3 has the d0 value
    169     add         r4,r11,r12                  @ r4 has the d3 value
    170 
    171 
    172 @    d0 = dp0 + dq0@
    173 @    d3 = dp3 + dq3@
    174 
    175     add         r14,r8,r11                  @ r13 has the value dp
    176     add         r12,r12,r9                  @ r12 has the value  dq
    177 @    dp = dp0 + dp3@
    178 @   dq = dq0 + dq3@
    179 
    180     add         r11, r3, r4                 @ r3 has the value d
    181 
    182 @   d = d0 + d3@
    183 
    184 
    185     cmp         r11,r5
    186     vdup.32     d22,d0[1]
    187     bge         l1.964
    188 
    189 @    if(d < beta)
    190 
    191 
    192     @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
    193 
    194     @ registers for use: r2,r7,r8,r9,r10,
    195     vqsub.u8    d30,d7,d19
    196     asr         r10,r5,#2
    197     vqadd.u8    d31,d7,d19
    198     cmp         r10,r3,lsl #1
    199     vaddl.u8    q0,d5,d4
    200     ble         l1.336
    201 
    202     ldrb        r2,[r0,#-4]
    203     vaddw.u8    q0,q0,d2
    204     ldrb        r7,[r0,#-1]
    205     vmull.u8    q10,d7,d23
    206     ldrb        r3,[r0,#0]
    207     vmlal.u8    q10,d22,d18
    208     ldrb        r8,[r0,#3]
    209 @   ubfx   r7,r2,#24,#8           @ has the -1 value
    210 @  and    r2,#0xff               @ has the -4 value
    211 @  ubfx   r8,r3,#24,#8           @ has the 3 value
    212 @  and    r3,#0xff               @ r4 has the 0 value
    213 
    214     vadd.i16    q10,q10,q0
    215     subs        r8,r8,r3
    216     vrshrn.i16  d22,q10,#3
    217     rsbmi       r8,r8,#0
    218     subs        r2,r2,r7
    219     vmin.u8     d21,d22,d31
    220     rsbmi       r2,r2,#0
    221     vmax.u8     d22,d21,d30
    222     add         r8,r8,r2
    223     vaddl.u8    q10,d7,d3
    224     cmp         r8,r5,asr #3
    225     vmla.i16    q10,q0,q8
    226     bge         l1.336
    227     vaddw.u8    q0,q0,d7
    228     subs        r7,r3,r7
    229     vrshrn.i16  d20,q10,#3
    230     rsbmi       r7,r7,#0
    231     vrshrn.i16  d0,q0,#2
    232     mov         r10,#5
    233     vqadd.u8    d30,d5,d19
    234     mul         r10,r10,r6
    235     vqsub.u8    d31,d5,d19
    236     add         r10,#1
    237     cmp         r7,r10,asr #1
    238     bge         l1.336
    239 
    240 
    241 @        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
    242 @            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
    243 
    244 
    245     asr         r10,r5,#2
    246     vqsub.u8    d25,d4,d19
    247     cmp         r10,r4,lsl #1
    248     vqadd.u8    d21,d4,d19
    249     ble         l1.336
    250     vmin.u8     d26,d20,d21
    251     add         r4,r1,r1,lsl #1
    252     add         r4,r4,r0
    253     vmax.u8     d20,d26,d25
    254     ldrb        r2,[r4,#-4]
    255     vmin.u8     d19,d0,d30
    256     ldrb        r7,[r4,#-1]
    257     vmax.u8     d21,d19,d31
    258     ldrb        r3,[r4,#0]
    259     lsl         r10,r6,#1
    260     ldrb        r8,[r4,#3]
    261 @   ubfx   r7,r2,#24,#8           @ has the -1 value
    262 @  and    r2,#0xff               @ has the -4 value
    263 @  ubfx   r8,r3,#24,#8           @ has the 3 value
    264 @  and    r3,#0xff               @ r4 has the 0 value
    265     vaddl.u8    q0,d2,d3
    266     vdup.8      d19,r10
    267     subs        r8,r8,r3
    268     vaddw.u8    q0,q0,d4
    269     rsbmi       r8,r8,#0
    270     vqadd.u8    d30,d2,d19
    271     subs        r2,r2,r7
    272     vqsub.u8    d31,d2,d19
    273     rsbmi       r2,r2,#0
    274     vaddl.u8    q13,d5,d6
    275     add         r8,r8,r2
    276     vmla.i16    q13,q0,q8
    277     cmp         r8,r5,asr #3
    278     bge         l1.336
    279     vrshrn.i16  d26,q13,#3
    280     subs        r7,r3,r7
    281     vqadd.u8    d27,d3,d19
    282     rsbmi       r7,r7,#0
    283     vqsub.u8    d28,d3,d19
    284     mov         r10,#5
    285     vmin.u8     d16,d26,d30
    286     mul         r10,r10,r6
    287     add         r10,#1
    288     cmp         r7,r10,asr #1
    289     vmax.u8     d26,d16,d31
    290     bge         l1.336
    291     vqadd.u8    d30,d6,d19
    292 
    293     mov         r2,#2
    294     ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
    295     vqsub.u8    d31,d6,d19
    296     ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
    297     b           end_dep_deq_decision
    298 @ r2 has the value of de
    299 @ r6 has teh value of tc
    300 @ r5 has the value of beta
    301 @ r14 has the value of dp
    302 @ r12 has the value of dq
    303 @ r0 has the value of source address
    304 @ r1 has the src stride
    305 
    306 l1.336:
    307     mov         r2,#1
    308 l1.424:
    309     mov         r11,r5
    310     ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
    311     ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
    312 
    313     cmp         r6,#1
    314     moveq       r9,#0
    315     moveq       r10,#0
    316     beq         end_dep_deq_decision
    317 
    318     and         r7,r4,r5
    319 
    320     cmp         r7,#1
    321     beq         both_flags_set
    322     cmp         r4,#0
    323     beq         set_flag_dep_zero
    324 
    325 
    326     add         r8,r11,r11,asr #1
    327     mov         r10,#0
    328     asr         r8,#3
    329     cmp         r8,r14
    330     movgt       r9,#1
    331     movle       r9,#0
    332     b           end_dep_deq_decision
    333 set_flag_dep_zero:
    334 
    335     add         r8,r11,r11,asr #1
    336     mov         r9,#0
    337     asr         r8,#3
    338     cmp         r8,r12
    339     movgt       r10,#1
    340     movle       r10,#0
    341     b           end_dep_deq_decision
    342 
    343 both_flags_set:
    344     add         r8,r11,r11,asr #1
    345     asr         r8,#3
    346     cmp         r8,r14
    347     movgt       r9,#1
    348     movle       r9,#0
    349     cmp         r8,r12
    350     movgt       r10,#1
    351     movle       r10,#0
    352 end_dep_deq_decision:
    353 
    354 @r0=source address
    355 @r1=stride
    356 @ r2 =de
    357 @ r4=flag p
    358 @r5= flag q
    359 @r6 =tc
    360 @ r9 =dep
    361 @ r10=deq
    362 @   b   l1.964
    363 
    364 
    365     cmp         r2,#2
    366 @ r4 has the value of de
    367     bne         l1.968
    368 
    369     cmp         r5,#0
    370     beq         l1.780
    371 @ r5 has the flag of q
    372 
    373     add         r3,r0,#2
    374     vst1.8      {d22[0]},[r3],r1
    375 
    376     vst1.8      {d22[1]},[r3],r1
    377 
    378     vst1.8      {d22[2]},[r3],r1
    379 
    380     vst1.8      {d22[3]},[r3]
    381     add         r3,r0,r1
    382     vtrn.8      d20,d21
    383 
    384     vst1.16     {d20[0]},[r0]
    385     vst1.16     {d21[0]},[r3],r1
    386     vst1.16     {d20[1]},[r3],r1
    387     vst1.16     {d21[1]},[r3]
    388 
    389 
    390 l1.780:
    391     cmp         r4,#0
    392     beq         l1.964
    393     @ r5 has the flag p
    394 
    395 
    396     vdup.32     d7,d24[0]
    397     sub         r3,r0,#1
    398     vaddw.u8    q8,q0,d6
    399     add         r7,r3,r1
    400     vrshrn.i16  d2,q8,#2
    401     vst1.8      {d26[0]},[r3]
    402     sub         r0,r0,#3
    403     vmin.u8     d16,d2,d27
    404     vst1.8      {d26[1]},[r7],r1
    405     vmull.u8    q1,d6,d23
    406     vmlal.u8    q1,d7,d18
    407     vst1.8      {d26[2]},[r7],r1
    408     vmax.u8     d5,d16,d28
    409     vst1.8      {d26[3]},[r7]
    410     vadd.i16    q0,q1,q0
    411     vrshrn.i16  d0,q0,#3
    412 
    413 
    414     vmin.u8     d1,d0,d30
    415     vmax.u8     d0,d1,d31
    416 
    417     vtrn.8      d0,d5
    418     vst1.16     {d0[0]},[r0],r1
    419     vst1.16     {d5[0]},[r0],r1
    420     vst1.16     {d0[1]},[r0],r1
    421     vst1.16     {d5[1]},[r0]
    422 l1.964:
    423     pop         {r3-r12,pc}
    424 l1.968:
    425 
    426 
    427     vmov.i16    q0,#0x9
    428     rsb         r11,r6,#0
    429     cmp         r4,#0
    430     @ checks for the flag p
    431     vmov.i16    q8,#0x3
    432     vmov.i8     d24,#0x1
    433 
    434 
    435     vdup.8      d30,r11
    436     and         r11,r6,#0xff
    437     vdup.8      d31,r11
    438 
    439     vsubl.u8    q9,d4,d2
    440     vmul.i16    q9,q9,q0
    441     vsubl.u8    q0,d5,d3
    442 
    443 
    444 
    445     vmul.i16    q8,q0,q8
    446     vsub.i16    q8,q9,q8
    447     vrshr.s16   q8,q8,#4
    448 @   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
    449 
    450     vabs.s16    q0,q8
    451     vmovn.i16   d0,q0
    452     @ storing the absolute values of delta in d0
    453 
    454     vqmovn.s16  d16,q8
    455     @ storing the clipped values of delta in d16
    456 
    457     vmov.i8     d1,#0xa
    458     vdup.8      d21,r11
    459     vmul.i8     d1,d1,d21
    460     @ d1 stores the value (10 * tc)
    461 
    462 @if(abs(delta) < 10 * tc)
    463 
    464     vmin.s8     d18,d16,d31
    465     vmax.s8     d20,d18,d30
    466 
    467 @ delta = clip3(delta, -tc, tc)@
    468     vmovl.s8    q8,d20
    469     vmovl.u8    q9,d2
    470     vadd.i16    q9,q9,q8
    471 
    472     vqmovun.s16 d22,q9
    473     vmovl.u8    q9,d4
    474     vsub.i16    q8,q9,q8
    475     vqmovun.s16 d23,q8
    476 @ tmp_p0 = clip_u8(pu1_src[-1] + delta)@
    477 @  tmp_q0 = clip_u8(pu1_src[0] - delta)@
    478     beq         l1.1272
    479 
    480 
    481 
    482     cmp         r9,#1
    483     bne         l1.1212
    484 @ checks for the flag dep
    485 
    486     asr         r3,r6,#1
    487 
    488 
    489     vaddl.u8    q8,d6,d2
    490     vaddw.u8    q8,q8,d24
    491     vdup.8      d18,r3
    492     rsb         r3,r3,#0
    493     vdup.8      d19,r3
    494     vshr.u16    q8,q8,#1
    495     vmovn.i16   d16,q8
    496 
    497     vsubl.u8    q8,d16,d3
    498     vaddw.s8    q8,q8,d20
    499     vshr.s16    q8,q8,#1
    500     vqmovn.s16  d16,q8
    501 
    502     vmin.s8     d17,d16,d18
    503     vmax.s8     d16,d19,d17
    504 
    505 
    506 
    507 
    508     vmovl.u8    q9,d3
    509     vmovl.s8    q8,d16
    510     vadd.i16    q8,q9,q8
    511 
    512     vqmovun.s16 d16,q8
    513     vmov        d30,d3
    514     vcge.u8     d3,d0,d1
    515 
    516 
    517     vbsl        d3,d30,d16
    518 l1.1212:
    519     vdup.8      d16,r11
    520     sub         r12,r0,#3
    521     sub         r3,r0,#1
    522 @     vmul.i8  d16,d16,d1
    523     vtrn.8      d6,d3
    524     vst1.16     {d6[0]},[r12],r1
    525     vcge.u8     d16,d0,d1
    526     vst1.16     {d3[0]},[r12],r1
    527     vbsl        d16,d2,d22
    528     vst1.8      {d16[0]},[r3],r1
    529     vst1.8      {d16[1]},[r3],r1
    530     vst1.16     {d6[1]},[r12],r1
    531     vst1.8      {d16[2]},[r3],r1
    532     vst1.16     {d3[1]},[r12]
    533     vst1.8      {d16[3]},[r3]
    534 l1.1272:
    535     @   ldr      r3,[sp,#0x38]
    536     cmp         r5,#0
    537     beq         l1.964
    538     @ checks for the flag q
    539     cmp         r10,#1
    540     bne         l1.1412
    541     @ checks for the flag deq
    542     vmov        d2,d7
    543     asr         r3,r6,#1
    544 
    545     vdup.8      d6,r3
    546     rsb         r3,r3,#0
    547     vdup.8      d16,r3
    548     vaddl.u8    q1,d2,d4
    549     vaddw.u8    q1,q1,d24
    550     vshr.u16    q1,q1,#1
    551     vmovn.i16   d2,q1
    552 
    553     vsubl.u8    q1,d2,d5
    554     vsubw.s8    q1,q1,d20
    555     vshr.s16    q1,q1,#1
    556     vqmovn.s16  d3,q1
    557 
    558     vmin.s8     d2,d3,d6
    559     vmax.s8     d3,d16,d2
    560     @  vdup.8   d6,r2
    561     @   vmul.i8  d6,d6,d1
    562 
    563 
    564 
    565     vmovl.u8    q8,d5
    566     vmovl.s8    q1,d3
    567     vadd.i16    q1,q8,q1
    568     vqmovun.s16 d3,q1
    569     vmov        d30,d5
    570     vcge.u8     d5,d0,d1
    571 
    572 
    573     vbsl        d5,d30,d3
    574 l1.1412:
    575     @  vdup.8   d2,r2
    576     add         r3,r0,#2
    577     add         r11,r3,r1
    578     @   vmul.i8  d1,d2,d1
    579     vst1.8      {d7[0]},[r3]
    580     vst1.8      {d7[1]},[r11],r1
    581     vst1.8      {d7[2]},[r11],r1
    582     vcge.u8     d0,d0,d1
    583     vst1.8      {d7[3]},[r11]
    584     vbsl        d0,d4,d23
    585     vtrn.8      d0,d5
    586     vst1.16     {d0[0]},[r0],r1
    587     vst1.16     {d5[0]},[r0],r1
    588     vst1.16     {d0[1]},[r0],r1
    589     vst1.16     {d5[1]},[r0]
    590     pop         {r3-r12,pc}
    591 
    592 
    593 
    594