Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/*******************************************************************************
     19 @* @file
     20 @*  ihevc_deblk_luma_vert.s
     21 @*
     22 @* @brief
     23 @*  contains function definitions for inter prediction  interpolation.
     24 @* functions are coded using neon  intrinsics and can be compiled using
     25 
     26 @* rvct
     27 @*
     28 @* @author
     29 @*  anand s
     30 @*
     31 @* @par list of functions:
     32 @*
     33 @*
     34 @* @remarks
     35 @*  none
     36 @*
     37 @*******************************************************************************/
     38 
     39 .equ    qp_q_offset,                108
     40 .equ    beta_offset_div2_offset,    112
     41 .equ    tc_offset_div2_offset,      116
     42 .equ    filter_p_offset,            120
     43 .equ    filter_q_offset,            124
     44 
     45 .text
     46 .align 4
     47 
     48 
     49 
     50 
     51 
     52 .extern gai4_ihevc_tc_table
     53 .extern gai4_ihevc_beta_table
     54 .globl ihevc_deblk_luma_horz_a9q
     55 
     56 gai4_ihevc_tc_table_addr:
     57 .long gai4_ihevc_tc_table  - ulbl1 - 8
     58 
     59 gai4_ihevc_beta_table_addr:
     60 .long gai4_ihevc_beta_table  - ulbl2 - 8
     61 
     62 .type ihevc_deblk_luma_horz_a9q, %function
     63 
     64 ihevc_deblk_luma_horz_a9q:
     65     stmfd       sp!, {r3-r12,lr}
     66     vpush       {d8  -  d15}
     67 
     68     ldr         r4,[sp,#qp_q_offset]
     69     ldr         r5,[sp,#beta_offset_div2_offset]
     70 
     71     add         r3,r3,r4
     72     add         r3,r3,#1
     73     ldr         r6, [sp,#tc_offset_div2_offset]
     74     asr         r3,r3,#1
     75     add         r7,r3,r5,lsl #1
     76     add         r3,r3,r6,lsl #1
     77     cmp         r7,#0x33
     78     movgt       r7,#0x33
     79     bgt         l1.1532
     80     cmp         r7,#0x0
     81     movlt       r7,#0x0                     @ r7 has the beta_index value
     82 l1.1532:
     83     @     bic      r2,r2,#1
     84     asr         r2,r2,#1
     85 
     86     add         r3,r3,r2,lsl #1
     87     cmp         r3,#0x35
     88     movgt       r3,#0x35
     89     bgt         l1.1564
     90     cmp         r3,#0x0
     91     movlt       r3,#0x0                     @ r3 has the tc_index value
     92 
     93     @    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
     94     @    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
     95     @    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
     96 
     97 l1.1564:
     98     ldr         r2,gai4_ihevc_beta_table_addr
     99 ulbl2:
    100     add         r2,r2,pc
    101     ldr         r4,gai4_ihevc_tc_table_addr
    102 ulbl1:
    103     add         r4,r4,pc
    104 
    105     ldr         r5,[r2,r7,lsl #2]           @ beta
    106     ldr         r6,[r4,r3,lsl #2]           @ tc
    107 
    108 
    109 
    110     cmp         r6,#0
    111     beq         l1.2404
    112     vmov.i16    d0,#0x2
    113     lsl         r7,r6,#1
    114     add         r14,r1,r1,lsl #1
    115     ldr         r8,[r0,-r14]                @ -3 value
    116     vdup.8      d1,r7
    117     ldr         r10,[r0,-r1,lsl #1]         @-2 value
    118     vdup.32     d23,r8                      @ -3 value
    119     ldr         r11,[r0,-r1]                @-1 value
    120     vdup.32     d24,r10                     @ -2 value
    121     and         r8,#0xff
    122     ldr         r12,[r0,#0]                 @ 0 value
    123     vdup.32     d25, r11                    @-1 value
    124     and         r10,#0xff
    125     ldr         r9,[r0,r1]                  @ 1 value
    126     vdup.32     d26,r12                     @ 0 value
    127     and         r11,#0xff
    128     ldr         r2,[r0,r1,lsl #1]           @ 2 value
    129     vdup.32     d27,r9                      @ 1value
    130     and         r12,#0xff
    131     vdup.32     d28,r2                      @ 2 value
    132     and         r9,#0xff
    133     and         r2,#0xff
    134 
    135     add         r12,r12,r2
    136     subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
    137     rsbmi       r9,r9,#0
    138     @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
    139 
    140     add         r8,r8,r11
    141     subs        r8,r8,r10,lsl #1
    142     rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
    143     @  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
    144 
    145 
    146 
    147     add         r3,r1,r1,lsl #1
    148     add         r14,r0,#3
    149 
    150 
    151     ldrb        r2,[r14,-r3]                @ -2 value
    152     ldrb        r10,[r14,-r1,lsl #1]        @ -2 value
    153     ldrb        r11,[r14,-r1]               @ -1 value
    154     ldrb        r12,[r14,#0]                @ 0 value
    155     ldrb        r3,[r14,r1]                 @ 1 value
    156     ldrb        r4,[r14,r1,lsl #1]          @ 2 value
    157 
    158 
    159     add         r12,r12,r4
    160     subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
    161     rsbmi       r12,r12,#0
    162     @    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
    163 
    164 
    165     add         r2,r2,r11
    166     subs        r11,r2,r10,lsl #1
    167     rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
    168     @    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
    169 
    170 
    171 
    172     add         r3,r8,r9                    @ r3 has the d0 value
    173     add         r4,r11,r12                  @ r4 has the d3 value
    174 
    175 
    176     @    d0 = dp0 + dq0@
    177     @    d3 = dp3 + dq3@
    178 
    179     add         r14,r8,r11                  @ r13 has the value dp
    180     add         r12,r12,r9                  @ r12 has the value  dq
    181     @    dp = dp0 + dp3@
    182     @   dq = dq0 + dq3@
    183 
    184     add         r11, r3, r4                 @ r3 has the value d
    185 
    186     @   d = d0 + d3@
    187 
    188 
    189     cmp         r11,r5
    190     bge         l1.2404
    191 
    192     @    if(d < beta)
    193 
    194 
    195     @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
    196 
    197     @ registers for use: r2,r7,r8,r9,r10,
    198 
    199     asr         r10,r5,#2
    200     vqadd.u8    d30,d26,d1
    201     cmp         r10,r3,lsl #1
    202     vqsub.u8    d31,d26,d1
    203     ble         l1.1840
    204     add         r10,r1,r1,lsl #1
    205     vaddl.u8    q3,d25,d26
    206     ldr         r2,[r0,-r1,lsl #2]          @ has the -4 value
    207     ldrb        r7,[r0,-r1]                 @ has the -1 value
    208     vdup.32     d22,r2                      @ -4 value
    209     vaddw.u8    q4,q3,d27
    210     ldrb        r3,[r0,#0]                  @ r4 has the 0 value
    211     vqadd.u8    d16,d27,d1
    212     and         r2,#0xff
    213     vmul.i16    q6,q4,d0[0]
    214     ldr         r8,[r0,r10]                 @ has the 3 value
    215     vaddl.u8    q5,d24,d28
    216     subs        r2,r2,r7
    217     vqsub.u8    d17,d27,d1
    218     vdup.32     d29,r8                      @ 3 value
    219     and         r8,#0xff
    220     vadd.i16    q6,q6,q5
    221     rsbmi       r2,r2,#0
    222     vrshrn.i16  d20,q6,#3
    223     subs        r8,r8,r3
    224     rsbmi       r8,r8,#0
    225     vmin.u8     d18,d20,d30
    226     add         r8,r8,r2
    227 
    228     cmp         r8,r5,asr #3
    229     bge         l1.1840
    230     vaddw.u8    q7,q4,d28
    231     subs        r7,r3,r7
    232     vmax.u8     d4,d18,d31
    233     rsbmi       r7,r7,#0
    234     vqadd.u8    d30,d28,d1
    235     mov         r10,#5
    236     vrshrn.i16  d21,q7,#2
    237     mul         r10,r10,r6
    238     vqsub.u8    d31,d28,d1
    239     add         r10,#1
    240     cmp         r7,r10,asr #1
    241     vmin.u8     d18,d21,d16
    242     bge         l1.1840
    243 
    244 
    245     @        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
    246     @            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
    247 
    248     vmax.u8     d5,d18,d17
    249     asr         r10,r5,#2
    250     vaddl.u8    q8,d29,d28
    251     cmp         r10,r4,lsl #1
    252     ble         l1.1840
    253 
    254     add         r10,r1,r1,lsl #1
    255     vmul.i16    q8,q8,d0[0]
    256     add         r4,r0,#3
    257 
    258 
    259     ldrb        r2,[r4,-r1,lsl #2]
    260     vadd.i16    q8,q8,q7
    261     ldrb        r7,[r4,-r1]
    262     vrshrn.i16  d19,q8,#3
    263     ldrb        r3,[r4,#0]
    264     ldrb        r8,[r4,r10]
    265     @   ubfx   r7,r2,#24,#8           @ has the -1 value
    266     @  and    r2,#0xff               @ has the -4 value
    267     @  ubfx   r8,r3,#24,#8           @ has the 3 value
    268     @  and    r3,#0xff               @ r4 has the 0 value
    269 
    270 
    271 
    272     subs        r8,r8,r3
    273     vmin.u8     d18,d19,d30
    274     rsbmi       r8,r8,#0
    275     vaddl.u8    q3,d25,d24
    276     subs        r2,r2,r7
    277     vmax.u8     d3,d18,d31
    278     rsbmi       r2,r2,#0
    279     vaddw.u8    q4,q3,d26
    280     add         r8,r8,r2
    281     vqadd.u8    d30,d25,d1
    282     cmp         r8,r5,asr #3
    283     vqsub.u8    d31,d25,d1
    284     bge         l1.1840
    285     vmul.i16    q6,q4,d0[0]
    286     subs        r7,r3,r7
    287     vqadd.u8    d16,d24,d1
    288     rsbmi       r7,r7,#0
    289     vaddl.u8    q5,d23,d27
    290     mov         r10,#5
    291     vqsub.u8    d17,d24,d1
    292     mul         r10,r10,r6
    293     vadd.i16    q6,q6,q5
    294     add         r10,#1
    295     vrshrn.i16  d20,q6,#3
    296     cmp         r7,r10,asr #1
    297     vaddw.u8    q7,q4,d23
    298     bge         l1.1840
    299     vmin.u8     d18,d20,d30
    300     mov         r2,#2
    301     vqadd.u8    d30,d23,d1
    302     ldr         r4,[sp,#filter_p_offset]         @ loading the filter_flag_p
    303     vmax.u8     d2,d18,d31
    304     ldr         r5,[sp,#filter_q_offset]         @ loading the filter_flag_q
    305     vrshrn.i16  d21,q7,#2
    306     b           end_dep_deq_decision_horz
    307     @ r2 has the value of de
    308     @ r6 has teh value of tc
    309     @ r5 has the value of beta
    310     @ r14 has the value of dp
    311     @ r12 has the value of dq
    312     @ r0 has the value of source address
    313     @ r1 has the src stride
    314 
    315 l1.1840:
    316     mov         r2,#1
    317 
    318     mov         r11,r5
    319     ldr         r4,[sp,#filter_p_offset]         @ loading the filter_flag_p
    320     ldr         r5,[sp,#filter_q_offset]         @ loading the filter_flag_q
    321 
    322     cmp         r6,#1
    323     moveq       r9,#0
    324     moveq       r10,#0
    325     beq         end_dep_deq_decision_horz
    326 
    327     and         r7,r4,r5
    328     cmp         r7,#1
    329     beq         both_flags_set_horz
    330     cmp         r4,#0
    331     beq         set_flag_dep_zero_horz
    332 
    333 
    334     add         r8,r11,r11,asr #1
    335     mov         r10,#0
    336     asr         r8,#3
    337     cmp         r8,r14
    338     movgt       r9,#1
    339     movle       r9,#0
    340     b           end_dep_deq_decision_horz
    341 set_flag_dep_zero_horz:
    342 
    343     add         r8,r11,r11,asr #1
    344     mov         r9,#0
    345     asr         r8,#3
    346     cmp         r8,r12
    347     movgt       r10,#1
    348     movle       r10,#0
    349     b           end_dep_deq_decision_horz
    350 
    351 both_flags_set_horz:
    352     add         r8,r11,r11,asr #1
    353     asr         r8,#3
    354     cmp         r8,r14
    355     movgt       r9,#1
    356     movle       r9,#0
    357     cmp         r8,r12
    358     movgt       r10,#1
    359     movle       r10,#0
    360 end_dep_deq_decision_horz:
    361 
    362     @r0=source address
    363     @r1=stride
    364     @ r2 =de
    365     @ r4=flag p
    366     @r5= flag q
    367     @r6 =tc
    368     @ r9 =dep
    369     @ r10=deq
    370 
    371 
    372 
    373     @   add     r14,r1,r1,lsl #1
    374     @   lsl     r7,r6,#1
    375     @   vdup.8  d1,r7
    376     @   vmov.i16  d0,#0x2
    377     vmin.u8     d18,d21,d16
    378     cmp         r2,#1
    379     vqsub.u8    d31,d23,d1
    380     beq         l1.2408
    381     vaddl.u8    q4,d23,d22
    382     cmp         r5,#1
    383 
    384     bne         strong_filtering_p
    385 
    386 strong_filtering_q:
    387     mov         r12,r0
    388     vst1.32     d4[0],[r12],r1
    389     vst1.32     d5[0],[r12],r1
    390     vst1.32     d3[0],[r12]
    391     cmp         r4,#1
    392     bne         l1.2404
    393 strong_filtering_p:
    394     vmax.u8     d5,d18,d17
    395     mov         r12,r0
    396     vmul.i16    q4,q4,d0[0]
    397     rsb         r11,r1,#0
    398     vadd.i16    q8,q4,q7
    399     add         r12,r12,r11
    400     vrshrn.i16  d19,q8,#3
    401     vst1.32     d2[0],[r12],r11
    402     vmin.u8     d18,d19,d30
    403     vst1.32     d5[0],[r12],r11
    404     vmax.u8     d3,d18,d31
    405     vst1.32     d3[0],[r12]
    406 
    407 l1.2404:
    408     vpop        {d8  -  d15}
    409     ldmfd       sp!, {r3-r12,pc}
    410 
    411     @ r4=flag p
    412     @r5= flag q
    413     @r6 =tc
    414     @ r9 =dep
    415     @ r10=deq
    416 
    417 
    418     @       d22          -4 value
    419 
    420     @d23        @ -3 value
    421 
    422     @   vdup.32 d24,r11         @ -2 value
    423 
    424     @   vdup.32 d25, r11        @-1 value
    425 
    426     @   vdup.32 d26,r11         @ 0 value
    427 
    428     @   vdup.32 d27,r11         @ 1value
    429 
    430     @   vdup.32 d28,r11         @ 2 value
    431 
    432     @   vdup.32 d29,r11         @ 3 value
    433 
    434 l1.2408:
    435 
    436     vmov.i16    d0,#0x9
    437 
    438     vsubl.u8    q5,d26,d25
    439 
    440     vmul.i16    q5,q5,d0[0]
    441 
    442     vmov.i16    d0,#0x3
    443 
    444     vsubl.u8    q6,d27,d24
    445     vmul.i16    q6,q6,d0[0]
    446 
    447 
    448     vdup.8      d30,r6                      @ duplicating the +tc value
    449 
    450     rsb         r12,r6,#0
    451     vdup.8      d31,r12                     @ duplicating the -tc value
    452 
    453 
    454 
    455     vsub.i16    q5,q5,q6
    456 
    457 
    458 
    459     vrshr.s16   q5,q5,#4
    460     @   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
    461 
    462     vabs.s16    q4,q5
    463     vmovn.i16   d9,q4
    464     @ storing the absolute values of delta in d9
    465 
    466     vqmovn.s16  d10,q5
    467     @ storing the clipped values of delta in d16
    468 
    469 
    470     vmin.s8     d11,d10,d30
    471     vmax.s8     d8,d31,d11                  @ d8 has the value  delta = clip3(delta, -tc, tc)@
    472 
    473 
    474     vmovl.u8    q3,d25
    475 
    476     vaddw.s8    q2,q3,d8
    477 
    478     vqmovun.s16 d12,q2
    479     vmovl.u8    q3,d26
    480     vsubw.s8    q2,q3,d8
    481     vqmovun.s16 d13,q2
    482 
    483 
    484     mov         r11,#0xa
    485     mul         r12,r11,r6
    486     vdup.8      d2,r12                      @ d2 has the 10*tc value
    487     vmov        d18,d24
    488     vdup.8      d0,r6
    489     vshr.s8     d0,#1
    490     vneg.s8     d1,d0
    491 
    492     cmp         r4,#1
    493     bne         l1.2724
    494     cmp         r9,#1
    495     bne         l1.2700
    496 
    497     @ d12 and d13 have the value temp_p0 and temp_q0
    498     vaddl.u8    q7,d23,d25
    499     vrshrn.u16  d14,q7,#1
    500     vsubl.u8    q7,d14,d24
    501     vaddw.s8    q7,q7,d8
    502     vqshrn.s16  d14,q7,#1
    503     vmin.s8     d15,d14,d0
    504     vmax.s8     d14,d1,d15
    505 
    506     @ d14 has the delta p value
    507     vmovl.u8    q8,d24
    508     vaddw.s8    q8,q8,d14
    509     vqmovun.s16 d14,q8
    510 
    511     @  d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
    512     vcge.u8     d18,d9,d2
    513     vbsl        d18,d24,d14
    514 
    515 l1.2700:
    516     mov         r12,r0
    517     rsb         r11,r1,#0
    518     add         r12,r11
    519     vcge.u8     d19,d9,d2
    520     vbsl        d19,d25,d12
    521     vst1.32     {d19[0]},[r12],r11
    522     vst1.32     {d18[0]},[r12]
    523 l1.2724:
    524     cmp         r5,#1
    525     bne         l1.2404
    526     cmp         r10,#1
    527     vmov        d18, d27
    528     bne         l1.2852
    529 
    530     vaddl.u8    q7,d26,d28
    531     vrshrn.u16  d14,q7,#1
    532     vsubl.u8    q7,d14,d27
    533     vsubw.s8    q7,q7,d8
    534     vqshrn.s16  d14,q7,#1
    535     vmin.s8     d15,d14,d0
    536     vmax.s8     d14,d1,d15
    537 @ d14 has the delta p value
    538     vmovl.u8    q8,d27
    539     vaddw.s8    q8,q8,d14
    540     vqmovun.s16 d14,q8
    541     vcge.u8     d18,d9,d2
    542     vbsl        d18,d27,d14
    543 l1.2852:
    544     mov         r12,r0
    545     vcge.u8     d19,d9,d2
    546     vbsl        d19,d26,d13
    547     vst1.32     {d19[0]},[r12],r1
    548     vst1.32     {d18[0]},[r12]
    549 
    550     vpop        {d8  -  d15}
    551     ldmfd       sp!, {r3-r12,r15}
    552 
    553 
    554 
    555