Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/*******************************************************************************
     19 @* @file
     20 @*  ihevc_deblk_luma_vert.s
     21 @*
     22 @* @brief
     23 @*  contains function definitions for inter prediction  interpolation.
     24 @* functions are coded using neon  intrinsics and can be compiled using
     25 
     26 @* rvct
     27 @*
     28 @* @author
     29 @*  anand s
     30 @*
     31 @* @par list of functions:
     32 @*
     33 @*
     34 @* @remarks
     35 @*  none
     36 @*
     37 @*******************************************************************************/
     38 
     39 .text
     40 .align 4
     41 
     42 
     43 
     44 
     45 
     46 .extern gai4_ihevc_tc_table
     47 .extern gai4_ihevc_beta_table
     48 .globl ihevc_deblk_luma_horz_a9q
     49 
     50 gai4_ihevc_tc_table_addr:
     51 .long gai4_ihevc_tc_table  - ulbl1 - 8
     52 
     53 gai4_ihevc_beta_table_addr:
     54 .long gai4_ihevc_beta_table  - ulbl2 - 8
     55 
     56 .type ihevc_deblk_luma_horz_a9q, %function
     57 
     58 ihevc_deblk_luma_horz_a9q:
     59     stmfd       sp!, {r3-r12,lr}
     60     ldr         r4,[sp,#0x2c]
     61     ldr         r5,[sp,#0x30]
     62 
     63     add         r3,r3,r4
     64     add         r3,r3,#1
     65     ldr         r6, [sp,#0x34]
     66     asr         r3,r3,#1
     67     add         r7,r3,r5,lsl #1
     68     add         r3,r3,r6,lsl #1
     69     cmp         r7,#0x33
     70     movgt       r7,#0x33
     71     bgt         l1.1532
     72     cmp         r7,#0x0
     73     movlt       r7,#0x0                     @ r7 has the beta_index value
     74 l1.1532:
     75     @     bic      r2,r2,#1
     76     asr         r2,r2,#1
     77 
     78     add         r3,r3,r2,lsl #1
     79     cmp         r3,#0x35
     80     movgt       r3,#0x35
     81     bgt         l1.1564
     82     cmp         r3,#0x0
     83     movlt       r3,#0x0                     @ r3 has the tc_index value
     84 
     85     @    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
     86     @    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
     87     @    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
     88 
     89 l1.1564:
     90     ldr         r2,gai4_ihevc_beta_table_addr
     91 ulbl2:
     92     add         r2,r2,pc
     93     ldr         r4,gai4_ihevc_tc_table_addr
     94 ulbl1:
     95     add         r4,r4,pc
     96 
     97     ldr         r5,[r2,r7,lsl #2]           @ beta
     98     ldr         r6,[r4,r3,lsl #2]           @ tc
     99 
    100 
    101 
    102     cmp         r6,#0
    103     beq         l1.2404
    104     vmov.i16    d0,#0x2
    105     lsl         r7,r6,#1
    106     add         r14,r1,r1,lsl #1
    107     ldr         r8,[r0,-r14]                @ -3 value
    108     vdup.8      d1,r7
    109     ldr         r10,[r0,-r1,lsl #1]         @-2 value
    110     vdup.32     d23,r8                      @ -3 value
    111     ldr         r11,[r0,-r1]                @-1 value
    112     vdup.32     d24,r10                     @ -2 value
    113     and         r8,#0xff
    114     ldr         r12,[r0,#0]                 @ 0 value
    115     vdup.32     d25, r11                    @-1 value
    116     and         r10,#0xff
    117     ldr         r9,[r0,r1]                  @ 1 value
    118     vdup.32     d26,r12                     @ 0 value
    119     and         r11,#0xff
    120     ldr         r2,[r0,r1,lsl #1]           @ 2 value
    121     vdup.32     d27,r9                      @ 1value
    122     and         r12,#0xff
    123     vdup.32     d28,r2                      @ 2 value
    124     and         r9,#0xff
    125     and         r2,#0xff
    126 
    127     add         r12,r12,r2
    128     subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
    129     rsbmi       r9,r9,#0
    130     @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
    131 
    132     add         r8,r8,r11
    133     subs        r8,r8,r10,lsl #1
    134     rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
    135     @  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
    136 
    137 
    138 
    139     add         r3,r1,r1,lsl #1
    140     add         r14,r0,#3
    141 
    142 
    143     ldrb        r2,[r14,-r3]                @ -2 value
    144     ldrb        r10,[r14,-r1,lsl #1]        @ -2 value
    145     ldrb        r11,[r14,-r1]               @ -1 value
    146     ldrb        r12,[r14,#0]                @ 0 value
    147     ldrb        r3,[r14,r1]                 @ 1 value
    148     ldrb        r4,[r14,r1,lsl #1]          @ 2 value
    149 
    150 
    151     add         r12,r12,r4
    152     subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
    153     rsbmi       r12,r12,#0
    154     @    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
    155 
    156 
    157     add         r2,r2,r11
    158     subs        r11,r2,r10,lsl #1
    159     rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
    160     @    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
    161 
    162 
    163 
    164     add         r3,r8,r9                    @ r3 has the d0 value
    165     add         r4,r11,r12                  @ r4 has the d3 value
    166 
    167 
    168     @    d0 = dp0 + dq0@
    169     @    d3 = dp3 + dq3@
    170 
    171     add         r14,r8,r11                  @ r13 has the value dp
    172     add         r12,r12,r9                  @ r12 has the value  dq
    173     @    dp = dp0 + dp3@
    174     @   dq = dq0 + dq3@
    175 
    176     add         r11, r3, r4                 @ r3 has the value d
    177 
    178     @   d = d0 + d3@
    179 
    180 
    181     cmp         r11,r5
    182     bge         l1.2404
    183 
    184     @    if(d < beta)
    185 
    186 
    187     @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
    188 
    189     @ registers for use: r2,r7,r8,r9,r10,
    190 
    191     asr         r10,r5,#2
    192     vqadd.u8    d30,d26,d1
    193     cmp         r10,r3,lsl #1
    194     vqsub.u8    d31,d26,d1
    195     ble         l1.1840
    196     add         r10,r1,r1,lsl #1
    197     vaddl.u8    q3,d25,d26
    198     ldr         r2,[r0,-r1,lsl #2]          @ has the -4 value
    199     ldrb        r7,[r0,-r1]                 @ has the -1 value
    200     vdup.32     d22,r2                      @ -4 value
    201     vaddw.u8    q4,q3,d27
    202     ldrb        r3,[r0,#0]                  @ r4 has the 0 value
    203     vqadd.u8    d16,d27,d1
    204     and         r2,#0xff
    205     vmul.i16    q6,q4,d0[0]
    206     ldr         r8,[r0,r10]                 @ has the 3 value
    207     vaddl.u8    q5,d24,d28
    208     subs        r2,r2,r7
    209     vqsub.u8    d17,d27,d1
    210     vdup.32     d29,r8                      @ 3 value
    211     and         r8,#0xff
    212     vadd.i16    q6,q6,q5
    213     rsbmi       r2,r2,#0
    214     vrshrn.i16  d20,q6,#3
    215     subs        r8,r8,r3
    216     rsbmi       r8,r8,#0
    217     vmin.u8     d18,d20,d30
    218     add         r8,r8,r2
    219 
    220     cmp         r8,r5,asr #3
    221     bge         l1.1840
    222     vaddw.u8    q7,q4,d28
    223     subs        r7,r3,r7
    224     vmax.u8     d4,d18,d31
    225     rsbmi       r7,r7,#0
    226     vqadd.u8    d30,d28,d1
    227     mov         r10,#5
    228     vrshrn.i16  d21,q7,#2
    229     mul         r10,r10,r6
    230     vqsub.u8    d31,d28,d1
    231     add         r10,#1
    232     cmp         r7,r10,asr #1
    233     vmin.u8     d18,d21,d16
    234     bge         l1.1840
    235 
    236 
    237     @        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
    238     @            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
    239 
    240     vmax.u8     d5,d18,d17
    241     asr         r10,r5,#2
    242     vaddl.u8    q8,d29,d28
    243     cmp         r10,r4,lsl #1
    244     ble         l1.1840
    245 
    246     add         r10,r1,r1,lsl #1
    247     vmul.i16    q8,q8,d0[0]
    248     add         r4,r0,#3
    249 
    250 
    251     ldrb        r2,[r4,-r1,lsl #2]
    252     vadd.i16    q8,q8,q7
    253     ldrb        r7,[r4,-r1]
    254     vrshrn.i16  d19,q8,#3
    255     ldrb        r3,[r4,#0]
    256     ldrb        r8,[r4,r10]
    257     @   ubfx   r7,r2,#24,#8           @ has the -1 value
    258     @  and    r2,#0xff               @ has the -4 value
    259     @  ubfx   r8,r3,#24,#8           @ has the 3 value
    260     @  and    r3,#0xff               @ r4 has the 0 value
    261 
    262 
    263 
    264     subs        r8,r8,r3
    265     vmin.u8     d18,d19,d30
    266     rsbmi       r8,r8,#0
    267     vaddl.u8    q3,d25,d24
    268     subs        r2,r2,r7
    269     vmax.u8     d3,d18,d31
    270     rsbmi       r2,r2,#0
    271     vaddw.u8    q4,q3,d26
    272     add         r8,r8,r2
    273     vqadd.u8    d30,d25,d1
    274     cmp         r8,r5,asr #3
    275     vqsub.u8    d31,d25,d1
    276     bge         l1.1840
    277     vmul.i16    q6,q4,d0[0]
    278     subs        r7,r3,r7
    279     vqadd.u8    d16,d24,d1
    280     rsbmi       r7,r7,#0
    281     vaddl.u8    q5,d23,d27
    282     mov         r10,#5
    283     vqsub.u8    d17,d24,d1
    284     mul         r10,r10,r6
    285     vadd.i16    q6,q6,q5
    286     add         r10,#1
    287     vrshrn.i16  d20,q6,#3
    288     cmp         r7,r10,asr #1
    289     vaddw.u8    q7,q4,d23
    290     bge         l1.1840
    291     vmin.u8     d18,d20,d30
    292     mov         r2,#2
    293     vqadd.u8    d30,d23,d1
    294     ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
    295     vmax.u8     d2,d18,d31
    296     ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
    297     vrshrn.i16  d21,q7,#2
    298     b           end_dep_deq_decision_horz
    299     @ r2 has the value of de
    300     @ r6 has teh value of tc
    301     @ r5 has the value of beta
    302     @ r14 has the value of dp
    303     @ r12 has the value of dq
    304     @ r0 has the value of source address
    305     @ r1 has the src stride
    306 
    307 l1.1840:
    308     mov         r2,#1
    309 
    310     mov         r11,r5
    311     ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
    312     ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
    313 
    314     cmp         r6,#1
    315     moveq       r9,#0
    316     moveq       r10,#0
    317     beq         end_dep_deq_decision_horz
    318 
    319     and         r7,r4,r5
    320     cmp         r7,#1
    321     beq         both_flags_set_horz
    322     cmp         r4,#0
    323     beq         set_flag_dep_zero_horz
    324 
    325 
    326     add         r8,r11,r11,asr #1
    327     mov         r10,#0
    328     asr         r8,#3
    329     cmp         r8,r14
    330     movgt       r9,#1
    331     movle       r9,#0
    332     b           end_dep_deq_decision_horz
    333 set_flag_dep_zero_horz:
    334 
    335     add         r8,r11,r11,asr #1
    336     mov         r9,#0
    337     asr         r8,#3
    338     cmp         r8,r12
    339     movgt       r10,#1
    340     movle       r10,#0
    341     b           end_dep_deq_decision_horz
    342 
    343 both_flags_set_horz:
    344     add         r8,r11,r11,asr #1
    345     asr         r8,#3
    346     cmp         r8,r14
    347     movgt       r9,#1
    348     movle       r9,#0
    349     cmp         r8,r12
    350     movgt       r10,#1
    351     movle       r10,#0
    352 end_dep_deq_decision_horz:
    353 
    354     @r0=source address
    355     @r1=stride
    356     @ r2 =de
    357     @ r4=flag p
    358     @r5= flag q
    359     @r6 =tc
    360     @ r9 =dep
    361     @ r10=deq
    362 
    363 
    364 
    365     @   add     r14,r1,r1,lsl #1
    366     @   lsl     r7,r6,#1
    367     @   vdup.8  d1,r7
    368     @   vmov.i16  d0,#0x2
    369     vmin.u8     d18,d21,d16
    370     cmp         r2,#1
    371     vqsub.u8    d31,d23,d1
    372     beq         l1.2408
    373     vaddl.u8    q4,d23,d22
    374     cmp         r5,#1
    375 
    376     bne         strong_filtering_p
    377 
    378 strong_filtering_q:
    379     mov         r12,r0
    380     vst1.32     d4[0],[r12],r1
    381     vst1.32     d5[0],[r12],r1
    382     vst1.32     d3[0],[r12]
    383     cmp         r4,#1
    384     bne         l1.2404
    385 strong_filtering_p:
    386     vmax.u8     d5,d18,d17
    387     mov         r12,r0
    388     vmul.i16    q4,q4,d0[0]
    389     rsb         r11,r1,#0
    390     vadd.i16    q8,q4,q7
    391     add         r12,r12,r11
    392     vrshrn.i16  d19,q8,#3
    393     vst1.32     d2[0],[r12],r11
    394     vmin.u8     d18,d19,d30
    395     vst1.32     d5[0],[r12],r11
    396     vmax.u8     d3,d18,d31
    397     vst1.32     d3[0],[r12]
    398 
    399 l1.2404:
    400     ldmfd       sp!, {r3-r12,pc}
    401 
    402     @ r4=flag p
    403     @r5= flag q
    404     @r6 =tc
    405     @ r9 =dep
    406     @ r10=deq
    407 
    408 
    409     @       d22          -4 value
    410 
    411     @d23        @ -3 value
    412 
    413     @   vdup.32 d24,r11         @ -2 value
    414 
    415     @   vdup.32 d25, r11        @-1 value
    416 
    417     @   vdup.32 d26,r11         @ 0 value
    418 
    419     @   vdup.32 d27,r11         @ 1value
    420 
    421     @   vdup.32 d28,r11         @ 2 value
    422 
    423     @   vdup.32 d29,r11         @ 3 value
    424 
    425 l1.2408:
    426 
    427     vmov.i16    d0,#0x9
    428 
    429     vsubl.u8    q5,d26,d25
    430 
    431     vmul.i16    q5,q5,d0[0]
    432 
    433     vmov.i16    d0,#0x3
    434 
    435     vsubl.u8    q6,d27,d24
    436     vmul.i16    q6,q6,d0[0]
    437 
    438 
    439     vdup.8      d30,r6                      @ duplicating the +tc value
    440 
    441     rsb         r12,r6,#0
    442     vdup.8      d31,r12                     @ duplicating the -tc value
    443 
    444 
    445 
    446     vsub.i16    q5,q5,q6
    447 
    448 
    449 
    450     vrshr.s16   q5,q5,#4
    451     @   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
    452 
    453     vabs.s16    q4,q5
    454     vmovn.i16   d9,q4
    455     @ storing the absolute values of delta in d9
    456 
    457     vqmovn.s16  d10,q5
    458     @ storing the clipped values of delta in d16
    459 
    460 
    461     vmin.s8     d11,d10,d30
    462     vmax.s8     d8,d31,d11                  @ d8 has the value  delta = clip3(delta, -tc, tc)@
    463 
    464 
    465     vmovl.u8    q3,d25
    466 
    467     vaddw.s8    q2,q3,d8
    468 
    469     vqmovun.s16 d12,q2
    470     vmovl.u8    q3,d26
    471     vsubw.s8    q2,q3,d8
    472     vqmovun.s16 d13,q2
    473 
    474 
    475     mov         r11,#0xa
    476     mul         r12,r11,r6
    477     vdup.8      d2,r12                      @ d2 has the 10*tc value
    478     vmov        d18,d24
    479     vdup.8      d0,r6
    480     vshr.s8     d0,#1
    481     vneg.s8     d1,d0
    482 
    483     cmp         r4,#1
    484     bne         l1.2724
    485     cmp         r9,#1
    486     bne         l1.2700
    487 
    488     @ d12 and d13 have the value temp_p0 and temp_q0
    489     vaddl.u8    q7,d23,d25
    490     vrshrn.u16  d14,q7,#1
    491     vsubl.u8    q7,d14,d24
    492     vaddw.s8    q7,q7,d8
    493     vqshrn.s16  d14,q7,#1
    494     vmin.s8     d15,d14,d0
    495     vmax.s8     d14,d1,d15
    496 
    497     @ d14 has the delta p value
    498     vmovl.u8    q8,d24
    499     vaddw.s8    q8,q8,d14
    500     vqmovun.s16 d14,q8
    501 
    502     @  d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
    503     vcge.u8     d18,d9,d2
    504     vbsl        d18,d24,d14
    505 
    506 l1.2700:
    507     mov         r12,r0
    508     rsb         r11,r1,#0
    509     add         r12,r11
    510     vcge.u8     d19,d9,d2
    511     vbsl        d19,d25,d12
    512     vst1.32     {d19[0]},[r12],r11
    513     vst1.32     {d18[0]},[r12]
    514 l1.2724:
    515     cmp         r5,#1
    516     bne         l1.2404
    517     cmp         r10,#1
    518     vmov        d18, d27
    519     bne         l1.2852
    520 
    521     vaddl.u8    q7,d26,d28
    522     vrshrn.u16  d14,q7,#1
    523     vsubl.u8    q7,d14,d27
    524     vsubw.s8    q7,q7,d8
    525     vqshrn.s16  d14,q7,#1
    526     vmin.s8     d15,d14,d0
    527     vmax.s8     d14,d1,d15
    528 @ d14 has the delta p value
    529     vmovl.u8    q8,d27
    530     vaddw.s8    q8,q8,d14
    531     vqmovun.s16 d14,q8
    532     vcge.u8     d18,d9,d2
    533     vbsl        d18,d27,d14
    534 l1.2852:
    535     mov         r12,r0
    536     vcge.u8     d19,d9,d2
    537     vbsl        d19,d26,d13
    538     vst1.32     {d19[0]},[r12],r1
    539     vst1.32     {d18[0]},[r12]
    540     ldmfd       sp!, {r3-r12,r15}
    541 
    542 
    543 
    544