Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   9641
      6 ;// Date:       Thursday, February 7, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         M_VARIANTS ARM1136JS
     17 
     18         IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
     19         IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
     20 
     21 
     22     IF ARM1136JS
     23 
     24 MASK_0      EQU 0x00000000
     25 MASK_1      EQU 0x01010101
     26 MASK_2      EQU 0xff00ff00
     27 LOOP_COUNT  EQU 0x11110000
     28 
     29 ;// Declare input registers
     30 
     31 pSrcDst     RN 0
     32 srcdstStep  RN 1
     33 pAlphaArg   RN 2
     34 pBetaArg    RN 3
     35 
     36 pThresholds RN 14
     37 pBS         RN 9
     38 pQ0         RN 0
     39 bS          RN 2
     40 
     41 alpha       RN 6
     42 alpha0      RN 6
     43 alpha1      RN 8
     44 
     45 beta        RN 7
     46 beta0       RN 7
     47 beta1       RN 9
     48 
     49 ;// Declare Local/Temporary variables
     50 
     51 ;// Pixels
     52 p_0         RN 3
     53 p_1         RN 5
     54 p_2         RN 4
     55 p_3         RN 2
     56 q_0         RN 8
     57 q_1         RN 9
     58 q_2         RN 10
     59 q_3         RN 12
     60 
     61 ;// Unpacking
     62 mask        RN 11
     63 
     64 row0        RN 2
     65 row1        RN 4
     66 row2        RN 5
     67 row3        RN 3
     68 
     69 row4        RN 8
     70 row5        RN 9
     71 row6        RN 10
     72 row7        RN 12
     73 row8        RN 14
     74 row9        RN 7
     75 
     76 tunpk0      RN 8
     77 tunpk1      RN 9
     78 tunpk2      RN 10
     79 tunpk3      RN 12
     80 tunpk4      RN 0
     81 
     82 tunpk5      RN 1
     83 tunpk6      RN 14
     84 tunpk7      RN 2
     85 tunpk8      RN 5
     86 tunpk9      RN 6
     87 
     88 
     89 ;// Filtering
     90 
     91 dp0q0       RN 12
     92 dp1p0       RN 12
     93 dq1q0       RN 12
     94 dp2p0       RN 12
     95 dq2q0       RN 12
     96 
     97 ap0q0       RN 1
     98 filt        RN 2
     99 
    100 m00         RN 14
    101 m01         RN 11
    102 
    103 apflg       RN 0
    104 aqflg       RN 6
    105 apqflg      RN 0
    106 
    107 
    108 ;//Declarations for bSLT4 kernel
    109 
    110 tC0         RN 7
    111 ptC0        RN 1
    112 
    113 pQ0a        RN 0
    114 Stepa       RN 1
    115 maska       RN 14
    116 
    117 P0a         RN 1
    118 P1a         RN 8
    119 Q0a         RN 7
    120 Q1a         RN 11
    121 
    122 ;//Declarations for bSGE4 kernel
    123 
    124 pQ0b        RN 0
    125 Stepb       RN 1
    126 maskb       RN 14
    127 
    128 P0b         RN 6
    129 P1b         RN 7
    130 P2b         RN 1
    131 P3b         RN 3
    132 
    133 Q0b         RN 9
    134 Q1b         RN 0
    135 Q2b         RN 2
    136 Q3b         RN 3
    137 
    138 ;// Miscellanous
    139 XY          RN 8
    140 t0          RN 3
    141 t1          RN 12
    142 t2          RN 14
    143 t7          RN 7
    144 t4          RN 4
    145 t5          RN 1
    146 t8          RN 6
    147 a           RN 0
    148 
    149 
    150 
    151         ;// Allocate stack memory
    152         M_ALLOC4 ppThresholds,4
    153         M_ALLOC4 pQ_3,4
    154         M_ALLOC4 pP_3,4
    155         M_ALLOC8 pAlphaBeta0,8
    156         M_ALLOC8 pAlphaBeta1,8
    157         M_ALLOC8 pXYBS,4
    158         M_ALLOC4 ppBS,4
    159         M_ALLOC8 ppQ0Step,4
    160         M_ALLOC4 pStep,4
    161 
    162         ;// Function header
    163         M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11
    164 
    165         ;//Input arguments on the stack
    166         M_ARG   ppThresholdsArg, 4
    167         M_ARG   ppBSArg, 4
    168 
    169         LDR     t4,=MASK_1
    170 
    171         LDRB    alpha0, [pAlphaArg]
    172         LDRB    beta0,  [pBetaArg]
    173         LDRB    alpha1, [pAlphaArg,#1]
    174         LDRB    beta1,  [pBetaArg,#1]
    175 
    176         MUL     alpha0, alpha0, t4
    177         MUL     beta0, beta0, t4
    178         MUL     alpha1, alpha1, t4
    179         MUL     beta1, beta1, t4
    180 
    181         M_STRD  alpha0, beta0, pAlphaBeta0
    182         M_STRD  alpha1, beta1, pAlphaBeta1
    183 
    184         LDR     XY,=LOOP_COUNT
    185         M_LDR   pBS, ppBSArg
    186         M_LDR   pThresholds, ppThresholdsArg
    187         M_STR   srcdstStep, pStep
    188         M_STRD  XY, pBS, pXYBS
    189         M_STR   pThresholds, ppThresholds
    190 
    191         SUB     pQ0, pQ0, #4
    192 LoopY
    193 ;//---------------Load Pixels-------------------
    194 
    195 ;//----------------Pack p0-p3-----------------------
    196         LDR     mask, =MASK_2
    197 
    198         M_LDR   row0, [pQ0], srcdstStep
    199         M_LDR   row1, [pQ0], srcdstStep
    200         LDR     row2, [pQ0]
    201         LDR     row3, [pQ0, srcdstStep]
    202         SUB     pQ0, pQ0, srcdstStep, LSL #1
    203 
    204         ;// row0 = [r0p0 r0p1 r0p2 r0p3]
    205         ;// row1 = [r1p0 r1p1 r1p2 r1p3]
    206         ;// row2 = [r2p0 r2p1 r2p2 r2p3]
    207         ;// row3 = [r3p0 r3p1 r3p2 r3p3]
    208 
    209         AND     tunpk0, mask, row0
    210         AND     tunpk6, mask, row0, LSL#8
    211         UXTAB16 tunpk0, tunpk0, row1, ROR#8
    212         UXTAB16 tunpk6, tunpk6, row1
    213         AND     tunpk2, mask, row2
    214         AND     tunpk3, mask, row2, LSL#8
    215         UXTAB16 tunpk2, tunpk2, row3, ROR#8
    216         UXTAB16 tunpk3, tunpk3, row3
    217 
    218         ;// tunpk0 = [r0p0 r1p0 r0p2 r1p2]
    219         ;// tunpk6 = [r0p1 r1p1 r0p3 r1p3]
    220         ;// tunpk2 = [r2p0 r3p0 r2p2 r3p2]
    221         ;// tunpk3 = [r2p1 r3p1 r2p3 r3p3]
    222 
    223         PKHTB   p_0, tunpk0, tunpk2, ASR#16
    224         PKHTB   p_1, tunpk6, tunpk3, ASR#16
    225         PKHBT   p_2, tunpk2, tunpk0, LSL#16
    226         PKHBT   p_3, tunpk3, tunpk6, LSL#16
    227 
    228 
    229         ;// p_0 = [r0p0 r1p0 r2p0 r3p0]
    230         ;// p_1 = [r0p1 r1p1 r2p1 r3p1]
    231         ;// p_2 = [r0p2 r1p2 r2p1 r3p2]
    232         ;// p_3 = [r0p3 r1p3 r2p3 r3p3]
    233 
    234         M_STR   p_3, pP_3
    235 
    236 ;//----------------Pack q0-q3-----------------------
    237 LoopX
    238         LDRB    bS, [pBS], #4
    239         M_STR   pQ0, ppQ0Step
    240         LDR     mask, =MASK_2
    241         CMP     bS, #0
    242         M_STR   pBS, ppBS
    243 
    244         LDR     row4, [pQ0, #4]!
    245         BEQ.W   NoFilterBS0
    246         M_LDR   row5, [pQ0, srcdstStep]!
    247         M_LDR   row6, [pQ0, srcdstStep]!
    248         M_LDR   row7, [pQ0, srcdstStep]
    249 
    250         ;// row4 = [r0q3 r0q2 r0q1 r0q0]
    251         ;// row5 = [r1q3 r1q2 r1q1 r1q0]
    252         ;// row6 = [r2q3 r2q2 r2q1 r2q0]
    253         ;// row7 = [r3q3 r3q2 r3q1 r3q0]
    254 
    255         AND     tunpk4, mask, row4
    256         CMP     bS, #4
    257         AND     tunpk5, mask, row4, LSL#8
    258         UXTAB16 tunpk4, tunpk4, row5, ROR#8
    259         UXTAB16 tunpk5, tunpk5, row5
    260         AND     tunpk6, mask, row6
    261         AND     tunpk7, mask, row6, LSL#8
    262         UXTAB16 tunpk6, tunpk6, row7, ROR#8
    263         UXTAB16 tunpk7, tunpk7, row7
    264 
    265         ;// tunpk4 = [r0q0 r1q0 r0q2 r1q2]
    266         ;// tunpk5 = [r0q1 r1q1 r0q3 r1q3]
    267         ;// tunpk6 = [r2q0 r3q0 r2q2 r3q2]
    268         ;// tunpk7 = [r2q1 r3q1 r2q3 r3q3]
    269 
    270         PKHTB   q_3, tunpk4, tunpk6, ASR#16
    271         PKHTB   q_2, tunpk5, tunpk7, ASR#16
    272         PKHBT   q_1, tunpk6, tunpk4, LSL#16
    273         M_STR   q_3, pQ_3
    274         PKHBT   q_0, tunpk7, tunpk5, LSL#16
    275 
    276 
    277         ;// q_0 = [r0q0 r1q0 r2q0 r3q0]
    278         ;// q_1 = [r0q1 r1q1 r2q1 r3q1]
    279         ;// q_2 = [r0q2 r1q2 r2q1 r3q2]
    280         ;// q_3 = [r0q3 r1q3 r2q3 r3q3]
    281 
    282 
    283 ;//--------------Filtering Decision -------------------
    284         LDR     m01, =MASK_1                ;//  01010101 mask
    285         MOV     m00, #MASK_0                ;//  00000000 mask
    286 
    287         ;// Check |p0-q0|<Alpha
    288         USUB8   dp0q0, p_0, q_0
    289         USUB8   a, q_0, p_0
    290         SEL     ap0q0, a, dp0q0
    291         USUB8   a, ap0q0, alpha
    292         SEL     filt, m00, m01
    293 
    294         ;// Check |p1-p0|<Beta
    295         USUB8   dp1p0, p_1, p_0
    296         USUB8   a, p_0, p_1
    297         SEL     a, a, dp1p0
    298         USUB8   a, a, beta
    299         SEL     filt, m00, filt
    300 
    301         ;// Check |q1-q0|<Beta
    302         USUB8   dq1q0, q_1, q_0
    303         USUB8   a, q_0, q_1
    304         SEL     a, a, dq1q0
    305         USUB8   a, a, beta
    306         SEL     filt, m00, filt
    307 
    308         ;// Check ap<Beta
    309         USUB8   dp2p0, p_2, p_0
    310         USUB8   a, p_0, p_2
    311         SEL     a, a, dp2p0
    312         USUB8   a, a, beta
    313         SEL     apflg, m00, filt            ;// apflg = filt && (ap<beta)
    314 
    315         ;// Check aq<Beta
    316         USUB8   dq2q0, q_2, q_0
    317         USUB8   t2, q_0, q_2
    318         SEL     t2, t2, dq2q0
    319         USUB8   t2, t2, beta
    320         MOV     t7,#0
    321 
    322 
    323         BLT     bSLT4
    324 ;//-------------------Filter--------------------
    325 bSGE4
    326         ;//---------bSGE4 Execution---------------
    327         SEL     t1, t7, filt            ;// aqflg = filt && (aq<beta)
    328         CMP     filt, #0
    329         ORR     apqflg, apflg, t1, LSL #1
    330         M_LDRD  pQ0, srcdstStep, ppQ0Step, EQ
    331         BEQ     NoFilterFilt0
    332 
    333         BL      armVCM4P10_DeblockingLumabSGE4_unsafe
    334 
    335         ;//---------Store result---------------
    336 
    337         LDR     maskb,=MASK_2
    338 
    339         ;// P0b = [r0p0 r1p0 r2p0 r3p0]
    340         ;// P1b = [r0p1 r1p1 r2p1 r3p1]
    341         ;// P2b = [r0p2 r1p2 r2p2 r3p2]
    342         ;// P3b = [r0p3 r1p3 r2p3 r3p3]
    343 
    344         M_LDR   P3b, pP_3
    345         M_STR   Q0b, pP_3
    346 
    347         ;//------Pack p0-p3------
    348         AND     tunpk0, maskb, P0b
    349         AND     tunpk2, maskb, P0b, LSL#8
    350         UXTAB16 tunpk0, tunpk0, P1b, ROR#8
    351         UXTAB16 tunpk2, tunpk2, P1b
    352 
    353         AND     tunpk3, maskb, P2b
    354         AND     tunpk8, maskb, P2b, LSL#8
    355         UXTAB16 tunpk3, tunpk3, P3b, ROR#8
    356         UXTAB16 tunpk8, tunpk8, P3b
    357 
    358         ;// tunpk0 = [r0p0 r0p1 r2p0 r2p1]
    359         ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1]
    360         ;// tunpk3 = [r0p2 r0p3 r2p2 r2p3]
    361         ;// tunpk8 = [r1p2 r1p3 r3p2 r3p3]
    362 
    363         MOV     p_2, Q1b
    364         M_LDRD  pQ0b, Stepb, ppQ0Step
    365 
    366         PKHTB   row9, tunpk0, tunpk3, ASR#16
    367         PKHBT   row7, tunpk3, tunpk0, LSL#16
    368         PKHTB   row3, tunpk2, tunpk8, ASR#16
    369         PKHBT   row6, tunpk8, tunpk2, LSL#16
    370 
    371         ;// row9 = [r0p0 r0p1 r0p2 r0p3]
    372         ;// row3 = [r1p0 r1p1 r1p2 r1p3]
    373         ;// row7 = [r2p0 r2p1 r2p2 r2p3]
    374         ;// row6 = [r3p0 r3p1 r3p2 r3p3]
    375 
    376         M_STR   row9, [pQ0b], Stepb
    377         STR     row7, [pQ0b, Stepb]
    378         STR     row6, [pQ0b, Stepb, LSL #1]
    379         STR     row3, [pQ0b], #4
    380 
    381         M_LDR   Q3b, pQ_3
    382 
    383         ;// Q0b = [r0q0 r1q0 r2q0 r3q0]
    384         ;// Q1b = [r0q1 r1q1 r2q1 r3q1]
    385         ;// Q2b = [r0q2 r1q2 r2q2 r3q2]
    386         ;// Q3b = [r0q3 r1q3 r2q3 r3q3]
    387 
    388         ;//------Pack q0-q3------
    389         AND     tunpk0, maskb, p_2
    390         AND     tunpk2, maskb, p_2, LSL#8
    391         UXTAB16 tunpk0, tunpk0, Q0b, ROR#8
    392         UXTAB16 tunpk2, tunpk2, Q0b
    393 
    394         AND     tunpk3, maskb, Q3b
    395         AND     tunpk8, maskb, Q3b, LSL#8
    396         UXTAB16 tunpk3, tunpk3, Q2b, ROR#8
    397         UXTAB16 tunpk8, tunpk8, Q2b
    398 
    399         ;// tunpk0 = [r0q1 r0q0 r2q1 r2q0]
    400         ;// tunpk2 = [r1q1 r1q0 r3q1 r3q0]
    401         ;// tunpk3 = [r0q3 r0q2 r2q3 r2q2]
    402         ;// tunpk8 = [r1q3 r1q2 r3q3 r3q2]
    403 
    404         PKHTB   row8, tunpk3, tunpk0, ASR#16
    405         PKHBT   row7, tunpk0, tunpk3, LSL#16
    406         PKHTB   row4, tunpk8, tunpk2, ASR#16
    407         PKHBT   row6, tunpk2, tunpk8, LSL#16
    408 
    409         ;// row8 = [r0q0 r0q1 r0q2 r0q3]
    410         ;// row4 = [r1q0 r1q1 r1q2 r1q3]
    411         ;// row7 = [r2q0 r2q1 r2q2 r2q3]
    412         ;// row6 = [r3q0 r3q1 r3q2 r3q3]
    413 
    414         STR     row4, [pQ0b]
    415         STR     row7, [pQ0b, Stepb]
    416         STR     row6, [pQ0b, Stepb, LSL #1]
    417 
    418         SUB     pQ0, pQ0b, Stepb
    419         MOV     p_1, Q2b
    420 
    421         STR     row8, [pQ0]
    422 
    423         M_LDRD  XY, pBS, pXYBS
    424         M_LDR   pThresholds, ppThresholds
    425         M_LDRD  alpha, beta, pAlphaBeta1
    426 
    427         ADDS    XY, XY, XY
    428         ADD     pThresholds, #4
    429         M_STR   pThresholds, ppThresholds
    430         M_STR   XY, pXYBS
    431         BCC     LoopX
    432         B       ExitLoopY
    433 
    434 ;//---------- Exit of LoopX --------------
    435 ;//---- for the case of no filtering -----
    436 
    437 NoFilterFilt0
    438         ADD     pQ0, pQ0, #4
    439 NoFilterBS0
    440         ;// Load counter for LoopX
    441         M_LDRD  XY, pBS, pXYBS
    442         M_LDR   pThresholds, ppThresholds
    443         M_LDRD  alpha, beta, pAlphaBeta1
    444 
    445         ;// Align the pointer
    446         ADDS    XY, XY, XY
    447         ADD     pThresholds, pThresholds, #4
    448         M_STR   pThresholds, ppThresholds
    449         M_STR   XY, pXYBS
    450         BCC     LoopY
    451         B       ExitLoopY
    452 
    453 bSLT4
    454         ;//---------bSLT4 Execution---------------
    455         SEL     aqflg, t7, filt            ;// aqflg = filt && (aq<beta)
    456         M_LDR   ptC0, ppThresholds
    457         CMP     filt, #0
    458         M_LDRD  pQ0, srcdstStep, ppQ0Step, EQ
    459         BEQ     NoFilterFilt0
    460 
    461         LDRB    tC0, [ptC0], #4
    462         M_STR   ptC0, ppThresholds
    463 
    464         BL      armVCM4P10_DeblockingLumabSLT4_unsafe
    465 
    466         ;//---------Store result---------------
    467         ;//--------Pack p1,p0,q1,q0------------
    468 
    469         ;//Load destination pointer
    470         LDR     maska,=MASK_2
    471         M_STR   Q0a, pP_3
    472         MOV     p_1, q_2
    473 
    474         ;// P1a = [r0p1 r1p1 r2p1 r3p1]
    475         ;// P0a = [r0p0 r1p0 r2p0 r3p0]
    476         ;// Q0a = [r0q0 r1q0 r2q0 r3q0]
    477         ;// Q1a = [r0q1 r1q1 r2q1 r3q1]
    478 
    479         AND     tunpk1, maska, P0a
    480         AND     tunpk2, maska, P0a, LSL#8
    481         UXTAB16 tunpk1, tunpk1, P1a, ROR#8
    482         UXTAB16 tunpk2, tunpk2, P1a
    483 
    484         M_LDRD  pQ0a, Stepa, ppQ0Step
    485 
    486         AND     tunpk9, maska, Q1a
    487         AND     tunpk3, maska, Q1a, LSL#8
    488         UXTAB16 tunpk9, tunpk9, Q0a, ROR#8
    489         UXTAB16 tunpk3, tunpk3, Q0a
    490 
    491         ;// tunpk1 = [r0p0 r0p1 r2p0 r2p1]
    492         ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1]
    493         ;// tunpk9 = [r0q1 r0q0 r2q1 r2q0]
    494         ;// tunpk3 = [r1q1 r1q0 r3q1 r3q0]
    495 
    496         MOV     t4, tunpk1, LSR #16
    497         MOV     t0, tunpk9, LSR #16
    498 
    499         STRH    t4,[pQ0a, #2]!          ;//Stores [r0p0 r0p1]
    500         STRH    t0,[pQ0a, #2]           ;//Stores [r0q0 r0q1]
    501 
    502         MOV     t4, tunpk2, LSR #16
    503         MOV     t0, tunpk3, LSR #16
    504 
    505         M_STRH  t4,[pQ0a, Stepa]!       ;//Stores [r1p0 r1p1]
    506         STRH    t0,[pQ0a, #2]           ;//Stores [r1q0 r1q1]
    507 
    508         M_STRH  tunpk1,[pQ0a, Stepa]!   ;//Stores [r2p0 r2p1]
    509         STRH    tunpk2,[pQ0a, Stepa]    ;//Stores [r3p0 r3p1]
    510         STRH    tunpk9,[pQ0a, #2]!        ;//Stores [r2q0 r2q1]
    511         STRH    tunpk3,[pQ0a, Stepa]    ;//Stores [r3q0 r3q1]
    512 
    513         SUB     pQ0, pQ0a, Stepa, LSL #1
    514 
    515         ;// Load counter
    516         M_LDRD  XY, pBS, pXYBS
    517 
    518         ;// Reload Pixels
    519         M_LDR   p_0, pQ_3
    520         MOV     p_2, Q1a
    521 
    522         M_LDRD  alpha, beta, pAlphaBeta1
    523 
    524         ADDS    XY, XY, XY
    525         M_STR   XY, pXYBS
    526         BCC     LoopX
    527 
    528 ;//-------- Common Exit of LoopY -----------------
    529         ;// Align the pointers
    530         M_LDR   pThresholds, ppThresholds
    531 ExitLoopY
    532         SUB     pQ0, pQ0, #16
    533         ADD     pQ0, pQ0, srcdstStep, LSL #2
    534         SUB     pBS, pBS, #15
    535         SUB     pThresholds, pThresholds, #15
    536         M_STR   pThresholds, ppThresholds
    537 
    538         M_LDRD  alpha, beta, pAlphaBeta0
    539 
    540         BNE     LoopY
    541         MOV     r0, #OMX_Sts_NoErr
    542 
    543         M_END
    544 ;//-----------------End Filter--------------------
    545 
    546     ENDIF
    547 
    548         END
    549 
    550