Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         M_VARIANTS CortexA8
     17 
     18         IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
     19         IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
     20 
     21         IF CortexA8
     22 
     23 LOOP_COUNT  EQU 0x11000000
     24 
     25 
     26 ;// Function arguments
     27 
     28 pSrcDst     RN 0
     29 srcdstStep  RN 1
     30 pAlpha      RN 2
     31 pBeta       RN 3
     32 
     33 pThresholds RN 5
     34 pBS         RN 4
     35 bS10        RN 12
     36 
     37 pAlpha_0    RN 2
     38 pBeta_0     RN 3
     39 
     40 pAlpha_1    RN 7
     41 pBeta_1     RN 8
     42 
     43 pTmp        RN 10
     44 pTmpStep    RN 11
     45 
     46 ;// Loop
     47 
     48 XY          RN 9
     49 
     50 ;// Rows input
     51 dRow0       DN D7.U8
     52 dRow1       DN D8.U8
     53 dRow2       DN D5.U8
     54 dRow3       DN D10.U8
     55 dRow4       DN D6.U8
     56 dRow5       DN D9.U8
     57 dRow6       DN D4.U8
     58 dRow7       DN D11.U8
     59 
     60 ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
     61 ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
     62 
     63 ;// Rows output
     64 dRown0      DN D7.U8
     65 dRown1      DN D24.U8
     66 dRown2      DN D30.U8
     67 dRown3      DN D10.U8
     68 dRown4      DN D6.U8
     69 dRown5      DN D25.U8
     70 dRown6      DN D29.U8
     71 dRown7      DN D11.U8
     72 
     73 ;// dP_0n       DN D29.U8
     74 ;// dP_1n       DN D30.U8
     75 ;// dP_2n       DN D31.U8
     76 ;//
     77 ;// dQ_0n       DN D24.U8   ;!!;Temp2
     78 ;// dQ_1n       DN D25.U8   ;!!;Temp2
     79 ;// dQ_2n       DN D28.U8   ;!!;dQ_0t
     80 ;//
     81 ;// dRown0 - dP_3,  dRown1 - dQ_0n
     82 ;// dRown2 - dP_1n, dRown3 - dQ_2
     83 ;// dRown4 - dP_2,  dRown5 - dQ_1n
     84 ;// dRown6 - dP_0n, dRown7 - dQ_3
     85 
     86 dRow0n      DN D7.U8
     87 dRow1n      DN D24.U8
     88 dRow2n      DN D30.U8
     89 dRow3n      DN D28.U8
     90 dRow4n      DN D31.U8
     91 dRow5n      DN D25.U8
     92 dRow6n      DN D29.U8
     93 dRow7n      DN D11.U8
     94 
     95 ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
     96 ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
     97 
     98 ;// Pixels
     99 dP_0        DN D4.U8
    100 dP_1        DN D5.U8
    101 dP_2        DN D6.U8
    102 dP_3        DN D7.U8
    103 dQ_0        DN D8.U8
    104 dQ_1        DN D9.U8
    105 dQ_2        DN D10.U8
    106 dQ_3        DN D11.U8
    107 
    108 
    109 ;// Filtering Decision
    110 dAlpha      DN D0.U8
    111 dBeta       DN D2.U8
    112 
    113 dFilt       DN D16.U8
    114 dAqflg      DN D12.U8
    115 dApflg      DN D17.U8
    116 
    117 dAp0q0      DN D13.U8
    118 dAp1p0      DN D12.U8
    119 dAq1q0      DN D18.U8
    120 dAp2p0      DN D19.U8
    121 dAq2q0      DN D17.U8
    122 
    123 ;// bSLT4
    124 dTC0        DN D18.U8
    125 dTC1        DN D19.U8
    126 dTC01       DN D18.U8
    127 
    128 dTCs        DN D31.S8
    129 dTC         DN D31.U8
    130 
    131 dMask_0     DN D14.U8
    132 dMask_1     DN D15.U8
    133 
    134 Mask_0      RN 6
    135 
    136 dTemp       DN D19.U8
    137 
    138 ;// Computing P0,Q0
    139 qDq0p0      QN Q10.S16
    140 qDp1q1      QN Q11.S16
    141 qDelta      QN Q10.S16  ; reuse qDq0p0
    142 dDelta      DN D20.S8
    143 
    144 
    145 ;// Computing P1,Q1
    146 dRp0q0      DN D24.U8
    147 
    148 dMaxP       DN D23.U8
    149 dMinP       DN D22.U8
    150 
    151 dMaxQ       DN D19.U8
    152 dMinQ       DN D21.U8
    153 
    154 dDeltaP     DN D26.U8
    155 dDeltaQ     DN D27.U8
    156 
    157 qP_0n       QN Q14.S16
    158 qQ_0n       QN Q12.S16
    159 
    160 dQ_0n       DN D24.U8
    161 dQ_1n       DN D25.U8
    162 dP_0n       DN D29.U8
    163 dP_1n       DN D30.U8
    164 
    165 ;// bSGE4
    166 
    167 qSp0q0      QN Q10.U16
    168 
    169 qSp2q1      QN Q11.U16
    170 qSp0q0p1    QN Q12.U16
    171 qSp3p2      QN Q13.U16
    172 dHSp0q1     DN D28.U8
    173 
    174 qSq2p1      QN Q11.U16
    175 qSp0q0q1    QN Q12.U16
    176 qSq3q2      QN Q13.U16  ;!!
    177 dHSq0p1     DN D28.U8   ;!!
    178 
    179 qTemp1      QN Q11.U16  ;!!;qSp2q1
    180 qTemp2      QN Q12.U16  ;!!;qSp0q0p1
    181 
    182 dP_0t       DN D28.U8   ;!!;dHSp0q1
    183 dQ_0t       DN D22.U8   ;!!;Temp1
    184 
    185 dP_0n       DN D29.U8
    186 dP_1n       DN D30.U8
    187 dP_2n       DN D31.U8
    188 
    189 dQ_0n       DN D24.U8   ;!!;Temp2
    190 dQ_1n       DN D25.U8   ;!!;Temp2
    191 dQ_2n       DN D28.U8   ;!!;dQ_0t
    192 
    193 
    194         ;// Function header
    195         M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15
    196 
    197         ;//Arguments on the stack
    198         M_ARG   ppThresholds, 4
    199         M_ARG   ppBS, 4
    200 
    201         ;// d0-dAlpha_0
    202         ;// d2-dBeta_0
    203 
    204         ADD         pAlpha_1, pAlpha_0, #1
    205         ADD         pBeta_1, pBeta_0, #1
    206 
    207         VLD1        {dAlpha[]}, [pAlpha_0]
    208         SUB         pSrcDst, pSrcDst, #4
    209         VLD1        {dBeta[]}, [pBeta_0]
    210 
    211         M_LDR       pBS, ppBS
    212         M_LDR       pThresholds, ppThresholds
    213 
    214         MOV         Mask_0,#0
    215 
    216         ;dMask_0-14
    217         ;dMask_1-15
    218 
    219         VMOV        dMask_0, #0
    220         VMOV        dMask_1, #1
    221 
    222         LDR         XY,=LOOP_COUNT
    223 
    224         ADD         pTmpStep, srcdstStep, srcdstStep
    225 
    226         ;// p0-p3 - d4-d7
    227         ;// q0-q3 - d8-d11
    228 LoopY
    229 LoopX
    230         LDRH        bS10, [pBS], #4
    231 
    232         CMP         bS10, #0
    233         BEQ         NoFilterBS0
    234 
    235         ;// Load 8 rows of data
    236         ADD         pTmp, pSrcDst, srcdstStep
    237         VLD1        dRow0, [pSrcDst], pTmpStep
    238         VLD1        dRow1, [pTmp], pTmpStep
    239         VLD1        dRow2, [pSrcDst], pTmpStep
    240         VZIP.8      dRow0, dRow1
    241         VLD1        dRow3, [pTmp], pTmpStep
    242         VLD1        dRow4, [pSrcDst], pTmpStep
    243         VZIP.8      dRow2, dRow3
    244         VLD1        dRow5, [pTmp], pTmpStep
    245         VLD1        dRow6, [pSrcDst], pTmpStep
    246         VLD1        dRow7, [pTmp], pTmpStep
    247         VZIP.8      dRow4, dRow5
    248         VZIP.16     dRow1, dRow3
    249 
    250 
    251         ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
    252         ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
    253         ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
    254         ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
    255         ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
    256         ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
    257         ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
    258         ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
    259 
    260         ;// 8x8 Transpose
    261 
    262         VZIP.8      dRow6, dRow7
    263 
    264         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    265         VZIP.16     dRow0, dRow2
    266         VZIP.16     dRow5, dRow7
    267 
    268 
    269         VZIP.16     dRow4, dRow6
    270         VZIP.32     dRow1, dRow5
    271         VZIP.32     dRow2, dRow6
    272         VZIP.32     dRow3, dRow7
    273         VZIP.32     dRow0, dRow4
    274 
    275 
    276         ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
    277         ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
    278 
    279         ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0]
    280         ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0]
    281         ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0]
    282         ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0]
    283 
    284         ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0]
    285         ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0]
    286         ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0]
    287         ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0]
    288 
    289         VABD        dAp0q0, dP_0, dQ_0
    290         VABD        dAp1p0, dP_1, dP_0
    291 
    292         VABD        dAq1q0, dQ_1, dQ_0
    293         VABD        dAp2p0, dP_2, dP_0
    294 
    295         TST         bS10, #0xff
    296         VCGT        dFilt, dAlpha, dAp0q0
    297 
    298         VMAX        dAp1p0, dAq1q0, dAp1p0
    299         VABD        dAq2q0, dQ_2, dQ_0
    300 
    301         VMOVEQ.U32  dFilt[0], Mask_0
    302         TST         bS10, #0xff00
    303 
    304         VCGT        dAp2p0, dBeta, dAp2p0
    305         VCGT        dAp1p0, dBeta, dAp1p0
    306 
    307         VMOVEQ.U32  dFilt[1], Mask_0
    308 
    309         VCGT        dAq2q0, dBeta, dAq2q0
    310         VAND        dFilt, dFilt, dAp1p0
    311         TST         bS10, #4
    312 
    313         VAND        dAqflg, dFilt, dAq2q0
    314         VAND        dApflg, dFilt, dAp2p0
    315 
    316         BNE         bSGE4
    317 bSLT4
    318         ;// bS < 4 Filtering
    319 
    320         BL          armVCM4P10_DeblockingLumabSLT4_unsafe
    321 
    322         ;// Transpose
    323 
    324         VZIP.8      dP_3,  dP_2
    325         VZIP.8      dP_1n, dP_0n
    326         VZIP.8      dQ_0n, dQ_1n
    327         VZIP.8      dQ_2,  dQ_3
    328 
    329 
    330         VZIP.16     dP_3,  dP_1n
    331         ADD         pTmp, pSrcDst, srcdstStep
    332         VZIP.16     dQ_0n, dQ_2
    333         VZIP.16     dQ_1n, dQ_3
    334         VZIP.16     dP_2,  dP_0n
    335 
    336         VZIP.32     dP_3,  dQ_0n
    337         VZIP.32     dP_1n, dQ_2
    338         VZIP.32     dP_2,  dQ_1n
    339         VZIP.32     dP_0n, dQ_3
    340 
    341         ;// dRown0 - dP_3,  dRown1 - dQ_0n
    342         ;// dRown2 - dP_1n, dRown3 - dQ_2
    343         ;// dRown4 - dP_2,  dRown5 - dQ_1n
    344         ;// dRown6 - dP_0n, dRown7 - dQ_3
    345 
    346         VST1        dRown0, [pSrcDst], pTmpStep
    347         VST1        dRown1, [pTmp], pTmpStep
    348         VST1        dRown2, [pSrcDst], pTmpStep
    349         VST1        dRown3, [pTmp], pTmpStep
    350         ;1
    351         VST1        dRown4, [pSrcDst], pTmpStep
    352         VST1        dRown5, [pTmp], pTmpStep
    353         ADDS        XY, XY, XY
    354         VST1        dRown6, [pSrcDst], pTmpStep
    355         ADD         pThresholds, pThresholds, #2
    356         VST1        dRown7, [pTmp], srcdstStep
    357 
    358         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    359         VLD1        {dAlpha[]}, [pAlpha_1]
    360         ADD         pSrcDst, pSrcDst, #4
    361         VLD1        {dBeta[]}, [pBeta_1]
    362 
    363         BCC         LoopX
    364         B           ExitLoopY
    365 
    366 NoFilterBS0
    367         ADD         pSrcDst, pSrcDst, #4
    368         ADDS        XY, XY, XY
    369         VLD1        {dAlpha[]}, [pAlpha_1]
    370         ADD         pThresholds, pThresholds, #4
    371         VLD1        {dBeta[]}, [pBeta_1]
    372         BCC         LoopX
    373         B           ExitLoopY
    374 bSGE4
    375         ;// bS >= 4 Filtering
    376 
    377         BL          armVCM4P10_DeblockingLumabSGE4_unsafe
    378 
    379         ;// Transpose
    380 
    381         VZIP.8      dP_3,  dP_2n
    382         VZIP.8      dP_1n, dP_0n
    383         VZIP.8      dQ_0n, dQ_1n
    384         VZIP.8      dQ_2n, dQ_3
    385 
    386         VZIP.16     dP_3,  dP_1n
    387         ADD         pTmp, pSrcDst, srcdstStep
    388         VZIP.16     dQ_0n, dQ_2n
    389         VZIP.16     dQ_1n, dQ_3
    390         VZIP.16     dP_2n, dP_0n
    391 
    392         VZIP.32     dP_3,  dQ_0n
    393         VZIP.32     dP_1n, dQ_2n
    394         VZIP.32     dP_2n, dQ_1n
    395         VZIP.32     dP_0n, dQ_3
    396 
    397         ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
    398         ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
    399 
    400         VST1        dRow0n, [pSrcDst], pTmpStep
    401         VST1        dRow1n, [pTmp], pTmpStep
    402         VST1        dRow2n, [pSrcDst], pTmpStep
    403         VST1        dRow3n, [pTmp], pTmpStep
    404         VST1        dRow4n, [pSrcDst], pTmpStep
    405         VST1        dRow5n, [pTmp], pTmpStep
    406         ADDS        XY,XY,XY
    407         VST1        dRow6n, [pSrcDst], pTmpStep
    408         ADD         pThresholds, pThresholds, #4
    409         VST1        dRow7n, [pTmp], pTmpStep
    410 
    411         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    412         VLD1        {dAlpha[]}, [pAlpha_1]
    413         ADD         pSrcDst, pSrcDst, #4
    414         VLD1        {dBeta[]}, [pBeta_1]
    415 
    416         BCC         LoopX
    417 
    418 ExitLoopY
    419         SUB         pBS, pBS, #14
    420         SUB         pThresholds, pThresholds, #14
    421         SUB         pSrcDst, pSrcDst, #16
    422         VLD1        {dAlpha[]}, [pAlpha_0]
    423         ADD         pSrcDst, pSrcDst, srcdstStep, LSL #3
    424         VLD1        {dBeta[]}, [pBeta_0]
    425         BNE         LoopY
    426 
    427         MOV         r0, #OMX_Sts_NoErr
    428 
    429         M_END
    430 
    431     ENDIF
    432 
    433 
    434         END
    435 
    436 
    437