Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS CortexA8
     31 
     32         IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
     33         IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
     34 
     35         IF CortexA8
     36 
     37 LOOP_COUNT  EQU 0x11000000
     38 
     39 
     40 ;// Function arguments
     41 
     42 pSrcDst     RN 0
     43 srcdstStep  RN 1
     44 pAlpha      RN 2
     45 pBeta       RN 3
     46 
     47 pThresholds RN 5
     48 pBS         RN 4
     49 bS10        RN 12
     50 
     51 pAlpha_0    RN 2
     52 pBeta_0     RN 3
     53 
     54 pAlpha_1    RN 7
     55 pBeta_1     RN 8
     56 
     57 pTmp        RN 10
     58 pTmpStep    RN 11
     59 
     60 ;// Loop
     61 
     62 XY          RN 9
     63 
     64 ;// Rows input
     65 dRow0       DN D7.U8
     66 dRow1       DN D8.U8
     67 dRow2       DN D5.U8
     68 dRow3       DN D10.U8
     69 dRow4       DN D6.U8
     70 dRow5       DN D9.U8
     71 dRow6       DN D4.U8
     72 dRow7       DN D11.U8
     73 
     74 ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
     75 ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
     76 
     77 ;// Rows output
     78 dRown0      DN D7.U8
     79 dRown1      DN D24.U8
     80 dRown2      DN D30.U8
     81 dRown3      DN D10.U8
     82 dRown4      DN D6.U8
     83 dRown5      DN D25.U8
     84 dRown6      DN D29.U8
     85 dRown7      DN D11.U8
     86 
     87 ;// dP_0n       DN D29.U8
     88 ;// dP_1n       DN D30.U8
     89 ;// dP_2n       DN D31.U8
     90 ;//
     91 ;// dQ_0n       DN D24.U8   ;!!;Temp2
     92 ;// dQ_1n       DN D25.U8   ;!!;Temp2
     93 ;// dQ_2n       DN D28.U8   ;!!;dQ_0t
     94 ;//
     95 ;// dRown0 - dP_3,  dRown1 - dQ_0n
     96 ;// dRown2 - dP_1n, dRown3 - dQ_2
     97 ;// dRown4 - dP_2,  dRown5 - dQ_1n
     98 ;// dRown6 - dP_0n, dRown7 - dQ_3
     99 
    100 dRow0n      DN D7.U8
    101 dRow1n      DN D24.U8
    102 dRow2n      DN D30.U8
    103 dRow3n      DN D28.U8
    104 dRow4n      DN D31.U8
    105 dRow5n      DN D25.U8
    106 dRow6n      DN D29.U8
    107 dRow7n      DN D11.U8
    108 
    109 ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
    110 ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
    111 
    112 ;// Pixels
    113 dP_0        DN D4.U8
    114 dP_1        DN D5.U8
    115 dP_2        DN D6.U8
    116 dP_3        DN D7.U8
    117 dQ_0        DN D8.U8
    118 dQ_1        DN D9.U8
    119 dQ_2        DN D10.U8
    120 dQ_3        DN D11.U8
    121 
    122 
    123 ;// Filtering Decision
    124 dAlpha      DN D0.U8
    125 dBeta       DN D2.U8
    126 
    127 dFilt       DN D16.U8
    128 dAqflg      DN D12.U8
    129 dApflg      DN D17.U8
    130 
    131 dAp0q0      DN D13.U8
    132 dAp1p0      DN D12.U8
    133 dAq1q0      DN D18.U8
    134 dAp2p0      DN D19.U8
    135 dAq2q0      DN D17.U8
    136 
    137 ;// bSLT4
    138 dTC0        DN D18.U8
    139 dTC1        DN D19.U8
    140 dTC01       DN D18.U8
    141 
    142 dTCs        DN D31.S8
    143 dTC         DN D31.U8
    144 
    145 dMask_0     DN D14.U8
    146 dMask_1     DN D15.U8
    147 
    148 Mask_0      RN 6
    149 
    150 dTemp       DN D19.U8
    151 
    152 ;// Computing P0,Q0
    153 qDq0p0      QN Q10.S16
    154 qDp1q1      QN Q11.S16
    155 qDelta      QN Q10.S16  ; reuse qDq0p0
    156 dDelta      DN D20.S8
    157 
    158 
    159 ;// Computing P1,Q1
    160 dRp0q0      DN D24.U8
    161 
    162 dMaxP       DN D23.U8
    163 dMinP       DN D22.U8
    164 
    165 dMaxQ       DN D19.U8
    166 dMinQ       DN D21.U8
    167 
    168 dDeltaP     DN D26.U8
    169 dDeltaQ     DN D27.U8
    170 
    171 qP_0n       QN Q14.S16
    172 qQ_0n       QN Q12.S16
    173 
    174 dQ_0n       DN D24.U8
    175 dQ_1n       DN D25.U8
    176 dP_0n       DN D29.U8
    177 dP_1n       DN D30.U8
    178 
    179 ;// bSGE4
    180 
    181 qSp0q0      QN Q10.U16
    182 
    183 qSp2q1      QN Q11.U16
    184 qSp0q0p1    QN Q12.U16
    185 qSp3p2      QN Q13.U16
    186 dHSp0q1     DN D28.U8
    187 
    188 qSq2p1      QN Q11.U16
    189 qSp0q0q1    QN Q12.U16
    190 qSq3q2      QN Q13.U16  ;!!
    191 dHSq0p1     DN D28.U8   ;!!
    192 
    193 qTemp1      QN Q11.U16  ;!!;qSp2q1
    194 qTemp2      QN Q12.U16  ;!!;qSp0q0p1
    195 
    196 dP_0t       DN D28.U8   ;!!;dHSp0q1
    197 dQ_0t       DN D22.U8   ;!!;Temp1
    198 
    199 dP_0n       DN D29.U8
    200 dP_1n       DN D30.U8
    201 dP_2n       DN D31.U8
    202 
    203 dQ_0n       DN D24.U8   ;!!;Temp2
    204 dQ_1n       DN D25.U8   ;!!;Temp2
    205 dQ_2n       DN D28.U8   ;!!;dQ_0t
    206 
    207 
    208         ;// Function header
    209         M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15
    210 
    211         ;//Arguments on the stack
    212         M_ARG   ppThresholds, 4
    213         M_ARG   ppBS, 4
    214 
    215         ;// d0-dAlpha_0
    216         ;// d2-dBeta_0
    217 
    218         ADD         pAlpha_1, pAlpha_0, #1
    219         ADD         pBeta_1, pBeta_0, #1
    220 
    221         VLD1        {dAlpha[]}, [pAlpha_0]
    222         SUB         pSrcDst, pSrcDst, #4
    223         VLD1        {dBeta[]}, [pBeta_0]
    224 
    225         M_LDR       pBS, ppBS
    226         M_LDR       pThresholds, ppThresholds
    227 
    228         MOV         Mask_0,#0
    229 
    230         ;dMask_0-14
    231         ;dMask_1-15
    232 
    233         VMOV        dMask_0, #0
    234         VMOV        dMask_1, #1
    235 
    236         LDR         XY,=LOOP_COUNT
    237 
    238         ADD         pTmpStep, srcdstStep, srcdstStep
    239 
    240         ;// p0-p3 - d4-d7
    241         ;// q0-q3 - d8-d11
    242 LoopY
    243 LoopX
    244         LDRH        bS10, [pBS], #4
    245 
    246         CMP         bS10, #0
    247         BEQ         NoFilterBS0
    248 
    249         ;// Load 8 rows of data
    250         ADD         pTmp, pSrcDst, srcdstStep
    251         VLD1        dRow0, [pSrcDst], pTmpStep
    252         VLD1        dRow1, [pTmp], pTmpStep
    253         VLD1        dRow2, [pSrcDst], pTmpStep
    254         VZIP.8      dRow0, dRow1
    255         VLD1        dRow3, [pTmp], pTmpStep
    256         VLD1        dRow4, [pSrcDst], pTmpStep
    257         VZIP.8      dRow2, dRow3
    258         VLD1        dRow5, [pTmp], pTmpStep
    259         VLD1        dRow6, [pSrcDst], pTmpStep
    260         VLD1        dRow7, [pTmp], pTmpStep
    261         VZIP.8      dRow4, dRow5
    262         VZIP.16     dRow1, dRow3
    263 
    264 
    265         ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
    266         ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
    267         ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
    268         ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
    269         ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
    270         ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
    271         ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
    272         ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
    273 
    274         ;// 8x8 Transpose
    275 
    276         VZIP.8      dRow6, dRow7
    277 
    278         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    279         VZIP.16     dRow0, dRow2
    280         VZIP.16     dRow5, dRow7
    281 
    282 
    283         VZIP.16     dRow4, dRow6
    284         VZIP.32     dRow1, dRow5
    285         VZIP.32     dRow2, dRow6
    286         VZIP.32     dRow3, dRow7
    287         VZIP.32     dRow0, dRow4
    288 
    289 
    290         ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
    291         ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
    292 
    293         ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0]
    294         ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0]
    295         ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0]
    296         ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0]
    297 
    298         ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0]
    299         ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0]
    300         ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0]
    301         ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0]
    302 
    303         VABD        dAp0q0, dP_0, dQ_0
    304         VABD        dAp1p0, dP_1, dP_0
    305 
    306         VABD        dAq1q0, dQ_1, dQ_0
    307         VABD        dAp2p0, dP_2, dP_0
    308 
    309         TST         bS10, #0xff
    310         VCGT        dFilt, dAlpha, dAp0q0
    311 
    312         VMAX        dAp1p0, dAq1q0, dAp1p0
    313         VABD        dAq2q0, dQ_2, dQ_0
    314 
    315         VMOVEQ.U32  dFilt[0], Mask_0
    316         TST         bS10, #0xff00
    317 
    318         VCGT        dAp2p0, dBeta, dAp2p0
    319         VCGT        dAp1p0, dBeta, dAp1p0
    320 
    321         VMOVEQ.U32  dFilt[1], Mask_0
    322 
    323         VCGT        dAq2q0, dBeta, dAq2q0
    324         VAND        dFilt, dFilt, dAp1p0
    325         TST         bS10, #4
    326 
    327         VAND        dAqflg, dFilt, dAq2q0
    328         VAND        dApflg, dFilt, dAp2p0
    329 
    330         BNE         bSGE4
    331 bSLT4
    332         ;// bS < 4 Filtering
    333 
    334         BL          armVCM4P10_DeblockingLumabSLT4_unsafe
    335 
    336         ;// Transpose
    337 
    338         VZIP.8      dP_3,  dP_2
    339         VZIP.8      dP_1n, dP_0n
    340         VZIP.8      dQ_0n, dQ_1n
    341         VZIP.8      dQ_2,  dQ_3
    342 
    343 
    344         VZIP.16     dP_3,  dP_1n
    345         ADD         pTmp, pSrcDst, srcdstStep
    346         VZIP.16     dQ_0n, dQ_2
    347         VZIP.16     dQ_1n, dQ_3
    348         VZIP.16     dP_2,  dP_0n
    349 
    350         VZIP.32     dP_3,  dQ_0n
    351         VZIP.32     dP_1n, dQ_2
    352         VZIP.32     dP_2,  dQ_1n
    353         VZIP.32     dP_0n, dQ_3
    354 
    355         ;// dRown0 - dP_3,  dRown1 - dQ_0n
    356         ;// dRown2 - dP_1n, dRown3 - dQ_2
    357         ;// dRown4 - dP_2,  dRown5 - dQ_1n
    358         ;// dRown6 - dP_0n, dRown7 - dQ_3
    359 
    360         VST1        dRown0, [pSrcDst], pTmpStep
    361         VST1        dRown1, [pTmp], pTmpStep
    362         VST1        dRown2, [pSrcDst], pTmpStep
    363         VST1        dRown3, [pTmp], pTmpStep
    364         ;1
    365         VST1        dRown4, [pSrcDst], pTmpStep
    366         VST1        dRown5, [pTmp], pTmpStep
    367         ADDS        XY, XY, XY
    368         VST1        dRown6, [pSrcDst], pTmpStep
    369         ADD         pThresholds, pThresholds, #2
    370         VST1        dRown7, [pTmp], srcdstStep
    371 
    372         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    373         VLD1        {dAlpha[]}, [pAlpha_1]
    374         ADD         pSrcDst, pSrcDst, #4
    375         VLD1        {dBeta[]}, [pBeta_1]
    376 
    377         BCC         LoopX
    378         B           ExitLoopY
    379 
    380 NoFilterBS0
    381         ADD         pSrcDst, pSrcDst, #4
    382         ADDS        XY, XY, XY
    383         VLD1        {dAlpha[]}, [pAlpha_1]
    384         ADD         pThresholds, pThresholds, #4
    385         VLD1        {dBeta[]}, [pBeta_1]
    386         BCC         LoopX
    387         B           ExitLoopY
    388 bSGE4
    389         ;// bS >= 4 Filtering
    390 
    391         BL          armVCM4P10_DeblockingLumabSGE4_unsafe
    392 
    393         ;// Transpose
    394 
    395         VZIP.8      dP_3,  dP_2n
    396         VZIP.8      dP_1n, dP_0n
    397         VZIP.8      dQ_0n, dQ_1n
    398         VZIP.8      dQ_2n, dQ_3
    399 
    400         VZIP.16     dP_3,  dP_1n
    401         ADD         pTmp, pSrcDst, srcdstStep
    402         VZIP.16     dQ_0n, dQ_2n
    403         VZIP.16     dQ_1n, dQ_3
    404         VZIP.16     dP_2n, dP_0n
    405 
    406         VZIP.32     dP_3,  dQ_0n
    407         VZIP.32     dP_1n, dQ_2n
    408         VZIP.32     dP_2n, dQ_1n
    409         VZIP.32     dP_0n, dQ_3
    410 
    411         ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
    412         ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
    413 
    414         VST1        dRow0n, [pSrcDst], pTmpStep
    415         VST1        dRow1n, [pTmp], pTmpStep
    416         VST1        dRow2n, [pSrcDst], pTmpStep
    417         VST1        dRow3n, [pTmp], pTmpStep
    418         VST1        dRow4n, [pSrcDst], pTmpStep
    419         VST1        dRow5n, [pTmp], pTmpStep
    420         ADDS        XY,XY,XY
    421         VST1        dRow6n, [pSrcDst], pTmpStep
    422         ADD         pThresholds, pThresholds, #4
    423         VST1        dRow7n, [pTmp], pTmpStep
    424 
    425         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    426         VLD1        {dAlpha[]}, [pAlpha_1]
    427         ADD         pSrcDst, pSrcDst, #4
    428         VLD1        {dBeta[]}, [pBeta_1]
    429 
    430         BCC         LoopX
    431 
    432 ExitLoopY
    433         SUB         pBS, pBS, #14
    434         SUB         pThresholds, pThresholds, #14
    435         SUB         pSrcDst, pSrcDst, #16
    436         VLD1        {dAlpha[]}, [pAlpha_0]
    437         ADD         pSrcDst, pSrcDst, srcdstStep, LSL #3
    438         VLD1        {dBeta[]}, [pBeta_0]
    439         BNE         LoopY
    440 
    441         MOV         r0, #OMX_Sts_NoErr
    442 
    443         M_END
    444 
    445     ENDIF
    446 
    447 
    448         END
    449 
    450 
    451