Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS CortexA8
     31 
     32 
     33     IF  CortexA8
     34 
     35 pThresholds RN 5
     36 
     37 ;// Pixels
     38 dP_0        DN D4.U8
     39 dP_1        DN D5.U8
     40 dP_2        DN D6.U8
     41 dP_3        DN D7.U8
     42 dQ_0        DN D8.U8
     43 dQ_1        DN D9.U8
     44 dQ_2        DN D10.U8
     45 dQ_3        DN D11.U8
     46 
     47 
     48 ;// Filtering Decision
     49 dAlpha      DN D0.U8
     50 
     51 dFilt       DN D16.U8
     52 dAqflg      DN D12.U8
     53 dApflg      DN D17.U8
     54 
     55 dAp0q0      DN D13.U8
     56 
     57 ;// bSLT4
     58 dTC0        DN D18.U8
     59 dTC1        DN D19.U8
     60 dTC01       DN D18.U8
     61 
     62 dTCs        DN D31.S8
     63 dTC         DN D31.U8
     64 
     65 dMask_0     DN D14.U8
     66 dMask_1     DN D15.U8
     67 
     68 dTemp       DN D19.U8
     69 
     70 ;// Computing P0,Q0
     71 qDq0p0      QN Q10.S16
     72 qDp1q1      QN Q11.S16
     73 qDelta      QN Q10.S16  ; reuse qDq0p0
     74 dDelta      DN D20.S8
     75 
     76 
     77 ;// Computing P1,Q1
     78 dRp0q0      DN D24.U8
     79 
     80 dMaxP       DN D23.U8
     81 dMinP       DN D22.U8
     82 
     83 dMaxQ       DN D19.U8
     84 dMinQ       DN D21.U8
     85 
     86 dDeltaP     DN D26.U8
     87 dDeltaQ     DN D27.U8
     88 
     89 qP_0n       QN Q14.S16
     90 qQ_0n       QN Q12.S16
     91 
     92 dQ_0n       DN D24.U8
     93 dQ_1n       DN D25.U8
     94 dP_0n       DN D29.U8
     95 dP_1n       DN D30.U8
     96 
     97 ;// bSGE4
     98 
     99 qSp0q0      QN Q10.U16
    100 
    101 qSp2q1      QN Q11.U16
    102 qSp0q0p1    QN Q12.U16
    103 qSp3p2      QN Q13.U16
    104 dHSp0q1     DN D28.U8
    105 
    106 qSq2p1      QN Q11.U16
    107 qSp0q0q1    QN Q12.U16
    108 qSq3q2      QN Q13.U16  ;!!
    109 dHSq0p1     DN D28.U8   ;!!
    110 
    111 qTemp1      QN Q11.U16  ;!!;qSp2q1
    112 qTemp2      QN Q12.U16  ;!!;qSp0q0p1
    113 
    114 dP_0t       DN D28.U8   ;!!;dHSp0q1
    115 dQ_0t       DN D22.U8   ;!!;Temp1
    116 
    117 dP_0n       DN D29.U8
    118 dP_1n       DN D30.U8
    119 dP_2n       DN D31.U8
    120 
    121 dQ_0n       DN D24.U8   ;!!;Temp2
    122 dQ_1n       DN D25.U8   ;!!;Temp2
    123 dQ_2n       DN D28.U8   ;!!;dQ_0t
    124 
    125 ;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe
    126 ;//
    127 ;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
    128 ;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
    129 ;//        - Additional Params  - pThresholds: r5
    130 ;//
    131 ;// Outputs - Pixels            - P0-P1: D29-D30, Q0-Q1: D24-D25
    132 ;//         - Additional Params - pThresholds: r5
    133 
    134 ;// Registers Corrupted         - D18-D31
    135 
    136 
    137         M_START armVCM4P10_DeblockingLumabSLT4_unsafe
    138 
    139 
    140         ;// qDq0p0-10
    141         VSUBL       qDp1q1, dP_1, dQ_1
    142         VLD1        {dTC0[]}, [pThresholds]!
    143         ;// qDp1q1-11
    144         VSUBL       qDq0p0, dQ_0, dP_0
    145         VLD1        {dTC1[]}, [pThresholds]!
    146 
    147         ;// dRp0q0-24
    148         VSHR        qDp1q1, qDp1q1, #2
    149 
    150         ;// dTC01 = (dTC1 << 4) | dTC0
    151         ;// dTC01-18
    152         VEXT        dTC01, dTC0, dTC1, #4
    153         ;// dTemp-19
    154         VAND        dTemp, dApflg, dMask_1
    155 
    156         VBIF        dTC01, dMask_0, dFilt
    157 
    158 
    159         ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
    160         ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1
    161 
    162         ;// qDelta-qDq0p0-10
    163         VRHADD      qDelta, qDp1q1, qDq0p0
    164         VRHADD      dRp0q0, dP_0, dQ_0
    165         VADD        dTC, dTC01, dTemp
    166 
    167         ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1)
    168 
    169         VAND        dTemp, dAqflg, dMask_1
    170         VQADD       dMaxP, dP_1, dTC01
    171         VQMOVN      dDelta, qDelta
    172         VADD        dTC, dTC, dTemp
    173 
    174         ;// dMaxP = QADD(dP_1, dTC01)
    175         ;// dMinP = QSUB(dP_1, dTC01)
    176 
    177         ;// dMaxP-d23
    178         ;// dMinP-d22
    179         VQSUB       dMinP, dP_1, dTC01
    180 
    181         ;// dDelta-d20
    182 
    183         ;// dMaxQ = QADD(dQ_1, dTC01)
    184         ;// dMinQ = QSUB(dQ_1, dTC01)
    185 
    186         ;// dMaxQ-19
    187         ;// dMinQ-21
    188         VQADD       dMaxQ, dQ_1, dTC01
    189         VHADD       dDeltaP, dRp0q0, dP_2
    190         VMIN        dDelta, dDelta, dTCs
    191 
    192         ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta);
    193         VNEG        dTCs, dTCs
    194 
    195         VQSUB       dMinQ, dQ_1, dTC01
    196 
    197         ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
    198         ;// delta = armClip(-tC0, tC0, delta);
    199         ;// pQ0[-2*Step] = (OMX_U8)(p1 + delta);
    200 
    201         ;// dDeltaP = (dP_2 + dRp0q0)>>1;
    202         ;// dP_1n = armClip(dP_1 - dTC01, dP_1 + dTC01, dDeltaP);
    203         ;// dP_1n = armClip(MinP, MaxP, dDeltaP);
    204 
    205         ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1;
    206         ;// delta = armClip(-tC0, tC0, delta);
    207         ;// pQ0[1*Step] = (OMX_U8)(q1 + delta);
    208 
    209         ;// dDeltaQ = (dQ_2 + dRp0q0)>>1;
    210         ;// dQ_1n = armClip(dQ_1 - dTC01, dQ_1 + dTC01, dDeltaQ);
    211         ;// dQ_1n = armClip(MinQ, MaxQ, dDeltaQ);
    212 
    213         ;// dDeltaP-26
    214         VHADD       dDeltaQ, dRp0q0, dQ_2
    215 
    216         ;// dDeltaQ-27
    217 
    218         ;// dP_0n - 29
    219         ;// dP_1n - 30
    220         ;// dQ_0n - 24
    221         ;// dQ_1n - 25
    222 
    223         ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1;
    224         ;// dDeltaQ = (dQ_2 + dRp0q0)>>1;
    225 
    226         VMAX        dP_1n, dDeltaP, dMinP
    227         VMAX        dDelta, dDelta, dTCs
    228 
    229         ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta);
    230         ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta);
    231 
    232         ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
    233         ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
    234 
    235         ;// qP_0n - 14
    236         ;// qQ_0n - 12
    237 
    238         VMOVL       qP_0n, dP_0
    239         VMOVL       qQ_0n, dQ_0
    240 
    241         VADDW       qP_0n, qP_0n, dDelta
    242         VSUBW       qQ_0n, qQ_0n, dDelta
    243 
    244         VQMOVUN     dP_0n, qP_0n
    245         VQMOVUN     dQ_0n, qQ_0n
    246 
    247         VMAX        dQ_1n, dDeltaQ, dMinQ
    248 
    249         VMIN        dP_1n, dP_1n, dMaxP
    250         VMIN        dQ_1n, dQ_1n, dMaxQ
    251         VBIF        dP_0n, dP_0, dFilt
    252 
    253         VBIF        dP_1n, dP_1, dApflg
    254         VBIF        dQ_0n, dQ_0, dFilt
    255         VBIF        dQ_1n, dQ_1, dAqflg
    256 
    257         M_END
    258 
    259 ;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
    260 ;//
    261 ;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
    262 ;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
    263 ;//        - Additional Params  - alpha: D0, dMask_1: D15
    264 ;//
    265 ;// Outputs - Pixels            - P0-P2: D29-D31, Q0-Q2: D24,D25,D28
    266 
    267 ;// Registers Corrupted         - D18-D31
    268 
    269         M_START armVCM4P10_DeblockingLumabSGE4_unsafe
    270 
    271 
    272         ;// ap<beta && armAbs(p0-q0)<((alpha>>2)+2)
    273         ;// aq<beta && armAbs(p0-q0)<((alpha>>2)+2)
    274 
    275         ;// ( dApflg & dAp0q0 < (dAlpha >> 2 + 2) )
    276         ;// ( dAqflg & dAp0q0 < (dAlpha >> 2 + 2) )
    277 
    278         ;// ( dApflg = dApflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) )
    279         ;// ( dAqflg = dAqflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) )
    280 
    281         ;// P Filter
    282 
    283         VSHR        dTemp, dAlpha, #2
    284         VADD        dTemp, dTemp, dMask_1
    285 
    286         ;// qSp0q0-10
    287         VADDL       qSp0q0, dQ_0, dP_0
    288         VADD        dTemp, dTemp, dMask_1
    289 
    290         ;// qSp2q1-11
    291         ;// qSp0q0p1-12
    292         VADDL       qSp2q1, dP_2, dQ_1
    293         VADDW       qSp0q0p1, qSp0q0, dP_1
    294 
    295         VCGT        dTemp, dTemp, dAp0q0
    296         VSHR        qSp2q1, #1
    297 
    298         ;// pQ0[-1*Step] = (OMX_U8)((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3);
    299         ;// pQ0[-1*Step] = ( ( (p0 + q0 + p1) + (p2 + q1)>>1 ) >> 1 + 1 ) >> 1
    300 
    301         ;// dP_0n = ( ( (qSp0q0 + dP_1) + qSp2q1>>1 ) >> 1 + 1 ) >> 1
    302         ;// dP_0n = ( ( qSp0q0p1 + qSp2q1>>1 ) >> 1 + 1 ) >> 1
    303         ;// dP_0n = ( qTemp1 + 1 ) >> 1
    304 
    305         ;// pQ0[-2*Step] = (OMX_U8)((p2 + p1 + p0 + q0 + 2)>>2);
    306 
    307         ;// dP_1n = (OMX_U8)((dP_2 + qSp0q0p1 + 2)>>2);
    308         ;// dP_1n = (OMX_U8)((qTemp2 + 2)>>2);
    309 
    310         ;// pQ0[-3*Step] = (OMX_U8)((2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3);
    311         ;// pQ0[-3*Step] = (OMX_U8)(( (p3 + p2) + (p1 + p0 + q0 + p2) >> 1 + 2)>>2);
    312 
    313         ;// dP_2n = (OMX_U8)(( qSp3p2 + (dP_2 + qSp0q0p1) >> 1 + 2) >> 2);
    314         ;// dP_2n = (OMX_U8)(( qSp3p2 + qTemp2 >> 1 + 2) >> 2);
    315 
    316         ;// qTemp1-qSp2q1-11
    317         ;// qTemp2-qSp0q0p1-12
    318         VHADD       qTemp1, qSp0q0p1, qSp2q1
    319         VADDW       qTemp2, qSp0q0p1, dP_2
    320 
    321         ;// qSp3p2-13
    322         VADDL       qSp3p2, dP_3, dP_2
    323 
    324         VAND        dApflg, dApflg, dTemp
    325         VHADD       dHSp0q1, dP_0, dQ_1
    326         VSRA        qSp3p2, qTemp2, #1
    327         ;// dHSp0q1-28
    328         VAND        dAqflg, dAqflg, dTemp
    329 
    330         ;// dP_0n-29
    331         ;// dP_0t-dHSp0q1-28
    332         VQRSHRN     dP_0n, qTemp1, #1
    333         VRHADD      dP_0t, dHSp0q1, dP_1
    334 
    335         ;// dP_1n-30
    336         VQRSHRN     dP_1n, qTemp2, #2
    337 
    338         VADDL       qSq2p1, dQ_2, dP_1
    339         VADDW       qSp0q0q1, qSp0q0, dQ_1
    340 
    341         VBIF        dP_0n, dP_0t, dApflg
    342 
    343         ;// Q Filter
    344 
    345         ;// pQ0[0*Step] = (OMX_U8)((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3);
    346         ;// pQ0[0*Step] = ( ( (p0 + q0 + q1) + (q2 + p1)>>1 ) >> 1 + 1 ) >> 1
    347 
    348         ;// dQ_0n = ( ( (qSp0q0 + dQ_1) + qSq2p1>>1 ) >> 1 + 1 ) >> 1
    349         ;// dQ_0n = ( ( qSp0q0q1 + qSq2p1>>1 ) >> 1 + 1 ) >> 1
    350         ;// dQ_0n = ( qTemp1 + 1 ) >> 1
    351 
    352         ;// pQ0[1*Step] = (OMX_U8)((q2 + q1 + q0 + q0 + 2)>>2);
    353 
    354         ;// dQ_1n = (OMX_U8)((dQ_2 + qSp0q0q1 + 2)>>2);
    355         ;// dQ_1n = (OMX_U8)((qTemp2 + 2)>>2);
    356 
    357         ;// pQ0[2*Step] = (OMX_U8)((2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3);
    358         ;// pQ0[2*Step] = (OMX_U8)(( (q3 + q2) + (q1 + p0 + q0 + q2) >> 1 + 2)>>2);
    359 
    360         ;// dQ_2n = (OMX_U8)(( qSq3q2 + (dQ_2 + qSp0q0q1) >> 1 + 2) >> 2);
    361         ;// dQ_2n = (OMX_U8)(( qSq3q2 + qTemp2 >> 1 + 2) >> 2);
    362 
    363         ;// qTemp1-qSp2q1-11
    364         ;// qTemp2-qSp0q0p1-12
    365         ;// qSq2p1-11
    366         ;// qSp0q0q1-12
    367 
    368 
    369         ;// qTemp2-qSp0q0p1-12
    370         ;// qTemp1-qSq2p1-11
    371         ;// qSq3q2-13
    372         ;// dP_2n-31
    373 
    374         VQRSHRN     dP_2n, qSp3p2, #2
    375         VADDL       qSq3q2, dQ_3, dQ_2
    376 
    377         VSHR        qSq2p1, #1
    378 
    379         VHADD       qTemp1, qSp0q0q1, qSq2p1
    380         VADDW       qTemp2, qSp0q0q1, dQ_2
    381 
    382         ;// dHSq0p1-28
    383         VHADD       dHSq0p1, dQ_0, dP_1
    384 
    385         VBIF        dP_0n, dP_0, dFilt
    386         VBIF        dP_1n, dP_1, dApflg
    387 
    388         VSRA        qSq3q2, qTemp2, #1
    389 
    390         ;// dQ_1-Temp2-25
    391         ;// dQ_0-Temp2-24
    392         VQRSHRN     dQ_1n, qTemp2, #2
    393         VQRSHRN     dQ_0n, qTemp1, #1
    394 
    395         ;// dQ_0t-Temp1-22
    396         VRHADD      dQ_0t, dHSq0p1, dQ_1
    397         VBIF        dQ_1n, dQ_1, dAqflg
    398 
    399         VBIF        dP_2n, dP_2, dApflg
    400         VBIF        dQ_0n, dQ_0t, dAqflg
    401         VQRSHRN     dQ_2n, qSq3q2, #2
    402         VBIF        dQ_0n, dQ_0, dFilt
    403         VBIF        dQ_2n, dQ_2, dAqflg
    404 
    405         M_END
    406 
    407     ENDIF
    408 
    409 
    410         END
    411