Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         M_VARIANTS CortexA8
     17 
     18         IF CortexA8
     19 
     20         IMPORT  armVCM4P10_DeblockingChromabSGE4_unsafe
     21         IMPORT  armVCM4P10_DeblockingChromabSLT4_unsafe
     22 
     23 LOOP_COUNT  EQU 0x40000000
     24 MASK_3      EQU 0x03030303
     25 MASK_4      EQU 0x04040404
     26 
     27 ;// Function arguments
     28 
     29 pSrcDst     RN 0
     30 srcdstStep  RN 1
     31 pAlpha      RN 2
     32 pBeta       RN 3
     33 
     34 pThresholds RN 5
     35 pBS         RN 4
     36 bS3210      RN 6
     37 pSrcDst_P   RN 10
     38 pSrcDst_Q   RN 12
     39 
     40 pTmp        RN 10
     41 pTmp2       RN 12
     42 step        RN 14
     43 
     44 ;// Loop
     45 
     46 XY          RN 7
     47 
     48 ;// Rows input
     49 dRow0       DN D7.U8
     50 dRow1       DN D8.U8
     51 dRow2       DN D5.U8
     52 dRow3       DN D10.U8
     53 dRow4       DN D6.U8
     54 dRow5       DN D9.U8
     55 dRow6       DN D4.U8
     56 dRow7       DN D11.U8
     57 
     58 
     59 ;// Pixels
     60 dP_0        DN D4.U8
     61 dP_1        DN D5.U8
     62 dP_2        DN D6.U8
     63 dQ_0        DN D8.U8
     64 dQ_1        DN D9.U8
     65 dQ_2        DN D10.U8
     66 
     67 ;// Filtering Decision
     68 dAlpha      DN D0.U8
     69 dBeta       DN D2.U8
     70 
     71 dFilt       DN D16.U8
     72 dAqflg      DN D12.U8
     73 dApflg      DN D17.U8
     74 
     75 dAp0q0      DN D13.U8
     76 dAp1p0      DN D12.U8
     77 dAq1q0      DN D18.U8
     78 dAp2p0      DN D19.U8
     79 dAq2q0      DN D17.U8
     80 
     81 qBS3210     QN Q13.U16
     82 dBS3210     DN D26
     83 dMask_bs    DN D27
     84 dFilt_bs    DN D26.U16
     85 
     86 ;// bSLT4
     87 dMask_0     DN D14.U8
     88 dMask_1     DN D15.U8
     89 dMask_4     DN D1.U16
     90 
     91 Mask_4      RN 8
     92 Mask_3      RN 9
     93 
     94 dTemp       DN D19.U8
     95 
     96 ;// Result
     97 dP_0t       DN D13.U8
     98 dQ_0t       DN D31.U8
     99 
    100 dP_0n       DN D29.U8
    101 dQ_0n       DN D24.U8
    102 
    103 
    104         ;// Function header
    105         M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15
    106 
    107         ;//Arguments on the stack
    108         M_ARG   ppThresholds, 4
    109         M_ARG   ppBS, 4
    110 
    111         ;// d0-dAlpha_0
    112         ;// d2-dBeta_0
    113 
    114         ;load alpha1,beta1 somewhere to avoid more loads
    115         VLD1        {dAlpha[]}, [pAlpha]!
    116         SUB         pSrcDst, pSrcDst, #4
    117         VLD1        {dBeta[]}, [pBeta]!
    118 
    119         M_LDR       pBS, ppBS
    120         M_LDR       pThresholds, ppThresholds
    121 
    122         LDR         Mask_4, =MASK_4
    123         LDR         Mask_3, =MASK_3
    124 
    125         ;dMask_0-14
    126         ;dMask_1-15
    127         ;dMask_4-19
    128 
    129         VMOV        dMask_0, #0
    130         VMOV        dMask_1, #1
    131         VMOV        dMask_4, #4
    132 
    133         LDR         XY, =LOOP_COUNT
    134 
    135         ;// p0-p3 - d4-d7
    136         ;// q0-q3 - d8-d11
    137 
    138 
    139 LoopY
    140         LDR         bS3210, [pBS], #8
    141         ADD         pTmp, pSrcDst, srcdstStep
    142         ADD         step, srcdstStep, srcdstStep
    143 
    144         ;1
    145         VLD1        dRow0, [pSrcDst], step
    146         ;1
    147         VLD1        dRow1, [pTmp], step
    148         VLD1        dRow2, [pSrcDst], step
    149         VLD1        dRow3, [pTmp], step
    150         VLD1        dRow4, [pSrcDst], step
    151         VLD1        dRow5, [pTmp], step
    152         VLD1        dRow6, [pSrcDst], step
    153         VLD1        dRow7, [pTmp], step
    154 
    155 
    156         ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
    157         ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
    158         ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
    159         ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
    160         ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
    161         ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
    162         ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
    163         ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
    164 
    165         ;// 8x8 Transpose
    166         VZIP.8      dRow0, dRow1
    167         VZIP.8      dRow2, dRow3
    168         VZIP.8      dRow4, dRow5
    169         VZIP.8      dRow6, dRow7
    170 
    171         VZIP.16     dRow0, dRow2
    172         VZIP.16     dRow1, dRow3
    173         VZIP.16     dRow4, dRow6
    174         VZIP.16     dRow5, dRow7
    175 
    176         VZIP.32     dRow0, dRow4
    177         VZIP.32     dRow2, dRow6
    178         VZIP.32     dRow3, dRow7
    179         VZIP.32     dRow1, dRow5
    180 
    181 
    182         ;Realign the pointers
    183 
    184         CMP         bS3210, #0
    185         VABD        dAp2p0, dP_2, dP_0
    186         VABD        dAp0q0, dP_0, dQ_0
    187         BEQ         NoFilterBS0
    188 
    189         VABD        dAp1p0, dP_1, dP_0
    190         VABD        dAq1q0, dQ_1, dQ_0
    191 
    192         VMOV.U32    dBS3210[0], bS3210
    193         VCGT        dFilt, dAlpha, dAp0q0
    194         VMAX        dAp1p0, dAq1q0, dAp1p0
    195         VMOVL       qBS3210, dBS3210.U8
    196         VABD        dAq2q0, dQ_2, dQ_0
    197         VCGT        dMask_bs.S16, dBS3210.S16, #0
    198 
    199         VCGT        dAp1p0, dBeta, dAp1p0
    200         VCGT        dAp2p0, dBeta, dAp2p0
    201         VAND        dFilt, dMask_bs.U8
    202 
    203         TST         bS3210, Mask_3
    204 
    205         VCGT        dAq2q0, dBeta, dAq2q0
    206         VAND        dFilt, dFilt, dAp1p0
    207 
    208         VAND        dAqflg, dFilt, dAq2q0
    209         VAND        dApflg, dFilt, dAp2p0
    210 
    211         ;// bS < 4 Filtering
    212         BLNE        armVCM4P10_DeblockingChromabSLT4_unsafe
    213 
    214         TST         bS3210, Mask_4
    215 
    216         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    217         VTST        dFilt_bs, dFilt_bs, dMask_4
    218 
    219         ;// bS == 4 Filtering
    220         BLNE        armVCM4P10_DeblockingChromabSGE4_unsafe
    221 
    222         VBIT        dP_0n, dP_0t, dFilt_bs
    223         VBIT        dQ_0n, dQ_0t, dFilt_bs
    224 
    225         ;// Result Storage
    226         ADD         pSrcDst_P, pSrcDst, #3
    227         VBIF        dP_0n, dP_0, dFilt
    228 
    229         ADD         pTmp2, pSrcDst_P, srcdstStep
    230         ADD         step, srcdstStep, srcdstStep
    231         VBIF        dQ_0n, dQ_0, dFilt
    232 
    233         ADDS        XY, XY, XY
    234 
    235         VST1        {dP_0n[0]}, [pSrcDst_P], step
    236         VST1        {dP_0n[1]}, [pTmp2], step
    237         VST1        {dP_0n[2]}, [pSrcDst_P], step
    238         VST1        {dP_0n[3]}, [pTmp2], step
    239         VST1        {dP_0n[4]}, [pSrcDst_P], step
    240         VST1        {dP_0n[5]}, [pTmp2], step
    241         VST1        {dP_0n[6]}, [pSrcDst_P], step
    242         VST1        {dP_0n[7]}, [pTmp2], step
    243 
    244         ADD         pSrcDst_Q, pSrcDst, #4
    245         ADD         pTmp, pSrcDst_Q, srcdstStep
    246 
    247         VST1        {dQ_0n[0]}, [pSrcDst_Q], step
    248         VST1        {dQ_0n[1]}, [pTmp], step
    249         VST1        {dQ_0n[2]}, [pSrcDst_Q], step
    250         VST1        {dQ_0n[3]}, [pTmp], step
    251         VST1        {dQ_0n[4]}, [pSrcDst_Q], step
    252         VST1        {dQ_0n[5]}, [pTmp], step
    253         VST1        {dQ_0n[6]}, [pSrcDst_Q], step
    254         VST1        {dQ_0n[7]}, [pTmp], step
    255 
    256         ADD         pSrcDst, pSrcDst, #4
    257 
    258         BNE         LoopY
    259 
    260         MOV         r0, #OMX_Sts_NoErr
    261 
    262         M_EXIT
    263 
    264 NoFilterBS0
    265         VLD1        {dAlpha[]}, [pAlpha]
    266         ADD         pSrcDst, pSrcDst, #4
    267         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    268         ADDS        XY, XY, XY
    269         VLD1        {dBeta[]}, [pBeta]
    270         ADD         pThresholds, pThresholds, #4
    271         BNE         LoopY
    272 
    273         MOV         r0, #OMX_Sts_NoErr
    274 
    275         M_END
    276 
    277         ENDIF
    278 
    279 
    280         END
    281 
    282 
    283