Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS CortexA8
     31 
     32         IF CortexA8
     33 
     34         IMPORT  armVCM4P10_DeblockingChromabSGE4_unsafe
     35         IMPORT  armVCM4P10_DeblockingChromabSLT4_unsafe
     36 
     37 LOOP_COUNT  EQU 0x40000000
     38 MASK_3      EQU 0x03030303
     39 MASK_4      EQU 0x04040404
     40 
     41 ;// Function arguments
     42 
     43 pSrcDst     RN 0
     44 srcdstStep  RN 1
     45 pAlpha      RN 2
     46 pBeta       RN 3
     47 
     48 pThresholds RN 5
     49 pBS         RN 4
     50 bS3210      RN 6
     51 pSrcDst_P   RN 10
     52 pSrcDst_Q   RN 12
     53 
     54 pTmp        RN 10
     55 pTmp2       RN 12
     56 step        RN 14
     57 
     58 ;// Loop
     59 
     60 XY          RN 7
     61 
     62 ;// Rows input
     63 dRow0       DN D7.U8
     64 dRow1       DN D8.U8
     65 dRow2       DN D5.U8
     66 dRow3       DN D10.U8
     67 dRow4       DN D6.U8
     68 dRow5       DN D9.U8
     69 dRow6       DN D4.U8
     70 dRow7       DN D11.U8
     71 
     72 
     73 ;// Pixels
     74 dP_0        DN D4.U8
     75 dP_1        DN D5.U8
     76 dP_2        DN D6.U8
     77 dQ_0        DN D8.U8
     78 dQ_1        DN D9.U8
     79 dQ_2        DN D10.U8
     80 
     81 ;// Filtering Decision
     82 dAlpha      DN D0.U8
     83 dBeta       DN D2.U8
     84 
     85 dFilt       DN D16.U8
     86 dAqflg      DN D12.U8
     87 dApflg      DN D17.U8
     88 
     89 dAp0q0      DN D13.U8
     90 dAp1p0      DN D12.U8
     91 dAq1q0      DN D18.U8
     92 dAp2p0      DN D19.U8
     93 dAq2q0      DN D17.U8
     94 
     95 qBS3210     QN Q13.U16
     96 dBS3210     DN D26
     97 dMask_bs    DN D27
     98 dFilt_bs    DN D26.U16
     99 
    100 ;// bSLT4
    101 dMask_0     DN D14.U8
    102 dMask_1     DN D15.U8
    103 dMask_4     DN D1.U16
    104 
    105 Mask_4      RN 8
    106 Mask_3      RN 9
    107 
    108 dTemp       DN D19.U8
    109 
    110 ;// Result
    111 dP_0t       DN D13.U8
    112 dQ_0t       DN D31.U8
    113 
    114 dP_0n       DN D29.U8
    115 dQ_0n       DN D24.U8
    116 
    117 
    118         ;// Function header
    119         M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15
    120 
    121         ;//Arguments on the stack
    122         M_ARG   ppThresholds, 4
    123         M_ARG   ppBS, 4
    124 
    125         ;// d0-dAlpha_0
    126         ;// d2-dBeta_0
    127 
    128         ;load alpha1,beta1 somewhere to avoid more loads
    129         VLD1        {dAlpha[]}, [pAlpha]!
    130         SUB         pSrcDst, pSrcDst, #4
    131         VLD1        {dBeta[]}, [pBeta]!
    132 
    133         M_LDR       pBS, ppBS
    134         M_LDR       pThresholds, ppThresholds
    135 
    136         LDR         Mask_4, =MASK_4
    137         LDR         Mask_3, =MASK_3
    138 
    139         ;dMask_0-14
    140         ;dMask_1-15
    141         ;dMask_4-19
    142 
    143         VMOV        dMask_0, #0
    144         VMOV        dMask_1, #1
    145         VMOV        dMask_4, #4
    146 
    147         LDR         XY, =LOOP_COUNT
    148 
    149         ;// p0-p3 - d4-d7
    150         ;// q0-q3 - d8-d11
    151 
    152 
    153 LoopY
    154         LDR         bS3210, [pBS], #8
    155         ADD         pTmp, pSrcDst, srcdstStep
    156         ADD         step, srcdstStep, srcdstStep
    157 
    158         ;1
    159         VLD1        dRow0, [pSrcDst], step
    160         ;1
    161         VLD1        dRow1, [pTmp], step
    162         VLD1        dRow2, [pSrcDst], step
    163         VLD1        dRow3, [pTmp], step
    164         VLD1        dRow4, [pSrcDst], step
    165         VLD1        dRow5, [pTmp], step
    166         VLD1        dRow6, [pSrcDst], step
    167         VLD1        dRow7, [pTmp], step
    168 
    169 
    170         ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
    171         ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
    172         ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
    173         ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
    174         ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
    175         ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
    176         ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
    177         ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
    178 
    179         ;// 8x8 Transpose
    180         VZIP.8      dRow0, dRow1
    181         VZIP.8      dRow2, dRow3
    182         VZIP.8      dRow4, dRow5
    183         VZIP.8      dRow6, dRow7
    184 
    185         VZIP.16     dRow0, dRow2
    186         VZIP.16     dRow1, dRow3
    187         VZIP.16     dRow4, dRow6
    188         VZIP.16     dRow5, dRow7
    189 
    190         VZIP.32     dRow0, dRow4
    191         VZIP.32     dRow2, dRow6
    192         VZIP.32     dRow3, dRow7
    193         VZIP.32     dRow1, dRow5
    194 
    195 
    196         ;Realign the pointers
    197 
    198         CMP         bS3210, #0
    199         VABD        dAp2p0, dP_2, dP_0
    200         VABD        dAp0q0, dP_0, dQ_0
    201         BEQ         NoFilterBS0
    202 
    203         VABD        dAp1p0, dP_1, dP_0
    204         VABD        dAq1q0, dQ_1, dQ_0
    205 
    206         VMOV.U32    dBS3210[0], bS3210
    207         VCGT        dFilt, dAlpha, dAp0q0
    208         VMAX        dAp1p0, dAq1q0, dAp1p0
    209         VMOVL       qBS3210, dBS3210.U8
    210         VABD        dAq2q0, dQ_2, dQ_0
    211         VCGT        dMask_bs.S16, dBS3210.S16, #0
    212 
    213         VCGT        dAp1p0, dBeta, dAp1p0
    214         VCGT        dAp2p0, dBeta, dAp2p0
    215         VAND        dFilt, dMask_bs.U8
    216 
    217         TST         bS3210, Mask_3
    218 
    219         VCGT        dAq2q0, dBeta, dAq2q0
    220         VAND        dFilt, dFilt, dAp1p0
    221 
    222         VAND        dAqflg, dFilt, dAq2q0
    223         VAND        dApflg, dFilt, dAp2p0
    224 
    225         ;// bS < 4 Filtering
    226         BLNE        armVCM4P10_DeblockingChromabSLT4_unsafe
    227 
    228         TST         bS3210, Mask_4
    229 
    230         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    231         VTST        dFilt_bs, dFilt_bs, dMask_4
    232 
    233         ;// bS == 4 Filtering
    234         BLNE        armVCM4P10_DeblockingChromabSGE4_unsafe
    235 
    236         VBIT        dP_0n, dP_0t, dFilt_bs
    237         VBIT        dQ_0n, dQ_0t, dFilt_bs
    238 
    239         ;// Result Storage
    240         ADD         pSrcDst_P, pSrcDst, #3
    241         VBIF        dP_0n, dP_0, dFilt
    242 
    243         ADD         pTmp2, pSrcDst_P, srcdstStep
    244         ADD         step, srcdstStep, srcdstStep
    245         VBIF        dQ_0n, dQ_0, dFilt
    246 
    247         ADDS        XY, XY, XY
    248 
    249         VST1        {dP_0n[0]}, [pSrcDst_P], step
    250         VST1        {dP_0n[1]}, [pTmp2], step
    251         VST1        {dP_0n[2]}, [pSrcDst_P], step
    252         VST1        {dP_0n[3]}, [pTmp2], step
    253         VST1        {dP_0n[4]}, [pSrcDst_P], step
    254         VST1        {dP_0n[5]}, [pTmp2], step
    255         VST1        {dP_0n[6]}, [pSrcDst_P], step
    256         VST1        {dP_0n[7]}, [pTmp2], step
    257 
    258         ADD         pSrcDst_Q, pSrcDst, #4
    259         ADD         pTmp, pSrcDst_Q, srcdstStep
    260 
    261         VST1        {dQ_0n[0]}, [pSrcDst_Q], step
    262         VST1        {dQ_0n[1]}, [pTmp], step
    263         VST1        {dQ_0n[2]}, [pSrcDst_Q], step
    264         VST1        {dQ_0n[3]}, [pTmp], step
    265         VST1        {dQ_0n[4]}, [pSrcDst_Q], step
    266         VST1        {dQ_0n[5]}, [pTmp], step
    267         VST1        {dQ_0n[6]}, [pSrcDst_Q], step
    268         VST1        {dQ_0n[7]}, [pTmp], step
    269 
    270         ADD         pSrcDst, pSrcDst, #4
    271 
    272         BNE         LoopY
    273 
    274         MOV         r0, #OMX_Sts_NoErr
    275 
    276         M_EXIT
    277 
    278 NoFilterBS0
    279         VLD1        {dAlpha[]}, [pAlpha]
    280         ADD         pSrcDst, pSrcDst, #4
    281         SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
    282         ADDS        XY, XY, XY
    283         VLD1        {dBeta[]}, [pBeta]
    284         ADD         pThresholds, pThresholds, #4
    285         BNE         LoopY
    286 
    287         MOV         r0, #OMX_Sts_NoErr
    288 
    289         M_END
    290 
    291         ENDIF
    292 
    293 
    294         END
    295 
    296 
    297