Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 ;// Description:
     27 ;// H.264 inverse quantize and transform module
     28 ;//
     29 ;//
     30 
     31 
     32 
     33 ;// Include standard headers
     34 
     35         INCLUDE omxtypes_s.h
     36         INCLUDE armCOMM_s.h
     37 
     38 ;// Import symbols required from other files
     39 ;// (For example tables)
     40 
     41         IMPORT armVCM4P10_UnpackBlock4x4
     42         IMPORT armVCM4P10_TransformResidual4x4
     43         IMPORT armVCM4P10_QPDivTable
     44         IMPORT armVCM4P10_VMatrixU16
     45         IMPORT armVCM4P10_QPModuloTable
     46 
     47         M_VARIANTS CortexA8
     48 
     49 ;// Set debugging level
     50 ;//DEBUG_ON    SETL {TRUE}
     51 
     52 
     53 ;// Static Function: armVCM4P10_DequantLumaAC4x4
     54 
     55 ;// Guarding implementation by the processor name
     56 
     57 
     58 
     59 ;// Guarding implementation by the processor name
     60 
     61 
     62 
     63 
     64 
     65 
     66 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
     67 
     68 ;// Guarding implementation by the processor name
     69 
     70 
     71 
     72 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
     73 
     74 ;// Guarding implementation by the processor name
     75 
     76     IF  CortexA8
     77 
     78 
     79 ;// ARM Registers
     80 
     81 ;//Input Registers
     82 ppSrc       RN  0
     83 pPred       RN  1
     84 pDC         RN  2
     85 pDst        RN  3
     86 
     87 
     88 ;//Output Registers
     89 result      RN  0
     90 
     91 ;//Local Scratch Registers
     92 
     93 ;//Registers used in armVCM4P10_DequantLumaAC4x4
     94 pQPdiv      RN  10
     95 pQPmod      RN  11
     96 pVRow       RN  2
     97 QPmod       RN  12
     98 shift       RN  14
     99 index0      RN  1
    100 index1      RN  10
    101 
    102 ;//Registers used in DequantTransformResidualFromPairAndAdd
    103 pDelta      RN  4
    104 pDeltaTmp   RN  6
    105 AC          RN  5                   ;//Load from stack
    106 pPredTemp   RN  7
    107 pDCTemp     RN  8
    108 pDstTemp    RN  9
    109 pDeltaArg1  RN  1
    110 pDeltaArg0  RN  0
    111 QP          RN  1                   ;//Load from stack
    112 DCval       RN  10
    113 predstep    RN  1
    114 dstStep     RN  10
    115 PredVal1    RN  3
    116 PredVal2    RN  5
    117 
    118 
    119 
    120 
    121 ;// Neon Registers
    122 
    123 ;// Registers used in armVCM4P10_DequantLumaAC4x4
    124 
    125 dVmatrix            DN  D6.8
    126 dindexRow0          DN  D7.32
    127 dindexRow1          DN  D9.32
    128 dByteIndexRow0      DN  D7.8
    129 dByteIndexRow1      DN  D9.8
    130 dVRow0              DN  D8.8
    131 dVRow1              DN  D4.8
    132 dVRow0U16           DN  D8.U16
    133 dVRow1U16           DN  D4.U16
    134 dVRow2U16           DN  D8.U16
    135 dVRow3U16           DN  D4.U16
    136 
    137 dShift              DN  D5.U16
    138 dSrcRow0            DN  D0.I16
    139 dSrcRow1            DN  D1.I16
    140 dSrcRow2            DN  D2.I16
    141 dSrcRow3            DN  D3.I16
    142 dDqntRow0           DN  D0.I16
    143 dDqntRow1           DN  D1.I16
    144 dDqntRow2           DN  D2.I16
    145 dDqntRow3           DN  D3.I16
    146 
    147 ;// Registers used in TransformResidual4x4
    148 
    149 ;// Packed Input pixels
    150 dIn0                DN  D0.S16
    151 dIn1                DN  D1.S16
    152 dIn2                DN  D2.S16
    153 dIn3                DN  D3.S16
    154 qIn01               QN  Q0.32
    155 qIn23               QN  Q1.32
    156 
    157 ;// Intermediate calculations
    158 dZero               DN  D4.S16
    159 de0                 DN  D5.S16
    160 de1                 DN  D6.S16
    161 de2                 DN  D7.S16
    162 de3                 DN  D8.S16
    163 dIn1RS              DN  D7.S16
    164 dIn3RS              DN  D8.S16
    165 df0                 DN  D0.S16
    166 df1                 DN  D1.S16
    167 df2                 DN  D2.S16
    168 df3                 DN  D3.S16
    169 qf01                QN  Q0.32
    170 qf23                QN  Q1.32
    171 dg0                 DN  D5.S16
    172 dg1                 DN  D6.S16
    173 dg2                 DN  D7.S16
    174 dg3                 DN  D8.S16
    175 df1RS               DN  D7.S16
    176 df3RS               DN  D8.S16
    177 
    178 ;// Output pixels
    179 dh0                 DN  D0.S16
    180 dh1                 DN  D1.S16
    181 dh2                 DN  D2.S16
    182 dh3                 DN  D3.S16
    183 
    184 ;// Registers used in DequantTransformResidualFromPairAndAdd
    185 
    186 dDeltaRow0          DN  D0.S16
    187 dDeltaRow1          DN  D1.S16
    188 dDeltaRow2          DN  D2.S16
    189 dDeltaRow3          DN  D3.S16
    190 qDeltaRow01         QN  Q0.S16
    191 qDeltaRow23         QN  Q1.S16
    192 
    193 dPredValRow01       DN  D4.U8
    194 dPredValRow23       DN  D5.U8
    195 
    196 qSumRow01           QN  Q3.S16
    197 qSumRow23           QN  Q4.S16
    198 dDstRow01           DN  D0.U8
    199 dDstRow23           DN  D1.U8
    200 dDstRow0            DN  D0.32[0]
    201 dDstRow1            DN  D0.32[1]
    202 dDstRow2            DN  D1.32[0]
    203 dDstRow3            DN  D1.32[1]
    204 
    205 
    206     ;// Allocate stack memory required by the function
    207         M_ALLOC8 pBuffer, 32
    208 
    209 
    210     ;// Write function header
    211         M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9
    212 
    213         ;// Define stack arguments
    214         M_ARG   predStepOnStack, 4
    215         M_ARG   dstStepOnStack,4
    216         M_ARG   QPOnStack, 4
    217         M_ARG   ACOnStack,4
    218 
    219 
    220         M_ADR   pDelta,pBuffer
    221         M_LDR   AC,ACOnStack
    222 
    223 
    224         ;// Save registers r1,r2,r3 before function call
    225         MOV     pPredTemp,pPred
    226         MOV     pDCTemp,pDC
    227         MOV     pDstTemp,pDst
    228 
    229         CMP     AC,#0
    230         BEQ     DCcase
    231         MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
    232 
    233         BL      armVCM4P10_UnpackBlock4x4
    234 
    235         ;//--------------------------------------------------------
    236         ;// armVCM4P10_DequantLumaAC4x4 : static function inlined
    237         ;//--------------------------------------------------------
    238 
    239         ;//BL      armVCM4P10_DequantLumaAC4x4
    240         M_LDR   QP,QPOnStack                                ;// Set up r1 for armVCM4P10_DequantLumaAC4x4
    241 
    242         LDR    pQPmod,=armVCM4P10_QPModuloTable
    243         LDR    pQPdiv,=armVCM4P10_QPDivTable
    244         LDR    pVRow,=armVCM4P10_VMatrixU16
    245 
    246 
    247         LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
    248         LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
    249 
    250         LDR    index1,=0x03020504
    251         LDR    index0,=0x05040100                   ;// Indexes into dVmatrix
    252         ADD    pVRow,pVRow,QPmod
    253         VDUP   dindexRow0,index0
    254         VDUP   dindexRow1,index1
    255         VDUP   dShift,shift
    256 
    257         ;// Load all 4x4 pVRow[] values
    258         VLD1   dVmatrix,[pVRow]                     ;// dVmatrix = [0d|0c|0b|0a]
    259 
    260 
    261         VTBL   dVRow0,dVmatrix,dByteIndexRow0       ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]]
    262         VTBL   dVRow1,dVmatrix,dByteIndexRow1       ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]]
    263         CMP     pDCTemp,#0
    264         ;// Load all the 4x4 'src' values
    265         VLD1   { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta]
    266 
    267         VSHL   dVRow0U16,dVRow0U16,dShift
    268         VSHL   dVRow1U16,dVRow1U16,dShift
    269         LDRSHNE DCval,[pDCTemp]
    270 
    271 
    272         ;// Multiply src[] with pVRow[]
    273         VMUL    dDqntRow0,dSrcRow0,dVRow0U16
    274         VMUL    dDqntRow1,dSrcRow1,dVRow1U16
    275         VMUL    dDqntRow2,dSrcRow2,dVRow2U16
    276         VMUL    dDqntRow3,dSrcRow3,dVRow3U16
    277 
    278 
    279 
    280         ;//-------------------------------------------------------------
    281         ;// TransformResidual4x4 : Inlined to avoid Load/Stores
    282         ;//-------------------------------------------------------------
    283 
    284 
    285         ;//BL      armVCM4P10_TransformResidual4x4
    286         ;//STRHNE  DCval,[pDelta]
    287         VMOVNE    dIn0[0],DCval
    288 
    289 
    290 
    291         ;//*****************************************************************
    292         ;// Transpose the input pixels : perform Row ops as Col ops
    293         ;//*****************************************************************
    294 
    295         VTRN    dIn0,dIn1
    296         VTRN    dIn2,dIn3
    297         VTRN    qIn01,qIn23
    298 
    299 
    300         VMOV    dZero,#0                                    ;// Used to right shift by 1
    301 
    302 
    303         ;//****************************************
    304         ;// Row Operations (Performed on columns)
    305         ;//****************************************
    306 
    307 
    308         VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
    309         VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
    310         VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
    311         VHADD       dIn3RS,dIn3,dZero
    312         VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
    313         VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
    314         VADD        df0,de0,de3                         ;//  f0 = e0 + e3
    315         VADD        df1,de1,de2                            ;//  f1 = e1 + e2
    316         VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
    317         VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
    318 
    319 
    320 
    321         ;//*****************************************************************
    322         ;// Transpose the resultant matrix
    323         ;//*****************************************************************
    324 
    325         VTRN    df0,df1
    326         VTRN    df2,df3
    327         VTRN    qf01,qf23
    328 
    329 
    330         ;//*******************************
    331         ;// Coloumn Operations
    332         ;//*******************************
    333 
    334 
    335         VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
    336         VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
    337         VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
    338         VHADD       df3RS,df3,dZero
    339         VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
    340         VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
    341         VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
    342         VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
    343         VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
    344         VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
    345 
    346 
    347         ;//************************************************
    348         ;// Calculate final value (colOp[i][j] + 32)>>6
    349         ;//************************************************
    350 
    351         VRSHR       dh0,#6
    352         VRSHR       dh1,#6
    353         VRSHR       dh2,#6
    354         VRSHR       dh3,#6
    355 
    356 
    357         B       OutDCcase
    358 
    359 
    360 DCcase
    361         ;// Calculate the Transformed DCvalue : (DCval+32)>>6
    362         LDRSH   DCval,[pDCTemp]
    363         ADD     DCval,DCval,#32
    364         ASR     DCval,DCval,#6
    365 
    366         VDUP    dDeltaRow0, DCval                       ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
    367         VDUP    dDeltaRow1, DCval                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
    368         VDUP    dDeltaRow2, DCval                        ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
    369         VDUP    dDeltaRow3, DCval
    370 
    371 
    372 OutDCcase
    373         M_LDR   predstep,predStepOnStack
    374         M_LDR   dstStep,dstStepOnStack
    375 
    376         LDR     PredVal1,[pPredTemp],predstep
    377         LDR     PredVal2,[pPredTemp],predstep
    378         VMOV    dPredValRow01,PredVal1,PredVal2
    379 
    380         LDR     PredVal1,[pPredTemp],predstep
    381         LDR     PredVal2,[pPredTemp]
    382         VMOV    dPredValRow23,PredVal1,PredVal2
    383 
    384 
    385         VADDW   qSumRow01,qDeltaRow01,dPredValRow01
    386         VADDW   qSumRow23,qDeltaRow23,dPredValRow23
    387         VQMOVUN dDstRow01,qSumRow01
    388         VQMOVUN dDstRow23,qSumRow23
    389 
    390 
    391         VST1    dDstRow0,[pDstTemp],dstStep
    392         VST1    dDstRow1,[pDstTemp],dstStep
    393         VST1    dDstRow2,[pDstTemp],dstStep
    394         VST1    dDstRow3,[pDstTemp]
    395 
    396         ;// Set return value
    397         MOV     result,#OMX_Sts_NoErr
    398 
    399 End
    400 
    401 
    402         ;// Write function tail
    403 
    404         M_END
    405 
    406     ENDIF                                                    ;//CORTEXA8
    407 
    408 
    409 
    410     END
    411