Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 ;// Description:
     13 ;// H.264 inverse quantize and transform module
     14 ;//
     15 ;//
     16 
     17 
     18 
     19 ;// Include standard headers
     20 
     21         INCLUDE omxtypes_s.h
     22         INCLUDE armCOMM_s.h
     23 
     24 ;// Import symbols required from other files
     25 ;// (For example tables)
     26 
     27         IMPORT armVCM4P10_UnpackBlock4x4
     28         IMPORT armVCM4P10_TransformResidual4x4
     29         IMPORT armVCM4P10_QPDivTable
     30         IMPORT armVCM4P10_VMatrixU16
     31         IMPORT armVCM4P10_QPModuloTable
     32 
     33         M_VARIANTS CortexA8
     34 
     35 ;// Set debugging level
     36 ;//DEBUG_ON    SETL {TRUE}
     37 
     38 
     39 ;// Static Function: armVCM4P10_DequantLumaAC4x4
     40 
     41 ;// Guarding implementation by the processor name
     42 
     43 
     44 
     45 ;// Guarding implementation by the processor name
     46 
     47 
     48 
     49 
     50 
     51 
     52 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
     53 
     54 ;// Guarding implementation by the processor name
     55 
     56 
     57 
     58 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
     59 
     60 ;// Guarding implementation by the processor name
     61 
     62     IF  CortexA8
     63 
     64 
     65 ;// ARM Registers
     66 
     67 ;//Input Registers
     68 ppSrc       RN  0
     69 pPred       RN  1
     70 pDC         RN  2
     71 pDst        RN  3
     72 
     73 
     74 ;//Output Registers
     75 result      RN  0
     76 
     77 ;//Local Scratch Registers
     78 
     79 ;//Registers used in armVCM4P10_DequantLumaAC4x4
     80 pQPdiv      RN  10
     81 pQPmod      RN  11
     82 pVRow       RN  2
     83 QPmod       RN  12
     84 shift       RN  14
     85 index0      RN  1
     86 index1      RN  10
     87 
     88 ;//Registers used in DequantTransformResidualFromPairAndAdd
     89 pDelta      RN  4
     90 pDeltaTmp   RN  6
     91 AC          RN  5                   ;//Load from stack
     92 pPredTemp   RN  7
     93 pDCTemp     RN  8
     94 pDstTemp    RN  9
     95 pDeltaArg1  RN  1
     96 pDeltaArg0  RN  0
     97 QP          RN  1                   ;//Load from stack
     98 DCval       RN  10
     99 predstep    RN  1
    100 dstStep     RN  10
    101 PredVal1    RN  3
    102 PredVal2    RN  5
    103 
    104 
    105 
    106 
    107 ;// Neon Registers
    108 
    109 ;// Registers used in armVCM4P10_DequantLumaAC4x4
    110 
    111 dVmatrix            DN  D6.8
    112 dindexRow0          DN  D7.32
    113 dindexRow1          DN  D9.32
    114 dByteIndexRow0      DN  D7.8
    115 dByteIndexRow1      DN  D9.8
    116 dVRow0              DN  D8.8
    117 dVRow1              DN  D4.8
    118 dVRow0U16           DN  D8.U16
    119 dVRow1U16           DN  D4.U16
    120 dVRow2U16           DN  D8.U16
    121 dVRow3U16           DN  D4.U16
    122 
    123 dShift              DN  D5.U16
    124 dSrcRow0            DN  D0.I16
    125 dSrcRow1            DN  D1.I16
    126 dSrcRow2            DN  D2.I16
    127 dSrcRow3            DN  D3.I16
    128 dDqntRow0           DN  D0.I16
    129 dDqntRow1           DN  D1.I16
    130 dDqntRow2           DN  D2.I16
    131 dDqntRow3           DN  D3.I16
    132 
    133 ;// Registers used in TransformResidual4x4
    134 
    135 ;// Packed Input pixels
    136 dIn0                DN  D0.S16
    137 dIn1                DN  D1.S16
    138 dIn2                DN  D2.S16
    139 dIn3                DN  D3.S16
    140 qIn01               QN  Q0.32
    141 qIn23               QN  Q1.32
    142 
    143 ;// Intermediate calculations
    144 dZero               DN  D4.S16
    145 de0                 DN  D5.S16
    146 de1                 DN  D6.S16
    147 de2                 DN  D7.S16
    148 de3                 DN  D8.S16
    149 dIn1RS              DN  D7.S16
    150 dIn3RS              DN  D8.S16
    151 df0                 DN  D0.S16
    152 df1                 DN  D1.S16
    153 df2                 DN  D2.S16
    154 df3                 DN  D3.S16
    155 qf01                QN  Q0.32
    156 qf23                QN  Q1.32
    157 dg0                 DN  D5.S16
    158 dg1                 DN  D6.S16
    159 dg2                 DN  D7.S16
    160 dg3                 DN  D8.S16
    161 df1RS               DN  D7.S16
    162 df3RS               DN  D8.S16
    163 
    164 ;// Output pixels
    165 dh0                 DN  D0.S16
    166 dh1                 DN  D1.S16
    167 dh2                 DN  D2.S16
    168 dh3                 DN  D3.S16
    169 
    170 ;// Registers used in DequantTransformResidualFromPairAndAdd
    171 
    172 dDeltaRow0          DN  D0.S16
    173 dDeltaRow1          DN  D1.S16
    174 dDeltaRow2          DN  D2.S16
    175 dDeltaRow3          DN  D3.S16
    176 qDeltaRow01         QN  Q0.S16
    177 qDeltaRow23         QN  Q1.S16
    178 
    179 dPredValRow01       DN  D4.U8
    180 dPredValRow23       DN  D5.U8
    181 
    182 qSumRow01           QN  Q3.S16
    183 qSumRow23           QN  Q4.S16
    184 dDstRow01           DN  D0.U8
    185 dDstRow23           DN  D1.U8
    186 dDstRow0            DN  D0.32[0]
    187 dDstRow1            DN  D0.32[1]
    188 dDstRow2            DN  D1.32[0]
    189 dDstRow3            DN  D1.32[1]
    190 
    191 
    192     ;// Allocate stack memory required by the function
    193         M_ALLOC8 pBuffer, 32
    194 
    195 
    196     ;// Write function header
    197         M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9
    198 
    199         ;// Define stack arguments
    200         M_ARG   predStepOnStack, 4
    201         M_ARG   dstStepOnStack,4
    202         M_ARG   QPOnStack, 4
    203         M_ARG   ACOnStack,4
    204 
    205 
    206         M_ADR   pDelta,pBuffer
    207         M_LDR   AC,ACOnStack
    208 
    209 
    210         ;// Save registers r1,r2,r3 before function call
    211         MOV     pPredTemp,pPred
    212         MOV     pDCTemp,pDC
    213         MOV     pDstTemp,pDst
    214 
    215         CMP     AC,#0
    216         BEQ     DCcase
    217         MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
    218 
    219         BL      armVCM4P10_UnpackBlock4x4
    220 
    221         ;//--------------------------------------------------------
    222         ;// armVCM4P10_DequantLumaAC4x4 : static function inlined
    223         ;//--------------------------------------------------------
    224 
    225         ;//BL      armVCM4P10_DequantLumaAC4x4
    226         M_LDR   QP,QPOnStack                                ;// Set up r1 for armVCM4P10_DequantLumaAC4x4
    227 
    228         LDR    pQPmod,=armVCM4P10_QPModuloTable
    229         LDR    pQPdiv,=armVCM4P10_QPDivTable
    230         LDR    pVRow,=armVCM4P10_VMatrixU16
    231 
    232 
    233         LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
    234         LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
    235 
    236         LDR    index1,=0x03020504
    237         LDR    index0,=0x05040100                   ;// Indexes into dVmatrix
    238         ADD    pVRow,pVRow,QPmod
    239         VDUP   dindexRow0,index0
    240         VDUP   dindexRow1,index1
    241         VDUP   dShift,shift
    242 
    243         ;// Load all 4x4 pVRow[] values
    244         VLD1   dVmatrix,[pVRow]                     ;// dVmatrix = [0d|0c|0b|0a]
    245 
    246 
    247         VTBL   dVRow0,dVmatrix,dByteIndexRow0       ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]]
    248         VTBL   dVRow1,dVmatrix,dByteIndexRow1       ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]]
    249         CMP     pDCTemp,#0
    250         ;// Load all the 4x4 'src' values
    251         VLD1   { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta]
    252 
    253         VSHL   dVRow0U16,dVRow0U16,dShift
    254         VSHL   dVRow1U16,dVRow1U16,dShift
    255         LDRSHNE DCval,[pDCTemp]
    256 
    257 
    258         ;// Multiply src[] with pVRow[]
    259         VMUL    dDqntRow0,dSrcRow0,dVRow0U16
    260         VMUL    dDqntRow1,dSrcRow1,dVRow1U16
    261         VMUL    dDqntRow2,dSrcRow2,dVRow2U16
    262         VMUL    dDqntRow3,dSrcRow3,dVRow3U16
    263 
    264 
    265 
    266         ;//-------------------------------------------------------------
    267         ;// TransformResidual4x4 : Inlined to avoid Load/Stores
    268         ;//-------------------------------------------------------------
    269 
    270 
    271         ;//BL      armVCM4P10_TransformResidual4x4
    272         ;//STRHNE  DCval,[pDelta]
    273         VMOVNE    dIn0[0],DCval
    274 
    275 
    276 
    277         ;//*****************************************************************
    278         ;// Transpose the input pixels : perform Row ops as Col ops
    279         ;//*****************************************************************
    280 
    281         VTRN    dIn0,dIn1
    282         VTRN    dIn2,dIn3
    283         VTRN    qIn01,qIn23
    284 
    285 
    286         VMOV    dZero,#0                                    ;// Used to right shift by 1
    287 
    288 
    289         ;//****************************************
    290         ;// Row Operations (Performed on columns)
    291         ;//****************************************
    292 
    293 
    294         VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
    295         VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
    296         VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
    297         VHADD       dIn3RS,dIn3,dZero
    298         VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
    299         VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
    300         VADD        df0,de0,de3                         ;//  f0 = e0 + e3
    301         VADD        df1,de1,de2                            ;//  f1 = e1 + e2
    302         VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
    303         VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
    304 
    305 
    306 
    307         ;//*****************************************************************
    308         ;// Transpose the resultant matrix
    309         ;//*****************************************************************
    310 
    311         VTRN    df0,df1
    312         VTRN    df2,df3
    313         VTRN    qf01,qf23
    314 
    315 
    316         ;//*******************************
    317         ;// Coloumn Operations
    318         ;//*******************************
    319 
    320 
    321         VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
    322         VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
    323         VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
    324         VHADD       df3RS,df3,dZero
    325         VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
    326         VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
    327         VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
    328         VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
    329         VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
    330         VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
    331 
    332 
    333         ;//************************************************
    334         ;// Calculate final value (colOp[i][j] + 32)>>6
    335         ;//************************************************
    336 
    337         VRSHR       dh0,#6
    338         VRSHR       dh1,#6
    339         VRSHR       dh2,#6
    340         VRSHR       dh3,#6
    341 
    342 
    343         B       OutDCcase
    344 
    345 
    346 DCcase
    347         ;// Calculate the Transformed DCvalue : (DCval+32)>>6
    348         LDRSH   DCval,[pDCTemp]
    349         ADD     DCval,DCval,#32
    350         ASR     DCval,DCval,#6
    351 
    352         VDUP    dDeltaRow0, DCval                       ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
    353         VDUP    dDeltaRow1, DCval                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
    354         VDUP    dDeltaRow2, DCval                        ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
    355         VDUP    dDeltaRow3, DCval
    356 
    357 
    358 OutDCcase
    359         M_LDR   predstep,predStepOnStack
    360         M_LDR   dstStep,dstStepOnStack
    361 
    362         LDR     PredVal1,[pPredTemp],predstep
    363         LDR     PredVal2,[pPredTemp],predstep
    364         VMOV    dPredValRow01,PredVal1,PredVal2
    365 
    366         LDR     PredVal1,[pPredTemp],predstep
    367         LDR     PredVal2,[pPredTemp]
    368         VMOV    dPredValRow23,PredVal1,PredVal2
    369 
    370 
    371         VADDW   qSumRow01,qDeltaRow01,dPredValRow01
    372         VADDW   qSumRow23,qDeltaRow23,dPredValRow23
    373         VQMOVUN dDstRow01,qSumRow01
    374         VQMOVUN dDstRow23,qSumRow23
    375 
    376 
    377         VST1    dDstRow0,[pDstTemp],dstStep
    378         VST1    dDstRow1,[pDstTemp],dstStep
    379         VST1    dDstRow2,[pDstTemp],dstStep
    380         VST1    dDstRow3,[pDstTemp]
    381 
    382         ;// Set return value
    383         MOV     result,#OMX_Sts_NoErr
    384 
    385 End
    386 
    387 
    388         ;// Write function tail
    389 
    390         M_END
    391 
    392     ENDIF                                                    ;//CORTEXA8
    393 
    394 
    395 
    396     END
    397