Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 ;// Description:
     13 ;// H.264 inverse quantize and transform module
     14 ;//
     15 ;//
     16 
     17 ;// Include standard headers
     18 
     19         INCLUDE omxtypes_s.h
     20         INCLUDE armCOMM_s.h
     21 
     22 ;// Import/Export symbols required from/to other files
     23 ;// (For example tables)
     24 
     25         IMPORT armVCM4P10_UnpackBlock4x4
     26         IMPORT armVCM4P10_QPDivTable
     27         IMPORT armVCM4P10_VMatrixQPModTable
     28 
     29         M_VARIANTS CortexA8
     30 
     31 ;// Set debugging level
     32 ;//DEBUG_ON    SETL {TRUE}
     33 
     34 
     35 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
     36 
     37 
     38 ;// Guarding implementation by the processor name
     39 
     40 
     41 
     42 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
     43 
     44 ;// Guarding implementation by the processor name
     45 
     46     IF  CortexA8
     47 
     48 ;//Input Registers
     49 pData               RN  0
     50 QP                  RN  1
     51 
     52 
     53 ;//Local Scratch Registers
     54 
     55 ;// ARM Registers
     56 
     57 pQPDivTable         RN  2
     58 pQPModTable         RN  3
     59 Shift               RN  4
     60 Scale               RN  5
     61 
     62 ;// NEON Registers
     63 
     64 ;// Packed Input pixels
     65 dIn0                DN  D0.S16
     66 dIn1                DN  D1.S16
     67 dIn2                DN  D2.S16
     68 dIn3                DN  D3.S16
     69 
     70 ;// Intermediate calculations
     71 dRowSum1            DN  D4.S16
     72 dRowSum2            DN  D5.S16
     73 dRowDiff1           DN  D6.S16
     74 dRowDiff2           DN  D7.S16
     75 
     76 ;// Row operated pixels
     77 dRowOp0             DN  D0.S16
     78 dRowOp1                DN  D1.S16
     79 dRowOp2                DN  D2.S16
     80 dRowOp3                DN  D3.S16
     81 qRowOp01            QN  Q0.32
     82 qRowOp23            QN  Q1.32
     83 
     84 ;// Intermediate calculations
     85 dColSum1            DN  D4.S16
     86 dColSum2            DN  D5.S16
     87 dColDiff1           DN  D6.S16
     88 dColDiff2           DN  D7.S16
     89 
     90 ;// Coloumn operated pixels
     91 dColOp0             DN  D0.S16
     92 dColOp1                DN  D1.S16
     93 dColOp2                DN  D2.S16
     94 dColOp3                DN  D3.S16
     95 
     96 ;// Temporary scratch varaibles
     97 
     98 dScale              DN  D5.S16
     99 qRound0             QN  Q3.S32
    100 qRound1             QN  Q4.S32
    101 qRound2             QN  Q5.S32
    102 qRound3             QN  Q6.S32
    103 
    104 ;// InvTransformed and Dequantized pixels
    105 dOut0               DN  D0.S16
    106 dOut1                DN  D1.S16
    107 dOut2                DN  D2.S16
    108 dOut3                DN  D3.S16
    109 
    110 
    111     ;// Allocate stack memory required by the function
    112 
    113 
    114     ;// Write function header
    115     M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
    116 
    117     ;******************************************************************
    118     ;// The strategy used in implementing the transform is as follows:*
    119     ;// Load the 4x4 block into 4 D-registers                         *
    120     ;// Transpose the 4x4 matrix                                      *
    121     ;// Perform the row operations (on columns) using SIMD            *
    122     ;// Transpose the 4x4 result matrix                               *
    123     ;// Perform the coloumn operations                                *
    124     ;******************************************************************
    125 
    126         ;// Load all the 4x4 pixels in Transposed form
    127 
    128         VLD4    {dIn0,dIn1,dIn2,dIn3},[pData]
    129         LDR     pQPDivTable, =armVCM4P10_QPDivTable        ;// QP Division look-up-table base pointer
    130         LDR     pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
    131 
    132         ;****************************************
    133         ;// Row Operations (Performed on columns)
    134         ;****************************************
    135         ;// Scale factor calculation is done using ARM instructions
    136         ;// Interleaved with NEON instructions inorder to Dual issue
    137 
    138         VADD    dRowSum1,dIn0,dIn1
    139         VADD    dRowSum2,dIn2,dIn3
    140         VSUB    dRowDiff1,dIn0,dIn1
    141         LDRSB   Shift, [pQPDivTable, QP]               ;// ARM CODE: Shift = pQPDivTable[QP]
    142         VSUB    dRowDiff2,dIn2,dIn3
    143         LDRSB   Scale, [pQPModTable, QP]               ;// ARM CODE: Scale = pQPModTable[QP]
    144         VADD    dRowOp0,dRowSum1,dRowSum2
    145         VSUB    dRowOp1,dRowSum1,dRowSum2
    146         VSUB    dRowOp2,dRowDiff1,dRowDiff2
    147         LSL     Scale, Scale, Shift                    ;// ARM CODE: Scale = Scale << Shift
    148         VADD    dRowOp3,dRowDiff1,dRowDiff2
    149 
    150         ;****************************************
    151         ;// Transpose the resultant matrix
    152         ;****************************************
    153 
    154         VTRN    dRowOp0,dRowOp1
    155         VTRN    dRowOp2,dRowOp3
    156         VTRN    qRowOp01,qRowOp23
    157 
    158         ;****************************************
    159         ;// Coloumn Operations
    160         ;****************************************
    161 
    162         VADD    dColSum1,dRowOp0,dRowOp1
    163         VADD    dColSum2,dRowOp2,dRowOp3
    164         VSUB    dColDiff1,dRowOp0,dRowOp1
    165         VSUB    dColDiff2,dRowOp2,dRowOp3
    166         VADD    dColOp0,dColSum1,dColSum2
    167         VSUB    dColOp1,dColSum1,dColSum2
    168         VSUB    dColOp2,dColDiff1,dColDiff2
    169         VADD    dColOp3,dColDiff1,dColDiff2
    170 
    171         ;//----------------------------------------------------------------------
    172         ;//
    173         ;// <Dequantize> improves on the c-reference code
    174         ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
    175         ;// We do not subtract 2 from Shift as in C reference, instead perform a
    176         ;// Scale << Shift once in the beginning and do a right shift by a
    177         ;// constant 2 after the Multiplication. The value of Round would be 2
    178         ;//
    179         ;// By doing this we aviod the Branches required and also
    180         ;// reduce the code size substantially
    181         ;//
    182         ;//----------------------------------------------------------------------
    183 
    184 
    185         VDUP    dScale, Scale                            ;// ARM -> NEON  copy 'scale' to vector
    186 
    187 
    188         VMOV    qRound0,#2                               ;// Set the Round Value
    189         VMOV    qRound1,#2
    190         VMOV    qRound2,#2
    191         VMOV    qRound3,#2
    192 
    193         VMLAL   qRound0,dColOp0,dScale                   ;// pDst[i] * Scale + Round
    194         VMLAL   qRound1,dColOp1,dScale
    195         VMLAL   qRound2,dColOp2,dScale
    196         VMLAL   qRound3,dColOp3,dScale
    197 
    198         VSHRN   dOut0,qRound0,#2                          ;// Right shift by 2 & (OMX_S16)Value
    199         VSHRN   dOut1,qRound1,#2
    200         VSHRN   dOut2,qRound2,#2
    201         VSHRN   dOut3,qRound3,#2
    202 
    203         ;***************************
    204         ;// Store all the 4x4 pixels
    205         ;***************************
    206 
    207         VST1  {dOut0,dOut1,dOut2,dOut3}, [pData]
    208 
    209 
    210         ;// Set return value
    211 
    212         ;// Write function tail
    213         M_END
    214 
    215     ENDIF                                                           ;//CORTEXA8
    216 
    217 
    218 
    219 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
    220 
    221 ;//Input Registers
    222 ppSrc               RN  0
    223 pDst                RN  1
    224 QPR2                RN  2
    225 
    226 ;//Output Registers
    227 result              RN  0
    228 
    229 ;//Local Scratch Registers
    230 pDstR4              RN  4
    231 pDstR0              RN  0
    232 QPR1                RN  1
    233 QPR5                RN  5
    234 
    235 ;// Guarding implementation by the processor name
    236 
    237     IF CortexA8
    238 
    239     ;// Allocate stack memory required by the function
    240 
    241 
    242     ;// Write function header
    243         M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
    244 
    245         MOV     pDstR4,pDst                         ;// Saving register r1
    246         MOV     QPR5,QPR2                           ;// Saving register r2
    247         BL      armVCM4P10_UnpackBlock4x4
    248 
    249         MOV     pDstR0,pDstR4                       ;// Setting up register r0
    250         MOV     QPR1,QPR5                           ;// Setting up register r1
    251         BL      armVCM4P10_InvTransformDequantLumaDC4x4
    252 
    253 
    254         ;// Set return value
    255         MOV     result,#OMX_Sts_NoErr
    256 
    257         ;// Write function tail
    258         M_END
    259 
    260 
    261     ENDIF                                                           ;//ARM1136JS
    262 
    263 
    264     END