Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 ;// Description:
     27 ;// H.264 inverse quantize and transform module
     28 ;//
     29 ;//
     30 
     31 ;// Include standard headers
     32 
     33         INCLUDE omxtypes_s.h
     34         INCLUDE armCOMM_s.h
     35 
     36 ;// Import/Export symbols required from/to other files
     37 ;// (For example tables)
     38 
     39         IMPORT armVCM4P10_UnpackBlock4x4
     40         IMPORT armVCM4P10_QPDivTable
     41         IMPORT armVCM4P10_VMatrixQPModTable
     42 
     43         M_VARIANTS CortexA8
     44 
     45 ;// Set debugging level
     46 ;//DEBUG_ON    SETL {TRUE}
     47 
     48 
     49 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
     50 
     51 
     52 ;// Guarding implementation by the processor name
     53 
     54 
     55 
     56 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
     57 
     58 ;// Guarding implementation by the processor name
     59 
     60     IF  CortexA8
     61 
     62 ;//Input Registers
     63 pData               RN  0
     64 QP                  RN  1
     65 
     66 
     67 ;//Local Scratch Registers
     68 
     69 ;// ARM Registers
     70 
     71 pQPDivTable         RN  2
     72 pQPModTable         RN  3
     73 Shift               RN  4
     74 Scale               RN  5
     75 
     76 ;// NEON Registers
     77 
     78 ;// Packed Input pixels
     79 dIn0                DN  D0.S16
     80 dIn1                DN  D1.S16
     81 dIn2                DN  D2.S16
     82 dIn3                DN  D3.S16
     83 
     84 ;// Intermediate calculations
     85 dRowSum1            DN  D4.S16
     86 dRowSum2            DN  D5.S16
     87 dRowDiff1           DN  D6.S16
     88 dRowDiff2           DN  D7.S16
     89 
     90 ;// Row operated pixels
     91 dRowOp0             DN  D0.S16
     92 dRowOp1                DN  D1.S16
     93 dRowOp2                DN  D2.S16
     94 dRowOp3                DN  D3.S16
     95 qRowOp01            QN  Q0.32
     96 qRowOp23            QN  Q1.32
     97 
     98 ;// Intermediate calculations
     99 dColSum1            DN  D4.S16
    100 dColSum2            DN  D5.S16
    101 dColDiff1           DN  D6.S16
    102 dColDiff2           DN  D7.S16
    103 
    104 ;// Coloumn operated pixels
    105 dColOp0             DN  D0.S16
    106 dColOp1                DN  D1.S16
    107 dColOp2                DN  D2.S16
    108 dColOp3                DN  D3.S16
    109 
    110 ;// Temporary scratch varaibles
    111 
    112 dScale              DN  D5.S16
    113 qRound0             QN  Q3.S32
    114 qRound1             QN  Q4.S32
    115 qRound2             QN  Q5.S32
    116 qRound3             QN  Q6.S32
    117 
    118 ;// InvTransformed and Dequantized pixels
    119 dOut0               DN  D0.S16
    120 dOut1                DN  D1.S16
    121 dOut2                DN  D2.S16
    122 dOut3                DN  D3.S16
    123 
    124 
    125     ;// Allocate stack memory required by the function
    126 
    127 
    128     ;// Write function header
    129     M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
    130 
    131     ;******************************************************************
    132     ;// The strategy used in implementing the transform is as follows:*
    133     ;// Load the 4x4 block into 4 D-registers                         *
    134     ;// Transpose the 4x4 matrix                                      *
    135     ;// Perform the row operations (on columns) using SIMD            *
    136     ;// Transpose the 4x4 result matrix                               *
    137     ;// Perform the coloumn operations                                *
    138     ;******************************************************************
    139 
    140         ;// Load all the 4x4 pixels in Transposed form
    141 
    142         VLD4    {dIn0,dIn1,dIn2,dIn3},[pData]
    143         LDR     pQPDivTable, =armVCM4P10_QPDivTable        ;// QP Division look-up-table base pointer
    144         LDR     pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
    145 
    146         ;****************************************
    147         ;// Row Operations (Performed on columns)
    148         ;****************************************
    149         ;// Scale factor calculation is done using ARM instructions
    150         ;// Interleaved with NEON instructions inorder to Dual issue
    151 
    152         VADD    dRowSum1,dIn0,dIn1
    153         VADD    dRowSum2,dIn2,dIn3
    154         VSUB    dRowDiff1,dIn0,dIn1
    155         LDRSB   Shift, [pQPDivTable, QP]               ;// ARM CODE: Shift = pQPDivTable[QP]
    156         VSUB    dRowDiff2,dIn2,dIn3
    157         LDRSB   Scale, [pQPModTable, QP]               ;// ARM CODE: Scale = pQPModTable[QP]
    158         VADD    dRowOp0,dRowSum1,dRowSum2
    159         VSUB    dRowOp1,dRowSum1,dRowSum2
    160         VSUB    dRowOp2,dRowDiff1,dRowDiff2
    161         LSL     Scale, Scale, Shift                    ;// ARM CODE: Scale = Scale << Shift
    162         VADD    dRowOp3,dRowDiff1,dRowDiff2
    163 
    164         ;****************************************
    165         ;// Transpose the resultant matrix
    166         ;****************************************
    167 
    168         VTRN    dRowOp0,dRowOp1
    169         VTRN    dRowOp2,dRowOp3
    170         VTRN    qRowOp01,qRowOp23
    171 
    172         ;****************************************
    173         ;// Coloumn Operations
    174         ;****************************************
    175 
    176         VADD    dColSum1,dRowOp0,dRowOp1
    177         VADD    dColSum2,dRowOp2,dRowOp3
    178         VSUB    dColDiff1,dRowOp0,dRowOp1
    179         VSUB    dColDiff2,dRowOp2,dRowOp3
    180         VADD    dColOp0,dColSum1,dColSum2
    181         VSUB    dColOp1,dColSum1,dColSum2
    182         VSUB    dColOp2,dColDiff1,dColDiff2
    183         VADD    dColOp3,dColDiff1,dColDiff2
    184 
    185         ;//----------------------------------------------------------------------
    186         ;//
    187         ;// <Dequantize> improves on the c-reference code
    188         ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
    189         ;// We do not subtract 2 from Shift as in C reference, instead perform a
    190         ;// Scale << Shift once in the beginning and do a right shift by a
    191         ;// constant 2 after the Multiplication. The value of Round would be 2
    192         ;//
    193         ;// By doing this we aviod the Branches required and also
    194         ;// reduce the code size substantially
    195         ;//
    196         ;//----------------------------------------------------------------------
    197 
    198 
    199         VDUP    dScale, Scale                            ;// ARM -> NEON  copy 'scale' to vector
    200 
    201 
    202         VMOV    qRound0,#2                               ;// Set the Round Value
    203         VMOV    qRound1,#2
    204         VMOV    qRound2,#2
    205         VMOV    qRound3,#2
    206 
    207         VMLAL   qRound0,dColOp0,dScale                   ;// pDst[i] * Scale + Round
    208         VMLAL   qRound1,dColOp1,dScale
    209         VMLAL   qRound2,dColOp2,dScale
    210         VMLAL   qRound3,dColOp3,dScale
    211 
    212         VSHRN   dOut0,qRound0,#2                          ;// Right shift by 2 & (OMX_S16)Value
    213         VSHRN   dOut1,qRound1,#2
    214         VSHRN   dOut2,qRound2,#2
    215         VSHRN   dOut3,qRound3,#2
    216 
    217         ;***************************
    218         ;// Store all the 4x4 pixels
    219         ;***************************
    220 
    221         VST1  {dOut0,dOut1,dOut2,dOut3}, [pData]
    222 
    223 
    224         ;// Set return value
    225 
    226         ;// Write function tail
    227         M_END
    228 
    229     ENDIF                                                           ;//CORTEXA8
    230 
    231 
    232 
    233 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
    234 
    235 ;//Input Registers
    236 ppSrc               RN  0
    237 pDst                RN  1
    238 QPR2                RN  2
    239 
    240 ;//Output Registers
    241 result              RN  0
    242 
    243 ;//Local Scratch Registers
    244 pDstR4              RN  4
    245 pDstR0              RN  0
    246 QPR1                RN  1
    247 QPR5                RN  5
    248 
    249 ;// Guarding implementation by the processor name
    250 
    251     IF CortexA8
    252 
    253     ;// Allocate stack memory required by the function
    254 
    255 
    256     ;// Write function header
    257         M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
    258 
    259         MOV     pDstR4,pDst                         ;// Saving register r1
    260         MOV     QPR5,QPR2                           ;// Saving register r2
    261         BL      armVCM4P10_UnpackBlock4x4
    262 
    263         MOV     pDstR0,pDstR4                       ;// Setting up register r0
    264         MOV     QPR1,QPR5                           ;// Setting up register r1
    265         BL      armVCM4P10_InvTransformDequantLumaDC4x4
    266 
    267 
    268         ;// Set return value
    269         MOV     result,#OMX_Sts_NoErr
    270 
    271         ;// Write function tail
    272         M_END
    273 
    274 
    275     ENDIF                                                           ;//ARM1136JS
    276 
    277 
    278     END
    279