Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   9641
     21 ;// Date:       Thursday, February 7, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 ;// Description:
     27 ;// H.264 inverse quantize and transform module
     28 ;//
     29 ;//
     30 
     31 ;// Include standard headers
     32 
     33         INCLUDE omxtypes_s.h
     34         INCLUDE armCOMM_s.h
     35 
     36 ;// Import/Export symbols required from/to other files
     37 ;// (For example tables)
     38 
     39         IMPORT armVCM4P10_UnpackBlock4x4
     40         IMPORT armVCM4P10_QPDivTable
     41         IMPORT armVCM4P10_VMatrixQPModTable
     42 
     43         M_VARIANTS ARM1136JS
     44 
     45 ;// Set debugging level
     46 ;//DEBUG_ON    SETL {TRUE}
     47 
     48 
     49 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
     50 
     51 
     52 ;// Guarding implementation by the processor name
     53 
     54     IF  ARM1136JS
     55 
     56 
     57 ;//Input Registers
     58 pData               RN  0
     59 QP                  RN  1
     60 
     61 ;//Output Registers
     62 
     63 
     64 ;//Local Scratch Registers
     65 
     66 ;// Packed Input pixels
     67 in00                RN  2                   ;// Src[0] & Src[1]
     68 in02                RN  3                   ;// Src[2] & Src[3]
     69 in10                RN  4                   ;// Src[4] & Src[5]
     70 in12                RN  5                   ;// Src[6] & Src[7]
     71 in20                RN  6                   ;// Src[8] & Src[9]
     72 in22                RN  7                   ;// Src[10] & Src[11]
     73 in30                RN  8                   ;// Src[12] & Src[13]
     74 in32                RN  9                   ;// Src[14] & Src[15]
     75 
     76 ;// Transpose for Row operations (Rows to cols)
     77 trRow00             RN  2
     78 trRow10             RN  10
     79 trRow02             RN  3
     80 trRow12             RN  5
     81 trRow20             RN  11
     82 trRow30             RN  12
     83 trRow32             RN  14
     84 trRow22             RN  7
     85 
     86 ;// Intermediate calculations
     87 rowSum1             RN  4
     88 rowSum2             RN  6
     89 rowDiff1            RN  8
     90 rowDiff2            RN  9
     91 
     92 
     93 ;// Row operated pixels
     94 rowOp00             RN  2
     95 rowOp10             RN  10
     96 rowOp20             RN  11
     97 rowOp30             RN  12
     98 rowOp02             RN  3
     99 rowOp12             RN  5
    100 rowOp22             RN  7
    101 rowOp32             RN  14
    102 
    103 ;// Transpose for colulmn operations
    104 trCol00             RN  2
    105 trCol02             RN  3
    106 trCol10             RN  4
    107 trCol12             RN  5
    108 trCol20             RN  6
    109 trCol22             RN  7
    110 trCol30             RN  8
    111 trCol32             RN  9
    112 
    113 ;// Intermediate calculations
    114 colSum1             RN  10
    115 colSum2             RN  11
    116 colDiff1            RN  12
    117 colDiff2            RN  14
    118 
    119 
    120 ;// Coloumn operated pixels
    121 colOp00             RN  2
    122 colOp02             RN  3
    123 colOp10             RN  4
    124 colOp12             RN  5
    125 colOp20             RN  6
    126 colOp22             RN  7
    127 colOp30             RN  8
    128 colOp32             RN  9
    129 
    130 ;// Temporary scratch varaibles
    131 pQPDivTable         RN  0
    132 pQPModTable         RN  11
    133 Shift               RN  10
    134 Scale               RN  14
    135 Round               RN  0
    136 
    137 temp1               RN  10
    138 temp2                RN  11
    139 temp3               RN  12
    140 temp4               RN  1
    141 
    142 
    143 
    144 ;// InvTransformed and Dequantized pixels
    145 out00               RN  2
    146 out02               RN  3
    147 out10               RN  4
    148 out12               RN  5
    149 out20               RN  6
    150 out22               RN  7
    151 out30               RN  8
    152 out32               RN  9
    153 
    154 
    155 
    156 
    157     ;// Allocate stack memory required by the function
    158         M_ALLOC4    pDataOnStack, 4
    159 
    160     ;// Write function header
    161         M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11
    162 
    163         ;******************************************************************
    164         ;// The strategy used in implementing the transform is as follows:*
    165         ;// Load the 4x4 block into 8 registers                           *
    166         ;// Transpose the 4x4 matrix                                      *
    167         ;// Perform the row operations (on columns) using SIMD            *
    168         ;// Transpose the 4x4 result matrix                               *
    169         ;// Perform the coloumn operations                                *
    170         ;// Store the 4x4 block at one go                                 *
    171         ;******************************************************************
    172 
    173         ;// Load all the 4x4 pixels
    174 
    175         LDMIA   pData,{in00,in02,in10,in12,in20,in22,in30,in32}
    176 
    177         ;//*****************************************************************
    178         ;//
    179         ;// Transpose the matrix inorder to perform row ops as coloumn ops
    180         ;// Input:   in[][] = original matrix
    181         ;// Output:  trRow[][]= transposed matrix
    182         ;// Step1: Obtain the LL part of the transposed matrix
    183         ;// Step2: Obtain the HL part
    184         ;// step3: Obtain the LH part
    185         ;// Step4: Obtain the HH part
    186         ;//
    187         ;//*****************************************************************
    188 
    189         ;// LL 2x2 transposed matrix
    190         ;//   d0 d1 - -
    191         ;//   d4 d5 - -
    192         ;//   -  -  - -
    193         ;//   -  -  - -
    194 
    195         PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
    196         PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
    197 
    198         ;// HL 2x2 transposed matrix
    199         ;//    -   -   - -
    200         ;//    -   -   - -
    201         ;//    d8  d9  - -
    202         ;//   d12 d13  - -
    203 
    204 
    205          PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
    206          PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
    207 
    208         ;// LH 2x2 transposed matrix
    209         ;//   - - d2 d3
    210         ;//   - - d6 d7
    211         ;//   - - -  -
    212         ;//   - - -  -
    213 
    214         PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
    215         PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
    216 
    217 
    218 
    219 
    220         ;// HH 2x2 transposed matrix
    221         ;//    - -   -   -
    222         ;//    - -   -   -
    223         ;//    - -  d10 d11
    224         ;//    - -  d14 d15
    225 
    226         PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
    227         PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
    228 
    229 
    230         ;****************************************
    231         ;// Row Operations (Performed on columns)
    232         ;****************************************
    233 
    234 
    235         ;// SIMD operations on first two columns(two rows of the original matrix)
    236 
    237         SADD16      rowSum1,trRow00,trRow10                ;// (c0+c1)
    238         SADD16      rowSum2,trRow20,trRow30                ;// (c2+c3)
    239         SSUB16      rowDiff1,trRow00,trRow10               ;// (c0-c1)
    240         SSUB16      rowDiff2,trRow20,trRow30               ;// (c2-c3)
    241         SADD16      rowOp00,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
    242         SSUB16      rowOp10,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
    243         SSUB16      rowOp20,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
    244         SADD16      rowOp30,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
    245 
    246 
    247         ;// SIMD operations on next two columns(next two rows of the original matrix)
    248 
    249         SADD16      rowSum1,trRow02,trRow12                ;// (c0+c1)
    250         SADD16      rowSum2,trRow22,trRow32                ;// (c2+c3)
    251         SSUB16      rowDiff1,trRow02,trRow12               ;// (c0-c1)
    252         SSUB16      rowDiff2,trRow22,trRow32               ;// (c2-c3)
    253         SADD16      rowOp02,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
    254         SSUB16      rowOp12,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
    255         SSUB16      rowOp22,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
    256         SADD16      rowOp32,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
    257 
    258 
    259 
    260         ;*****************************************************************
    261         ;// Transpose the resultant matrix
    262         ;// Input:  rowOp[][]
    263         ;// Output: trCol[][]
    264         ;*****************************************************************
    265 
    266         ;// LL 2x2 transposed matrix
    267         ;//   d0 d1 - -
    268         ;//   d4 d5 - -
    269         ;//   -  -  - -
    270         ;//   -  -  - -
    271 
    272         PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
    273         PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
    274 
    275         ;// HL 2x2 transposed matrix
    276         ;//    -   -   - -
    277         ;//    -   -   - -
    278         ;//    d8  d9  - -
    279         ;//   d12 d13  - -
    280 
    281 
    282          PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
    283          PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
    284 
    285         ;// LH 2x2 transposed matrix
    286         ;//   - - d2 d3
    287         ;//   - - d6 d7
    288         ;//   - - -  -
    289         ;//   - - -  -
    290 
    291         PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
    292         PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
    293 
    294 
    295 
    296 
    297         ;// HH 2x2 transposed matrix
    298         ;//    - -   -   -
    299         ;//    - -   -   -
    300         ;//    - -  d10 d11
    301         ;//    - -  d14 d15
    302 
    303         PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
    304         PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
    305 
    306 
    307         ;*******************************
    308         ;// Coloumn Operations
    309         ;*******************************
    310 
    311         ;//--------------------------------------------------------------------------------------
    312         ;// Store pData(RN0) on stack and restore it only at the final store back
    313         ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls
    314         ;//--------------------------------------------------------------------------------------
    315         M_STR       pData,pDataOnStack
    316 
    317 
    318         ;// SIMD operations on first two columns(two rows of the original matrix)
    319 
    320         SADD16      colSum1,trCol00,trCol10                ;// (c0+c1)
    321         SADD16      colSum2,trCol20,trCol30                ;// (c2+c3)
    322         SSUB16      colDiff1,trCol00,trCol10               ;// (c0-c1)
    323         SSUB16      colDiff2,trCol20,trCol30               ;// (c2-c3)
    324         SADD16      colOp00,colSum1,colSum2                ;// (c0+c1+c2+c3)
    325         SSUB16      colOp10,colSum1,colSum2                ;// (c0+c1-c2-c3)
    326         SSUB16      colOp20,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
    327         SADD16      colOp30,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
    328 
    329 
    330         ;// SIMD operations on next two columns(next two rows of the original matrix)
    331 
    332         LDR         pQPDivTable, =armVCM4P10_QPDivTable    ;// QP Division look-up-table base pointer
    333         SADD16      colSum1,trCol02,trCol12                ;// (c0+c1)
    334         SADD16      colSum2,trCol22,trCol32                ;// (c2+c3)
    335         SSUB16      colDiff1,trCol02,trCol12               ;// (c0-c1)
    336         SSUB16      colDiff2,trCol22,trCol32               ;// (c2-c3)
    337         SADD16      colOp02,colSum1,colSum2                ;// (c0+c1+c2+c3)
    338         SSUB16      colOp12,colSum1,colSum2                ;// (c0+c1-c2-c3)
    339         LDR         pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
    340         LDRSB       Shift, [pQPDivTable, QP]               ;// Shift = pQPDivTable[QP]
    341         SSUB16      colOp22,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
    342         SADD16      colOp32,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
    343 
    344 
    345         LDRSB       Scale, [pQPModTable, QP]               ;// Scale = pQPModTable[QP]
    346 
    347         ;//----------------------------------------------------------------------
    348         ;//
    349         ;// <Dequantize> improves on the c-reference code
    350         ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
    351         ;// We do not subtract 2 from Shift as in C reference, instead perform a
    352         ;// Scale << Shift once in the beginning and do a right shift by a
    353         ;// constant 2 after the Multiplication. The value of Round would be 2
    354         ;//
    355         ;// By doing this we aviod the Branches required and also
    356         ;// reduce the code size substantially
    357         ;//
    358         ;//----------------------------------------------------------------------
    359 
    360         MOV         Round, #2                               ;// Round = 2
    361         LSL         Scale, Scale, Shift                     ;// Scale = Scale << Shift
    362 
    363 
    364         ;// Row 1
    365         SMLABB  temp1, colOp00, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
    366         SMLABB  temp3, colOp02, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
    367         SMLATB  temp2, colOp00, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
    368         SMLATB  temp4, colOp02, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
    369 
    370         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
    371         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
    372         PKHBT   out00,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
    373         PKHBT   out02,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
    374 
    375 
    376         ;// Row 2
    377         SMLABB  temp1, colOp10, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
    378         SMLABB  temp3, colOp12, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
    379         SMLATB  temp2, colOp10, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
    380         SMLATB  temp4, colOp12, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
    381 
    382         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
    383         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
    384         PKHBT   out10,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
    385         PKHBT   out12,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
    386 
    387         ;// Row 3
    388         SMLABB  temp1, colOp20, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
    389         SMLABB  temp3, colOp22, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
    390         SMLATB  temp2, colOp20, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
    391         SMLATB  temp4, colOp22, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
    392 
    393         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
    394         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
    395         PKHBT   out20,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
    396         PKHBT   out22,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
    397 
    398         ;// Row 4
    399         SMLABB  temp1, colOp30, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
    400         SMLABB  temp3, colOp32, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
    401         SMLATB  temp2, colOp30, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
    402         SMLATB  temp4, colOp32, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
    403 
    404         M_LDR   pData,pDataOnStack                          ;// Restore pData pointer from stack
    405         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
    406         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
    407         PKHBT   out30,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
    408         PKHBT   out32,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
    409 
    410 
    411 
    412         ;***************************
    413         ;// Store all the 4x4 pixels
    414         ;***************************
    415 
    416 store_coeff
    417 
    418         STMIA   pData,{out00,out02,out10,out12,out20,out22,out30,out32}
    419 
    420 
    421 
    422         ;// Set return value
    423 
    424 
    425         ;// Write function tail
    426         M_END
    427 
    428     ENDIF                                                           ;//ARM1136JS
    429 
    430 
    431 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
    432 
    433 ;// Guarding implementation by the processor name
    434 
    435 
    436 
    437 
    438 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
    439 
    440 ;//Input Registers
    441 ppSrc               RN  0
    442 pDst                RN  1
    443 QPR2                RN  2
    444 
    445 ;//Output Registers
    446 result              RN  0
    447 
    448 ;//Local Scratch Registers
    449 pDstR4              RN  4
    450 pDstR0              RN  0
    451 QPR1                RN  1
    452 QPR5                RN  5
    453 
    454 ;// Guarding implementation by the processor name
    455 
    456     IF ARM1136JS
    457 
    458     ;// Allocate stack memory required by the function
    459 
    460 
    461     ;// Write function header
    462         M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
    463 
    464         MOV     pDstR4,pDst                         ;// Saving register r1
    465         MOV     QPR5,QPR2                           ;// Saving register r2
    466         BL      armVCM4P10_UnpackBlock4x4
    467 
    468         MOV     pDstR0,pDstR4                       ;// Setting up register r0
    469         MOV     QPR1,QPR5                           ;// Setting up register r1
    470         BL      armVCM4P10_InvTransformDequantLumaDC4x4
    471 
    472 
    473         ;// Set return value
    474         MOV     result,#OMX_Sts_NoErr
    475 
    476         ;// Write function tail
    477         M_END
    478 
    479 
    480     ENDIF                                                           ;//ARM1136JS
    481 
    482 
    483     END
    484