Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   9641
      6 ;// Date:       Thursday, February 7, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 ;// Description:
     13 ;// H.264 inverse quantize and transform module
     14 ;//
     15 ;//
     16 
     17 ;// Include standard headers
     18 
     19         INCLUDE omxtypes_s.h
     20         INCLUDE armCOMM_s.h
     21 
     22 ;// Import/Export symbols required from/to other files
     23 ;// (For example tables)
     24 
     25         IMPORT armVCM4P10_UnpackBlock4x4
     26         IMPORT armVCM4P10_QPDivTable
     27         IMPORT armVCM4P10_VMatrixQPModTable
     28 
     29         M_VARIANTS ARM1136JS
     30 
     31 ;// Set debugging level
     32 ;//DEBUG_ON    SETL {TRUE}
     33 
     34 
     35 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
     36 
     37 
     38 ;// Guarding implementation by the processor name
     39 
     40     IF  ARM1136JS
     41 
     42 
     43 ;//Input Registers
     44 pData               RN  0
     45 QP                  RN  1
     46 
     47 ;//Output Registers
     48 
     49 
     50 ;//Local Scratch Registers
     51 
     52 ;// Packed Input pixels
     53 in00                RN  2                   ;// Src[0] & Src[1]
     54 in02                RN  3                   ;// Src[2] & Src[3]
     55 in10                RN  4                   ;// Src[4] & Src[5]
     56 in12                RN  5                   ;// Src[6] & Src[7]
     57 in20                RN  6                   ;// Src[8] & Src[9]
     58 in22                RN  7                   ;// Src[10] & Src[11]
     59 in30                RN  8                   ;// Src[12] & Src[13]
     60 in32                RN  9                   ;// Src[14] & Src[15]
     61 
     62 ;// Transpose for Row operations (Rows to cols)
     63 trRow00             RN  2
     64 trRow10             RN  10
     65 trRow02             RN  3
     66 trRow12             RN  5
     67 trRow20             RN  11
     68 trRow30             RN  12
     69 trRow32             RN  14
     70 trRow22             RN  7
     71 
     72 ;// Intermediate calculations
     73 rowSum1             RN  4
     74 rowSum2             RN  6
     75 rowDiff1            RN  8
     76 rowDiff2            RN  9
     77 
     78 
     79 ;// Row operated pixels
     80 rowOp00             RN  2
     81 rowOp10             RN  10
     82 rowOp20             RN  11
     83 rowOp30             RN  12
     84 rowOp02             RN  3
     85 rowOp12             RN  5
     86 rowOp22             RN  7
     87 rowOp32             RN  14
     88 
     89 ;// Transpose for colulmn operations
     90 trCol00             RN  2
     91 trCol02             RN  3
     92 trCol10             RN  4
     93 trCol12             RN  5
     94 trCol20             RN  6
     95 trCol22             RN  7
     96 trCol30             RN  8
     97 trCol32             RN  9
     98 
     99 ;// Intermediate calculations
    100 colSum1             RN  10
    101 colSum2             RN  11
    102 colDiff1            RN  12
    103 colDiff2            RN  14
    104 
    105 
    106 ;// Coloumn operated pixels
    107 colOp00             RN  2
    108 colOp02             RN  3
    109 colOp10             RN  4
    110 colOp12             RN  5
    111 colOp20             RN  6
    112 colOp22             RN  7
    113 colOp30             RN  8
    114 colOp32             RN  9
    115 
    116 ;// Temporary scratch varaibles
    117 pQPDivTable         RN  0
    118 pQPModTable         RN  11
    119 Shift               RN  10
    120 Scale               RN  14
    121 Round               RN  0
    122 
    123 temp1               RN  10
    124 temp2                RN  11
    125 temp3               RN  12
    126 temp4               RN  1
    127 
    128 
    129 
    130 ;// InvTransformed and Dequantized pixels
    131 out00               RN  2
    132 out02               RN  3
    133 out10               RN  4
    134 out12               RN  5
    135 out20               RN  6
    136 out22               RN  7
    137 out30               RN  8
    138 out32               RN  9
    139 
    140 
    141 
    142 
    143     ;// Allocate stack memory required by the function
    144         M_ALLOC4    pDataOnStack, 4
    145 
    146     ;// Write function header
    147         M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11
    148 
    149         ;******************************************************************
    150         ;// The strategy used in implementing the transform is as follows:*
    151         ;// Load the 4x4 block into 8 registers                           *
    152         ;// Transpose the 4x4 matrix                                      *
    153         ;// Perform the row operations (on columns) using SIMD            *
    154         ;// Transpose the 4x4 result matrix                               *
    155         ;// Perform the coloumn operations                                *
    156         ;// Store the 4x4 block at one go                                 *
    157         ;******************************************************************
    158 
    159         ;// Load all the 4x4 pixels
    160 
    161         LDMIA   pData,{in00,in02,in10,in12,in20,in22,in30,in32}
    162 
    163         ;//*****************************************************************
    164         ;//
    165         ;// Transpose the matrix inorder to perform row ops as coloumn ops
    166         ;// Input:   in[][] = original matrix
    167         ;// Output:  trRow[][]= transposed matrix
    168         ;// Step1: Obtain the LL part of the transposed matrix
    169         ;// Step2: Obtain the HL part
    170         ;// step3: Obtain the LH part
    171         ;// Step4: Obtain the HH part
    172         ;//
    173         ;//*****************************************************************
    174 
    175         ;// LL 2x2 transposed matrix
    176         ;//   d0 d1 - -
    177         ;//   d4 d5 - -
    178         ;//   -  -  - -
    179         ;//   -  -  - -
    180 
    181         PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
    182         PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]
    183 
    184         ;// HL 2x2 transposed matrix
    185         ;//    -   -   - -
    186         ;//    -   -   - -
    187         ;//    d8  d9  - -
    188         ;//   d12 d13  - -
    189 
    190 
    191          PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
    192          PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]
    193 
    194         ;// LH 2x2 transposed matrix
    195         ;//   - - d2 d3
    196         ;//   - - d6 d7
    197         ;//   - - -  -
    198         ;//   - - -  -
    199 
    200         PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
    201         PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]
    202 
    203 
    204 
    205 
    206         ;// HH 2x2 transposed matrix
    207         ;//    - -   -   -
    208         ;//    - -   -   -
    209         ;//    - -  d10 d11
    210         ;//    - -  d14 d15
    211 
    212         PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
    213         PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
    214 
    215 
    216         ;****************************************
    217         ;// Row Operations (Performed on columns)
    218         ;****************************************
    219 
    220 
    221         ;// SIMD operations on first two columns(two rows of the original matrix)
    222 
    223         SADD16      rowSum1,trRow00,trRow10                ;// (c0+c1)
    224         SADD16      rowSum2,trRow20,trRow30                ;// (c2+c3)
    225         SSUB16      rowDiff1,trRow00,trRow10               ;// (c0-c1)
    226         SSUB16      rowDiff2,trRow20,trRow30               ;// (c2-c3)
    227         SADD16      rowOp00,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
    228         SSUB16      rowOp10,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
    229         SSUB16      rowOp20,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
    230         SADD16      rowOp30,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
    231 
    232 
    233         ;// SIMD operations on next two columns(next two rows of the original matrix)
    234 
    235         SADD16      rowSum1,trRow02,trRow12                ;// (c0+c1)
    236         SADD16      rowSum2,trRow22,trRow32                ;// (c2+c3)
    237         SSUB16      rowDiff1,trRow02,trRow12               ;// (c0-c1)
    238         SSUB16      rowDiff2,trRow22,trRow32               ;// (c2-c3)
    239         SADD16      rowOp02,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
    240         SSUB16      rowOp12,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
    241         SSUB16      rowOp22,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
    242         SADD16      rowOp32,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
    243 
    244 
    245 
    246         ;*****************************************************************
    247         ;// Transpose the resultant matrix
    248         ;// Input:  rowOp[][]
    249         ;// Output: trCol[][]
    250         ;*****************************************************************
    251 
    252         ;// LL 2x2 transposed matrix
    253         ;//   d0 d1 - -
    254         ;//   d4 d5 - -
    255         ;//   -  -  - -
    256         ;//   -  -  - -
    257 
    258         PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
    259         PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]
    260 
    261         ;// HL 2x2 transposed matrix
    262         ;//    -   -   - -
    263         ;//    -   -   - -
    264         ;//    d8  d9  - -
    265         ;//   d12 d13  - -
    266 
    267 
    268          PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
    269          PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]
    270 
    271         ;// LH 2x2 transposed matrix
    272         ;//   - - d2 d3
    273         ;//   - - d6 d7
    274         ;//   - - -  -
    275         ;//   - - -  -
    276 
    277         PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
    278         PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]
    279 
    280 
    281 
    282 
    283         ;// HH 2x2 transposed matrix
    284         ;//    - -   -   -
    285         ;//    - -   -   -
    286         ;//    - -  d10 d11
    287         ;//    - -  d14 d15
    288 
    289         PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
    290         PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
    291 
    292 
    293         ;*******************************
    294         ;// Coloumn Operations
    295         ;*******************************
    296 
    297         ;//--------------------------------------------------------------------------------------
    298         ;// Store pData(RN0) on stack and restore it only at the final store back
    299         ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls
    300         ;//--------------------------------------------------------------------------------------
    301         M_STR       pData,pDataOnStack
    302 
    303 
    304         ;// SIMD operations on first two columns(two rows of the original matrix)
    305 
    306         SADD16      colSum1,trCol00,trCol10                ;// (c0+c1)
    307         SADD16      colSum2,trCol20,trCol30                ;// (c2+c3)
    308         SSUB16      colDiff1,trCol00,trCol10               ;// (c0-c1)
    309         SSUB16      colDiff2,trCol20,trCol30               ;// (c2-c3)
    310         SADD16      colOp00,colSum1,colSum2                ;// (c0+c1+c2+c3)
    311         SSUB16      colOp10,colSum1,colSum2                ;// (c0+c1-c2-c3)
    312         SSUB16      colOp20,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
    313         SADD16      colOp30,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
    314 
    315 
    316         ;// SIMD operations on next two columns(next two rows of the original matrix)
    317 
    318         LDR         pQPDivTable, =armVCM4P10_QPDivTable    ;// QP Division look-up-table base pointer
    319         SADD16      colSum1,trCol02,trCol12                ;// (c0+c1)
    320         SADD16      colSum2,trCol22,trCol32                ;// (c2+c3)
    321         SSUB16      colDiff1,trCol02,trCol12               ;// (c0-c1)
    322         SSUB16      colDiff2,trCol22,trCol32               ;// (c2-c3)
    323         SADD16      colOp02,colSum1,colSum2                ;// (c0+c1+c2+c3)
    324         SSUB16      colOp12,colSum1,colSum2                ;// (c0+c1-c2-c3)
    325         LDR         pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
    326         LDRSB       Shift, [pQPDivTable, QP]               ;// Shift = pQPDivTable[QP]
    327         SSUB16      colOp22,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
    328         SADD16      colOp32,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
    329 
    330 
    331         LDRSB       Scale, [pQPModTable, QP]               ;// Scale = pQPModTable[QP]
    332 
    333         ;//----------------------------------------------------------------------
    334         ;//
    335         ;// <Dequantize> improves on the c-reference code
    336         ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
    337         ;// We do not subtract 2 from Shift as in C reference, instead perform a
    338         ;// Scale << Shift once in the beginning and do a right shift by a
    339         ;// constant 2 after the Multiplication. The value of Round would be 2
    340         ;//
    341         ;// By doing this we aviod the Branches required and also
    342         ;// reduce the code size substantially
    343         ;//
    344         ;//----------------------------------------------------------------------
    345 
    346         MOV         Round, #2                               ;// Round = 2
    347         LSL         Scale, Scale, Shift                     ;// Scale = Scale << Shift
    348 
    349 
    350         ;// Row 1
    351         SMLABB  temp1, colOp00, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
    352         SMLABB  temp3, colOp02, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
    353         SMLATB  temp2, colOp00, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
    354         SMLATB  temp4, colOp02, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
    355 
    356         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
    357         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
    358         PKHBT   out00,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
    359         PKHBT   out02,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
    360 
    361 
    362         ;// Row 2
    363         SMLABB  temp1, colOp10, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
    364         SMLABB  temp3, colOp12, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
    365         SMLATB  temp2, colOp10, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
    366         SMLATB  temp4, colOp12, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
    367 
    368         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
    369         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
    370         PKHBT   out10,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
    371         PKHBT   out12,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
    372 
    373         ;// Row 3
    374         SMLABB  temp1, colOp20, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
    375         SMLABB  temp3, colOp22, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
    376         SMLATB  temp2, colOp20, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
    377         SMLATB  temp4, colOp22, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
    378 
    379         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
    380         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
    381         PKHBT   out20,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
    382         PKHBT   out22,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
    383 
    384         ;// Row 4
    385         SMLABB  temp1, colOp30, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
    386         SMLABB  temp3, colOp32, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
    387         SMLATB  temp2, colOp30, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
    388         SMLATB  temp4, colOp32, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
    389 
    390         M_LDR   pData,pDataOnStack                          ;// Restore pData pointer from stack
    391         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
    392         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
    393         PKHBT   out30,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
    394         PKHBT   out32,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
    395 
    396 
    397 
    398         ;***************************
    399         ;// Store all the 4x4 pixels
    400         ;***************************
    401 
    402 store_coeff
    403 
    404         STMIA   pData,{out00,out02,out10,out12,out20,out22,out30,out32}
    405 
    406 
    407 
    408         ;// Set return value
    409 
    410 
    411         ;// Write function tail
    412         M_END
    413 
    414     ENDIF                                                           ;//ARM1136JS
    415 
    416 
    417 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
    418 
    419 ;// Guarding implementation by the processor name
    420 
    421 
    422 
    423 
    424 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
    425 
    426 ;//Input Registers
    427 ppSrc               RN  0
    428 pDst                RN  1
    429 QPR2                RN  2
    430 
    431 ;//Output Registers
    432 result              RN  0
    433 
    434 ;//Local Scratch Registers
    435 pDstR4              RN  4
    436 pDstR0              RN  0
    437 QPR1                RN  1
    438 QPR5                RN  5
    439 
    440 ;// Guarding implementation by the processor name
    441 
    442     IF ARM1136JS
    443 
    444     ;// Allocate stack memory required by the function
    445 
    446 
    447     ;// Write function header
    448         M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
    449 
    450         MOV     pDstR4,pDst                         ;// Saving register r1
    451         MOV     QPR5,QPR2                           ;// Saving register r2
    452         BL      armVCM4P10_UnpackBlock4x4
    453 
    454         MOV     pDstR0,pDstR4                       ;// Setting up register r0
    455         MOV     QPR1,QPR5                           ;// Setting up register r1
    456         BL      armVCM4P10_InvTransformDequantLumaDC4x4
    457 
    458 
    459         ;// Set return value
    460         MOV     result,#OMX_Sts_NoErr
    461 
    462         ;// Write function tail
    463         M_END
    464 
    465 
    466     ENDIF                                                           ;//ARM1136JS
    467 
    468 
    469     END