m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s

;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;//      http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   9641
;// Date:       Thursday, February 7, 2008
;//
;//
;//
;//
;// Description:
;// H.264 inverse quantize and transform module
;//
;//

;// Include standard headers

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h

;// Import/Export symbols required from/to other files
;// (For example tables)

        IMPORT armVCM4P10_UnpackBlock4x4
        IMPORT armVCM4P10_QPDivTable
        IMPORT armVCM4P10_VMatrixQPModTable

        M_VARIANTS ARM1136JS

;// Set debugging level
;//DEBUG_ON    SETL {TRUE}


;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4


;// Guarding implementation by the processor name

    IF  ARM1136JS


;//Input Registers
pData               RN  0
QP                  RN  1

;//Output Registers


;//Local Scratch Registers

;// Packed Input pixels
in00                RN  2                   ;// Src[0] & Src[1]
in02                RN  3                   ;// Src[2] & Src[3]
in10                RN  4                   ;// Src[4] & Src[5]
in12                RN  5                   ;// Src[6] & Src[7]
in20                RN  6                   ;// Src[8] & Src[9]
in22                RN  7                   ;// Src[10] & Src[11]
in30                RN  8                   ;// Src[12] & Src[13]
in32                RN  9                   ;// Src[14] & Src[15]

;// Transpose for Row operations (Rows to cols)
trRow00             RN  2
trRow10             RN  10
trRow02             RN  3
trRow12             RN  5
trRow20             RN  11
trRow30             RN  12
trRow32             RN  14
trRow22             RN  7

;// Intermediate calculations
rowSum1             RN  4
rowSum2             RN  6
rowDiff1            RN  8
rowDiff2            RN  9


;// Row operated pixels
rowOp00             RN  2
rowOp10             RN  10
rowOp20             RN  11
rowOp30             RN  12
rowOp02             RN  3
rowOp12             RN  5
rowOp22             RN  7
rowOp32             RN  14

;// Transpose for colulmn operations
trCol00             RN  2
trCol02             RN  3
trCol10             RN  4
trCol12             RN  5
trCol20             RN  6
trCol22             RN  7
trCol30             RN  8
trCol32             RN  9

;// Intermediate calculations
colSum1             RN  10
colSum2             RN  11
colDiff1            RN  12
colDiff2            RN  14


;// Coloumn operated pixels
colOp00             RN  2
colOp02             RN  3
colOp10             RN  4
colOp12             RN  5
colOp20             RN  6
colOp22             RN  7
colOp30             RN  8
colOp32             RN  9

;// Temporary scratch varaibles
pQPDivTable         RN  0
pQPModTable         RN  11
Shift               RN  10
Scale               RN  14
Round               RN  0

temp1               RN  10
temp2                RN  11
temp3               RN  12
temp4               RN  1


;// InvTransformed and Dequantized pixels
out00               RN  2
out02               RN  3
out10               RN  4
out12               RN  5
out20               RN  6
out22               RN  7
out30               RN  8
out32               RN  9


    ;// Allocate stack memory required by the function
        M_ALLOC4    pDataOnStack, 4

    ;// Write function header
        M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11

        ;******************************************************************
        ;// The strategy used in implementing the transform is as follows:*
        ;// Load the 4x4 block into 8 registers                           *
        ;// Transpose the 4x4 matrix                                      *
        ;// Perform the row operations (on columns) using SIMD            *
        ;// Transpose the 4x4 result matrix                               *
        ;// Perform the coloumn operations                                *
        ;// Store the 4x4 block at one go                                 *
        ;******************************************************************

        ;// Load all the 4x4 pixels

        LDMIA   pData,{in00,in02,in10,in12,in20,in22,in30,in32}

        ;//*****************************************************************
        ;//
        ;// Transpose the matrix inorder to perform row ops as coloumn ops
        ;// Input:   in[][] = original matrix
        ;// Output:  trRow[][]= transposed matrix
        ;// Step1: Obtain the LL part of the transposed matrix
        ;// Step2: Obtain the HL part
        ;// step3: Obtain the LH part
        ;// Step4: Obtain the HH part
        ;//
        ;//*****************************************************************

        ;// LL 2x2 transposed matrix
        ;//   d0 d1 - -
        ;//   d4 d5 - -
        ;//   -  -  - -
        ;//   -  -  - -

        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]

        ;// HL 2x2 transposed matrix
        ;//    -   -   - -
        ;//    -   -   - -
        ;//    d8  d9  - -
        ;//   d12 d13  - -


         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]

        ;// LH 2x2 transposed matrix
        ;//   - - d2 d3
        ;//   - - d6 d7
        ;//   - - -  -
        ;//   - - -  -

        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]


        ;// HH 2x2 transposed matrix
        ;//    - -   -   -
        ;//    - -   -   -
        ;//    - -  d10 d11
        ;//    - -  d14 d15

        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]


        ;****************************************
        ;// Row Operations (Performed on columns)
        ;****************************************


        ;// SIMD operations on first two columns(two rows of the original matrix)

        SADD16      rowSum1,trRow00,trRow10                ;// (c0+c1)
        SADD16      rowSum2,trRow20,trRow30                ;// (c2+c3)
        SSUB16      rowDiff1,trRow00,trRow10               ;// (c0-c1)
        SSUB16      rowDiff2,trRow20,trRow30               ;// (c2-c3)
        SADD16      rowOp00,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
        SSUB16      rowOp10,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
        SSUB16      rowOp20,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
        SADD16      rowOp30,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)


        ;// SIMD operations on next two columns(next two rows of the original matrix)

        SADD16      rowSum1,trRow02,trRow12                ;// (c0+c1)
        SADD16      rowSum2,trRow22,trRow32                ;// (c2+c3)
        SSUB16      rowDiff1,trRow02,trRow12               ;// (c0-c1)
        SSUB16      rowDiff2,trRow22,trRow32               ;// (c2-c3)
        SADD16      rowOp02,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
        SSUB16      rowOp12,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
        SSUB16      rowOp22,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
        SADD16      rowOp32,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)


        ;*****************************************************************
        ;// Transpose the resultant matrix
        ;// Input:  rowOp[][]
        ;// Output: trCol[][]
        ;*****************************************************************

        ;// LL 2x2 transposed matrix
        ;//   d0 d1 - -
        ;//   d4 d5 - -
        ;//   -  -  - -
        ;//   -  -  - -

        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]

        ;// HL 2x2 transposed matrix
        ;//    -   -   - -
        ;//    -   -   - -
        ;//    d8  d9  - -
        ;//   d12 d13  - -


         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]

        ;// LH 2x2 transposed matrix
        ;//   - - d2 d3
        ;//   - - d6 d7
        ;//   - - -  -
        ;//   - - -  -

        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]


        ;// HH 2x2 transposed matrix
        ;//    - -   -   -
        ;//    - -   -   -
        ;//    - -  d10 d11
        ;//    - -  d14 d15

        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]


        ;*******************************
        ;// Coloumn Operations
        ;*******************************

        ;//--------------------------------------------------------------------------------------
        ;// Store pData(RN0) on stack and restore it only at the final store back
        ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls
        ;//--------------------------------------------------------------------------------------
        M_STR       pData,pDataOnStack


        ;// SIMD operations on first two columns(two rows of the original matrix)

        SADD16      colSum1,trCol00,trCol10                ;// (c0+c1)
        SADD16      colSum2,trCol20,trCol30                ;// (c2+c3)
        SSUB16      colDiff1,trCol00,trCol10               ;// (c0-c1)
        SSUB16      colDiff2,trCol20,trCol30               ;// (c2-c3)
        SADD16      colOp00,colSum1,colSum2                ;// (c0+c1+c2+c3)
        SSUB16      colOp10,colSum1,colSum2                ;// (c0+c1-c2-c3)
        SSUB16      colOp20,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
        SADD16      colOp30,colDiff1,colDiff2              ;// (c0-c1+c2-c3)


        ;// SIMD operations on next two columns(next two rows of the original matrix)

        LDR         pQPDivTable, =armVCM4P10_QPDivTable    ;// QP Division look-up-table base pointer
        SADD16      colSum1,trCol02,trCol12                ;// (c0+c1)
        SADD16      colSum2,trCol22,trCol32                ;// (c2+c3)
        SSUB16      colDiff1,trCol02,trCol12               ;// (c0-c1)
        SSUB16      colDiff2,trCol22,trCol32               ;// (c2-c3)
        SADD16      colOp02,colSum1,colSum2                ;// (c0+c1+c2+c3)
        SSUB16      colOp12,colSum1,colSum2                ;// (c0+c1-c2-c3)
        LDR         pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
        LDRSB       Shift, [pQPDivTable, QP]               ;// Shift = pQPDivTable[QP]
        SSUB16      colOp22,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
        SADD16      colOp32,colDiff1,colDiff2              ;// (c0-c1+c2-c3)


        LDRSB       Scale, [pQPModTable, QP]               ;// Scale = pQPModTable[QP]

        ;//----------------------------------------------------------------------
        ;//
        ;// <Dequantize> improves on the c-reference code
        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
        ;// We do not subtract 2 from Shift as in C reference, instead perform a
        ;// Scale << Shift once in the beginning and do a right shift by a
        ;// constant 2 after the Multiplication. The value of Round would be 2
        ;//
        ;// By doing this we aviod the Branches required and also
        ;// reduce the code size substantially
        ;//
        ;//----------------------------------------------------------------------

        MOV         Round, #2                               ;// Round = 2
        LSL         Scale, Scale, Shift                     ;// Scale = Scale << Shift


        ;// Row 1
        SMLABB  temp1, colOp00, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
        SMLABB  temp3, colOp02, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
        SMLATB  temp2, colOp00, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
        SMLATB  temp4, colOp02, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round

        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
        PKHBT   out00,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
        PKHBT   out02,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |


        ;// Row 2
        SMLABB  temp1, colOp10, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
        SMLABB  temp3, colOp12, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
        SMLATB  temp2, colOp10, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
        SMLATB  temp4, colOp12, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round

        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
        PKHBT   out10,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
        PKHBT   out12,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |

        ;// Row 3
        SMLABB  temp1, colOp20, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
        SMLABB  temp3, colOp22, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
        SMLATB  temp2, colOp20, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
        SMLATB  temp4, colOp22, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round

        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
        PKHBT   out20,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
        PKHBT   out22,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |

        ;// Row 4
        SMLABB  temp1, colOp30, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
        SMLABB  temp3, colOp32, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
        SMLATB  temp2, colOp30, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
        SMLATB  temp4, colOp32, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round

        M_LDR   pData,pDataOnStack                          ;// Restore pData pointer from stack
        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
        PKHBT   out30,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
        PKHBT   out32,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |


        ;***************************
        ;// Store all the 4x4 pixels
        ;***************************

store_coeff

        STMIA   pData,{out00,out02,out10,out12,out20,out22,out30,out32}


        ;// Set return value


        ;// Write function tail
        M_END

    ENDIF                                                           ;//ARM1136JS


;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4

;// Guarding implementation by the processor name


;// Function: omxVCM4P10_TransformDequantLumaDCFromPair

;//Input Registers
ppSrc               RN  0
pDst                RN  1
QPR2                RN  2

;//Output Registers
result              RN  0

;//Local Scratch Registers
pDstR4              RN  4
pDstR0              RN  0
QPR1                RN  1
QPR5                RN  5

;// Guarding implementation by the processor name

    IF ARM1136JS

    ;// Allocate stack memory required by the function


    ;// Write function header
        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5

        MOV     pDstR4,pDst                         ;// Saving register r1
        MOV     QPR5,QPR2                           ;// Saving register r2
        BL      armVCM4P10_UnpackBlock4x4

        MOV     pDstR0,pDstR4                       ;// Setting up register r0
        MOV     QPR1,QPR5                           ;// Setting up register r1
        BL      armVCM4P10_InvTransformDequantLumaDC4x4


        ;// Set return value
        MOV     result,#OMX_Sts_NoErr

        ;// Write function tail
        M_END


    ENDIF                                                           ;//ARM1136JS


    END