Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// (c) Copyright 2007 ARM Limited. All Rights Reserved.
      3 ;//
      4 ;// Description:
      5 ;// H.264 inverse quantize and transform module
      6 ;//
      7 ;//
      8 
      9 
     10 
     11 ;// Include standard headers
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16 ;// Import symbols required from other files
     17 ;// (For example tables)
     18 
     19         IMPORT armVCM4P10_UnpackBlock4x4
     20         IMPORT armVCM4P10_TransformResidual4x4
     21         IMPORT armVCM4P10_QPDivTable
     22         IMPORT armVCM4P10_VMatrixU16
     23         IMPORT armVCM4P10_QPModuloTable
     24 
     25     M_VARIANTS ARM1136JS, ARM1136JS_U
     26 
     27 ;// Set debugging level
     28 ;//DEBUG_ON    SETL {TRUE}
     29 
     30 
     31 ;// Static Function: armVCM4P10_DequantLumaAC4x4
     32 
     33 ;// Guarding implementation by the processor name
     34 
     35     IF  ARM1136JS
     36 
     37 ;//Input Registers
     38 pSrcDst       RN  0
     39 QP            RN  1
     40 
     41 
     42 ;//Output Registers
     43 
     44 
     45 ;//Local Scratch Registers
     46 pQPdiv          RN  4
     47 pQPmod          RN  5
     48 pVRow           RN  2
     49 QPmod           RN  6
     50 shift           RN  3
     51 rowLuma01       RN  1
     52 rowLuma23       RN  4
     53 
     54 SrcDst00        RN  5
     55 SrcDst02        RN  6
     56 SrcDst10        RN  7
     57 SrcDst12        RN  8
     58 SrcDst20        RN  9
     59 SrcDst22        RN  10
     60 SrcDst30        RN  11
     61 SrcDst32        RN  12
     62 
     63 temp1           RN  2
     64 temp2           RN  3
     65 temp3           RN  14
     66 
     67 
     68         ;// Allocate stack memory required by the function
     69 
     70         ;// Write function header
     71         M_START armVCM4P10_DequantLumaAC4x4,r11
     72 
     73         LDR    pQPmod,=armVCM4P10_QPModuloTable
     74         LDR    pQPdiv,=armVCM4P10_QPDivTable
     75         LDR    pVRow,=armVCM4P10_VMatrixU16
     76 
     77         LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
     78         LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
     79 
     80         LDRH    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [00|0a]
     81         LDRH    temp3,[pVRow,#2]                     ;// temp3     = [00|0b]
     82         LDRH    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [00|0c]
     83         ORR     rowLuma01,rowLuma01,temp3,LSL #16    ;// rowLuma01 = [0b|0a]
     84 
     85         ;// Load all the 16 'src' values
     86         LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
     87 
     88 
     89         ;//*********************************************************************************************
     90         ;//
     91         ;// 'Shift' ranges between [0,8]
     92         ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
     93         ;//
     94         ;//*********************************************************************************************
     95 
     96         LSL    rowLuma01,rowLuma01,shift
     97         LSL    rowLuma23,rowLuma23,shift
     98 
     99 
    100         ;//**********************************************************************************************
    101         ;//
    102         ;// The idea is to unroll the Loop completely
    103         ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
    104         ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
    105         ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
    106         ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
    107         ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
    108         ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
    109         ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
    110         ;//
    111         ;// We then pack the two 16 bit multiplication result into a word and store at one go
    112         ;//
    113         ;//**********************************************************************************************
    114 
    115 
    116         ;// Row 1
    117 
    118 
    119         SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
    120         SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
    121 
    122         SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
    123         SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
    124 
    125         PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
    126 
    127 
    128         ;// Row 2
    129         SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
    130         SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
    131 
    132         PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
    133         SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
    134         SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
    135 
    136         PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
    137 
    138 
    139         ;// Row 3
    140 
    141         SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
    142         SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
    143 
    144         PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
    145         SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
    146         SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
    147 
    148         PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
    149 
    150 
    151 
    152         ;// Row 4
    153 
    154         SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
    155         SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
    156 
    157         SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
    158         SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
    159 
    160         PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
    161         PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
    162         PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
    163 
    164 
    165         STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
    166 
    167 
    168         ;// Set return value
    169 
    170 
    171 
    172         ;// Write function tail
    173         M_END
    174 
    175     ENDIF                                                    ;//ARM1136JS
    176 
    177 
    178 ;// Guarding implementation by the processor name
    179 
    180     IF  ARM1136JS_U
    181 
    182 ;//Input Registers
    183 pSrcDst       RN  0
    184 QP            RN  1
    185 
    186 
    187 ;//Output Registers
    188 
    189 
    190 ;//Local Scratch Registers
    191 pQPdiv          RN  4
    192 pQPmod          RN  5
    193 pVRow           RN  2
    194 QPmod           RN  6
    195 shift           RN  3
    196 rowLuma01       RN  1
    197 rowLuma23       RN  4
    198 
    199 SrcDst00        RN  5
    200 SrcDst02        RN  6
    201 SrcDst10        RN  7
    202 SrcDst12        RN  8
    203 SrcDst20        RN  9
    204 SrcDst22        RN  10
    205 SrcDst30        RN  11
    206 SrcDst32        RN  12
    207 
    208 temp1           RN  2
    209 temp2           RN  3
    210 temp3           RN  14
    211 
    212 
    213         ;// Allocate stack memory required by the function
    214 
    215         ;// Write function header
    216         M_START armVCM4P10_DequantLumaAC4x4,r11
    217 
    218         LDR    pQPmod,=armVCM4P10_QPModuloTable
    219         LDR    pQPdiv,=armVCM4P10_QPDivTable
    220         LDR    pVRow,=armVCM4P10_VMatrixU16
    221 
    222         LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
    223         LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
    224 
    225         LDR    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [0b|0a]
    226         LDR    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [0d|0c]
    227 
    228         ;// Load all the 16 'src' values
    229         LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
    230 
    231 
    232         ;//*********************************************************************************************
    233         ;//
    234         ;// 'Shift' ranges between [0,8]
    235         ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
    236         ;//
    237         ;//*********************************************************************************************
    238 
    239         LSL    rowLuma01,rowLuma01,shift
    240         LSL    rowLuma23,rowLuma23,shift
    241 
    242 
    243         ;//**********************************************************************************************
    244         ;//
    245         ;// The idea is to unroll the Loop completely
    246         ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
    247         ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
    248         ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
    249         ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
    250         ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
    251         ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
    252         ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
    253         ;//
    254         ;// We then pack the two 16 bit multiplication result into a word and store at one go
    255         ;//
    256         ;//**********************************************************************************************
    257 
    258 
    259         ;// Row 1
    260 
    261 
    262         SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
    263         SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)
    264 
    265         SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
    266         SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
    267 
    268         PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
    269 
    270 
    271         ;// Row 2
    272         SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
    273         SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
    274 
    275         PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
    276         SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
    277         SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
    278 
    279         PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
    280 
    281 
    282         ;// Row 3
    283 
    284         SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
    285         SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)
    286 
    287         PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
    288         SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
    289         SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
    290 
    291         PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
    292 
    293 
    294 
    295         ;// Row 4
    296 
    297         SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
    298         SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
    299 
    300         SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
    301         SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
    302 
    303         PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
    304         PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
    305         PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
    306 
    307 
    308         STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
    309 
    310 
    311         ;// Set return value
    312 
    313 
    314 
    315         ;// Write function tail
    316         M_END
    317 
    318     ENDIF                                                    ;//ARM1136JS_U
    319 
    320 
    321 
    322 
    323 
    324 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
    325 
    326 ;// Guarding implementation by the processor name
    327 
    328     IF  ARM1136JS
    329 
    330 ;//Input Registers
    331 ppSrc       RN  0
    332 pPred       RN  1
    333 pDC         RN  2
    334 pDst        RN  3
    335 
    336 
    337 ;//Output Registers
    338 result      RN  0
    339 
    340 ;//Local Scratch Registers
    341 pDelta      RN  4
    342 pDeltaTmp   RN  6
    343 AC          RN  5                   ;//Load from stack
    344 pPredTemp   RN  7
    345 pDCTemp     RN  8
    346 pDstTemp    RN  9
    347 pDeltaArg1  RN  1
    348 pDeltaArg0  RN  0
    349 QP          RN  1                   ;//Load from stack
    350 DCval       RN  10
    351 DCvalCopy   RN  11
    352 predstep    RN  1
    353 dstStep     RN  10
    354 ycounter    RN  0
    355 PredVal1    RN  3
    356 PredVal2    RN  5
    357 DeltaVal1   RN  2
    358 DeltaVal2   RN  11
    359 PredVal     RN  8
    360 tmpDeltaVal RN  6
    361 sum1        RN  12
    362 sum2        RN  14
    363 
    364 
    365 
    366     ;// Allocate stack memory required by the function
    367         M_ALLOC8 pBuffer, 32
    368 
    369 
    370     ;// Write function header
    371         M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
    372 
    373         ;// Define stack arguments
    374         M_ARG   predStepOnStack, 4
    375         M_ARG   dstStepOnStack,4
    376         M_ARG   QPOnStack, 4
    377         M_ARG   ACOnStack,4
    378 
    379 
    380         M_ADR   pDelta,pBuffer
    381         M_LDR   AC,ACOnStack
    382 
    383 
    384         ;// Save registers r1,r2,r3 before function call
    385         MOV     pPredTemp,pPred
    386         MOV     pDCTemp,pDC
    387         MOV     pDstTemp,pDst
    388 
    389         CMP     AC,#0
    390         BEQ     DCcase
    391         MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
    392 
    393         BL      armVCM4P10_UnpackBlock4x4
    394 
    395         M_LDR   QP,QPOnStack                                ;// Set up r1 for DequantLumaAC4x4
    396         MOV     pDeltaArg0,pDelta                           ;// Set up r0 for DequantLumaAC4x4
    397 
    398         BL      armVCM4P10_DequantLumaAC4x4
    399 
    400 
    401         CMP     pDCTemp,#0
    402         LDRSHNE DCval,[pDCTemp]
    403         MOV     pDeltaArg0,pDelta                           ;// Set up r0 for armVCM4P10_TransformResidual4x4
    404         MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_TransformResidual4x4
    405         STRHNE  DCval,[pDelta]
    406 
    407         BL      armVCM4P10_TransformResidual4x4
    408         B       OutDCcase
    409 
    410 
    411 DCcase
    412         LDRSH   DCval,[pDCTemp]
    413         ADD     DCval,DCval,#32
    414         ASR     DCval,DCval,#6
    415         PKHBT   DCval,DCval,DCval,LSL #16                  ;// Duplicating the Lower halfword
    416         MOV     DCvalCopy, DCval                           ;// Needed for STRD
    417         STRD    DCval, [pDelta, #0]                        ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
    418         STRD    DCval, [pDelta, #8]                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
    419         STRD    DCval, [pDelta, #16]                       ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
    420         STRD    DCval, [pDelta, #24]
    421 
    422 
    423 OutDCcase
    424         M_LDR   predstep,predStepOnStack
    425         M_LDR   dstStep,dstStepOnStack
    426 
    427         LDMIA   pDelta!,{tmpDeltaVal,DeltaVal2}             ;// Pre load
    428         MOV     ycounter,#4                                 ;// Counter for the PredPlusDeltaLoop
    429         LDR     PredVal,[pPredTemp]                         ;// Pre load
    430 
    431 PredPlusDeltaLoop
    432 
    433 
    434         SUBS    ycounter,ycounter,#1
    435         ADD     pPredTemp,pPredTemp,predstep                ;// Increment pPred ptr
    436 
    437         PKHBT   DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16     ;// Deltaval1 = [C A]
    438         PKHTB   DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16     ;// DeltaVal2 = [D B]
    439 
    440         UXTB16  PredVal1,PredVal                            ;// PredVal1 = [0c0a]
    441         UXTB16  PredVal2,PredVal,ROR #8                     ;// PredVal2 = [0d0b]
    442 
    443         LDRGT   PredVal,[pPredTemp]                         ;// Pre load
    444 
    445         QADD16  sum2,DeltaVal2,PredVal2                     ;// Add and saturate to 16 bits
    446         QADD16  sum1,DeltaVal1,PredVal1
    447 
    448         USAT16  sum2,#8,sum2                                ;// armClip(0,255,sum2)
    449         USAT16  sum1,#8,sum1
    450 
    451         LDMGTIA   pDelta!,{tmpDeltaVal,DeltaVal2}           ;// Pre load
    452 
    453         ORR     sum1,sum1,sum2,LSL #8                       ;// sum1 = [dcba]
    454         STR     sum1,[pDstTemp]
    455 
    456         ADD     pDstTemp,pDstTemp,dstStep                   ;// Increment pDst ptr
    457         BGT     PredPlusDeltaLoop
    458 
    459 
    460         ;// Set return value
    461         MOV     result,#OMX_Sts_NoErr
    462 
    463 End
    464 
    465 
    466         ;// Write function tail
    467 
    468         M_END
    469 
    470     ENDIF                                                    ;//ARM1136JS
    471 
    472 
    473 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
    474 
    475 ;// Guarding implementation by the processor name
    476 
    477 
    478 
    479 
    480     END
    481