Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         M_VARIANTS CortexA8
     17 
     18 
     19 ;//-------------------------------------------------------
     20 ;// This table for implementing switch case of C in asm by
     21 ;// the mehtod of two levels of indexing.
     22 ;//-------------------------------------------------------
     23 
     24     M_TABLE armVCM4P10_pIndexTable16x16
     25     DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
     26     DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
     27 
     28 
     29     IF CortexA8
     30 
     31     M_TABLE armVCM4P10_MultiplierTable16x16,1
     32     DCW   7,  6,  5,  4,  3,  2,  1,  8
     33     DCW   0,  1,  2,  3,  4,  5,  6,  7
     34     DCW   8,  9, 10, 11, 12, 13, 14, 15
     35 
     36 ;//--------------------------------------------
     37 ;// Constants
     38 ;//--------------------------------------------
     39 BLK_SIZE        EQU 0x10
     40 MUL_CONST0      EQU 0x01010101
     41 MUL_CONST1      EQU 0x00060004
     42 MUL_CONST2      EQU 0x00070005
     43 MUL_CONST3      EQU 0x00030001
     44 MASK_CONST      EQU 0x00FF00FF
     45 
     46 ;//--------------------------------------------
     47 ;// Scratch variable
     48 ;//--------------------------------------------
     49 y               RN 12
     50 pc              RN 15
     51 
     52 return          RN 0
     53 pTable          RN 9
     54 count           RN 11
     55 pMultTable      RN 9
     56 ; ----------------------------------------------
     57 ; Neon registers
     58 ; ----------------------------------------------
     59 qAbove          QN Q0.U8
     60 qLeft           QN Q1.U8
     61 qSum8           QN Q0.U16
     62 dSum80          DN D0.U16
     63 dSum81          DN D1.U16
     64 dSum4           DN D0.U16
     65 dSum2           DN D0.U32
     66 dSum1           DN D0.U64
     67 qOut            QN Q3.U8
     68 dSumLeft        DN D6.U64
     69 dSumAbove       DN D7.U64
     70 dSum            DN D8.U64
     71 dSum0           DN D8.U8[0]
     72 
     73 qH              QN Q11.S32
     74 qV              QN Q12.S32
     75 qA              QN Q11.S16
     76 qB              QN Q6.S16
     77 qC              QN Q7.S16
     78 
     79 qB0             QN Q5.S16
     80 qB1             QN Q6.S16
     81 dA1             DN D23.S16
     82 
     83 dH0             DN D22.S32
     84 dH1             DN D23.S32
     85 dV0             DN D24.S32
     86 dV1             DN D25.S32
     87 
     88 qHV             QN Q11.S64
     89 qHV0            QN Q11.S32
     90 qHV1            QN Q12.S64
     91 
     92 dHV00           DN D22.S32
     93 dHV01           DN D23.S32
     94 
     95 dHV0            DN D22.S16[0]
     96 dHV1            DN D23.S16[0]
     97 dHV10           DN D24.S64
     98 dHV11           DN D25.S64
     99 
    100 qSum0           QN Q0.S16
    101 qSum1           QN Q1.S16
    102 
    103 dOut0           DN D6.U8
    104 dOut1           DN D7.U8
    105 
    106 dLeft0          DN D2.U8
    107 dLeft1          DN D3.U8
    108 qConst          QN Q13.S16
    109 
    110 dAbove0         DN D0.U8
    111 dAbove1         DN D1.U8
    112 
    113 dRevLeft64      DN D12.U64
    114 dRevLeft        DN D12.U8
    115 dRevAbove64     DN D5.U64
    116 dRevAbove       DN D5.U8
    117 qLeftDiff       QN Q8.S16
    118 dLeftDiff1      DN D17.S16
    119 dLeftDiff64     DN D17.S64
    120 qDiffLeft       QN Q8.S16
    121 qDiffAbove      QN Q4.S16
    122 dAboveDiff1     DN D9.S16
    123 dAboveDiff64    DN D9.S64
    124 qAboveDiff      QN Q4.S16
    125 
    126 dAboveLeft      DN D4.U8
    127 
    128 dDiffLeft0      DN D16.S16
    129 dDiffLeft1      DN D17.S16
    130 dDiffAbove0     DN D8.S16
    131 dDiffAbove1     DN D9.S16
    132 
    133 qLeft15minus0   QN Q7.S16
    134 dLeft15minus0   DN D14.S16
    135 qAbove15minus0  QN Q3.S16
    136 dAbove15minus0  DN D6.S16
    137 
    138 qMultiplier     QN Q10.S16
    139 qMultiplier0    QN Q10.S16
    140 qMultiplier1    QN Q12.S16
    141 dMultiplier0    DN D20.S16
    142 dMultiplier1    DN D21.S16
    143 
    144 dBPlusCMult7    DN D1.S64
    145 dBPlusCMult7S16 DN D1.S16
    146 
    147 qTmp            QN Q0.U8
    148 
    149 ;//--------------------------------------------
    150 ;// Declare input registers
    151 ;//--------------------------------------------
    152 pSrcLeft        RN 0    ;// input pointer
    153 pSrcAbove       RN 1    ;// input pointer
    154 pSrcAboveLeft   RN 2    ;// input pointer
    155 pDst            RN 3    ;// output pointer
    156 leftStep        RN 4    ;// input variable
    157 dstStep         RN 5    ;// input variable
    158 predMode        RN 6    ;// input variable
    159 availability    RN 7    ;// input variable
    160 
    161 pTmp            RN 8
    162 step            RN 10
    163 pTmp2           RN 11
    164 
    165 ;//-----------------------------------------------------------------------------------------------
    166 ;// omxVCM4P10_PredictIntra_16x16 starts
    167 ;//-----------------------------------------------------------------------------------------------
    168 
    169         ;// Write function header
    170         M_START omxVCM4P10_PredictIntra_16x16, r11, d15
    171 
    172         ;// Define stack arguments
    173         M_ARG    LeftStep,     4
    174         M_ARG    DstStep,      4
    175         M_ARG    PredMode,     4
    176         M_ARG    Availability, 4
    177 
    178         ;// M_STALL ARM1136JS=4
    179 
    180         LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
    181 
    182         ;// Load argument from the stack
    183         M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
    184         M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
    185         M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
    186         M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
    187 
    188         MOV      y, #BLK_SIZE                        ;// Outer Loop Count
    189         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
    190 
    191 OMX_VC_16X16_VERT
    192         VLD1    qAbove,  [pSrcAbove]
    193         ADD     pTmp, pDst, dstStep
    194         ADD     step, dstStep, dstStep
    195         VST1    qAbove, [pDst], step
    196         VST1    qAbove, [pTmp], step
    197         VST1    qAbove, [pDst], step
    198         VST1    qAbove, [pTmp], step
    199         VST1    qAbove, [pDst], step
    200         VST1    qAbove, [pTmp], step
    201         VST1    qAbove, [pDst], step
    202         VST1    qAbove, [pTmp], step
    203         VST1    qAbove, [pDst], step
    204         VST1    qAbove, [pTmp], step
    205         VST1    qAbove, [pDst], step
    206         VST1    qAbove, [pTmp], step
    207         VST1    qAbove, [pDst], step
    208         VST1    qAbove, [pTmp], step
    209         VST1    qAbove, [pDst]
    210         VST1    qAbove, [pTmp]
    211         MOV     return, #OMX_Sts_NoErr               ;// returnNoError
    212         M_EXIT
    213 
    214 OMX_VC_16X16_HOR
    215         ADD     pTmp, pSrcLeft, leftStep
    216         ADD     leftStep, leftStep, leftStep
    217         ADD     pTmp2, pDst, dstStep
    218         ADD     dstStep, dstStep, dstStep
    219 LoopHor
    220         VLD1     {qLeft[]}, [pSrcLeft], leftStep
    221         VLD1     {qTmp[]}, [pTmp], leftStep
    222         SUBS     y, y, #8
    223         VST1     qLeft, [pDst], dstStep
    224         VST1     qTmp, [pTmp2], dstStep
    225         VLD1     {qLeft[]}, [pSrcLeft], leftStep
    226         VLD1     {qTmp[]}, [pTmp], leftStep
    227         VST1     qLeft, [pDst], dstStep
    228         VST1     qTmp, [pTmp2], dstStep
    229         VLD1     {qLeft[]}, [pSrcLeft], leftStep
    230         VLD1     {qTmp[]}, [pTmp], leftStep
    231         VST1     qLeft, [pDst], dstStep
    232         VST1     qTmp, [pTmp2], dstStep
    233         VLD1     {qLeft[]}, [pSrcLeft], leftStep
    234         VLD1     {qTmp[]}, [pTmp], leftStep
    235         VST1     qLeft, [pDst], dstStep
    236         VST1     qTmp, [pTmp2], dstStep
    237 
    238         BNE      LoopHor                                  ;// Loop for 16 times
    239         MOV      return, #OMX_Sts_NoErr
    240         M_EXIT
    241 
    242 OMX_VC_16X16_DC
    243         MOV      count, #0                                 ;// count = 0
    244         TST      availability, #OMX_VC_LEFT
    245         BEQ      UpperOrNoneAvailable                      ;// Jump to Upper if not left
    246 
    247         ADD     pTmp, pSrcLeft, leftStep
    248         ADD     step, leftStep, leftStep
    249 
    250         VLD1    {qLeft[0]}, [pSrcLeft],step
    251         VLD1    {qLeft[1]}, [pTmp],step
    252         VLD1    {qLeft[2]}, [pSrcLeft],step
    253         VLD1    {qLeft[3]}, [pTmp],step
    254         VLD1    {qLeft[4]}, [pSrcLeft],step
    255         VLD1    {qLeft[5]}, [pTmp],step
    256         VLD1    {qLeft[6]}, [pSrcLeft],step
    257         VLD1    {qLeft[7]}, [pTmp],step
    258         VLD1    {qLeft[8]}, [pSrcLeft],step
    259         VLD1    {qLeft[9]}, [pTmp],step
    260         VLD1    {qLeft[10]},[pSrcLeft],step
    261         VLD1    {qLeft[11]},[pTmp],step
    262         VLD1    {qLeft[12]},[pSrcLeft],step
    263         VLD1    {qLeft[13]},[pTmp],step
    264         VLD1    {qLeft[14]},[pSrcLeft],step
    265         VLD1    {qLeft[15]},[pTmp]
    266 
    267         VPADDL   qSum8, qLeft
    268         ADD     count, count, #1
    269         VPADD    dSum4, dSum80, dSum81
    270         VPADDL   dSum2, dSum4
    271         VPADDL   dSumLeft, dSum2
    272         VRSHR    dSum, dSumLeft, #4
    273 
    274 UpperOrNoneAvailable
    275         TST      availability,  #OMX_VC_UPPER              ;// if(availability & #OMX_VC_UPPER)
    276         BEQ      BothOrNoneAvailable                       ;// Jump to Left if not upper
    277         VLD1     qAbove, [pSrcAbove]
    278         ADD      count, count, #1                          ;// if upper inc count by 1
    279         VPADDL   qSum8, qAbove
    280         VPADD    dSum4, dSum80, dSum81
    281         VPADDL   dSum2, dSum4
    282         VPADDL   dSumAbove, dSum2
    283         VRSHR    dSum, dSumAbove, #4
    284 
    285 BothOrNoneAvailable
    286         CMP      count, #2                                  ;// check if both available
    287         BNE      NoneAvailable
    288         VADD     dSum, dSumAbove, dSumLeft
    289         VRSHR    dSum, dSum, #5
    290 
    291 
    292 NoneAvailable
    293         VDUP     qOut, dSum0
    294         CMP      count, #0                                  ;// check if none available
    295         ADD      pTmp, pDst, dstStep
    296         ADD      step, dstStep, dstStep
    297         BNE      LoopDC
    298         VMOV     qOut, #128
    299 LoopDC
    300         VST1    qOut, [pDst], step
    301         VST1    qOut, [pTmp], step
    302         VST1    qOut, [pDst], step
    303         VST1    qOut, [pTmp], step
    304         VST1    qOut, [pDst], step
    305         VST1    qOut, [pTmp], step
    306         VST1    qOut, [pDst], step
    307         VST1    qOut, [pTmp], step
    308         VST1    qOut, [pDst], step
    309         VST1    qOut, [pTmp], step
    310         VST1    qOut, [pDst], step
    311         VST1    qOut, [pTmp], step
    312         VST1    qOut, [pDst], step
    313         VST1    qOut, [pTmp], step
    314         VST1    qOut, [pDst], step
    315         VST1    qOut, [pTmp], step
    316         MOV     return, #OMX_Sts_NoErr
    317         M_EXIT
    318 
    319 OMX_VC_16X16_PLANE
    320         LDR     pMultTable, =armVCM4P10_MultiplierTable16x16
    321         VLD1    qAbove, [pSrcAbove]                         ;// pSrcAbove[x]      :0<= x <= 7
    322         VLD1    dAboveLeft[0],[pSrcAboveLeft]
    323         ADD     pTmp, pSrcLeft, leftStep
    324         ADD     step, leftStep, leftStep
    325         VLD1    {qLeft[0]},  [pSrcLeft],step
    326         VLD1    {qLeft[1]},  [pTmp],step
    327         VLD1    {qLeft[2]},  [pSrcLeft],step
    328         VLD1    {qLeft[3]},  [pTmp],step
    329         VLD1    {qLeft[4]},  [pSrcLeft],step
    330         VLD1    {qLeft[5]},  [pTmp],step
    331         VLD1    {qLeft[6]},  [pSrcLeft],step
    332         VLD1    {qLeft[7]},  [pTmp],step
    333         VLD1    {qLeft[8]},  [pSrcLeft],step
    334         VLD1    {qLeft[9]},  [pTmp],step
    335         VLD1    {qLeft[10]}, [pSrcLeft],step
    336         VLD1    {qLeft[11]}, [pTmp],step
    337         VLD1    {qLeft[12]}, [pSrcLeft],step
    338         VLD1    {qLeft[13]}, [pTmp],step
    339         VLD1    {qLeft[14]}, [pSrcLeft],step
    340         VLD1    {qLeft[15]}, [pTmp]
    341 
    342         VREV64  dRevAbove, dAbove1                          ;// pSrcAbove[15:14:13:12:11:10:9:8]
    343         VSUBL   qAbove15minus0, dRevAbove, dAboveLeft       ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0]
    344         VSHR    dRevAbove64, dRevAbove64, #8                ;// pSrcAbove[14:13:12:11:10:9:8:X]
    345         VSUBL   qAboveDiff, dRevAbove, dAbove0
    346 
    347         VSHL    dAboveDiff64, dAboveDiff64, #16
    348         VEXT    dDiffAbove1, dAboveDiff1, dAbove15minus0, #1
    349 
    350         VREV64  dRevLeft,dLeft1                             ;// pSrcLeft[15:14:13:12:11:10:9:8]
    351         VSUBL   qLeft15minus0,dRevLeft, dAboveLeft          ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
    352         VSHR    dRevLeft64, dRevLeft64, #8                  ;// pSrcLeft[14:13:12:11:10:9:8:X]
    353         VSUBL   qLeftDiff,dRevLeft, dLeft0
    354 
    355         ;// Multiplier = [8|1|2|...|6|7]
    356         VLD1    qMultiplier, [pMultTable]!
    357 
    358         VSHL    dLeftDiff64, dLeftDiff64, #16
    359         VEXT    dDiffLeft1, dLeftDiff1, dLeft15minus0, #1
    360 
    361         VMULL   qH,dDiffAbove0, dMultiplier0
    362         VMULL   qV,dDiffLeft0,  dMultiplier0
    363         VMLAL   qH,dDiffAbove1, dMultiplier1
    364         VMLAL   qV,dDiffLeft1,  dMultiplier1
    365 
    366         VPADD   dHV00,dH1,dH0
    367         VPADD   dHV01,dV1,dV0
    368         VPADDL  qHV, qHV0
    369         VSHL    qHV1,qHV,#2
    370         VADD    qHV,qHV,qHV1
    371 
    372         ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)]
    373         VRSHR   qHV,qHV,#6
    374 
    375         ;// HV1 = [c*7|b*7]
    376         VSHL    qHV1,qHV,#3
    377         VSUB    qHV1,qHV1,qHV
    378 
    379         ;// Multiplier1 = [0|1|2|...|7]
    380         VLD1    qMultiplier0, [pMultTable]!
    381         VDUP    qB, dHV0
    382         VDUP    qC, dHV1
    383 
    384         VADDL   qA,dAbove1,dLeft1
    385         VSHL    qA,qA, #4
    386         VDUP    qA,dA1[3]
    387         VADD    dBPlusCMult7, dHV10, dHV11
    388 
    389         ;// Multiplier1 = [8|9|10|...|15]
    390         VLD1    qMultiplier1, [pMultTable]
    391         ;// Const = a - 7*(b+c)
    392         VDUP    qConst, dBPlusCMult7S16[0]
    393         VSUB    qConst, qA, qConst
    394 
    395         ;// B0 = [0*b|1*b|2*b|3*b|......|7*b]
    396         VMUL    qB0,qB,qMultiplier0
    397 
    398         ;// B0 = [8*b|9*b|10*b|11*b|....|15*b]
    399         VMUL    qB1,qB,qMultiplier1
    400 
    401         VADD    qSum0, qB0, qConst
    402         VADD    qSum1, qB1, qConst
    403 
    404         ;// Loops for 16 times
    405 LoopPlane
    406         ;// (b*x + c*y + C)>>5
    407         VQRSHRUN dOut0, qSum0,#5
    408         VQRSHRUN dOut1, qSum1,#5
    409         SUBS     y, y, #1
    410         VST1     qOut,[pDst],dstStep
    411         VADD     qSum0,qSum0,qC
    412         VADD     qSum1,qSum1,qC
    413         BNE      LoopPlane
    414 
    415         MOV      return, #OMX_Sts_NoErr
    416 
    417         M_END
    418 
    419         ENDIF ;// CortexA8
    420 
    421         END
    422 ;-----------------------------------------------------------------------------------------------
    423 ; omxVCM4P10_PredictIntra_16x16 ends
    424 ;-----------------------------------------------------------------------------------------------
    425