Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
     17 
     18         M_VARIANTS CortexA8
     19 
     20     IF CortexA8
     21 
     22         M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11
     23 
     24 ;// Declare input registers
     25 pSrc            RN 0
     26 srcStep         RN 1
     27 pDst            RN 2
     28 dstStep         RN 3
     29 
     30 ;// Declare Neon registers
     31 dCoeff5         DN 30.S16
     32 dCoeff20        DN 31.S16
     33 qCoeff5         QN 14.S32
     34 qCoeff20        QN 15.S32
     35 
     36 qSrc01          QN 0.U8
     37 dSrc0           DN 0.U8
     38 dSrc1           DN 1.U8
     39 
     40 dSrcb           DN 4.U8
     41 dSrcc           DN 2.U8
     42 dSrcd           DN 3.U8
     43 dSrce           DN 5.U8
     44 dSrcf           DN 1.U8
     45 
     46 qSrcb           QN 2.S16
     47 qSrcc           QN 1.S16
     48 dSrcB           DN 4.S16
     49 dSrcC           DN 2.S16
     50 
     51 qRes0           QN 5.S16
     52 qRes1           QN 6.S16
     53 qRes2           QN 7.S16
     54 qRes3           QN 8.S16
     55 qRes4           QN 9.S16
     56 qRes5           QN 10.S16
     57 qRes6           QN 11.S16
     58 qRes7           QN 12.S16
     59 qRes8           QN 13.S16
     60 
     61 dRes0           DN 10.S16
     62 dRes1           DN 12.S16
     63 dRes2           DN 14.S16
     64 dRes3           DN 16.S16
     65 dRes4           DN 18.S16
     66 dRes5           DN 20.S16
     67 dRes6           DN 22.S16
     68 dRes7           DN 24.S16
     69 dRes8           DN 26.S16
     70 
     71 qAcc01          QN 5.S32
     72 qAcc23          QN 6.S32
     73 qAcc45          QN 2.S32
     74 qAcc67          QN 3.S32
     75 qSumBE          QN 0.S32
     76 qSumCD          QN 1.S32
     77 
     78 dTempAcc0       DN 0.U16
     79 dTempAcc1       DN 2.U16
     80 dTempAcc2       DN 4.U16
     81 dTempAcc3       DN 6.U16
     82 
     83 qTAcc0          QN 0.U16
     84 qTAcc1          QN 1.U16
     85 qTAcc2          QN 2.U16
     86 qTAcc3          QN 3.U16
     87 
     88 dAcc0           DN 0.U8
     89 dAcc1           DN 2.U8
     90 dAcc2           DN 4.U8
     91 dAcc3           DN 6.U8
     92 
     93 dTmp0           DN 8.S16
     94 dTmp1           DN 9.S16
     95 qTmp0           QN 4.S32
     96 
     97         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
     98         VMOV        dCoeff20, #20
     99         VMOV        dCoeff5, #5
    100 
    101         ;// Row0
    102         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    103         VEXT        dSrcc, dSrc0, dSrc1, #2
    104         VEXT        dSrcd, dSrc0, dSrc1, #3
    105         VEXT        dSrce, dSrc0, dSrc1, #4
    106         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    107         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    108         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    109         VADDL       qRes0, dSrc0, dSrcf         ;// Acc=a+f
    110         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    111         VMLA        dRes0, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    112 ;        VMLS        dRes0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    113         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    114 
    115         ;// Row1
    116         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    117         VEXT        dSrcc, dSrc0, dSrc1, #2
    118         VEXT        dSrcd, dSrc0, dSrc1, #3
    119         VEXT        dSrce, dSrc0, dSrc1, #4
    120         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    121         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    122         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    123         VADDL       qRes1, dSrc0, dSrcf         ;// Acc=a+f
    124         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    125 
    126         VSUB        dRes0, dRes0, dTmp0 ;// TeRi
    127 
    128         VMLA        dRes1, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    129 ;        VMLS        dRes1, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    130         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    131 
    132         ;// Row2
    133         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    134         VEXT        dSrcc, dSrc0, dSrc1, #2
    135         VEXT        dSrcd, dSrc0, dSrc1, #3
    136         VEXT        dSrce, dSrc0, dSrc1, #4
    137         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    138         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    139         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    140         VADDL       qRes2, dSrc0, dSrcf         ;// Acc=a+f
    141         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    142 
    143         VSUB        dRes1, dRes1, dTmp0
    144 
    145         VMLA        dRes2, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    146 ;        VMLS        dRes2, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    147         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    148 
    149         ;// Row3
    150         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    151         VEXT        dSrcc, dSrc0, dSrc1, #2
    152         VEXT        dSrcd, dSrc0, dSrc1, #3
    153         VEXT        dSrce, dSrc0, dSrc1, #4
    154         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    155         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    156         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    157         VADDL       qRes3, dSrc0, dSrcf         ;// Acc=a+f
    158         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    159 
    160         VSUB        dRes2, dRes2, dTmp0
    161 
    162         VMLA        dRes3, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    163 ;        VMLS        dRes3, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    164         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    165 
    166         ;// Row4
    167         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    168         VEXT        dSrcc, dSrc0, dSrc1, #2
    169         VEXT        dSrcd, dSrc0, dSrc1, #3
    170         VEXT        dSrce, dSrc0, dSrc1, #4
    171         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    172         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    173         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    174         VADDL       qRes4, dSrc0, dSrcf         ;// Acc=a+f
    175         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    176 
    177         VSUB        dRes3, dRes3, dTmp0
    178 
    179         VMLA        dRes4, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    180 ;        VMLS        dRes4, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    181         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    182 
    183         ;// Row5
    184         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    185         VEXT        dSrcc, dSrc0, dSrc1, #2
    186         VEXT        dSrcd, dSrc0, dSrc1, #3
    187         VEXT        dSrce, dSrc0, dSrc1, #4
    188         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    189         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    190         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    191         VADDL       qRes5, dSrc0, dSrcf         ;// Acc=a+f
    192         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    193 
    194         VSUB        dRes4, dRes4, dTmp0
    195 
    196         VMLA        dRes5, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    197 ;        VMLS        dRes5, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    198         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    199 
    200         ;// Row6
    201         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    202         VEXT        dSrcc, dSrc0, dSrc1, #2
    203         VEXT        dSrcd, dSrc0, dSrc1, #3
    204         VEXT        dSrce, dSrc0, dSrc1, #4
    205         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    206         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    207         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    208         VADDL       qRes6, dSrc0, dSrcf         ;// Acc=a+f
    209         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    210 
    211         VSUB        dRes5, dRes5, dTmp0
    212 
    213         VMLA        dRes6, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    214 ;        VMLS        dRes6, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    215         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    216 
    217         ;// Row7
    218         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    219         VEXT        dSrcc, dSrc0, dSrc1, #2
    220         VEXT        dSrcd, dSrc0, dSrc1, #3
    221         VEXT        dSrce, dSrc0, dSrc1, #4
    222         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    223         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    224         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    225         VADDL       qRes7, dSrc0, dSrcf         ;// Acc=a+f
    226         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    227 
    228         VSUB        dRes6, dRes6, dTmp0
    229 
    230         VMLA        dRes7, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    231 ;        VMLS        dRes7, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    232         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    233 
    234         ;// Row8
    235         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    236         VEXT        dSrcc, dSrc0, dSrc1, #2
    237         VEXT        dSrcd, dSrc0, dSrc1, #3
    238         VEXT        dSrce, dSrc0, dSrc1, #4
    239         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    240         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    241         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    242         VADDL       qRes8, dSrc0, dSrcf         ;// Acc=a+f
    243 
    244         VSUB        dRes7, dRes7, dTmp0
    245 
    246         VMLA        dRes8, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    247 ;        VMLS        dRes8, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    248         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    249 
    250         VMOV        qCoeff20, #20
    251         VMOV        qCoeff5, #5
    252 
    253         ;// Col0
    254         VADDL       qAcc01, dRes0, dRes5        ;// Acc = a+f
    255         VADDL       qSumCD, dRes2, dRes3        ;// c+d
    256         VADDL       qSumBE, dRes1, dRes4        ;// b+e
    257 
    258         VSUB        dRes8, dRes8, dTmp0
    259 
    260         VMLA        qAcc01, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
    261 ;        VMLS        qAcc01, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    262         VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    263 
    264         ;// Col1
    265         VADDL       qAcc23, dRes1, dRes6        ;// Acc = a+f
    266         VADDL       qSumCD, dRes3, dRes4        ;// c+d
    267         VADDL       qSumBE, dRes2, dRes5        ;// b+e
    268         VMLA        qAcc23, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
    269 
    270         VSUB        qAcc01, qAcc01, qTmp0
    271 
    272 ;        VMLS        qAcc23, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    273         VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    274 
    275         ;// Col2
    276         VADDL       qAcc45, dRes2, dRes7        ;// Acc = a+f
    277         VADDL       qSumCD, dRes4, dRes5        ;// c+d
    278         VADDL       qSumBE, dRes3, dRes6        ;// b+e
    279         VMLA        qAcc45, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
    280 
    281         VSUB        qAcc23, qAcc23, qTmp0
    282 
    283 ;        VMLS        qAcc45, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    284         VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    285 
    286         ;// Col3
    287         VADDL       qAcc67, dRes3, dRes8        ;// Acc = a+f
    288         VADDL       qSumCD, dRes5, dRes6        ;// c+d
    289         VADDL       qSumBE, dRes4, dRes7        ;// b+e
    290         VMLA        qAcc67, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
    291 
    292         VSUB        qAcc45, qAcc45, qTmp0
    293 
    294         VMLS        qAcc67, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    295 
    296         VQRSHRUN    dTempAcc0, qAcc01, #10
    297         VQRSHRUN    dTempAcc1, qAcc23, #10
    298         VQRSHRUN    dTempAcc2, qAcc45, #10
    299         VQRSHRUN    dTempAcc3, qAcc67, #10
    300 
    301         VQMOVN      dAcc0, qTAcc0
    302         VQMOVN      dAcc1, qTAcc1
    303         VQMOVN      dAcc2, qTAcc2
    304         VQMOVN      dAcc3, qTAcc3
    305 
    306         M_END
    307 
    308     ENDIF
    309 
    310 
    311 
    312     END
    313 
    314