Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         M_VARIANTS CortexA8
     17 
     18         EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
     19 
     20 DEBUG_ON    SETL {FALSE}
     21 
     22     IF CortexA8
     23 
     24         M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11
     25 
     26 ;// Declare input registers
     27 pSrc            RN 0
     28 srcStep         RN 1
     29 pDst            RN 2
     30 dstStep         RN 3
     31 
     32 ;// Declare Neon registers
     33 dCoeff5         DN 30.S16
     34 dCoeff20        DN 31.S16
     35 
     36 qSrcA01         QN 11.U8
     37 qSrcB01         QN 12.U8
     38 qSrcC01         QN 13.U8
     39 qSrcD01         QN 14.U8
     40 
     41 dSrcA0          DN 22.U8
     42 dSrcA1          DN 23.U8
     43 dSrcB0          DN 24.U8
     44 dSrcB1          DN 25.U8
     45 dSrcC0          DN 26.U8
     46 dSrcC1          DN 27.U8
     47 dSrcD0          DN 28.U8
     48 dSrcD1          DN 29.U8
     49 
     50 dSrcb           DN 12.U8
     51 dSrce           DN 13.U8
     52 dSrcf           DN 10.U8
     53 
     54 dSrc0c          DN 14.U8
     55 dSrc1c          DN 16.U8
     56 dSrc2c          DN 18.U8
     57 dSrc3c          DN 20.U8
     58 
     59 dSrc0d          DN 15.U8
     60 dSrc1d          DN 17.U8
     61 dSrc2d          DN 19.U8
     62 dSrc3d          DN 21.U8
     63 
     64 qTemp01         QN 4.S16
     65 qTemp23         QN 6.S16
     66 dTemp0          DN 8.S16
     67 dTemp2          DN 12.S16
     68 
     69 qRes01          QN 11.S16
     70 qRes23          QN 12.S16
     71 qRes45          QN 13.S16
     72 qRes67          QN 14.S16
     73 
     74 dRes0           DN 22.S16
     75 dRes2           DN 24.S16
     76 dRes4           DN 26.S16
     77 dRes6           DN 28.S16
     78 
     79 dAcc0           DN 22.U8
     80 dAcc2           DN 24.U8
     81 dAcc4           DN 26.U8
     82 dAcc6           DN 28.U8
     83 
     84 dResult0        DN 22.U32
     85 dResult2        DN 24.U32
     86 dResult4        DN 26.U32
     87 dResult6        DN 28.U32
     88 
     89         VLD1        qSrcA01, [pSrc], srcStep    ;// Load A register [a0 a1 a2 a3 ..]
     90         ;// One cycle stall
     91         VEXT        dSrcf, dSrcA0, dSrcA1, #5   ;// [f0 f1 f2 f3 ..]
     92         VEXT        dSrcb, dSrcA0, dSrcA1, #1   ;// [b0 b1 b2 b3 ..]
     93 ;        VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
     94         VEXT        dSrc0c, dSrcA0, dSrcA1, #2
     95         VEXT        dSrc0d, dSrcA0, dSrcA1, #3
     96         VEXT        dSrce, dSrcA0, dSrcA1, #4
     97         VADDL       qRes01, dSrcA0, dSrcf       ;// Acc=a+f
     98         VADDL       qTemp01, dSrc0c, dSrc0d     ;// c+d
     99         VADDL       qTemp23, dSrcb, dSrce       ;// b+e
    100 
    101         VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
    102 ;        VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]
    103         VMLA        dRes0, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
    104 ;        VMLS        dRes0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
    105         VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
    106 
    107         VEXT        dSrcf, dSrcB0, dSrcB1, #5   ;// [f0 f1 f2 f3 ..]
    108         VEXT        dSrcb, dSrcB0, dSrcB1, #1   ;// [b0 b1 b2 b3 ..]
    109         VEXT        dSrc1c, dSrcB0, dSrcB1, #2
    110         VEXT        dSrc1d, dSrcB0, dSrcB1, #3
    111         VEXT        dSrce, dSrcB0, dSrcB1, #4
    112         VADDL       qRes23, dSrcB0, dSrcf       ;// Acc=a+f
    113 
    114         VSUB        dRes0, dRes0, dTemp0    ;// TeRi
    115 
    116         VADDL       qTemp01, dSrc1c, dSrc1d     ;// c+d
    117         VADDL       qTemp23, dSrcb, dSrce       ;// b+e
    118 
    119         VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]
    120 ;        VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]
    121 
    122         VMLA        dRes2, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
    123 ;        VMLS        dRes2, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
    124         VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
    125 
    126         VEXT        dSrcf, dSrcC0, dSrcC1, #5   ;// [f0 f1 f2 f3 ..]
    127         VEXT        dSrcb, dSrcC0, dSrcC1, #1   ;// [b0 b1 b2 b3 ..]
    128         VEXT        dSrc2c, dSrcC0, dSrcC1, #2
    129         VEXT        dSrc2d, dSrcC0, dSrcC1, #3
    130         VEXT        dSrce, dSrcC0, dSrcC1, #4
    131         VADDL       qRes45, dSrcC0, dSrcf       ;// Acc=a+f
    132 
    133         VSUB        dRes2, dRes2, dTemp0  ;// TeRi
    134 
    135         VADDL       qTemp01, dSrc2c, dSrc2d     ;// c+d
    136         VADDL       qTemp23, dSrcb, dSrce       ;// b+e
    137 
    138         VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]
    139 
    140         VMLA        dRes4, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
    141 ;        VMLS        dRes4, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
    142         VMUL        dTemp0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e) TeRi
    143 
    144 
    145         VEXT        dSrcf, dSrcD0, dSrcD1, #5   ;// [f0 f1 f2 f3 ..]
    146         VEXT        dSrcb, dSrcD0, dSrcD1, #1   ;// [b0 b1 b2 b3 ..]
    147         VEXT        dSrc3c, dSrcD0, dSrcD1, #2
    148         VEXT        dSrc3d, dSrcD0, dSrcD1, #3
    149         VEXT        dSrce, dSrcD0, dSrcD1, #4
    150         VADDL       qRes67, dSrcD0, dSrcf       ;// Acc=a+f
    151 
    152         VSUB        dRes4, dRes4, dTemp0 ;// TeRi
    153 
    154         VADDL       qTemp01, dSrc3c, dSrc3d     ;// c+d
    155         VADDL       qTemp23, dSrcb, dSrce       ;// b+e
    156         VMLA        dRes6, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
    157         VMLS        dRes6, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
    158 
    159         VQRSHRUN    dAcc0, qRes01, #5           ;// Acc = Sat ((Acc + 16) / 32)
    160         VQRSHRUN    dAcc2, qRes23, #5           ;// Acc = Sat ((Acc + 16) / 32)
    161         VQRSHRUN    dAcc4, qRes45, #5           ;// Acc = Sat ((Acc + 16) / 32)
    162         VQRSHRUN    dAcc6, qRes67, #5           ;// Acc = Sat ((Acc + 16) / 32)
    163 
    164         M_END
    165 
    166     ENDIF
    167 
    168 
    169     END
    170 
    171 
    172 
    173 
    174 
    175 
    176 
    177 
    178 
    179 
    180 
    181 
    182 
    183 
    184 
    185 
    186 
    187 
    188 
    189 
    190 
    191 
    192 
    193 
    194 
    195 
    196 
    197 
    198 
    199 
    200 
    201 
    202 
    203 
    204 
    205 
    206 
    207 
    208 
    209 
    210 
    211 
    212 
    213 
    214 
    215 
    216 
    217 
    218 
    219 
    220 
    221 
    222 
    223 
    224 
    225 
    226 
    227 
    228 
    229