Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
     31 
     32         M_VARIANTS CortexA8
     33 
     34     IF CortexA8
     35 
     36         M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11
     37 
     38 ;// Declare input registers
     39 pSrc            RN 0
     40 srcStep         RN 1
     41 pDst            RN 2
     42 dstStep         RN 3
     43 
     44 ;// Declare Neon registers
     45 dCoeff5         DN 30.S16
     46 dCoeff20        DN 31.S16
     47 qCoeff5         QN 14.S32
     48 qCoeff20        QN 15.S32
     49 
     50 qSrc01          QN 0.U8
     51 dSrc0           DN 0.U8
     52 dSrc1           DN 1.U8
     53 
     54 dSrcb           DN 4.U8
     55 dSrcc           DN 2.U8
     56 dSrcd           DN 3.U8
     57 dSrce           DN 5.U8
     58 dSrcf           DN 1.U8
     59 
     60 qSrcb           QN 2.S16
     61 qSrcc           QN 1.S16
     62 dSrcB           DN 4.S16
     63 dSrcC           DN 2.S16
     64 
     65 qRes0           QN 5.S16
     66 qRes1           QN 6.S16
     67 qRes2           QN 7.S16
     68 qRes3           QN 8.S16
     69 qRes4           QN 9.S16
     70 qRes5           QN 10.S16
     71 qRes6           QN 11.S16
     72 qRes7           QN 12.S16
     73 qRes8           QN 13.S16
     74 
     75 dRes0           DN 10.S16
     76 dRes1           DN 12.S16
     77 dRes2           DN 14.S16
     78 dRes3           DN 16.S16
     79 dRes4           DN 18.S16
     80 dRes5           DN 20.S16
     81 dRes6           DN 22.S16
     82 dRes7           DN 24.S16
     83 dRes8           DN 26.S16
     84 
     85 qAcc01          QN 5.S32
     86 qAcc23          QN 6.S32
     87 qAcc45          QN 2.S32
     88 qAcc67          QN 3.S32
     89 qSumBE          QN 0.S32
     90 qSumCD          QN 1.S32
     91 
     92 dTempAcc0       DN 0.U16
     93 dTempAcc1       DN 2.U16
     94 dTempAcc2       DN 4.U16
     95 dTempAcc3       DN 6.U16
     96 
     97 qTAcc0          QN 0.U16
     98 qTAcc1          QN 1.U16
     99 qTAcc2          QN 2.U16
    100 qTAcc3          QN 3.U16
    101 
    102 dAcc0           DN 0.U8
    103 dAcc1           DN 2.U8
    104 dAcc2           DN 4.U8
    105 dAcc3           DN 6.U8
    106 
    107 dTmp0           DN 8.S16
    108 dTmp1           DN 9.S16
    109 qTmp0           QN 4.S32
    110 
    111         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    112         VMOV        dCoeff20, #20
    113         VMOV        dCoeff5, #5
    114 
    115         ;// Row0
    116         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    117         VEXT        dSrcc, dSrc0, dSrc1, #2
    118         VEXT        dSrcd, dSrc0, dSrc1, #3
    119         VEXT        dSrce, dSrc0, dSrc1, #4
    120         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    121         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    122         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    123         VADDL       qRes0, dSrc0, dSrcf         ;// Acc=a+f
    124         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    125         VMLA        dRes0, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    126 ;        VMLS        dRes0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    127         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    128 
    129         ;// Row1
    130         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    131         VEXT        dSrcc, dSrc0, dSrc1, #2
    132         VEXT        dSrcd, dSrc0, dSrc1, #3
    133         VEXT        dSrce, dSrc0, dSrc1, #4
    134         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    135         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    136         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    137         VADDL       qRes1, dSrc0, dSrcf         ;// Acc=a+f
    138         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    139 
    140         VSUB        dRes0, dRes0, dTmp0 ;// TeRi
    141 
    142         VMLA        dRes1, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    143 ;        VMLS        dRes1, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    144         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    145 
    146         ;// Row2
    147         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    148         VEXT        dSrcc, dSrc0, dSrc1, #2
    149         VEXT        dSrcd, dSrc0, dSrc1, #3
    150         VEXT        dSrce, dSrc0, dSrc1, #4
    151         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    152         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    153         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    154         VADDL       qRes2, dSrc0, dSrcf         ;// Acc=a+f
    155         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    156 
    157         VSUB        dRes1, dRes1, dTmp0
    158 
    159         VMLA        dRes2, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    160 ;        VMLS        dRes2, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    161         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    162 
    163         ;// Row3
    164         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    165         VEXT        dSrcc, dSrc0, dSrc1, #2
    166         VEXT        dSrcd, dSrc0, dSrc1, #3
    167         VEXT        dSrce, dSrc0, dSrc1, #4
    168         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    169         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    170         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    171         VADDL       qRes3, dSrc0, dSrcf         ;// Acc=a+f
    172         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    173 
    174         VSUB        dRes2, dRes2, dTmp0
    175 
    176         VMLA        dRes3, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    177 ;        VMLS        dRes3, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    178         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    179 
    180         ;// Row4
    181         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    182         VEXT        dSrcc, dSrc0, dSrc1, #2
    183         VEXT        dSrcd, dSrc0, dSrc1, #3
    184         VEXT        dSrce, dSrc0, dSrc1, #4
    185         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    186         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    187         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    188         VADDL       qRes4, dSrc0, dSrcf         ;// Acc=a+f
    189         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    190 
    191         VSUB        dRes3, dRes3, dTmp0
    192 
    193         VMLA        dRes4, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    194 ;        VMLS        dRes4, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    195         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    196 
    197         ;// Row5
    198         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    199         VEXT        dSrcc, dSrc0, dSrc1, #2
    200         VEXT        dSrcd, dSrc0, dSrc1, #3
    201         VEXT        dSrce, dSrc0, dSrc1, #4
    202         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    203         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    204         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    205         VADDL       qRes5, dSrc0, dSrcf         ;// Acc=a+f
    206         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    207 
    208         VSUB        dRes4, dRes4, dTmp0
    209 
    210         VMLA        dRes5, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    211 ;        VMLS        dRes5, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    212         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    213 
    214         ;// Row6
    215         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    216         VEXT        dSrcc, dSrc0, dSrc1, #2
    217         VEXT        dSrcd, dSrc0, dSrc1, #3
    218         VEXT        dSrce, dSrc0, dSrc1, #4
    219         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    220         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    221         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    222         VADDL       qRes6, dSrc0, dSrcf         ;// Acc=a+f
    223         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    224 
    225         VSUB        dRes5, dRes5, dTmp0
    226 
    227         VMLA        dRes6, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    228 ;        VMLS        dRes6, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    229         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    230 
    231         ;// Row7
    232         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    233         VEXT        dSrcc, dSrc0, dSrc1, #2
    234         VEXT        dSrcd, dSrc0, dSrc1, #3
    235         VEXT        dSrce, dSrc0, dSrc1, #4
    236         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    237         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    238         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    239         VADDL       qRes7, dSrc0, dSrcf         ;// Acc=a+f
    240         VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
    241 
    242         VSUB        dRes6, dRes6, dTmp0
    243 
    244         VMLA        dRes7, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    245 ;        VMLS        dRes7, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    246         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    247 
    248         ;// Row8
    249         VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
    250         VEXT        dSrcc, dSrc0, dSrc1, #2
    251         VEXT        dSrcd, dSrc0, dSrc1, #3
    252         VEXT        dSrce, dSrc0, dSrc1, #4
    253         VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
    254         VADDL       qSrcc, dSrcc, dSrcd         ;// c+d
    255         VADDL       qSrcb, dSrcb, dSrce         ;// b+e
    256         VADDL       qRes8, dSrc0, dSrcf         ;// Acc=a+f
    257 
    258         VSUB        dRes7, dRes7, dTmp0
    259 
    260         VMLA        dRes8, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
    261 ;        VMLS        dRes8, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    262         VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
    263 
    264         VMOV        qCoeff20, #20
    265         VMOV        qCoeff5, #5
    266 
    267         ;// Col0
    268         VADDL       qAcc01, dRes0, dRes5        ;// Acc = a+f
    269         VADDL       qSumCD, dRes2, dRes3        ;// c+d
    270         VADDL       qSumBE, dRes1, dRes4        ;// b+e
    271 
    272         VSUB        dRes8, dRes8, dTmp0
    273 
    274         VMLA        qAcc01, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
    275 ;        VMLS        qAcc01, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    276         VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    277 
    278         ;// Col1
    279         VADDL       qAcc23, dRes1, dRes6        ;// Acc = a+f
    280         VADDL       qSumCD, dRes3, dRes4        ;// c+d
    281         VADDL       qSumBE, dRes2, dRes5        ;// b+e
    282         VMLA        qAcc23, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
    283 
    284         VSUB        qAcc01, qAcc01, qTmp0
    285 
    286 ;        VMLS        qAcc23, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    287         VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    288 
    289         ;// Col2
    290         VADDL       qAcc45, dRes2, dRes7        ;// Acc = a+f
    291         VADDL       qSumCD, dRes4, dRes5        ;// c+d
    292         VADDL       qSumBE, dRes3, dRes6        ;// b+e
    293         VMLA        qAcc45, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
    294 
    295         VSUB        qAcc23, qAcc23, qTmp0
    296 
    297 ;        VMLS        qAcc45, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    298         VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    299 
    300         ;// Col3
    301         VADDL       qAcc67, dRes3, dRes8        ;// Acc = a+f
    302         VADDL       qSumCD, dRes5, dRes6        ;// c+d
    303         VADDL       qSumBE, dRes4, dRes7        ;// b+e
    304         VMLA        qAcc67, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
    305 
    306         VSUB        qAcc45, qAcc45, qTmp0
    307 
    308         VMLS        qAcc67, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)
    309 
    310         VQRSHRUN    dTempAcc0, qAcc01, #10
    311         VQRSHRUN    dTempAcc1, qAcc23, #10
    312         VQRSHRUN    dTempAcc2, qAcc45, #10
    313         VQRSHRUN    dTempAcc3, qAcc67, #10
    314 
    315         VQMOVN      dAcc0, qTAcc0
    316         VQMOVN      dAcc1, qTAcc1
    317         VQMOVN      dAcc2, qTAcc2
    318         VQMOVN      dAcc3, qTAcc3
    319 
    320         M_END
    321 
    322     ENDIF
    323 
    324 
    325 
    326     END
    327 
    328