Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_InterpolateLuma_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27 ;// Function:
     28 ;//     omxVCM4P10_InterpolateLuma
     29 ;//
     30 ;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly.
     31 ;// Performs quarter pel interpolation of inter luma MB.
     32 ;// It's assumed that the frame is already padded when calling this function.
     33 ;// Parameters:
     34 ;// [in]    pSrc        Pointer to the source reference frame buffer
     35 ;// [in]    srcStep     Reference frame step in byte
     36 ;// [in]    dstStep     Destination frame step in byte. Must be multiple of roi.width
     37 ;// [in]    dx          Fractional part of horizontal motion vector
     38 ;//                         component in 1/4 pixel unit; valid in the range [0,3]
     39 ;// [in]    dy          Fractional part of vertical motion vector
     40 ;//                         component in 1/4 pixel unit; valid in the range [0,3]
     41 ;// [in]    roi         Dimension of the interpolation region;the parameters roi.width and roi.height must
     42 ;//                         be equal to either 4, 8, or 16.
     43 ;// [out]   pDst        Pointer to the destination frame buffer.
     44 ;//                   if roi.width==4,  4-byte alignment required
     45 ;//                   if roi.width==8,  8-byte alignment required
     46 ;//                   if roi.width==16, 16-byte alignment required
     47 ;//
     48 ;// Return Value:
     49 ;// If the function runs without error, it returns OMX_Sts_NoErr.
     50 ;// It is assued that following cases are satisfied before calling this function:
     51 ;//  pSrc or pDst is not NULL.
     52 ;//  srcStep or dstStep >= roi.width.
     53 ;//     dx or dy is in the range [0-3].
     54 ;//     roi.width or roi.height is not out of range {4, 8, 16}.
     55 ;//     If roi.width is equal to 4, Dst is 4 byte aligned.
     56 ;//     If roi.width is equal to 8, pDst is 8 byte aligned.
     57 ;//     If roi.width is equal to 16, pDst is 16 byte aligned.
     58 ;//     srcStep and dstStep is multiple of 8.
     59 ;//
     60 ;//
     61 
     62 
     63         INCLUDE omxtypes_s.h
     64         INCLUDE armCOMM_s.h
     65 
     66         M_VARIANTS CortexA8
     67 
     68         EXPORT omxVCM4P10_InterpolateLuma
     69 
     70 
     71     IF CortexA8
     72         IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
     73         IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
     74         IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
     75         IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
     76     ENDIF
     77 
     78 
     79 
     80 ;// Declare input registers
     81 pSrc            RN 0
     82 srcStep         RN 1
     83 pDst            RN 2
     84 dstStep         RN 3
     85 iHeight         RN 4
     86 iWidth          RN 5
     87 
     88 ;// Declare other intermediate registers
     89 idx             RN 6
     90 idy             RN 7
     91 index           RN 6
     92 Temp            RN 12
     93 pArgs           RN 11
     94 
     95 
     96     IF CortexA8
     97 
     98         ;//
     99         ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time.
    100         ;//
    101         M_ALLOC4    ppArgs, 16
    102 
    103         ;// Function header
    104         M_START omxVCM4P10_InterpolateLuma, r11, d15
    105 
    106 pSrcBK          RN 8
    107 
    108 ;// Declare Neon registers
    109 dCoeff5         DN 30.S16
    110 dCoeff20        DN 31.S16
    111 
    112 ;// Registers used for implementing Horizontal interpolation
    113 dSrc0c          DN 14.U8
    114 dSrc1c          DN 16.U8
    115 dSrc2c          DN 18.U8
    116 dSrc3c          DN 20.U8
    117 dSrc0d          DN 15.U8
    118 dSrc1d          DN 17.U8
    119 dSrc2d          DN 19.U8
    120 dSrc3d          DN 21.U8
    121 dAccH0          DN 22.U8
    122 dAccH1          DN 24.U8
    123 dAccH2          DN 26.U8
    124 dAccH3          DN 28.U8
    125 dResultH0       DN 22.U32
    126 dResultH1       DN 24.U32
    127 dResultH2       DN 26.U32
    128 dResultH3       DN 28.U32
    129 
    130 ;// Registers used for implementing Vertical interpolation
    131 dSrc0           DN 9.U8
    132 dSrc1           DN 10.U8
    133 dSrc2           DN 11.U8
    134 dSrc3           DN 12.U8
    135 dSrc4           DN 13.U8
    136 dAccV0          DN 0.U8
    137 dAccV1          DN 2.U8
    138 dAccV2          DN 4.U8
    139 dAccV3          DN 6.U8
    140 dResultV0       DN 0.U32
    141 dResultV1       DN 2.U32
    142 dResultV2       DN 4.U32
    143 dResultV3       DN 6.U32
    144 
    145 ;// Registers used for implementing Diagonal interpolation
    146 dTAcc0          DN 0.U8
    147 dTAcc1          DN 2.U8
    148 dTAcc2          DN 4.U8
    149 dTAcc3          DN 6.U8
    150 dTRes0          DN 0.32
    151 dTRes1          DN 2.32
    152 dTRes2          DN 4.32
    153 dTRes3          DN 6.32
    154 dTResult0       DN 14.U8
    155 dTResult1       DN 16.U8
    156 dTResult2       DN 18.U8
    157 dTResult3       DN 20.U8
    158 dTempP0         DN 18.S16
    159 dTempP1         DN 19.S16
    160 dTempQ0         DN 20.S16
    161 dTempQ1         DN 21.S16
    162 dTempR0         DN 22.S16
    163 dTempR1         DN 23.S16
    164 dTempS0         DN 24.S16
    165 dTempS1         DN 25.S16
    166 qTempP01        QN 9.S16
    167 qTempQ01        QN 10.S16
    168 qTempR01        QN 11.S16
    169 qTempS01        QN 12.S16
    170 
    171 ;// Intermediate values for averaging
    172 qRes2           QN 7.S16
    173 qRes3           QN 8.S16
    174 qRes4           QN 9.S16
    175 qRes5           QN 10.S16
    176 qRes6           QN 11.S16
    177 
    178 ;// For implementing copy
    179 dDst0            DN 9.32
    180 dDst1            DN 10.32
    181 dDst2            DN 11.32
    182 dDst3            DN 12.32
    183 
    184         ;// Define stack arguments
    185         M_ARG       ptridx, 4
    186         M_ARG       ptridy, 4
    187         M_ARG       ptrWidth, 4
    188         M_ARG       ptrHeight, 4
    189 
    190         ;// Load structure elements of roi
    191         M_LDR       idx, ptridx
    192         M_LDR       idy, ptridy
    193         M_LDR       iWidth, ptrWidth
    194         M_LDR       iHeight, ptrHeight
    195 
    196         ADD         index, idx, idy, LSL #2                 ;//  [index] = [idy][idx]
    197         M_ADR       pArgs, ppArgs
    198 
    199         ;// Move coefficients Neon registers
    200         VMOV        dCoeff20, #20
    201         VMOV        dCoeff5, #5
    202 
    203 Block4x4WidthLoop
    204 Block4x4HeightLoop
    205 
    206         STM         pArgs, {pSrc,srcStep,pDst,dstStep}
    207 
    208         ;// switch table using motion vector as index
    209         ADD         pc, pc, index, LSL #2
    210         B           Case_f
    211         B           Case_0
    212         B           Case_1
    213         B           Case_2
    214         B           Case_3
    215         B           Case_4
    216         B           Case_5
    217         B           Case_6
    218         B           Case_7
    219         B           Case_8
    220         B           Case_9
    221         B           Case_a
    222         B           Case_b
    223         B           Case_c
    224         B           Case_d
    225         B           Case_e
    226         B           Case_f
    227 
    228 Case_0
    229         ;// Case G
    230         M_PRINTF "Case 0 \n"
    231 
    232         ;// Loads a 4x4 block of .8 and stores as .32
    233         ADD         Temp, pSrc, srcStep, LSL #1
    234         VLD1        dSrc0, [pSrc], srcStep
    235         VLD1        dSrc2, [Temp], srcStep
    236         VLD1        dSrc1, [pSrc]
    237         VLD1        dSrc3, [Temp]
    238 
    239         ADD         Temp, pDst, dstStep, LSL #1
    240         VST1        dDst0[0], [pDst], dstStep
    241         VST1        dDst2[0], [Temp], dstStep
    242         VST1        dDst1[0], [pDst]
    243         VST1        dDst3[0], [Temp]
    244         M_ADR       pArgs, ppArgs
    245         B           Block4x4LoopEnd
    246 Case_1
    247         ;// Case a
    248         M_PRINTF "Case 1 \n"
    249 
    250         SUB         pSrc, pSrc, #2
    251         BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
    252         VRHADD      dAccH0, dAccH0, dSrc0c
    253         VRHADD      dAccH2, dAccH2, dSrc2c
    254         VRHADD      dAccH1, dAccH1, dSrc1c
    255         VRHADD      dAccH3, dAccH3, dSrc3c
    256         ADD         Temp, pDst, dstStep, LSL #1
    257         VST1        dResultH0[0], [pDst], dstStep
    258         VST1        dResultH2[0], [Temp], dstStep
    259         VST1        dResultH1[0], [pDst]
    260         VST1        dResultH3[0], [Temp]
    261         M_ADR       pArgs, ppArgs
    262         B           Block4x4LoopEnd
    263 Case_2
    264         ;// Case b
    265         M_PRINTF "Case 2 \n"
    266 
    267         SUB         pSrc, pSrc, #2
    268         BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
    269         ADD         Temp, pDst, dstStep, LSL #1
    270         VST1        dResultH0[0], [pDst], dstStep
    271         VST1        dResultH2[0], [Temp], dstStep
    272         VST1        dResultH1[0], [pDst]
    273         VST1        dResultH3[0], [Temp]
    274         M_ADR       pArgs, ppArgs
    275         B           Block4x4LoopEnd
    276 Case_3
    277         ;// Case c
    278         M_PRINTF "Case 3 \n"
    279 
    280         SUB         pSrc, pSrc, #2
    281         BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
    282         VRHADD      dAccH0, dAccH0, dSrc0d
    283         VRHADD      dAccH2, dAccH2, dSrc2d
    284         VRHADD      dAccH1, dAccH1, dSrc1d
    285         VRHADD      dAccH3, dAccH3, dSrc3d
    286         ADD         Temp, pDst, dstStep, LSL #1
    287         VST1        dResultH0[0], [pDst], dstStep
    288         VST1        dResultH2[0], [Temp], dstStep
    289         VST1        dResultH1[0], [pDst]
    290         VST1        dResultH3[0], [Temp]
    291         M_ADR       pArgs, ppArgs
    292         B           Block4x4LoopEnd
    293 Case_4
    294         ;// Case d
    295         M_PRINTF "Case 4 \n"
    296 
    297         SUB         pSrc, pSrc, srcStep, LSL #1
    298         BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
    299         VRHADD      dAccV0, dAccV0, dSrc0
    300         VRHADD      dAccV2, dAccV2, dSrc2
    301         VRHADD      dAccV1, dAccV1, dSrc1
    302         VRHADD      dAccV3, dAccV3, dSrc3
    303         ADD         Temp, pDst, dstStep, LSL #1
    304         VST1        dResultV0[0], [pDst], dstStep
    305         VST1        dResultV2[0], [Temp], dstStep
    306         VST1        dResultV1[0], [pDst]
    307         VST1        dResultV3[0], [Temp]
    308         M_ADR       pArgs, ppArgs
    309         B           Block4x4LoopEnd
    310 Case_5
    311         ;// Case e
    312         M_PRINTF "Case 5 \n"
    313 
    314         MOV         pSrcBK, pSrc
    315         SUB         pSrc, pSrc, srcStep, LSL #1
    316         BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
    317         SUB         pSrc, pSrcBK, #2
    318         BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
    319         VRHADD      dAccH0, dAccH0, dAccV0
    320         VRHADD      dAccH2, dAccH2, dAccV2
    321         VRHADD      dAccH1, dAccH1, dAccV1
    322         VRHADD      dAccH3, dAccH3, dAccV3
    323         ADD         Temp, pDst, dstStep, LSL #1
    324         VST1        dResultH0[0], [pDst], dstStep
    325         VST1        dResultH2[0], [Temp], dstStep
    326         VST1        dResultH1[0], [pDst]
    327         VST1        dResultH3[0], [Temp]
    328 
    329         M_ADR       pArgs, ppArgs
    330         B       Block4x4LoopEnd
    331 Case_6
    332         ;// Case f
    333         M_PRINTF "Case 6 \n"
    334 
    335         SUB         pSrc, pSrc, srcStep, LSL #1
    336         SUB         pSrc, pSrc, #2
    337         BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
    338         VQRSHRUN    dTResult0, qRes2, #5
    339         VQRSHRUN    dTResult1, qRes3, #5
    340         VQRSHRUN    dTResult2, qRes4, #5
    341         VQRSHRUN    dTResult3, qRes5, #5
    342         VRHADD      dTAcc0, dTAcc0, dTResult0
    343         VRHADD      dTAcc2, dTAcc2, dTResult2
    344         VRHADD      dTAcc1, dTAcc1, dTResult1
    345         VRHADD      dTAcc3, dTAcc3, dTResult3
    346         ADD         Temp, pDst, dstStep, LSL #1
    347         VST1        dTRes0[0], [pDst], dstStep
    348         VST1        dTRes2[0], [Temp], dstStep
    349         VST1        dTRes1[0], [pDst]
    350         VST1        dTRes3[0], [Temp]
    351 
    352         M_ADR       pArgs, ppArgs
    353         B       Block4x4LoopEnd
    354 Case_7
    355         ;// Case g
    356         M_PRINTF "Case 7 \n"
    357         MOV         pSrcBK, pSrc
    358         ADD         pSrc, pSrc, #1
    359         SUB         pSrc, pSrc, srcStep, LSL #1
    360         BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
    361         SUB         pSrc, pSrcBK, #2
    362         BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
    363         VRHADD      dAccH0, dAccH0, dAccV0
    364         VRHADD      dAccH2, dAccH2, dAccV2
    365         VRHADD      dAccH1, dAccH1, dAccV1
    366         VRHADD      dAccH3, dAccH3, dAccV3
    367         ADD         Temp, pDst, dstStep, LSL #1
    368         VST1        dResultH0[0], [pDst], dstStep
    369         VST1        dResultH2[0], [Temp], dstStep
    370         VST1        dResultH1[0], [pDst]
    371         VST1        dResultH3[0], [Temp]
    372 
    373         M_ADR       pArgs, ppArgs
    374         B       Block4x4LoopEnd
    375 Case_8
    376         ;// Case h
    377         M_PRINTF "Case 8 \n"
    378 
    379         SUB         pSrc, pSrc, srcStep, LSL #1
    380         BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
    381         ADD         Temp, pDst, dstStep, LSL #1
    382         VST1        dResultV0[0], [pDst], dstStep
    383         VST1        dResultV2[0], [Temp], dstStep
    384         VST1        dResultV1[0], [pDst]
    385         VST1        dResultV3[0], [Temp]
    386         M_ADR       pArgs, ppArgs
    387         B           Block4x4LoopEnd
    388 Case_9
    389         ;// Case i
    390         M_PRINTF "Case 9 \n"
    391         SUB         pSrc, pSrc, srcStep, LSL #1
    392         SUB         pSrc, pSrc, #2
    393         BL          armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
    394         VEXT        dTempP0, dTempP0, dTempP1, #2
    395         VEXT        dTempQ0, dTempQ0, dTempQ1, #2
    396         VEXT        dTempR0, dTempR0, dTempR1, #2
    397         VEXT        dTempS0, dTempS0, dTempS1, #2
    398 
    399         VQRSHRUN    dTResult0, qTempP01, #5
    400         VQRSHRUN    dTResult1, qTempQ01, #5
    401         VQRSHRUN    dTResult2, qTempR01, #5
    402         VQRSHRUN    dTResult3, qTempS01, #5
    403 
    404         VRHADD      dTAcc0, dTAcc0, dTResult0
    405         VRHADD      dTAcc2, dTAcc2, dTResult2
    406         VRHADD      dTAcc1, dTAcc1, dTResult1
    407         VRHADD      dTAcc3, dTAcc3, dTResult3
    408         ADD         Temp, pDst, dstStep, LSL #1
    409         VST1        dTRes0[0], [pDst], dstStep
    410         VST1        dTRes2[0], [Temp], dstStep
    411         VST1        dTRes1[0], [pDst]
    412         VST1        dTRes3[0], [Temp]
    413         M_ADR       pArgs, ppArgs
    414         B       Block4x4LoopEnd
    415 Case_a
    416         ;// Case j
    417         M_PRINTF "Case a \n"
    418 
    419         SUB         pSrc, pSrc, srcStep, LSL #1
    420         SUB         pSrc, pSrc, #2
    421         BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
    422         ADD         Temp, pDst, dstStep, LSL #1
    423         VST1        dTRes0[0], [pDst], dstStep
    424         VST1        dTRes2[0], [Temp], dstStep
    425         VST1        dTRes1[0], [pDst]
    426         VST1        dTRes3[0], [Temp]
    427         M_ADR       pArgs, ppArgs
    428         B       Block4x4LoopEnd
    429 Case_b
    430         ;// Case k
    431         M_PRINTF "Case b \n"
    432         SUB         pSrc, pSrc, srcStep, LSL #1
    433         SUB         pSrc, pSrc, #2
    434         BL          armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
    435         VEXT        dTempP0, dTempP0, dTempP1, #3
    436         VEXT        dTempQ0, dTempQ0, dTempQ1, #3
    437         VEXT        dTempR0, dTempR0, dTempR1, #3
    438         VEXT        dTempS0, dTempS0, dTempS1, #3
    439 
    440         VQRSHRUN    dTResult0, qTempP01, #5
    441         VQRSHRUN    dTResult1, qTempQ01, #5
    442         VQRSHRUN    dTResult2, qTempR01, #5
    443         VQRSHRUN    dTResult3, qTempS01, #5
    444 
    445         VRHADD      dTAcc0, dTAcc0, dTResult0
    446         VRHADD      dTAcc2, dTAcc2, dTResult2
    447         VRHADD      dTAcc1, dTAcc1, dTResult1
    448         VRHADD      dTAcc3, dTAcc3, dTResult3
    449         ADD         Temp, pDst, dstStep, LSL #1
    450         VST1        dTRes0[0], [pDst], dstStep
    451         VST1        dTRes2[0], [Temp], dstStep
    452         VST1        dTRes1[0], [pDst]
    453         VST1        dTRes3[0], [Temp]
    454         M_ADR       pArgs, ppArgs
    455         B       Block4x4LoopEnd
    456 Case_c
    457         ;// Case n
    458         M_PRINTF "Case c \n"
    459 
    460         SUB         pSrc, pSrc, srcStep, LSL #1
    461         BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
    462         VRHADD      dAccV0, dAccV0, dSrc1
    463         VRHADD      dAccV2, dAccV2, dSrc3
    464         VRHADD      dAccV1, dAccV1, dSrc2
    465         VRHADD      dAccV3, dAccV3, dSrc4
    466         ADD         Temp, pDst, dstStep, LSL #1
    467         VST1        dResultV0[0], [pDst], dstStep
    468         VST1        dResultV2[0], [Temp], dstStep
    469         VST1        dResultV1[0], [pDst]
    470         VST1        dResultV3[0], [Temp]
    471         M_ADR       pArgs, ppArgs
    472         B           Block4x4LoopEnd
    473 Case_d
    474         ;// Case p
    475         M_PRINTF "Case d \n"
    476 
    477         MOV         pSrcBK, pSrc
    478         SUB         pSrc, pSrc, srcStep, LSL #1
    479         BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
    480         ADD         pSrc, pSrcBK, srcStep
    481         SUB         pSrc, pSrc, #2
    482         BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
    483         VRHADD      dAccH0, dAccH0, dAccV0
    484         VRHADD      dAccH2, dAccH2, dAccV2
    485         VRHADD      dAccH1, dAccH1, dAccV1
    486         VRHADD      dAccH3, dAccH3, dAccV3
    487         ADD         Temp, pDst, dstStep, LSL #1
    488         VST1        dResultH0[0], [pDst], dstStep
    489         VST1        dResultH2[0], [Temp], dstStep
    490         VST1        dResultH1[0], [pDst]
    491         VST1        dResultH3[0], [Temp]
    492         M_ADR       pArgs, ppArgs
    493         B       Block4x4LoopEnd
    494 Case_e
    495         ;// Case q
    496         M_PRINTF "Case e \n"
    497 
    498         SUB         pSrc, pSrc, srcStep, LSL #1
    499         SUB         pSrc, pSrc, #2
    500         BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
    501         VQRSHRUN    dTResult0, qRes3, #5
    502         VQRSHRUN    dTResult1, qRes4, #5
    503         VQRSHRUN    dTResult2, qRes5, #5
    504         VQRSHRUN    dTResult3, qRes6, #5
    505 
    506         VRHADD      dTAcc0, dTAcc0, dTResult0
    507         VRHADD      dTAcc2, dTAcc2, dTResult2
    508         VRHADD      dTAcc1, dTAcc1, dTResult1
    509         VRHADD      dTAcc3, dTAcc3, dTResult3
    510         ADD         Temp, pDst, dstStep, LSL #1
    511         VST1        dTRes0[0], [pDst], dstStep
    512         VST1        dTRes2[0], [Temp], dstStep
    513         VST1        dTRes1[0], [pDst]
    514         VST1        dTRes3[0], [Temp]
    515         M_ADR       pArgs, ppArgs
    516         B       Block4x4LoopEnd
    517 Case_f
    518         ;// Case r
    519         M_PRINTF "Case f \n"
    520         MOV         pSrcBK, pSrc
    521         ADD         pSrc, pSrc, #1
    522         SUB         pSrc, pSrc, srcStep, LSL #1
    523         BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
    524         ADD         pSrc, pSrcBK, srcStep
    525         SUB         pSrc, pSrc, #2
    526         BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
    527         VRHADD      dAccH0, dAccH0, dAccV0
    528         VRHADD      dAccH2, dAccH2, dAccV2
    529         VRHADD      dAccH1, dAccH1, dAccV1
    530         VRHADD      dAccH3, dAccH3, dAccV3
    531         ADD         Temp, pDst, dstStep, LSL #1
    532         VST1        dResultH0[0], [pDst], dstStep
    533         VST1        dResultH2[0], [Temp], dstStep
    534         VST1        dResultH1[0], [pDst]
    535         VST1        dResultH3[0], [Temp]
    536         M_ADR       pArgs, ppArgs
    537 
    538 
    539 Block4x4LoopEnd
    540 
    541         ;// Width Loop
    542         ;//M_ADR       pArgs, ppArgs
    543         LDM         pArgs, {pSrc,srcStep,pDst,dstStep}  ;// Load arguments
    544         SUBS        iWidth, iWidth, #4
    545         ADD         pSrc, pSrc, #4
    546         ADD         pDst, pDst, #4
    547         BGT         Block4x4WidthLoop
    548 
    549         ;// Height Loop
    550         SUBS        iHeight, iHeight, #4
    551         M_LDR       iWidth, ptrWidth
    552         M_ADR       pArgs, ppArgs
    553         ADD         pSrc, pSrc, srcStep, LSL #2
    554         ADD         pDst, pDst, dstStep, LSL #2
    555         SUB         pSrc, pSrc, iWidth
    556         SUB         pDst, pDst, iWidth
    557         BGT         Block4x4HeightLoop
    558 
    559 EndOfInterpolation
    560         MOV         r0, #0
    561         M_END
    562 
    563     ENDIF
    564         ;// End of CortexA8
    565 
    566     END
    567 
    568