Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   9641
     21 ;// Date:       Thursday, February 7, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
     31 
     32         M_VARIANTS ARM1136JS
     33 
     34 
     35 
     36     IF ARM1136JS
     37 
     38 
     39         M_ALLOC8 ppDstArgs, 8
     40         M_ALLOC8 pTempResult1, 8
     41         M_ALLOC8 pTempResult2, 8
     42         M_ALLOC4 ppSrc, 4
     43         M_ALLOC4 ppDst, 4
     44         M_ALLOC4 pDstStep, 4
     45         M_ALLOC4 pSrcStep, 4
     46         M_ALLOC4 pCounter, 4
     47 
     48         ;// Function header
     49         ;// Function:
     50         ;//     armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
     51         ;//
     52         ;// Implements diagonal interpolation for a block of size 4x4. Input and output should
     53         ;// be aligned.
     54         ;//
     55         ;// Registers used as input for this function
     56         ;// r0,r1,r2,r3, r8 where r0,r2  input pointer and r1,r3 step size, r8 intermediate-buf pointer
     57         ;//
     58         ;// Registers preserved for top level function
     59         ;// r0,r1,r2,r3,r4,r5,r6,r14
     60         ;//
     61         ;// Registers modified by the function
     62         ;// r7,r8,r9,r10,r11,r12
     63         ;//
     64         ;// Output registers
     65         ;// None. Function will preserve r0-r3
     66 
     67         M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r6
     68 
     69 ;// Declare input registers
     70 pSrc            RN 0
     71 srcStep         RN 1
     72 pDst            RN 2
     73 dstStep         RN 3
     74 
     75 ;// Declare inner loop registers
     76 Acc0            RN 4
     77 Acc1            RN 5
     78 Acc2            RN 6
     79 Acc3            RN 7
     80 
     81 ValA            RN 4
     82 ValB            RN 5
     83 ValC            RN 6
     84 ValD            RN 7
     85 ValE            RN 8
     86 ValF            RN 9
     87 ValG            RN 12
     88 ValH            RN 14
     89 ValI            RN 1
     90 
     91 Temp1           RN 3
     92 Temp2           RN 1
     93 Temp3           RN 12
     94 Temp4           RN 7
     95 Temp5           RN 5
     96 r0x0fe00fe0     RN 3                                    ;// [0 (16*255 - 16) 0 (16*255 - 16)]
     97 r0x00ff00ff     RN 10                                   ;// [0 255 0 255] where 255 is offset
     98 Counter         RN 11
     99 pInterBuf       RN 8
    100 
    101 ValCA           RN 8
    102 ValDB           RN 9
    103 ValGE           RN 10
    104 ValHF           RN 11
    105 r0x00140001     RN 12
    106 r0x0014fffb     RN 14
    107 
    108 r0x0001fc00     RN 11
    109 
    110 Accx            RN 8
    111 Accy            RN 9
    112 Temp6           RN 14
    113 
    114         M_STRD      pDst, dstStep, ppDstArgs
    115 
    116         MOV         pDst, pInterBuf
    117         MOV         dstStep, #16
    118 
    119         ;// Set up counter of format, [0]  [0]  [1 (height)]  [8 (width)]
    120         MOV         Counter, #4
    121         M_STR       dstStep, pDstStep
    122         M_STR       srcStep, pSrcStep
    123         LDR         r0x00ff00ff, =0x00ff00ff               ;// [0 255 0 255] 255 is offset to avoid negative results
    124 
    125 HeightLoop
    126 NextTwoRowsLoop
    127         LDR     ValD, [pSrc, srcStep]                   ;// Load row 1 [d1 c1 b1 a1]
    128         LDR     ValA, [pSrc], #4                        ;// Load row 0 [d0 c0 b0 a0]
    129         LDR     ValH, [pSrc, srcStep]                   ;// Load  [h1 g1 f1 e1]
    130         LDR     ValE, [pSrc], #4                        ;// Load  [h0 g0 f0 e0]
    131         LDRB    Temp2, [pSrc, srcStep]                  ;// Load row 1 [l1 k1 j1 i1]
    132         LDRB    Temp1, [pSrc], #-8                      ;// Load row 0 [l0 k0 j0 i0]
    133 
    134         PKHBT   ValB, ValA, ValD, LSL #16               ;// [b1 a1 b0 a0]
    135         PKHTB   ValD, ValD, ValA, ASR #16               ;// [d1 c1 d0 c0]
    136         UXTAB16 ValA, r0x00ff00ff, ValB                 ;// [00 a1 00 a0] + [0 255 0 255]
    137         UXTAB16 ValC, r0x00ff00ff, ValD                 ;// [00 c1 00 c0] + [0 255 0 255]
    138         PKHBT   ValI, Temp1, Temp2, LSL #16             ;// [00 i1 00 i0]
    139         PKHBT   ValF, ValE, ValH, LSL #16               ;// [f1 e1 f0 e0]
    140         PKHTB   ValH, ValH, ValE, ASR #16               ;// [h1 g1 h0 g0]
    141         UXTAB16 ValE, r0x00ff00ff, ValF                 ;// [00 e1 00 e0] + [0 255 0 255]
    142 
    143         ;// Calculate Acc0
    144         ;// Acc0 = a - 5*b + 20*c + 20*d - 5*e + f
    145         UXTAB16 Temp1, ValC, ValD, ROR #8
    146         UXTAB16 Temp3, ValE, ValB, ROR #8
    147         RSB     Temp1, Temp3, Temp1, LSL #2
    148         UXTAB16 Acc0, ValA, ValF, ROR #8
    149         ADD     Temp1, Temp1, Temp1, LSL #2
    150         ADD     Acc0, Acc0, Temp1
    151 
    152         ;// Calculate Acc1
    153         ;// Acc1 = b - 5*c + 20*d + 20*e - 5*f + g
    154         UXTAB16 Temp1, ValE, ValD, ROR #8
    155         UXTAB16 Temp3, ValC, ValF, ROR #8
    156         RSB     Temp1, Temp3, Temp1, LSL #2
    157         UXTAB16 ValG, r0x00ff00ff, ValH                 ;// [00 g1 00 g0] + [0 255 0 255]
    158         ADD     Temp1, Temp1, Temp1, LSL #2
    159         UXTAB16 Acc1, ValG, ValB, ROR #8
    160         ADD     Acc1, Acc1, Temp1
    161 
    162         UXTAB16 Acc2, ValC, ValH, ROR #8
    163         ADD     ValI, r0x00ff00ff, ValI                 ;// [00 i1 00 i0] + [0 255 0 255]
    164 
    165         ;// Calculate Acc2
    166         ;// Acc2 = c - 5*d + 20*e + 20*f - 5*g + h
    167         UXTAB16 Temp1, ValG, ValD, ROR #8
    168         UXTAB16 Acc3, ValI, ValD, ROR #8
    169         UXTAB16 Temp2, ValE, ValF, ROR #8
    170 
    171         RSB     Temp1, Temp1, Temp2, LSL #2
    172         UXTAB16 Temp2, ValG, ValF, ROR #8
    173         ADD     Temp1, Temp1, Temp1, LSL #2
    174         ADD     Acc2, Acc2, Temp1
    175 
    176         ;// Calculate Acc3
    177         ;// Acc3 = d - 5*e + 20*f + 20*g - 5*h + i
    178         UXTAB16 Temp1, ValE, ValH, ROR #8
    179         RSB     Temp1, Temp1, Temp2, LSL #2
    180         ADD     Temp1, Temp1, Temp1, LSL #2
    181         ADD     Acc3, Acc3, Temp1
    182 
    183         M_LDR   dstStep, pDstStep
    184         M_LDR   srcStep, pSrcStep
    185 
    186         ;// If Counter is even store Acc0-Acc3 in a temporary buffer
    187         ;// If Counter is off store Acc0-Acc3 and previous Acc0-Acc3 in a intermediate buf
    188         ANDS        Temp3, Counter, #1
    189         BEQ         NoProcessing
    190 
    191         ;// Packing previous and current Acc0-Acc3 values
    192         M_LDRD      Accx, Accy, pTempResult1
    193         PKHBT       Temp6, Accx, Acc0, LSL #16          ;//[0 a2 0 a0] = [0 a3 0 a2] [0 a1 0 a0]
    194         PKHTB       Acc0, Acc0, Accx, ASR #16           ;//[0 a3 0 a1] = [0 a1 0 a0] [0 a3 0 a2]
    195         STR         Acc0, [pDst, dstStep]
    196         STR         Temp6, [pDst], #4
    197         PKHBT       Temp6, Accy, Acc1, LSL #16          ;//[0 b2 0 b0] = [0 b3 0 b2] [0 b1 0 b0]
    198         PKHTB       Acc1, Acc1, Accy, ASR #16            ;//[0 b3 0 b1] = [0 b1 0 b0] [0 b3 0 b2]
    199         M_LDRD      Accx, Accy, pTempResult2
    200         STR         Acc1, [pDst, dstStep]
    201         STR         Temp6, [pDst], #4
    202 
    203         PKHBT       Temp6, Accx, Acc2, LSL #16          ;//[0 c2 0 c0] = [0 c3 0 c2] [0 c1 0 c0]
    204         PKHTB       Acc2, Acc2, Accx, ASR #16            ;//[0 c3 0 c1] = [0 c1 0 c0] [0 c3 0 c2]
    205         STR         Acc2, [pDst, dstStep]
    206         STR         Temp6, [pDst], #4
    207         PKHBT       Temp6, Accy, Acc3, LSL #16          ;//[0 d2 0 d0] = [0 d3 0 d2] [0 d1 0 d0]
    208         PKHTB       Acc3, Acc3, Accy, ASR #16            ;//[0 d3 0 d1] = [0 d1 0 d0] [0 d3 0 d2]
    209         STR         Acc3, [pDst, dstStep]
    210         STR         Temp6, [pDst], #-12
    211         ADD         pDst, pDst, dstStep, LSL #1
    212         B           AfterStore
    213 
    214 NoProcessing
    215         M_STRD      Acc0, Acc1, pTempResult1
    216         M_STRD      Acc2, Acc3, pTempResult2
    217 AfterStore
    218         SUBS        Counter, Counter, #1                ;// Loop till height is 10
    219         ADD         pSrc, pSrc, srcStep, LSL #1
    220         BPL         HeightLoop
    221 
    222         STR         Acc0, [pDst], #4                    ;//[0 a1 0 a0]
    223         STR         Acc1, [pDst], #4
    224         STR         Acc2, [pDst], #4
    225         STR         Acc3, [pDst], #-12
    226 
    227         ;//
    228         ;// Horizontal interpolation using multiplication
    229         ;//
    230 
    231         SUB         pSrc, pDst, dstStep, LSL #2
    232         MOV         srcStep, #16
    233         M_LDRD      pDst, dstStep, ppDstArgs
    234 
    235         MOV         Counter, #4
    236         LDR         r0x0014fffb, =0x0014fffb
    237         LDR         r0x00140001, =0x00140001
    238 
    239 HeightLoop1
    240         M_STR       Counter, pCounter
    241 
    242         M_LDR       ValCA, [pSrc], srcStep               ;// Load  [0 c 0 a]
    243         M_LDR       ValDB, [pSrc], srcStep               ;// Load  [0 d 0 b]
    244         M_LDR       ValGE, [pSrc], srcStep               ;// Load  [0 g 0 e]
    245         M_LDR       ValHF, [pSrc], srcStep               ;// Load  [0 h 0 f]
    246 
    247 
    248         ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e))
    249         ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f))
    250         ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g))
    251         ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h))
    252 
    253         SMUAD       Acc0, ValCA, r0x00140001            ;// Acc0  = [0 c 0 a] * [0 20 0 1]
    254         SMUAD       Acc1, ValDB, r0x00140001            ;// Acc1  = [0 c 0 a] * [0 20 0 1]
    255         SMUADX      Acc2, ValGE, r0x0014fffb            ;// Acc2  = [0 g 0 e] * [0 20 0 -5]
    256         SMUAD       Acc3, ValGE, r0x0014fffb            ;// Acc3  = [0 g 0 e] * [0 20 0 -5]
    257 
    258         SMLAD       Acc0, ValDB, r0x0014fffb, Acc0      ;// Acc0 += [0 d 0 b] * [0 20 0 -5]
    259         SMLADX      Acc1, ValGE, r0x00140001, Acc1      ;// Acc1 += [0 g 0 e] * [0 20 0 1]
    260         SMLADX      Acc2, ValHF, r0x00140001, Acc2      ;// Acc2 += [0 h 0 f] * [0 20 0 1]
    261         SMLADX      Acc3, ValHF, r0x0014fffb, Acc3      ;// Acc3 += [0 h 0 f] * [0 20 0 -5]
    262 
    263         SMLABB      Acc0, ValGE, r0x0014fffb, Acc0      ;// Acc0 += [0 g 0 e] * [0 0 0 -5]
    264         SMLATB      Acc1, ValCA, r0x0014fffb, Acc1      ;// Acc1 += [0 d 0 b] * [0 0 0 -5]
    265         SMLATB      Acc2, ValCA, r0x00140001, Acc2      ;// Acc2 += [0 c 0 a] * [0 0 0 1]
    266         SMLATB      Acc3, ValDB, r0x00140001, Acc3      ;// Acc3 += [0 c 0 a] * [0 0 0 1]
    267 
    268         LDRH        ValCA, [pSrc], #4                   ;// 8 = srcStep - 16
    269         SMLABB      Acc0, ValHF, r0x00140001, Acc0      ;// Acc0 += [0 h 0 f] * [0 0 0 1]
    270         SMLABB      Acc1, ValHF, r0x0014fffb, Acc1      ;// Acc1 += [0 h 0 f] * [0 0 0 -5]
    271         SMLATB      Acc2, ValDB, r0x0014fffb, Acc2      ;// Acc2 += [0 d 0 b] * [0 0 0 -5]
    272         SMLABB      Acc3, ValCA, r0x00140001, Acc3      ;// Acc3 += [0 d 0 b] * [0 0 0 1]
    273 
    274         LDR         r0x0001fc00, =0x0001fc00            ;// (0xff * 16 * 32) - 512
    275         SUB         Acc0, Acc0, r0x0001fc00
    276         SUB         Acc1, Acc1, r0x0001fc00
    277         SUB         Acc2, Acc2, r0x0001fc00
    278         SUB         Acc3, Acc3, r0x0001fc00
    279 
    280         USAT        Acc0, #18, Acc0
    281         USAT        Acc1, #18, Acc1
    282         USAT        Acc2, #18, Acc2
    283         USAT        Acc3, #18, Acc3
    284 
    285         MOV         Acc0, Acc0, LSR #10
    286         M_STRB      Acc0, [pDst], dstStep
    287         MOV         Acc1, Acc1, LSR #10
    288         M_STRB      Acc1, [pDst], dstStep
    289         MOV         Acc2, Acc2, LSR #10
    290         M_STRB      Acc2, [pDst], dstStep
    291         MOV         Acc3, Acc3, LSR #10
    292         M_STRB      Acc3, [pDst], dstStep
    293 
    294 
    295         M_LDR       Counter, pCounter
    296         SUB         pDst, pDst, dstStep, LSL #2
    297         SUB         pSrc, pSrc, srcStep, LSL #2
    298         ADD         pDst, pDst, #1
    299         SUBS        Counter, Counter, #1
    300         BGT         HeightLoop1
    301 End
    302         SUB         pDst, pDst, #4
    303         SUB         pSrc, pSrc, #16
    304 
    305         M_END
    306 
    307     ENDIF
    308 
    309     END
    310 
    311