1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 9641 21 ;// Date: Thursday, February 7, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS ARM1136JS 31 32 EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 33 34 35 36 IF ARM1136JS 37 38 ;// Function header 39 40 ;// Function: 41 ;// armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 42 ;// 43 ;// Implements vertical interpolation for a block of size 4x4. Input and output should 44 ;// be aligned. 45 ;// 46 ;// Registers used as input for this function 47 ;// r0,r1,r2,r3 where r0,r2 input pointer and r1,r3 corresponding step size 48 ;// 49 ;// Registers preserved for top level function 50 ;// r0,r1,r2,r3,r4,r5,r6,r14 51 ;// 52 ;// Registers modified by the function 53 ;// r7,r8,r9,r10,r11,r12 54 ;// 55 ;// Output registers 56 ;// None. Function will preserve r0-r3 57 M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r6 58 59 ;// Declare input registers 60 pSrc RN 0 61 srcStep RN 1 62 pDst RN 2 63 dstStep RN 3 64 65 ;// Declare inner loop registers 66 ValA RN 5 67 ValA0 RN 4 68 ValA1 RN 5 69 ValAF0 RN 4 70 ValAF1 RN 5 71 72 ValB RN 11 73 74 ValC RN 5 75 ValC0 RN 4 76 ValC1 RN 5 77 ValCD0 RN 12 78 ValCD1 RN 14 79 ValCF0 RN 4 80 ValCF1 RN 5 81 82 ValD RN 10 83 84 ValE RN 7 85 ValE0 RN 6 86 ValE1 RN 7 87 ValEB0 RN 10 88 ValEB1 RN 11 89 ValED0 RN 6 90 ValED1 RN 7 91 92 ValF RN 10 93 94 ValG RN 14 95 ValG0 RN 12 96 ValG1 RN 14 97 ValGB0 RN 12 98 ValGB1 RN 14 99 100 Acc0 RN 4 101 Acc1 RN 5 102 Acc2 RN 6 103 Acc3 RN 7 104 105 Temp RN 7 106 Height RN 3 107 Step RN 6 108 109 Counter RN 8 110 r0x00ff00ff RN 9 ;// [0 255 0 255] where 255 is offset 111 r0x0fe00fe0 RN 10 ;// [0 (16*255 - 16) 0 (16*255 - 16)] 112 113 114 LDR r0x00ff00ff, =0x00ff00ff ;// [0 255 0 255] 255 is offset to avoid negative results 115 MOV Counter, #2 116 117 TwoRowsLoop 118 M_LDR ValC, [pSrc], srcStep ;// Load [c3 c2 c1 c0] 119 M_LDR ValD, [pSrc], srcStep ;// Load [d3 d2 d1 d0] 120 M_LDR ValE, [pSrc], srcStep ;// Load [e3 e2 e1 e0] 121 SUB pSrc, pSrc, srcStep, LSL #2 122 LDR ValB, [pSrc] ;// Load [b3 b2 b1 b0] 123 UXTAB16 ValC0, r0x00ff00ff, ValC ;// [0 c2 0 c0] + [0 255 0 255] 124 UXTAB16 ValC1, r0x00ff00ff, ValC, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] 125 126 UXTAB16 ValE0, r0x00ff00ff, ValE ;// [0 e2 0 e0] + [0 255 0 255] 127 UXTAB16 ValE1, r0x00ff00ff, ValE, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] 128 UXTAB16 ValCD0, ValC0, ValD ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0] 129 UXTAB16 ValCD1, ValC1, ValD, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1] 130 UXTAB16 ValEB0, ValE0, ValB ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0] 131 RSB ValCD0, ValEB0, ValCD0, LSL #2 ;// 4*(Off+C+D) - (Off+B+E) 132 133 LDR ValD, [pSrc, srcStep, LSL #1] ;// Load [d3 d2 d1 d0] 134 UXTAB16 ValEB1, ValE1, ValB, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1] 135 RSB ValCD1, ValEB1, ValCD1, LSL #2 136 ;// One cycle stall 137 UXTAB16 ValED0, ValE0, ValD ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0] 138 UXTAB16 ValED1, ValE1, ValD, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1] 139 140 LDR ValF, [pSrc, srcStep, LSL #2] ;// Load [f3 f2 f1 f0] 141 M_LDR ValB, [pSrc], srcStep ;// Load [b3 b2 b1 b0] 142 ADD ValCD0, ValCD0, ValCD0, LSL #2 ;// 5 * [4*(Off+C+D) - (Off+B+E)] 143 ADD ValCD1, ValCD1, ValCD1, LSL #2 144 UXTAB16 ValCF1, ValC1, ValF, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1] 145 UXTAB16 ValCF0, ValC0, ValF ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0] 146 RSB ValED1, ValCF1, ValED1, LSL #2 147 148 SUB ValA, pSrc, srcStep, LSL #1 149 LDR ValA, [ValA] ;// Load [a3 a2 a1 a0] 150 RSB ValED0, ValCF0, ValED0, LSL #2 ;// 4*(Off+E+D) - (Off+C+F) 151 ADD ValED1, ValED1, ValED1, LSL #2 152 ADD ValED0, ValED0, ValED0, LSL #2 ;// 5 * [4*(Off+E+D) - (Off+C+F)] 153 UXTAB16 ValA0, r0x00ff00ff, ValA ;// [0 a2 0 a0] + [0 255 0 255] 154 UXTAB16 ValA1, r0x00ff00ff, ValA, ROR #8 ;// [0 a3 0 a1] + [0 255 0 255] 155 UXTAB16 ValAF0, ValA0, ValF ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0] 156 UXTAB16 ValAF1, ValA1, ValF, ROR #8 ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1] 157 158 LDR r0x0fe00fe0, =0x0fe00fe0 ;// [0 255 0 255] 255 is offset to avoid negative results 159 ADD Acc1, ValCD1, ValAF1 160 161 LDR ValG, [pSrc, srcStep, LSL #2] ;// Load [g3 g2 g1 g0] 162 ADD Acc0, ValCD0, ValAF0 ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E) 163 UQSUB16 Acc1, Acc1, r0x0fe00fe0 ;// Acc1 -= (16*Off - 16) 164 UQSUB16 Acc0, Acc0, r0x0fe00fe0 165 UXTAB16 ValG0, r0x00ff00ff, ValG ;// [0 g2 0 g0] + [0 255 0 255] 166 UXTAB16 ValG1, r0x00ff00ff, ValG, ROR #8 ;// [0 g3 0 g1] + [0 255 0 255] 167 UXTAB16 ValGB0, ValG0, ValB ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0] 168 UXTAB16 ValGB1, ValG1, ValB, ROR #8 ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1] 169 ADD Acc2, ValED0, ValGB0 ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F) 170 ADD Acc3, ValED1, ValGB1 171 UQSUB16 Acc3, Acc3, r0x0fe00fe0 ;// Acc3 -= (16*Off - 16) 172 UQSUB16 Acc2, Acc2, r0x0fe00fe0 173 USAT16 Acc1, #13, Acc1 ;// Saturate to 8+5 = 13 bits 174 USAT16 Acc0, #13, Acc0 175 USAT16 Acc3, #13, Acc3 176 USAT16 Acc2, #13, Acc2 177 AND Acc1, r0x00ff00ff, Acc1, LSR #5 ;// [0 a3 0 a1] 178 AND Acc0, r0x00ff00ff, Acc0, LSR #5 ;// [0 a2 0 a0] 179 ORR Acc0, Acc0, Acc1, LSL #8 ;// [a3 a2 a1 a0] 180 AND Acc3, r0x00ff00ff, Acc3, LSR #5 ;// [0 b3 0 b1] 181 AND Acc2, r0x00ff00ff, Acc2, LSR #5 ;// [0 b2 0 b0] 182 183 M_STR Acc0, [pDst], dstStep ;// Store result & adjust pointer 184 ORR Acc2, Acc2, Acc3, LSL #8 ;// [b3 b2 b1 b0] 185 M_STR Acc2, [pDst], dstStep ;// Store result & adjust pointer 186 ADD pSrc, pSrc, srcStep, LSL #1 187 188 SUBS Counter, Counter, #1 189 BGT TwoRowsLoop 190 End 191 SUB pDst, pDst, dstStep, LSL #2 192 SUB pSrc, pSrc, srcStep, LSL #2 193 194 M_END 195 196 ENDIF 197 198 END 199 200