1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 9641 21 ;// Date: Thursday, February 7, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 31 32 M_VARIANTS ARM1136JS 33 34 35 36 37 38 IF ARM1136JS 39 40 M_ALLOC8 ppDstArgs, 8 41 M_ALLOC4 ppSrc, 4 42 M_ALLOC4 ppDst, 4 43 M_ALLOC4 pCounter, 4 44 45 ;// Function header 46 ;// Function: 47 ;// armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 48 ;// 49 ;// Implements diagonal interpolation for a block of size 4x4. Input and output should 50 ;// be aligned. 51 ;// 52 ;// Registers used as input for this function 53 ;// r0,r1,r2,r3, r8 where r0,r2 input pointer and r1,r3 step size, r8 intermediate-buf pointer 54 ;// 55 ;// Registers preserved for top level function 56 ;// r0,r1,r2,r3,r4,r5,r6,r14 57 ;// 58 ;// Registers modified by the function 59 ;// r7,r8,r9,r10,r11,r12 60 ;// 61 ;// Output registers 62 ;// None. Function will preserve r0-r3 63 64 M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r6 65 66 ;// Declare input registers 67 pSrc RN 0 68 srcStep RN 1 69 pDst RN 2 70 dstStep RN 3 71 72 ;// Declare inner loop registers 73 ValA RN 5 74 ValA0 RN 4 75 ValA1 RN 5 76 ValAF0 RN 4 77 ValAF1 RN 5 78 79 ValB RN 11 80 81 ValC RN 5 82 ValC0 RN 4 83 ValC1 RN 5 84 ValCD0 RN 12 85 ValCD1 RN 14 86 ValCF0 RN 4 87 ValCF1 RN 5 88 89 ValD RN 10 90 91 ValE RN 7 92 ValE0 RN 6 93 ValE1 RN 7 94 ValEB0 RN 10 95 ValEB1 RN 11 96 ValED0 RN 6 97 ValED1 RN 7 98 99 ValF RN 10 100 101 ValG RN 14 102 ValG0 RN 12 103 ValG1 RN 14 104 ValGB0 RN 12 105 ValGB1 RN 14 106 107 Acc0 RN 4 108 Acc1 RN 5 109 Acc2 RN 6 110 Acc3 RN 7 111 112 Temp RN 7 113 Step RN 6 114 115 pInterBuf RN 8 116 Counter RN 8 117 r0x00ff00ff RN 9 ;// [0 255 0 255] where 255 is offset 118 r0x0001fc00 RN 10 ;// [0 (16*255 - 16) 0 (16*255 - 16)] 119 120 121 ;// Declare inner loop registers 122 ValCA RN 8 123 ValDB RN 9 124 ValGE RN 10 125 ValHF RN 11 126 r0x00140001 RN 12 127 r0x0014fffb RN 14 128 129 r0x00000200 RN 12 130 r0x000000ff RN 12 131 132 M_STRD pDst, dstStep, ppDstArgs 133 MOV pDst, pInterBuf 134 MOV dstStep, #24 135 136 ;// Set up counter of format, [0] [0] [1 (height)] [8 (width)] 137 MOV Counter, #1 138 MOV Temp, #8 139 ADD Counter, Temp, Counter, LSL #8 ;// [0 0 H W] 140 141 LDR r0x00ff00ff, =0x00ff00ff ;// [0 255 0 255] 255 is offset to avoid negative results 142 WidthLoop 143 M_STR pSrc, ppSrc 144 M_STR pDst, ppDst 145 HeightLoop 146 TwoRowsLoop 147 M_LDR ValC, [pSrc], srcStep ;// Load [c3 c2 c1 c0] 148 M_LDR ValD, [pSrc], srcStep ;// Load [d3 d2 d1 d0] 149 M_LDR ValE, [pSrc], srcStep ;// Load [e3 e2 e1 e0] 150 SUB pSrc, pSrc, srcStep, LSL #2 151 UXTAB16 ValC0, r0x00ff00ff, ValC ;// [0 c2 0 c0] + [0 255 0 255] 152 UXTAB16 ValC1, r0x00ff00ff, ValC, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] 153 LDR ValB, [pSrc] ;// Load [b3 b2 b1 b0] 154 UXTAB16 ValE0, r0x00ff00ff, ValE ;// [0 e2 0 e0] + [0 255 0 255] 155 UXTAB16 ValE1, r0x00ff00ff, ValE, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] 156 UXTAB16 ValCD0, ValC0, ValD ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0] 157 UXTAB16 ValCD1, ValC1, ValD, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1] 158 UXTAB16 ValEB0, ValE0, ValB ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0] 159 RSB ValCD0, ValEB0, ValCD0, LSL #2 ;// 4*(Off+C+D) - (Off+B+E) 160 161 LDR ValD, [pSrc, srcStep, LSL #1] ;// Load [d3 d2 d1 d0] 162 UXTAB16 ValEB1, ValE1, ValB, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1] 163 RSB ValCD1, ValEB1, ValCD1, LSL #2 164 165 UXTAB16 ValED0, ValE0, ValD ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0] 166 UXTAB16 ValED1, ValE1, ValD, ROR #8 ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1] 167 LDR ValF, [pSrc, srcStep, LSL #2] ;// Load [f3 f2 f1 f0] 168 M_LDR ValB, [pSrc], srcStep ;// Load [b3 b2 b1 b0] 169 ADD ValCD0, ValCD0, ValCD0, LSL #2 ;// 5 * [4*(Off+C+D) - (Off+B+E)] 170 ADD ValCD1, ValCD1, ValCD1, LSL #2 171 UXTAB16 ValCF1, ValC1, ValF, ROR #8 ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1] 172 UXTAB16 ValCF0, ValC0, ValF ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0] 173 RSB ValED1, ValCF1, ValED1, LSL #2 174 175 SUB ValA, pSrc, srcStep, LSL #1 176 LDR ValA, [ValA] ;// Load [a3 a2 a1 a0] 177 RSB ValED0, ValCF0, ValED0, LSL #2 ;// 4*(Off+E+D) - (Off+C+F) 178 ADD ValED1, ValED1, ValED1, LSL #2 179 ADD ValED0, ValED0, ValED0, LSL #2 ;// 5 * [4*(Off+E+D) - (Off+C+F)] 180 UXTAB16 ValA0, r0x00ff00ff, ValA ;// [0 a2 0 a0] + [0 255 0 255] 181 UXTAB16 ValA1, r0x00ff00ff, ValA, ROR #8 ;// [0 a3 0 a1] + [0 255 0 255] 182 UXTAB16 ValAF0, ValA0, ValF ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0] 183 UXTAB16 ValAF1, ValA1, ValF, ROR #8 ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1] 184 ADD Acc1, ValCD1, ValAF1 185 186 LDR ValG, [pSrc, srcStep, LSL #2] ;// Load [g3 g2 g1 g0] 187 ADD Acc0, ValCD0, ValAF0 ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E) 188 STR Acc1, [pDst, #4] ;// Store result & adjust pointer 189 M_STR Acc0, [pDst], dstStep ;// Store result & adjust pointer 190 UXTAB16 ValG0, r0x00ff00ff, ValG ;// [0 g2 0 g0] + [0 255 0 255] 191 UXTAB16 ValG1, r0x00ff00ff, ValG, ROR #8 ;// [0 g3 0 g1] + [0 255 0 255] 192 UXTAB16 ValGB0, ValG0, ValB ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0] 193 UXTAB16 ValGB1, ValG1, ValB, ROR #8 ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1] 194 ADD Acc2, ValED0, ValGB0 ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F) 195 ADD Acc3, ValED1, ValGB1 196 197 STR Acc3, [pDst, #4] ;// Store result & adjust pointer 198 M_STR Acc2, [pDst], dstStep ;// Store result & adjust pointer 199 200 SUBS Counter, Counter, #1 << 8 ;// Loop till height is 10 201 ADD pSrc, pSrc, srcStep, LSL #1 202 BPL HeightLoop 203 204 M_LDR pSrc, ppSrc 205 M_LDR pDst, ppDst 206 ADDS Counter, Counter, #(1 << 8)-4 ;// Loop till width is 12 207 ADD pSrc, pSrc, #4 208 ADD pDst, pDst, #8 209 ADD Counter, Counter, #1<<8 210 BPL WidthLoop 211 212 ;// 213 ;// Horizontal interpolation using multiplication 214 ;// 215 216 SUB pSrc, pDst, #24 217 MOV srcStep, #24 218 M_LDRD pDst, dstStep, ppDstArgs 219 220 MOV Counter, #4 221 LDR r0x0014fffb, =0x0014fffb 222 LDR r0x00140001, =0x00140001 223 224 HeightLoop1 225 M_STR Counter, pCounter 226 227 228 LDR ValCA, [pSrc], #4 ;// Load [0 c 0 a] 229 LDR ValDB, [pSrc], #4 ;// Load [0 d 0 b] 230 LDR ValGE, [pSrc], #4 ;// Load [0 g 0 e] 231 LDR ValHF, [pSrc], #4 ;// Load [0 h 0 f] 232 233 ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e)) 234 ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f)) 235 ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g)) 236 ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h)) 237 SMUAD Acc0, ValCA, r0x00140001 ;// Acc0 = [0 c 0 a] * [0 20 0 1] 238 SMUAD Acc1, ValDB, r0x00140001 ;// Acc1 = [0 c 0 a] * [0 20 0 1] 239 SMUADX Acc2, ValGE, r0x0014fffb ;// Acc2 = [0 g 0 e] * [0 20 0 -5] 240 SMUAD Acc3, ValGE, r0x0014fffb ;// Acc3 = [0 g 0 e] * [0 20 0 -5] 241 242 SMLAD Acc0, ValDB, r0x0014fffb, Acc0 ;// Acc0 += [0 d 0 b] * [0 20 0 -5] 243 SMLADX Acc1, ValGE, r0x00140001, Acc1 ;// Acc1 += [0 g 0 e] * [0 20 0 1] 244 SMLADX Acc2, ValHF, r0x00140001, Acc2 ;// Acc2 += [0 h 0 f] * [0 20 0 1] 245 SMLADX Acc3, ValHF, r0x0014fffb, Acc3 ;// Acc3 += [0 h 0 f] * [0 20 0 -5] 246 247 SMLABB Acc0, ValGE, r0x0014fffb, Acc0 ;// Acc0 += [0 g 0 e] * [0 0 0 -5] 248 SMLATB Acc1, ValCA, r0x0014fffb, Acc1 ;// Acc1 += [0 d 0 b] * [0 0 0 -5] 249 SMLATB Acc2, ValCA, r0x00140001, Acc2 ;// Acc2 += [0 c 0 a] * [0 0 0 1] 250 SMLATB Acc3, ValDB, r0x00140001, Acc3 ;// Acc3 += [0 c 0 a] * [0 0 0 1] 251 252 LDRH ValCA, [pSrc], #8 ;// 8 = srcStep - 16 253 SMLABB Acc0, ValHF, r0x00140001, Acc0 ;// Acc0 += [0 h 0 f] * [0 0 0 1] 254 SMLABB Acc1, ValHF, r0x0014fffb, Acc1 ;// Acc1 += [0 h 0 f] * [0 0 0 -5] 255 SMLATB Acc2, ValDB, r0x0014fffb, Acc2 ;// Acc2 += [0 d 0 b] * [0 0 0 -5] 256 SMLABB Acc3, ValCA, r0x00140001, Acc3 ;// Acc3 += [0 d 0 b] * [0 0 0 1] 257 258 LDR r0x0001fc00, =0x0001fc00 ;// (0xff * 16 * 32) - 512 259 SUB Acc0, Acc0, r0x0001fc00 260 SUB Acc1, Acc1, r0x0001fc00 261 SUB Acc2, Acc2, r0x0001fc00 262 SUB Acc3, Acc3, r0x0001fc00 263 264 USAT Acc0, #18, Acc0 265 USAT Acc1, #18, Acc1 266 USAT Acc2, #18, Acc2 267 USAT Acc3, #18, Acc3 268 269 MOV Acc0, Acc0, LSR #10 270 MOV Acc1, Acc1, LSR #10 271 MOV Acc2, Acc2, LSR #10 272 MOV Acc3, Acc3, LSR #10 273 274 M_LDR Counter, pCounter 275 ORR Acc0, Acc0, Acc1, LSL #8 276 ORR Acc2, Acc2, Acc3, LSL #8 277 SUBS Counter, Counter, #1 278 ORR Acc0, Acc0, Acc2, LSL #16 279 M_STR Acc0, [pDst], dstStep 280 BGT HeightLoop1 281 End 282 SUB pDst, pDst, dstStep, LSL #2 283 SUB pSrc, pSrc, srcStep, LSL #2 284 285 M_END 286 287 ENDIF 288 289 END 290 291