1 ; Copyright (C) 2009 The Android Open Source Project 2 ; 3 ; Licensed under the Apache License, Version 2.0 (the "License"); 4 ; you may not use this file except in compliance with the License. 5 ; You may obtain a copy of the License at 6 ; 7 ; http://www.apache.org/licenses/LICENSE-2.0 8 ; 9 ; Unless required by applicable law or agreed to in writing, software 10 ; distributed under the License is distributed on an "AS IS" BASIS, 11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 ; See the License for the specific language governing permissions and 13 ; limitations under the License. 14 15 ;------------------------------------------------------------------------------- 16 ;-- 17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter 18 ;-- function 19 ;-- 20 ;------------------------------------------------------------------------------- 21 22 23 IF :DEF: H264DEC_WINASM 24 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 25 ELSE 26 REQUIRE8 27 PRESERVE8 28 ENDIF 29 30 AREA |.text|, CODE 31 32 ;// h264bsdInterpolateHorVerQuarter register allocation 33 34 ref RN 0 35 36 mb RN 1 37 buff RN 1 38 39 count RN 2 40 x0 RN 2 41 42 y0 RN 3 43 x_2_0 RN 3 44 res RN 3 45 46 x_3_1 RN 4 47 tmp1 RN 4 48 49 height RN 5 50 x_6_4 RN 5 51 tmp2 RN 5 52 53 partW RN 6 54 x_7_5 RN 6 55 tmp3 RN 6 56 57 partH RN 7 58 tmp4 RN 7 59 60 tmp5 RN 8 61 62 tmp6 RN 9 63 64 tmpa RN 10 65 66 mult_20_01 RN 11 67 tmpb RN 11 68 69 mult_20_m5 RN 12 70 width RN 12 71 72 plus16 RN 14 73 74 75 ;// function exports and imports 76 77 IMPORT h264bsdFillBlock 78 79 EXPORT h264bsdInterpolateHorVerQuarter 80 81 ;// Horizontal filter approach 82 ;// 83 ;// Basic idea in horizontal filtering is to adjust coefficients 84 ;// like below. Calculation is done with 16-bit maths. 85 ;// 86 ;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0 87 ;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ... 88 ;// y_0 = 20 1 20 -5 -5 1 89 ;// y_1 = -5 20 1 1 20 -5 90 ;// y_2 = 1 -5 -5 20 1 20 91 ;// y_3 = 1 20 -5 -5 20 1 92 93 94 h264bsdInterpolateHorVerQuarter 95 STMFD sp!, {r0-r11, lr} 96 SUB sp, sp, #0x1e4 97 98 CMP x0, #0 99 BLT do_fill ;// (x0 < 0) 100 LDR partW, [sp,#0x220] ;// partWidth 101 LDR width, [sp,#0x218] ;// width 102 ADD tmpa, x0, partW ;// (x0+partWidth) 103 ADD tmpa, tmpa, #5 ;// (x0+partW+5) 104 CMP tmpa, width 105 BHI do_fill ;// (x0+partW)>width 106 107 CMP y0, #0 108 BLT do_fill ;// (y0 < 0) 109 LDR partH, [sp,#0x224] ;// partHeight 110 LDR height, [sp,#0x21c] ;// height 111 ADD tmp5, y0, partH ;// (y0+partHeight) 112 ADD tmp5, tmp5, #5 ;// (y0+partH+5) 113 CMP tmp5, height 114 BLS skip_fill ;// no overfill needed 115 116 117 do_fill 118 LDR partH, [sp,#0x224] ;// partHeight 119 LDR partW, [sp,#0x220] ;// partWidth 120 LDR height, [sp,#0x21c] ;// height 121 ADD tmp5, partH, #5 ;// tmp5 = partH + 5 122 ADD tmpa, partW, #5 ;// tmpa = partW + 5 123 STMIB sp, {height, tmpa} ;// sp+4 = height, sp+8 = partWidth+5 124 LDR width, [sp,#0x218] ;// width 125 STR tmp5, [sp,#0xc] ;// sp+c = partHeight+5 126 STR tmpa, [sp,#0x10] ;// sp+10 = partWidth+5 127 STR width, [sp,#0] ;// sp+0 = width 128 ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] 129 BL h264bsdFillBlock 130 131 MOV x0, #0 132 STR x0,[sp,#0x1ec] ;// x0 = 0 133 STR x0,[sp,#0x1f0] ;// y0 = 0 134 ADD ref,sp,#0x28 ;// ref = p1 135 STR tmpa, [sp,#0x218] ;// width = partWidth+5 136 137 138 skip_fill 139 LDR x0 ,[sp,#0x1ec] ;// x0 140 LDR y0 ,[sp,#0x1f0] ;// y0 141 LDR width, [sp,#0x218] ;// width 142 LDR tmp6, [sp,#0x228] ;// horVerOffset 143 LDR mb, [sp, #0x1e8] ;// mb 144 MLA tmp5, width, y0, x0 ;// y0*width+x0 145 ADD ref, ref, tmp5 ;// ref += y0*width+x0 146 STR ref, [sp, #0x1e4] ;// store "ref" for vertical filtering 147 AND tmp6, tmp6, #2 ;// calculate ref for horizontal filter 148 MOV tmpa, #2 149 ADD tmp6, tmpa, tmp6, LSR #1 150 MLA ref, tmp6, width, ref 151 ADD ref, ref, #8 ;// ref = ref+8 152 153 ;// pack values to count register 154 ;// [31:28] loop_x (partWidth-1) 155 ;// [27:24] loop_y (partHeight-1) 156 ;// [23:20] partWidth-1 157 ;// [19:16] partHeight-1 158 ;// [15:00] width 159 MOV count, width 160 SUB partW, partW, #1; 161 SUB partH, partH, #1; 162 ADD tmp5, partH, partW, LSL #4 163 ADD count, count, tmp5, LSL #16 164 165 166 LDR mult_20_01, = 0x00140001 ;// constant multipliers 167 LDR mult_20_m5, = 0x0014FFFB ;// constant multipliers 168 MOV plus16, #16 ;// constant for add 169 AND tmp4, count, #0x000F0000 ;// partHeight-1 170 AND tmp6, count, #0x00F00000 ;// partWidth-1 171 ADD count, count, tmp4, LSL #8 ;// partH-1 to lower part of top byte 172 173 ;// HORIZONTAL PART 174 175 loop_y_hor 176 LDR x_3_1, [ref, #-8] 177 ADD count, count, tmp6, LSL #8 ;// partW-1 to upper part of top byte 178 LDR x_7_5, [ref, #-4] 179 UXTB16 x_2_0, x_3_1 180 UXTB16 x_3_1, x_3_1, ROR #8 181 UXTB16 x_6_4, x_7_5 182 183 loop_x_hor 184 UXTB16 x_7_5, x_7_5, ROR #8 185 186 SMLAD tmp4, x_2_0, mult_20_01, plus16 187 SMLATB tmp6, x_2_0, mult_20_01, plus16 188 SMLATB tmp5, x_2_0, mult_20_m5, plus16 189 SMLATB tmpa, x_3_1, mult_20_01, plus16 190 191 SMLAD tmp4, x_3_1, mult_20_m5, tmp4 192 SMLATB tmp6, x_3_1, mult_20_m5, tmp6 193 SMLAD tmp5, x_3_1, mult_20_01, tmp5 194 LDR x_3_1, [ref], #4 195 SMLAD tmpa, x_6_4, mult_20_m5, tmpa 196 197 SMLABB tmp4, x_6_4, mult_20_m5, tmp4 198 SMLADX tmp6, x_6_4, mult_20_m5, tmp6 199 SMLADX tmp5, x_6_4, mult_20_01, tmp5 200 SMLADX tmpa, x_7_5, mult_20_m5, tmpa 201 202 SMLABB tmp4, x_7_5, mult_20_01, tmp4 203 UXTB16 x_2_0, x_3_1 204 SMLABB tmp5, x_7_5, mult_20_m5, tmp5 205 SMLADX tmp6, x_7_5, mult_20_01, tmp6 206 SMLABB tmpa, x_2_0, mult_20_01, tmpa 207 208 MOV tmp5, tmp5, ASR #5 209 MOV tmp4, tmp4, ASR #5 210 PKHBT tmp5, tmp5, tmpa, LSL #(16-5) 211 PKHBT tmp4, tmp4, tmp6, LSL #(16-5) 212 USAT16 tmp5, #8, tmp5 213 USAT16 tmp4, #8, tmp4 214 215 SUBS count, count, #4<<28 216 ORR tmp4, tmp4, tmp5, LSL #8 217 STR tmp4, [mb], #4 218 BCC next_y_hor 219 220 UXTB16 x_3_1, x_3_1, ROR #8 221 222 SMLAD tmp4, x_6_4, mult_20_01, plus16 223 SMLATB tmp6, x_6_4, mult_20_01, plus16 224 SMLATB tmp5, x_6_4, mult_20_m5, plus16 225 SMLATB tmpa, x_7_5, mult_20_01, plus16 226 227 SMLAD tmp4, x_7_5, mult_20_m5, tmp4 228 SMLATB tmp6, x_7_5, mult_20_m5, tmp6 229 SMLAD tmp5, x_7_5, mult_20_01, tmp5 230 LDR x_7_5, [ref], #4 231 SMLAD tmpa, x_2_0, mult_20_m5, tmpa 232 233 SMLABB tmp4, x_2_0, mult_20_m5, tmp4 234 SMLADX tmp6, x_2_0, mult_20_m5, tmp6 235 SMLADX tmp5, x_2_0, mult_20_01, tmp5 236 SMLADX tmpa, x_3_1, mult_20_m5, tmpa 237 238 SMLABB tmp4, x_3_1, mult_20_01, tmp4 239 UXTB16 x_6_4, x_7_5 240 SMLABB tmp5, x_3_1, mult_20_m5, tmp5 241 SMLADX tmp6, x_3_1, mult_20_01, tmp6 242 SMLABB tmpa, x_6_4, mult_20_01, tmpa 243 244 MOV tmp5, tmp5, ASR #5 245 MOV tmp4, tmp4, ASR #5 246 PKHBT tmp5, tmp5, tmpa, LSL #(16-5) 247 PKHBT tmp4, tmp4, tmp6, LSL #(16-5) 248 USAT16 tmp5, #8, tmp5 249 USAT16 tmp4, #8, tmp4 250 251 SUBS count, count, #4<<28 252 ORR tmp4, tmp4, tmp5, LSL #8 253 STR tmp4, [mb], #4 254 BCS loop_x_hor 255 256 next_y_hor 257 AND tmp6, count, #0x00F00000 ;// partWidth-1 258 SMLABB ref, count, mult_20_01, ref ;// +width 259 ADDS mb, mb, #16 ;// +16, Carry=0 260 SBC mb, mb, tmp6, LSR #20 ;// -(partWidth-1)-1 261 SBC ref, ref, tmp6, LSR #20 ;// -(partWidth-1)-1 262 ADDS count, count, #(1<<28)-(1<<24) ;// decrement counter (partW) 263 BGE loop_y_hor 264 265 266 267 ;// VERTICAL PART 268 ;// 269 ;// Approach to vertical interpolation 270 ;// 271 ;// Interpolation is done by using 32-bit loads and stores 272 ;// and by using 16 bit arithmetic. 4x4 block is processed 273 ;// in each round. 274 ;// 275 ;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n| 276 ;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n| 277 ;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n| 278 ;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n| 279 ;// .. 280 ;// .. 281 ;// |a_m1|a_m1|a_m1|a_m1|... 282 ;// |b_m1|b_m1|b_m1|b_m1|... 283 ;// |c_m1|c_m1|c_m1|c_m1|... 284 ;// |d_m1|d_m1|d_m1|d_m1|... 285 286 ;// Approach to bilinear interpolation to quarter pel position. 287 ;// 4 bytes are processed parallel 288 ;// 289 ;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by 290 ;// negating second operand to get one's complement (instead of 2's) 291 ;// and using subtraction, EOR is used to correct sign. 292 ;// 293 ;// MVN b, b 294 ;// UHSUB8 a, a, b 295 ;// EOR a, a, 0x80808080 296 297 298 LDR ref, [sp, #0x1e4] ;// ref 299 LDR tmpa, [sp, #0x228] ;// horVerOffset 300 LDR mb, [sp, #0x1e8] ;// mb 301 LDR width, [sp, #0x218] ;// width 302 ADD ref, ref, #2 ;// calculate correct position 303 AND tmpa, tmpa, #1 304 ADD ref, ref, tmpa 305 LDR plus16, = 0x00100010 ;// +16 to lower and upperf halfwords 306 AND count, count, #0x00FFFFFF ;// partWidth-1 307 308 AND tmpa, count, #0x000F0000 ;// partHeight-1 309 ADD count, count, tmpa, LSL #8 310 311 loop_y 312 ADD count, count, tmp6, LSL #8 ;// partWidth-1 313 314 loop_x 315 LDR tmp1, [ref], width ;// |a4|a3|a2|a1| 316 LDR tmp2, [ref], width ;// |c4|c3|c2|c1| 317 LDR tmp3, [ref], width ;// |g4|g3|g2|g1| 318 LDR tmp4, [ref], width ;// |m4|m3|m2|m1| 319 LDR tmp5, [ref], width ;// |r4|r3|r2|r1| 320 LDR tmp6, [ref], width ;// |t4|t3|t2|t1| 321 322 ;// first four pixels 323 UXTB16 tmpa, tmp3 ;// |g3|g1| 324 UXTAB16 tmpa, tmpa, tmp4 ;// |g3+m3|g1+m1| 325 UXTB16 tmpb, tmp2 ;// |c3|c1| 326 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 327 328 UXTAB16 tmpb, tmpb, tmp5 ;// |c3+r3|c1+r1| 329 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 330 UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A 331 UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T 332 333 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 334 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 335 336 USAT16 tmpb, #13, tmpa ;// saturate 337 LDR res, = 0x00FF00FF 338 UXTB16 tmpa, tmp3, ROR #8 ;// |g4|g2| 339 UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// |g4+m4|g2+m2| 340 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 341 342 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 343 UXTB16 tmpb, tmp2, ROR #8 ;// |c4|c2| 344 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 345 UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// |c4+r4|c2+r2| 346 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A 347 UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T 348 349 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 350 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 351 352 USAT16 tmpb, #13, tmpa ;// saturate 353 LDR tmp1, [mb] 354 LDR tmpa, = 0xFF00FF00 355 MVN tmp1, tmp1 356 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32 357 ORR res, res, tmpa 358 359 LDR tmpa, = 0x80808080 360 UHSUB8 res, res, tmp1 ;// bilinear interpolation 361 LDR tmp1, [ref], width ;// load next row 362 EOR res, res, tmpa ;// correct sign 363 364 STR res, [mb], #16 ;// next row (mb) 365 366 367 ;// tmp2 = |a4|a3|a2|a1| 368 ;// tmp3 = |c4|c3|c2|c1| 369 ;// tmp4 = |g4|g3|g2|g1| 370 ;// tmp5 = |m4|m3|m2|m1| 371 ;// tmp6 = |r4|r3|r2|r1| 372 ;// tmp1 = |t4|t3|t2|t1| 373 374 ;// second four pixels 375 UXTB16 tmpa, tmp4 ;// |g3|g1| 376 UXTAB16 tmpa, tmpa, tmp5 ;// |g3+m3|g1+m1| 377 UXTB16 tmpb, tmp3 ;// |c3|c1| 378 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 379 UXTAB16 tmpb, tmpb, tmp6 ;// |c3+r3|c1+r1| 380 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 381 UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A 382 UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T 383 384 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 385 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 386 387 USAT16 tmpb, #13, tmpa ;// saturate 388 LDR res, = 0x00FF00FF 389 UXTB16 tmpa, tmp4, ROR #8 ;// |g4|g2| 390 UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// |g4+m4|g2+m2| 391 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 392 393 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 394 UXTB16 tmpb, tmp3, ROR #8 ;// |c4|c2| 395 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 396 UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// |c4+r4|c2+r2| 397 UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A 398 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T 399 400 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 401 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 402 403 USAT16 tmpb, #13, tmpa ;// saturate 404 LDR tmp2, [mb] 405 LDR tmpa, = 0xFF00FF00 406 MVN tmp2, tmp2 407 408 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 409 ORR res, res, tmpa 410 LDR tmpa, = 0x80808080 411 UHSUB8 res, res, tmp2 ;// bilinear interpolation 412 LDR tmp2, [ref], width ;// load next row 413 EOR res, res, tmpa ;// correct sign 414 STR res, [mb], #16 ;// next row 415 416 ;// tmp3 = |a4|a3|a2|a1| 417 ;// tmp4 = |c4|c3|c2|c1| 418 ;// tmp5 = |g4|g3|g2|g1| 419 ;// tmp6 = |m4|m3|m2|m1| 420 ;// tmp1 = |r4|r3|r2|r1| 421 ;// tmp2 = |t4|t3|t2|t1| 422 423 ;// third four pixels 424 UXTB16 tmpa, tmp5 ;// |g3|g1| 425 UXTAB16 tmpa, tmpa, tmp6 ;// |g3+m3|g1+m1| 426 UXTB16 tmpb, tmp4 ;// |c3|c1| 427 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 428 UXTAB16 tmpb, tmpb, tmp1 ;// |c3+r3|c1+r1| 429 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 430 UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A 431 UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T 432 433 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 434 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 435 436 USAT16 tmpb, #13, tmpa ;// saturate 437 LDR res, = 0x00FF00FF 438 UXTB16 tmpa, tmp5, ROR #8 ;// |g4|g2| 439 UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// |g4+m4|g2+m2| 440 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 441 442 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 443 UXTB16 tmpb, tmp4, ROR #8 ;// |c4|c2| 444 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 445 UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// |c4+r4|c2+r2| 446 UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A 447 UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T 448 449 450 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 451 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 452 453 USAT16 tmpb, #13, tmpa ;// saturate 454 LDR tmp3, [mb] 455 LDR tmpa, = 0xFF00FF00 456 MVN tmp3, tmp3 457 458 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 459 ORR res, res, tmpa 460 LDR tmpa, = 0x80808080 461 UHSUB8 res, res, tmp3 ;// bilinear interpolation 462 LDR tmp3, [ref] ;// load next row 463 EOR res, res, tmpa ;// correct sign 464 STR res, [mb], #16 ;// next row 465 466 ;// tmp4 = |a4|a3|a2|a1| 467 ;// tmp5 = |c4|c3|c2|c1| 468 ;// tmp6 = |g4|g3|g2|g1| 469 ;// tmp1 = |m4|m3|m2|m1| 470 ;// tmp2 = |r4|r3|r2|r1| 471 ;// tmp3 = |t4|t3|t2|t1| 472 473 ;// fourth four pixels 474 UXTB16 tmpa, tmp6 ;// |g3|g1| 475 UXTAB16 tmpa, tmpa, tmp1 ;// |g3+m3|g1+m1| 476 UXTB16 tmpb, tmp5 ;// |c3|c1| 477 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 478 UXTAB16 tmpb, tmpb, tmp2 ;// |c3+r3|c1+r1| 479 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 480 UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A 481 UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T 482 483 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 484 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 485 486 USAT16 tmpb, #13, tmpa ;// saturate 487 LDR res, = 0x00FF00FF 488 UXTB16 tmpa, tmp6, ROR #8 ;// |g4|g2| 489 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// |g4+m4|g2+m2| 490 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 491 492 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 493 UXTB16 tmpb, tmp5, ROR #8 ;// |c4|c2| 494 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 495 UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// |c4+r4|c2+r2| 496 UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A 497 UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T 498 499 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 500 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 501 502 USAT16 tmpb, #13, tmpa ;// saturate 503 LDR tmp5, [mb] 504 LDR tmp4, = 0xFF00FF00 505 MVN tmp5, tmp5 506 507 AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32 508 ORR res, res, tmpa 509 LDR tmpa, = 0x80808080 510 UHSUB8 res, res, tmp5 ;// bilinear interpolation 511 512 ;// decrement loop_x counter 513 SUBS count, count, #4<<28 ;// decrement x loop counter 514 515 ;// calculate "ref" address for next round 516 SUB ref, ref, width, LSL #3 ;// ref -= 8*width; 517 ADD ref, ref, #4 ;// next column (4 pixels) 518 519 EOR res, res, tmpa ;// correct sign 520 STR res, [mb], #-44 521 522 BCS loop_x 523 524 ADDS mb, mb, #64 ;// set Carry=0 525 ADD ref, ref, width, LSL #2 ;// ref += 4*width 526 AND tmp6, count, #0x00F00000 ;// partWidth-1 527 SBC ref, ref, tmp6, LSR #20 ;// -(partWidth-1)-1 528 SBC mb, mb, tmp6, LSR #20 ;// -(partWidth-1)-1 529 530 ADDS count, count, #0xC << 24 ;// decrement y loop counter 531 BGE loop_y 532 533 ADD sp, sp, #0x1f4 534 LDMFD sp!, {r4-r11, pc} 535 536 END 537