1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18 19 #include <machine/cpu-features.h> 20 #include <machine/asm.h> 21 22 /* 23 r0 = dst 24 r1 = y0 base pointer 25 r2 = y1 base pointer 26 r3 = y2 base pointer 27 sp = coeffs 28 sp = length / 2 29 */ 30 31 ENTRY(rsdIntrinsicConvolve3x3_K) 32 push {r4-r8, r10, r11, lr} 33 vpush {q4-q7} 34 35 /* Get the coeffs pointer from the stack and load the 36 coefficients in the q0, q1 NEON registers */ 37 ldr r4, [sp, #32+64] 38 vld1.16 {q0, q1}, [r4] 39 40 /* Get count from the stack */ 41 ldr r4, [sp, #36+64] 42 43 /* Load the frequently used immediate in a register */ 44 mov r5, #8 45 46 1: 47 /* Load and post-increase the address by r5=#8 */ 48 vld1.8 {q13}, [r1], r5 49 vld1.8 {q14}, [r2], r5 50 vld1.8 {q15}, [r3], r5 51 52 /* Signal memory for data that will be used in the loop after the next */ 53 PLD (r1, r5) 54 PLD (r2, r5) 55 PLD (r3, r5) 56 57 vmovl.u8 q2, d26 58 vmovl.u8 q3, d27 59 vmovl.u8 q4, d28 60 vmovl.u8 q5, d29 61 vmovl.u8 q6, d30 62 vmovl.u8 q7, d31 63 64 /* 65 The two pixel source array is 66 d4, d5, d6, d7 67 d8, d9, d10, d11 68 d12, d13, d14, d15 69 */ 70 71 vmull.s16 q8, d4, d0[0] 72 vmlal.s16 q8, d5, d0[1] 73 vmlal.s16 q8, d6, d0[2] 74 vmlal.s16 q8, d8, d0[3] 75 vmlal.s16 q8, d9, d1[0] 76 vmlal.s16 q8, d10, d1[1] 77 vmlal.s16 q8, d12, d1[2] 78 vmlal.s16 q8, d13, d1[3] 79 vmlal.s16 q8, d14, d2[0] 80 81 vmull.s16 q9, d5, d0[0] 82 vmlal.s16 q9, d6, d0[1] 83 vmlal.s16 q9, d7, d0[2] 84 vmlal.s16 q9, d9, d0[3] 85 vmlal.s16 q9, d10, d1[0] 86 vmlal.s16 q9, d11, d1[1] 87 vmlal.s16 q9, d13, d1[2] 88 vmlal.s16 q9, d14, d1[3] 89 vmlal.s16 q9, d15, d2[0] 90 91 vshrn.i32 d16, q8, #8 92 vshrn.i32 d17, q9, #8 93 94 vqmovun.s16 d16, q8 95 vst1.8 d16, [r0]! 96 97 /* Are we done yet? */ 98 subs r4, r4, #1 99 bne 1b 100 101 /* We're done, bye! */ 102 vpop {q4-q7} 103 pop {r4-r8, r10, r11, lr} 104 bx lr 105 END(rsdIntrinsicConvolve3x3_K) 106 107 108 /* 109 static void OneVF(float4 *out, const uchar *ptrIn, int iStride, 110 const float* gPtr, int iradius, int x1, int x2) 111 112 r0 = out 113 r1 = pin 114 r2 = stride 115 r3 = gptr 116 r4 = sp, ct 117 r5 = sp+4, x1 118 r6 = sp+8, x2 119 */ 120 ENTRY(rsdIntrinsicBlurVFU4_K) 121 push {r4-r8, r10, r11, lr} 122 vpush {q4-q7} 123 124 ldr r4, [sp, #32+64] 125 ldr r5, [sp, #32+64 + 4] 126 ldr r6, [sp, #32+64 + 8] 127 128 1: 129 veor q10, q10, q10 /* float4 blurredPixel = 0; */ 130 veor q11, q11, q11 /* float4 blurredPixel = 0; */ 131 add r7, r1, r5, lsl #2 /* const uchar *pi = ptrIn + x1 * 4; */ 132 mov r10, r3 133 134 mov r11, r4 135 136 2: 137 vld1.32 {d2}, [r7] 138 vmovl.u8 q1, d2 139 vmovl.u16 q3, d2 140 vmovl.u16 q4, d3 141 vcvt.f32.s32 q3, q3 142 vcvt.f32.s32 q4, q4 143 vld1.32 {d0[0]}, [r10]! 144 add r7, r7, r2 145 vmla.f32 q10, q3, d0[0] 146 vmla.f32 q11, q4, d0[0] 147 subs r11, r11, #1 148 bne 2b 149 150 vst1.32 {q10}, [r0]! 151 vst1.32 {q11}, [r0]! 152 add r5, r5, #2 153 cmp r5, r6 154 bne 1b 155 156 157 vpop {q4-q7} 158 pop {r4-r8, r10, r11, lr} 159 bx lr 160 END(rsdIntrinsicBlurVFU4_K) 161 162 /* 163 static void OneVF(float4 *out, const uchar *ptrIn, int iStride, 164 const float* gPtr, int iradius, int x1, int x2) 165 166 r0 = out 167 r1 = pin 168 r2 = gptr 169 r3 = ct 170 r4 = sp, x1 171 r5 = sp+4, x2 172 */ 173 ENTRY(rsdIntrinsicBlurHFU4_K) 174 push {r4-r8, r10, r11, lr} 175 vpush {q4-q7} 176 177 ldr r4, [sp, #32+64] 178 ldr r5, [sp, #32+64 + 4] 179 180 1: 181 add r7, r1, r4, lsl #4 /* const uchar *pi = ptrIn + x1 * 4; */ 182 mov r10, r2 183 mov r11, r3 184 185 vld1.32 {q1}, [r7]! 186 vld1.32 {d6[0]}, [r10]! 187 vmul.f32 q0, q1, d6[0] 188 sub r11, r11, #1 189 190 2: 191 vld1.32 {q1}, [r7]! 192 vld1.32 {q2}, [r7]! 193 vld1.32 {d6}, [r10]! 194 vmla.f32 q0, q1, d6[0] 195 vmla.f32 q0, q2, d6[1] 196 subs r11, r11, #2 197 bne 2b 198 199 vcvt.s32.f32 q0, q0 200 vmovn.u32 d0, q0 201 vmovn.u16 d0, q0 202 203 vst1.32 {d0[0]}, [r0]! 204 add r4, r4, #1 205 cmp r4, r5 206 bne 1b 207 208 vpop {q4-q7} 209 pop {r4-r8, r10, r11, lr} 210 bx lr 211 END(rsdIntrinsicBlurHFU4_K) 212 213 ENTRY(rsdIntrinsicBlurHFU1_K) 214 push {r4-r8, r10, r11, lr} 215 vpush {q4-q7} 216 217 ldr r4, [sp, #32+64] 218 ldr r5, [sp, #32+64 + 4] 219 220 1: 221 add r7, r1, r4, lsl #2 /* const uchar *pi = ptrIn + x1 * 4; */ 222 mov r10, r2 223 mov r11, r3 224 225 veor q0, q0 226 227 2: 228 vld1.32 {q1}, [r7] 229 add r7, r7, #4 230 vld1.32 {d4[0]}, [r10]! 231 vmla.f32 q0, q1, d4[0] 232 subs r11, r11, #1 233 bne 2b 234 235 vcvt.s32.f32 q0, q0 236 vmovn.u32 d0, q0 237 vmovn.u16 d0, q0 238 239 vst1.32 {d0[0]}, [r0]! 240 add r4, r4, #4 241 cmp r4, r5 242 bne 1b 243 244 vpop {q4-q7} 245 pop {r4-r8, r10, r11, lr} 246 bx lr 247 END(rsdIntrinsicBlurHFU1_K) 248 249 /* 250 Function called with the following arguments: dst, Y, vu, len, YuvCoeff 251 r0 = dst 252 r1 = Y 253 r2 = VU 254 r3 = length (pixels / 8) 255 ---- Args below will be in the stack ---- 256 sp = YuvCoeff 257 258 This function converts 8 pixels per iteration 259 */ 260 ENTRY(rsdIntrinsicYuv_K) 261 push {r4, r5, lr} @ preserve clobbered int registers 262 vpush {Q4-Q7} @ preserve Vregisters we clobber 263 264 mov r5, #16 @ Integer 16 in r5; used as an incrementing value 265 266 ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3) 267 vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 268 vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 269 vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 270 271 mov r4, #8 @ Integer 8 in r4; used as an incrementing value 272 273 vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in 274 @ the coeffs matrix (Q2) 275 276 1: 277 vld1.8 {d10}, [r1]! @ get Y (r1->Y) 278 vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4) 279 pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops 280 pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops 281 282 vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) 283 vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) 284 vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) 285 286 vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) 287 vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) 288 vmov.u16 d11, d10 @ Copying V to d11 289 vmov.u16 d13, d12 @ Copying U to d13 290 vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) 291 vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) 292 293 294 vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 295 vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 296 vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 297 vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 298 299 @ R G B 300 @ Pixel(0-3) Q8, Q9, Q10 301 @ Pixel(4-7) Q11, Q12, Q13 302 @ 303 304 @ Pixel(0-3) 305 vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 306 vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) 307 vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) 308 vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 309 310 @ Pixel(4-7) 311 vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 312 vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) 313 vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) 314 vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 315 316 @ Pixel(0-3) 317 vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit 318 vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit 319 vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit 320 321 @ Pixel(4-7) 322 vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit 323 vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit 324 vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit 325 326 vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) 327 vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) 328 vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) 329 330 subs r3, r3, #1 @ Checking length (r3) 331 vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) 332 333 bne 1b @ if not done with length, loop 334 335 vpop {Q4-Q7} @ Restore Vregisters 336 pop {r4, r5, lr} @ Restore int registers 337 bx lr 338 END(rsdIntrinsicYuv_K) 339 340 /* 341 Function called with the following arguments: dst, Y, vu, len, YuvCoeff 342 r0 = dst 343 r1 = Y 344 r2 = UV 345 r3 = length (pixels / 8) 346 ---- Args below will be in the stack ---- 347 sp = YuvCoeff 348 349 This function converts 8 pixels per iteration 350 */ 351 ENTRY(rsdIntrinsicYuvR_K) 352 push {r4, r5, lr} @ preserve clobbered int registers 353 vpush {Q4-Q7} @ preserve Vregisters we clobber 354 355 mov r5, #16 @ Integer 16 in r5; used as an incrementing value 356 357 ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3) 358 vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 359 vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 360 vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 361 362 mov r4, #8 @ Integer 8 in r4; used as an incrementing value 363 364 vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in 365 @ the coeffs matrix (Q2) 366 367 1: 368 vld1.8 {d10}, [r1]! @ get Y (r1->Y) 369 vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4) 370 pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops 371 pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops 372 373 vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) 374 vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) 375 vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) 376 377 vsubl.u8 Q5, d14, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) 378 vsubl.u8 Q6, d12, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) 379 vmov.u16 d11, d10 @ Copying V to d11 380 vmov.u16 d13, d12 @ Copying U to d13 381 vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) 382 vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) 383 384 385 vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 386 vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 387 vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 388 vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 389 390 @ R G B 391 @ Pixel(0-3) Q8, Q9, Q10 392 @ Pixel(4-7) Q11, Q12, Q13 393 @ 394 395 @ Pixel(0-3) 396 vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 397 vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) 398 vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) 399 vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 400 401 @ Pixel(4-7) 402 vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 403 vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) 404 vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) 405 vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 406 407 @ Pixel(0-3) 408 vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit 409 vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit 410 vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit 411 412 @ Pixel(4-7) 413 vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit 414 vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit 415 vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit 416 417 vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) 418 vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) 419 vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) 420 421 subs r3, r3, #1 @ Checking length (r3) 422 vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) 423 424 bne 1b @ if not done with length, loop 425 426 vpop {Q4-Q7} @ Restore Vregisters 427 pop {r4, r5, lr} @ Restore int registers 428 bx lr 429 END(rsdIntrinsicYuvR_K) 430 431 /* 432 Function called with the following arguments: dst, Y, v, u, len, YuvCoeff 433 r0 = dst 434 r1 = Y 435 r2 = V, 436 r3 = U 437 ---- Args below will be in the stack ---- 438 sp = length (pixels / 8) 439 sp+4 = YuvCoeff 440 441 This function converts 8 pixels per iteration 442 */ 443 ENTRY(rsdIntrinsicYuv2_K) 444 push {r4, r5, r6, lr} @ preserve clobbered int registers 445 vpush {Q4-Q7} @ preserve Vregisters we clobber 446 447 mov r5, #16 @ Integer 16 in r5; used as an incrementing value 448 449 ldr r4, [sp, #64+16+4] @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4) 450 ldr r6, [sp, #64+16] @ load the length in r6 (16*4 + 4*4) 451 vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 452 vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 453 vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 454 455 mov r4, #4 @ Integer 8 in r4; used as an incrementing value 456 457 vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in 458 @ the coeffs matrix (Q2) 459 460 1: 461 vld1.8 {d10}, [r1]! @ get Y (r1->Y) 462 vld1.8 {d12}, [r3], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4) 463 vld1.8 {d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4) 464 pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops 465 pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops 466 467 vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) 468 vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) 469 vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) 470 471 vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) 472 vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) 473 vmov.u16 d11, d10 @ Copying V to d11 474 vmov.u16 d13, d12 @ Copying U to d13 475 vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) 476 vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) 477 478 479 vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 480 vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 481 vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 482 vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 483 484 @ R G B 485 @ Pixel(0-3) Q8, Q9, Q10 486 @ Pixel(4-7) Q11, Q12, Q13 487 @ 488 489 @ Pixel(0-3) 490 vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 491 vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) 492 vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) 493 vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 494 495 @ Pixel(4-7) 496 vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 497 vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) 498 vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) 499 vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 500 501 @ Pixel(0-3) 502 vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit 503 vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit 504 vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit 505 506 @ Pixel(4-7) 507 vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit 508 vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit 509 vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit 510 511 vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) 512 vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) 513 vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) 514 515 subs r6, r6, #1 @ Checking length (r6) 516 vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) 517 518 bne 1b @ if not done with length, loop 519 520 vpop {Q4-Q7} @ Restore Vregisters 521 pop {r4, r5, r6, lr} @ Restore int registers 522 bx lr 523 END(rsdIntrinsicYuv2_K) 524 525 /* Convolve 5x5 */ 526 527 /* 528 r0 = dst 529 r1 = y0 base pointer 530 r2 = y1 base pointer 531 r3 = y2 base pointer 532 r4 = y3 base pointer 533 r5 = y4 base pointer 534 r6 = coeffs 535 r7 = length 536 */ 537 ENTRY(rsdIntrinsicConvolve5x5_K) 538 push {r4-r7, lr} 539 vpush {q4-q7} 540 541 /* load y3 in r4 */ 542 ldr r4, [sp, #20 + 64] 543 544 /* load y4 in r5 */ 545 ldr r5, [sp, #24 + 64] 546 547 /* Load the coefficients pointer */ 548 ldr r6, [sp, #28 + 64] 549 550 /* Create the coefficients vector */ 551 vld1.16 {d0, d1, d2, d3}, [r6]! 552 vld1.16 {d4, d5, d6}, [r6] 553 554 vmov.u32 q15, #0x7f 555 556 /* load the count */ 557 ldr r6, [sp, #32 + 64] 558 559 /* Load the frequently used immediate in a register */ 560 mov r7, #8 561 562 1: 563 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 564 vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 ) 565 vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 ) 566 567 /* Signal memory for data that will be used in the loop after the next */ 568 PLD (r1, r7) 569 PLD (r2, r7) 570 571 /* Promoting the 8bit channels to 16bit */ 572 vmovl.u8 q9, d24 573 vmovl.u8 q10, d25 574 vmovl.u8 q11, d26 575 vmovl.u8 q12, d27 576 vmovl.u8 q13, d28 577 vmovl.u8 q14, d29 578 579 /* 580 d18, d19, d20, d21, d22, d23, 581 d24, d25 582 */ 583 vmull.s16 q4, d18, d0[0] 584 vmlal.s16 q4, d19, d0[1] 585 vmlal.s16 q4, d20, d0[2] 586 vmlal.s16 q4, d21, d0[3] 587 vmlal.s16 q4, d22, d1[0] 588 589 vmlal.s16 q4, d24, d1[1] 590 vmlal.s16 q4, d25, d1[2] 591 vmlal.s16 q4, d26, d1[3] 592 vmlal.s16 q4, d27, d2[0] 593 vmlal.s16 q4, d28, d2[1] 594 595 vmull.s16 q5, d19, d0[0] 596 vmlal.s16 q5, d20, d0[1] 597 vmlal.s16 q5, d21, d0[2] 598 vmlal.s16 q5, d22, d0[3] 599 vmlal.s16 q5, d23, d1[0] 600 601 vmlal.s16 q5, d25, d1[1] 602 vmlal.s16 q5, d26, d1[2] 603 vmlal.s16 q5, d27, d1[3] 604 vmlal.s16 q5, d28, d2[0] 605 vmlal.s16 q5, d29, d2[1] 606 607 608 /* Next 2 rows */ 609 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 610 vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y ) 611 vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 ) 612 613 /* Signal memory for data that will be used in the loop after the next */ 614 PLD (r3, r7) 615 PLD (r4, r7) 616 617 /* Promoting the 8bit channels to 16bit */ 618 vmovl.u8 q9, d24 619 vmovl.u8 q10, d25 620 vmovl.u8 q11, d26 621 vmovl.u8 q12, d27 622 vmovl.u8 q13, d28 623 vmovl.u8 q14, d29 624 625 /* 626 d18, d19, d20, d21, d22, d23, 627 d24, d25 628 */ 629 vmlal.s16 q4, d18, d2[2] 630 vmlal.s16 q4, d19, d2[3] 631 vmlal.s16 q4, d20, d3[0] 632 vmlal.s16 q4, d21, d3[1] 633 vmlal.s16 q4, d22, d3[2] 634 635 vmlal.s16 q4, d24, d3[3] 636 vmlal.s16 q4, d25, d4[0] 637 vmlal.s16 q4, d26, d4[1] 638 vmlal.s16 q4, d27, d4[2] 639 vmlal.s16 q4, d28, d4[3] 640 641 vmlal.s16 q5, d19, d2[2] 642 vmlal.s16 q5, d20, d2[3] 643 vmlal.s16 q5, d21, d3[0] 644 vmlal.s16 q5, d22, d3[1] 645 vmlal.s16 q5, d23, d3[2] 646 647 vmlal.s16 q5, d25, d3[3] 648 vmlal.s16 q5, d26, d4[0] 649 vmlal.s16 q5, d27, d4[1] 650 vmlal.s16 q5, d28, d4[2] 651 vmlal.s16 q5, d29, d4[3] 652 653 /* Last row */ 654 /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 655 vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 ) 656 657 /* Signal memory for data that will be used in the loop after the next */ 658 PLD (r5, r7) 659 660 /* Promoting the 8bit channels to 16bit */ 661 vmovl.u8 q9, d24 662 vmovl.u8 q10, d25 663 vmovl.u8 q11, d26 664 665 /* 666 d18, d19, d20, d21, d22, d23, 667 d24, d25 668 */ 669 670 vmlal.s16 q4, d18, d5[0] 671 vmlal.s16 q4, d19, d5[1] 672 vmlal.s16 q4, d20, d5[2] 673 vmlal.s16 q4, d21, d5[3] 674 vmlal.s16 q4, d22, d6[0] 675 676 vmlal.s16 q5, d19, d5[0] 677 vmlal.s16 q5, d20, d5[1] 678 vmlal.s16 q5, d21, d5[2] 679 vmlal.s16 q5, d22, d5[3] 680 vmlal.s16 q5, d23, d6[0] 681 682 683 684 vadd.i32 q4, q4, q15 685 vadd.i32 q5, q5, q15 686 687 /* Narrow it to a d-reg 32 -> 16 bit */ 688 vrshrn.i32 d8, q4, #8 689 vrshrn.i32 d9, q5, #8 690 691 692 /* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ 693 vqmovun.s16 d8, q4 694 695 vst1.8 d8, [r0]! @ return the output and increase the address of r0 696 697 /* Are we done? */ 698 subs r6, r6, #1 699 bne 1b 700 701 /* Yup, bye */ 702 vpop {q4-q7} 703 pop {r4-r7, lr} 704 bx lr 705 706 END(rsdIntrinsicConvolve5x5_K) 707 708 709 710 711 /* 712 dst = src + dst * (1.0 - src.a) 713 714 r0 = dst 715 r1 = src 716 r2 = length 717 */ 718 ENTRY(rsdIntrinsicBlendSrcOver_K) 719 .save {r4, lr} 720 stmfd sp!, {r4, lr} 721 vpush {q4-q7} 722 723 mov r4, #255 724 vdup.16 q7, r4 725 726 mov r4, r0 727 1: 728 729 /* src */ 730 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 731 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 732 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 733 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 734 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 735 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 736 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 737 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 738 vshll.u8 q12, d0, #8 739 vshll.u8 q13, d1, #8 740 vshll.u8 q14, d2, #8 741 vmovl.u8 q6, d3 742 vsub.i16 q6, q7, q6 // q6 = 1 - src.a 743 vshll.u8 q15, d3, #8 744 745 /* dst */ 746 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 747 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 748 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 749 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 750 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 751 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 752 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 753 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 754 vmovl.u8 q8, d0 755 vmovl.u8 q9, d1 756 vmovl.u8 q10, d2 757 vmovl.u8 q11, d3 758 759 vmla.i16 q12, q8, q6 760 vmla.i16 q13, q9, q6 761 vmla.i16 q14, q10, q6 762 vmla.i16 q15, q11, q6 763 764 vshrn.i16 d0, q12, #8 765 vshrn.i16 d1, q13, #8 766 vshrn.i16 d2, q14, #8 767 vshrn.i16 d3, q15, #8 768 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 769 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 770 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 771 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 772 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 773 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 774 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 775 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 776 777 subs r2, r2, #1 778 bne 1b 779 780 vpop {q4-q7} 781 ldmfd sp!, {r4, lr} 782 bx lr 783 END(rsdIntrinsicBlendSrcOver_K) 784 785 /* 786 dst = dst + src * (1.0 - dst.a) 787 788 r0 = dst 789 r1 = src 790 r2 = length 791 */ 792 ENTRY(rsdIntrinsicBlendDstOver_K) 793 .save {r4, lr} 794 stmfd sp!, {r4, lr} 795 vpush {q4-q7} 796 797 mov r4, #255 798 vdup.16 q7, r4 799 800 mov r4, r0 801 1: 802 803 /* src */ 804 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 805 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 806 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 807 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 808 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 809 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 810 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 811 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 812 vmovl.u8 q12, d0 813 vmovl.u8 q13, d1 814 vmovl.u8 q14, d2 815 vmovl.u8 q15, d3 816 817 /* dst */ 818 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 819 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 820 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 821 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 822 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 823 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 824 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 825 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 826 vshll.u8 q8, d0, #8 827 vshll.u8 q9, d1, #8 828 vshll.u8 q10, d2, #8 829 vmovl.u8 q6, d3 830 vsub.i16 q6, q7, q6 // q6 = 1 - dst.a 831 vshll.u8 q11, d3, #8 832 833 834 vmla.i16 q8, q12, q6 835 vmla.i16 q9, q13, q6 836 vmla.i16 q10, q14, q6 837 vmla.i16 q11, q15, q6 838 839 vshrn.i16 d0, q8, #8 840 vshrn.i16 d1, q9, #8 841 vshrn.i16 d2, q10, #8 842 vshrn.i16 d3, q11, #8 843 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 844 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 845 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 846 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 847 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 848 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 849 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 850 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 851 852 subs r2, r2, #1 853 bne 1b 854 855 vpop {q4-q7} 856 ldmfd sp!, {r4, lr} 857 bx lr 858 END(rsdIntrinsicBlendDstOver_K) 859 860 /* 861 dst = src * dst.a 862 863 r0 = dst 864 r1 = src 865 r2 = length 866 */ 867 ENTRY(rsdIntrinsicBlendSrcIn_K) 868 .save {r4, lr} 869 stmfd sp!, {r4, lr} 870 vpush {q4-q7} 871 872 mov r4, r0 873 1: 874 875 /* src */ 876 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 877 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 878 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 879 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 880 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 881 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 882 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 883 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 884 vmovl.u8 q12, d0 885 vmovl.u8 q13, d1 886 vmovl.u8 q14, d2 887 vmovl.u8 q15, d3 888 889 /* dst */ 890 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 891 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 892 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 893 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 894 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 895 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 896 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 897 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 898 //vmovl.u8 q8, d0 899 //vmovl.u8 q9, d1 900 //vmovl.u8 q10, d2 901 vmovl.u8 q11, d3 902 903 vmul.i16 q12, q12, q11 904 vmul.i16 q13, q13, q11 905 vmul.i16 q14, q14, q11 906 vmul.i16 q15, q15, q11 907 908 vshrn.i16 d0, q12, #8 909 vshrn.i16 d1, q13, #8 910 vshrn.i16 d2, q14, #8 911 vshrn.i16 d3, q15, #8 912 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 913 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 914 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 915 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 916 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 917 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 918 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 919 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 920 921 subs r2, r2, #1 922 bne 1b 923 924 vpop {q4-q7} 925 ldmfd sp!, {r4, lr} 926 bx lr 927 END(rsdIntrinsicBlendSrcIn_K) 928 929 /* 930 dst = dst * src.a 931 932 r0 = dst 933 r1 = src 934 r2 = length 935 */ 936 ENTRY(rsdIntrinsicBlendDstIn_K) 937 .save {r4, lr} 938 stmfd sp!, {r4, lr} 939 vpush {q4-q7} 940 941 mov r4, r0 942 1: 943 944 /* src */ 945 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 946 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 947 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 948 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 949 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 950 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 951 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 952 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 953 //vmovl.u8 q12, d0 954 //vmovl.u8 q13, d1 955 //vmovl.u8 q14, d2 956 vmovl.u8 q15, d3 957 958 /* dst */ 959 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 960 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 961 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 962 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 963 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 964 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 965 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 966 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 967 vmovl.u8 q8, d0 968 vmovl.u8 q9, d1 969 vmovl.u8 q10, d2 970 vmovl.u8 q11, d3 971 972 vmul.i16 q8, q8, q15 973 vmul.i16 q9, q9, q15 974 vmul.i16 q10, q10, q15 975 vmul.i16 q11, q11, q15 976 977 vshrn.i16 d0, q8, #8 978 vshrn.i16 d1, q9, #8 979 vshrn.i16 d2, q10, #8 980 vshrn.i16 d3, q11, #8 981 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 982 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 983 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 984 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 985 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 986 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 987 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 988 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 989 990 subs r2, r2, #1 991 bne 1b 992 993 vpop {q4-q7} 994 ldmfd sp!, {r4, lr} 995 bx lr 996 END(rsdIntrinsicBlendDstIn_K) 997 998 999 1000 /* 1001 dst = src * (1.0 - dst.a) 1002 1003 r0 = dst 1004 r1 = src 1005 r2 = length 1006 */ 1007 ENTRY(rsdIntrinsicBlendSrcOut_K) 1008 .save {r4, lr} 1009 stmfd sp!, {r4, lr} 1010 vpush {q4-q7} 1011 1012 mov r4, #255 1013 vdup.16 q7, r4 1014 1015 mov r4, r0 1016 1: 1017 1018 /* src */ 1019 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1020 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1021 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1022 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1023 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1024 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1025 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1026 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1027 vmovl.u8 q12, d0 1028 vmovl.u8 q13, d1 1029 vmovl.u8 q14, d2 1030 vmovl.u8 q15, d3 1031 1032 /* dst */ 1033 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1034 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1035 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1036 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1037 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1038 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1039 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1040 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1041 //vmovl.u8 q8, d0 1042 //vmovl.u8 q9, d1 1043 //vmovl.u8 q10, d2 1044 vmovl.u8 q11, d3 1045 1046 1047 vsub.i16 q6, q7, q11 // q6 = 1 - dst.a 1048 vmul.i16 q12, q12, q6 1049 vmul.i16 q13, q13, q6 1050 vmul.i16 q14, q14, q6 1051 vmul.i16 q15, q15, q6 1052 1053 vshrn.i16 d0, q12, #8 1054 vshrn.i16 d1, q13, #8 1055 vshrn.i16 d2, q14, #8 1056 vshrn.i16 d3, q15, #8 1057 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1058 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1059 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1060 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1061 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1062 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1063 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1064 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1065 1066 subs r2, r2, #1 1067 bne 1b 1068 1069 vpop {q4-q7} 1070 ldmfd sp!, {r4, lr} 1071 bx lr 1072 END(rsdIntrinsicBlendSrcOut_K) 1073 1074 1075 /* 1076 dst = dst * (1.0 - src.a) 1077 1078 r0 = dst 1079 r1 = src 1080 r2 = length 1081 */ 1082 ENTRY(rsdIntrinsicBlendDstOut_K) 1083 .save {r4, lr} 1084 stmfd sp!, {r4, lr} 1085 vpush {q4-q7} 1086 1087 mov r4, #255 1088 vdup.16 q7, r4 1089 1090 mov r4, r0 1091 1: 1092 1093 /* src */ 1094 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1095 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1096 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1097 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1098 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1099 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1100 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1101 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1102 //vmovl.u8 q12, d0 1103 //vmovl.u8 q13, d1 1104 //vmovl.u8 q14, d2 1105 vmovl.u8 q15, d3 1106 1107 /* dst */ 1108 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1109 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1110 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1111 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1112 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1113 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1114 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1115 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1116 vmovl.u8 q8, d0 1117 vmovl.u8 q9, d1 1118 vmovl.u8 q10, d2 1119 vmovl.u8 q11, d3 1120 1121 1122 vsub.i16 q6, q7, q15 // q6 = 1 - src.a 1123 vmul.i16 q12, q8, q6 1124 vmul.i16 q13, q9, q6 1125 vmul.i16 q14, q10, q6 1126 vmul.i16 q15, q11, q6 1127 1128 vshrn.i16 d0, q12, #8 1129 vshrn.i16 d1, q13, #8 1130 vshrn.i16 d2, q14, #8 1131 vshrn.i16 d3, q15, #8 1132 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1133 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1134 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1135 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1136 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1137 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1138 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1139 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1140 1141 subs r2, r2, #1 1142 bne 1b 1143 1144 vpop {q4-q7} 1145 ldmfd sp!, {r4, lr} 1146 bx lr 1147 END(rsdIntrinsicBlendDstOut_K) 1148 1149 1150 /* 1151 dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb 1152 dst.a = dst.a 1153 1154 r0 = dst 1155 r1 = src 1156 r2 = length 1157 */ 1158 ENTRY(rsdIntrinsicBlendSrcAtop_K) 1159 .save {r4, lr} 1160 stmfd sp!, {r4, lr} 1161 vpush {q4-q7} 1162 1163 mov r4, #255 1164 vdup.16 q7, r4 1165 1166 mov r4, r0 1167 1: 1168 1169 /* src */ 1170 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1171 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1172 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1173 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1174 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1175 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1176 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1177 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1178 vmovl.u8 q12, d0 1179 vmovl.u8 q13, d1 1180 vmovl.u8 q14, d2 1181 vmovl.u8 q15, d3 1182 1183 /* dst */ 1184 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1185 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1186 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1187 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1188 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1189 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1190 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1191 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1192 vmovl.u8 q8, d0 1193 vmovl.u8 q9, d1 1194 vmovl.u8 q10, d2 1195 vmovl.u8 q11, d3 1196 1197 1198 vsub.i16 q6, q7, q15 // q6 = 1 - src.a 1199 vmul.i16 q8, q8, q6 1200 vmul.i16 q9, q9, q6 1201 vmul.i16 q10, q10, q6 1202 1203 vmla.i16 q8, q12, q11 1204 vmla.i16 q9, q13, q11 1205 vmla.i16 q10, q14, q11 1206 1207 1208 vshrn.i16 d0, q8, #8 1209 vshrn.i16 d1, q9, #8 1210 vshrn.i16 d2, q10, #8 1211 //vshrn.i16 d3, q15, #8 1212 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1213 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1214 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1215 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1216 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1217 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1218 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1219 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1220 1221 subs r2, r2, #1 1222 bne 1b 1223 1224 vpop {q4-q7} 1225 ldmfd sp!, {r4, lr} 1226 bx lr 1227 END(rsdIntrinsicBlendSrcAtop_K) 1228 1229 /* 1230 dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb 1231 dst.a = src.a 1232 1233 r0 = dst 1234 r1 = src 1235 r2 = length 1236 */ 1237 ENTRY(rsdIntrinsicBlendDstAtop_K) 1238 .save {r4, lr} 1239 stmfd sp!, {r4, lr} 1240 vpush {q4-q7} 1241 1242 mov r4, #255 1243 vdup.16 q7, r4 1244 1245 mov r4, r0 1246 1: 1247 1248 /* src */ 1249 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1250 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1251 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1252 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1253 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1254 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1255 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1256 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1257 vmovl.u8 q12, d0 1258 vmovl.u8 q13, d1 1259 vmovl.u8 q14, d2 1260 vmovl.u8 q15, d3 1261 1262 /* dst */ 1263 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1264 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1265 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1266 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1267 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1268 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1269 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1270 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1271 vmovl.u8 q8, d0 1272 vmovl.u8 q9, d1 1273 vmovl.u8 q10, d2 1274 vmovl.u8 q11, d3 1275 1276 1277 vsub.i16 q6, q7, q11 // q6 = 1 - dst.a 1278 vmul.i16 q12, q12, q6 1279 vmul.i16 q13, q13, q6 1280 vmul.i16 q14, q14, q6 1281 1282 vmla.i16 q12, q8, q15 1283 vmla.i16 q13, q9, q15 1284 vmla.i16 q14, q10, q15 1285 1286 1287 vshrn.i16 d0, q12, #8 1288 vshrn.i16 d1, q13, #8 1289 vshrn.i16 d2, q14, #8 1290 //vshrn.i16 d3, q15, #8 1291 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1292 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1293 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1294 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1295 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1296 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1297 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1298 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1299 1300 subs r2, r2, #1 1301 bne 1b 1302 1303 vpop {q4-q7} 1304 ldmfd sp!, {r4, lr} 1305 bx lr 1306 END(rsdIntrinsicBlendDstAtop_K) 1307 1308 /* 1309 dst = dst ^ src 1310 1311 r0 = dst 1312 r1 = src 1313 r2 = length 1314 */ 1315 ENTRY(rsdIntrinsicBlendXor_K) 1316 .save {r4, lr} 1317 stmfd sp!, {r4, lr} 1318 vpush {q4-q7} 1319 1320 mov r4, #255 1321 vdup.16 q7, r4 1322 1323 mov r4, r0 1324 1: 1325 1326 /* src */ 1327 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1328 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1329 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1330 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1331 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1332 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1333 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1334 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1335 vmov.u8 d4, d0 1336 vmov.u8 d5, d1 1337 vmov.u8 d6, d2 1338 vmov.u8 d7, d3 1339 1340 /* dst */ 1341 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1342 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1343 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1344 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1345 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1346 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1347 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1348 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1349 1350 veor d0, d0, d4 1351 veor d1, d1, d5 1352 veor d2, d2, d6 1353 veor d3, d3, d7 1354 1355 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1356 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1357 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1358 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1359 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1360 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1361 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1362 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1363 1364 subs r2, r2, #1 1365 bne 1b 1366 1367 vpop {q4-q7} 1368 ldmfd sp!, {r4, lr} 1369 bx lr 1370 END(rsdIntrinsicBlendXor_K) 1371 1372 /* 1373 dst = dst * src 1374 1375 r0 = dst 1376 r1 = src 1377 r2 = length 1378 */ 1379 ENTRY(rsdIntrinsicBlendMultiply_K) 1380 .save {r4, lr} 1381 stmfd sp!, {r4, lr} 1382 vpush {q4-q7} 1383 1384 mov r4, #255 1385 vdup.16 q7, r4 1386 1387 mov r4, r0 1388 1: 1389 1390 /* src */ 1391 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1392 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1393 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1394 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1395 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1396 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1397 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1398 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1399 vmovl.u8 q12, d0 1400 vmovl.u8 q13, d1 1401 vmovl.u8 q14, d2 1402 vmovl.u8 q15, d3 1403 1404 /* dst */ 1405 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1406 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1407 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1408 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1409 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1410 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1411 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1412 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1413 vmovl.u8 q8, d0 1414 vmovl.u8 q9, d1 1415 vmovl.u8 q10, d2 1416 vmovl.u8 q11, d3 1417 1418 1419 vmul.i16 q8, q8, q12 1420 vmul.i16 q9, q9, q13 1421 vmul.i16 q10, q10, q14 1422 vmul.i16 q11, q11, q15 1423 1424 vshrn.i16 d0, q8, #8 1425 vshrn.i16 d1, q9, #8 1426 vshrn.i16 d2, q10, #8 1427 vshrn.i16 d3, q11, #8 1428 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1429 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1430 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1431 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1432 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1433 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1434 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1435 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1436 1437 subs r2, r2, #1 1438 bne 1b 1439 1440 vpop {q4-q7} 1441 ldmfd sp!, {r4, lr} 1442 bx lr 1443 END(rsdIntrinsicBlendMultiply_K) 1444 1445 /* 1446 dst = min(src + dst, 1.0) 1447 1448 r0 = dst 1449 r1 = src 1450 r2 = length 1451 */ 1452 ENTRY(rsdIntrinsicBlendAdd_K) 1453 .save {r4, lr} 1454 stmfd sp!, {r4, lr} 1455 vpush {q4-q7} 1456 1457 mov r4, #255 1458 vdup.16 q7, r4 1459 1460 mov r4, r0 1461 1: 1462 1463 /* src */ 1464 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1465 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1466 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1467 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1468 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1469 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1470 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1471 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1472 vmovl.u8 q12, d0 1473 vmovl.u8 q13, d1 1474 vmovl.u8 q14, d2 1475 vmovl.u8 q15, d3 1476 1477 /* dst */ 1478 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1479 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1480 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1481 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1482 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1483 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1484 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1485 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1486 vmovl.u8 q8, d0 1487 vmovl.u8 q9, d1 1488 vmovl.u8 q10, d2 1489 vmovl.u8 q11, d3 1490 1491 1492 vadd.i16 q8, q8, q12 1493 vadd.i16 q9, q9, q13 1494 vadd.i16 q10, q10, q14 1495 vadd.i16 q11, q11, q15 1496 1497 vqmovun.s16 d0, q8 1498 vqmovun.s16 d1, q9 1499 vqmovun.s16 d2, q10 1500 vqmovun.s16 d3, q11 1501 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1502 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1503 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1504 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1505 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1506 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1507 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1508 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1509 1510 subs r2, r2, #1 1511 bne 1b 1512 1513 vpop {q4-q7} 1514 ldmfd sp!, {r4, lr} 1515 bx lr 1516 END(rsdIntrinsicBlendAdd_K) 1517 1518 1519 /* 1520 dst = max(dst - src, 0.0) 1521 1522 r0 = dst 1523 r1 = src 1524 r2 = length 1525 */ 1526 ENTRY(rsdIntrinsicBlendSub_K) 1527 .save {r4, lr} 1528 stmfd sp!, {r4, lr} 1529 vpush {q4-q7} 1530 1531 mov r4, #255 1532 vdup.16 q7, r4 1533 1534 mov r4, r0 1535 1: 1536 1537 /* src */ 1538 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 1539 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 1540 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 1541 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 1542 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! 1543 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! 1544 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! 1545 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! 1546 vmovl.u8 q12, d0 1547 vmovl.u8 q13, d1 1548 vmovl.u8 q14, d2 1549 vmovl.u8 q15, d3 1550 1551 /* dst */ 1552 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 1553 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 1554 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 1555 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 1556 vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! 1557 vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! 1558 vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! 1559 vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! 1560 vmovl.u8 q8, d0 1561 vmovl.u8 q9, d1 1562 vmovl.u8 q10, d2 1563 vmovl.u8 q11, d3 1564 1565 1566 vsub.i16 q8, q8, q12 1567 vsub.i16 q9, q9, q13 1568 vsub.i16 q10, q10, q14 1569 vsub.i16 q11, q11, q15 1570 1571 vqmovun.s16 d0, q8 1572 vqmovun.s16 d1, q9 1573 vqmovun.s16 d2, q10 1574 vqmovun.s16 d3, q11 1575 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! 1576 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! 1577 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! 1578 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! 1579 vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! 1580 vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! 1581 vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! 1582 vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! 1583 1584 subs r2, r2, #1 1585 bne 1b 1586 1587 vpop {q4-q7} 1588 ldmfd sp!, {r4, lr} 1589 bx lr 1590 END(rsdIntrinsicBlendSub_K) 1591 1592 1593 /* 3D LUT */ 1594 1595 /* 1596 r0 = dst 1597 r1 = src 1598 r2 = cube base pointer 1599 r3 = cube Y stride 1600 r4 = cube Z stride 1601 r5 = count 1602 xr10 = * constants 1603 1604 d0 / q0 = weight 1 p1 1605 d1 = weight 2 p1 1606 1607 d2 / q1 = weight 1 p2 1608 d3 = weight 2 p2 1609 1610 d4 / q2 = src1 1611 d5 = src2 1612 1613 d6 / q3 = baseCoord 1614 d7 = baseCoord 1615 1616 d8 / q4 = coord1 p1 1617 d9 = 1618 1619 d10 / q5 = coord1 p2 1620 d11 = 1621 1622 d12 / q6 = 1623 d13 = 1624 1625 d14 / q7 = 1626 d15 = 1627 1628 1629 d16 / q8 = x0 y0 z0 1630 d17 = x1 y0 z0 1631 d18 / q9 = x0 y1 z0 1632 d19 = x1 y1 z0 1633 d20 / q10 = x0 y0 z1 1634 d21 = x1 y0 z1 1635 d22 / q11 = x0 y1 z1 1636 d23 = x1 y1 z1 1637 1638 d24 / q12 = alpha mash 1639 d25 = current pixel alpha 1640 d26 / q13 = 4, y stride 1641 d27 = z stride, 0 1642 d28 / q14 = 0x8000 1643 d29 = 0x7fff 1644 d30 / q15 = 0, 0, 0, 0xffff 1645 1646 1647 d31 = coordMult 1648 */ 1649 1650 ENTRY(rsdIntrinsic3DLUT_K) 1651 push {r4-r8, r10, r11, lr} 1652 vpush {q4-q7} 1653 1654 /* load Z stride in r4 */ 1655 ldr r4, [sp, #32 + 64] 1656 1657 /* Load count */ 1658 ldr r5, [sp, #36 + 64] 1659 1660 vmov.u16 d28, #0x8000 1661 vmov.u16 d29, #0x7fff 1662 vmov.u32 d24, #0xff000000 1663 1664 /* load constants using r10 */ 1665 ldr r10, [sp, #40 + 64] 1666 vld1.32 {d31}, [r10]! 1667 vld1.32 {d30}, [r10]! 1668 1669 mov r6, #4 1670 vmov d26, r6, r3 1671 mov r6, #0 1672 vmov d27, r4, r6 1673 1674 add r8, r3, r4 1675 1676 1677 1678 1: 1679 vld1.8 {d4}, [r1]! 1680 vand.u8 d25, d4, d24 1681 vmovl.u8 q2, d4 1682 1683 1684 vmull.u16 q3, d4, d31 1685 vshr.u32 q4, q3, #15 // coord1 p1 1686 vmovn.u32 d1, q3 1687 vand.u16 d1, d29 // weight 2 1688 vsub.u16 d0, d28, d1 // weight 1 1689 vmul.u32 q4, q4, q13 // q4 = x*4, y*ystride, z*zstride, 0 1690 1691 vmull.u16 q3, d5, d31 1692 vshr.u32 q5, q3, #15 // coord1 p2 1693 vmovn.u32 d3, q3 1694 vand.u16 d3, d29 // weight 2 1695 vsub.u16 d2, d28, d3 // weight 1 1696 vmul.u32 q5, q5, q13 // q5 = x*4, y*ystride, z*zstride, 0 1697 1698 vpadd.u32 d8, d8, d9 1699 vpadd.u32 d9, d10, d11 1700 vpadd.u32 d8, d8, d9 1701 vmov r6, r7, d8 // base pointers 1702 1703 add r6, r6, r2 1704 add r7, r7, r2 1705 1706 vld1.8 {d16}, [r6] 1707 add r11, r6, r3 1708 vld1.8 {d18}, [r11] 1709 add r11, r6, r4 1710 vld1.8 {d20}, [r11] 1711 add r11, r6, r8 1712 vld1.8 {d22}, [r11] 1713 1714 vmovl.u8 q8, d16 1715 vmovl.u8 q9, d18 1716 vmovl.u8 q10, d20 1717 vmovl.u8 q11, d22 1718 1719 vmull.u16 q6, d16, d0[0] 1720 vmlal.u16 q6, d17, d1[0] 1721 vshrn.u32 d16, q6, #7 1722 vmull.u16 q6, d18, d0[0] 1723 vmlal.u16 q6, d19, d1[0] 1724 vshrn.u32 d18, q6, #7 1725 vmull.u16 q6, d20, d0[0] 1726 vmlal.u16 q6, d21, d1[0] 1727 vshrn.u32 d20, q6, #7 1728 vmull.u16 q6, d22, d0[0] 1729 vmlal.u16 q6, d23, d1[0] 1730 vshrn.u32 d22, q6, #7 1731 1732 vmull.u16 q6, d16, d0[1] 1733 vmlal.u16 q6, d18, d1[1] 1734 vshrn.u32 d16, q6, #15 1735 vmull.u16 q6, d20, d0[1] 1736 vmlal.u16 q6, d22, d1[1] 1737 vshrn.u32 d18, q6, #15 1738 1739 vmull.u16 q6, d16, d0[2] 1740 vmlal.u16 q6, d18, d1[2] 1741 vshrn.u32 d14, q6, #15 1742 1743 1744 vld1.8 {d16}, [r7] 1745 add r11, r7, r3 1746 vld1.8 {d18}, [r11] 1747 add r11, r7, r4 1748 vld1.8 {d20}, [r11] 1749 add r11, r7, r8 1750 vld1.8 {d22}, [r11] 1751 vmovl.u8 q8, d16 1752 vmovl.u8 q9, d18 1753 vmovl.u8 q10, d20 1754 vmovl.u8 q11, d22 1755 1756 vmull.u16 q6, d16, d2[0] 1757 vmlal.u16 q6, d17, d3[0] 1758 vshrn.u32 d16, q6, #7 1759 vmull.u16 q6, d18, d2[0] 1760 vmlal.u16 q6, d19, d3[0] 1761 vshrn.u32 d18, q6, #7 1762 vmull.u16 q6, d20, d2[0] 1763 vmlal.u16 q6, d21, d3[0] 1764 vshrn.u32 d20, q6, #7 1765 vmull.u16 q6, d22, d2[0] 1766 vmlal.u16 q6, d23, d3[0] 1767 vshrn.u32 d22, q6, #7 1768 1769 vmull.u16 q6, d16, d2[1] 1770 vmlal.u16 q6, d18, d3[1] 1771 vshrn.u32 d16, q6, #15 1772 vmull.u16 q6, d20, d2[1] 1773 vmlal.u16 q6, d22, d3[1] 1774 vshrn.u32 d18, q6, #15 1775 1776 vmull.u16 q6, d16, d2[2] 1777 vmlal.u16 q6, d18, d3[2] 1778 vshrn.u32 d15, q6, #15 1779 1780 vrshrn.u16 d14, q7, #8 1781 1782 vbic.u8 d14, d14, d24 // mix in alpha 1783 vorr.u8 d14, d14, d25 1784 vst1.32 {d14}, [r0]! 1785 1786 1787 /* Are we done? */ 1788 subs r5, r5, #1 1789 bne 1b 1790 1791 /* Yup, bye */ 1792 vpop {q4-q7} 1793 pop {r4-r8, r10, r11, lr} 1794 bx lr 1795 1796 END(rsdIntrinsic3DLUT_K) 1797 1798 1799