1 .global RestoreRegisters_NEON 2 .global ReverseLine_NEON 3 .global ReverseLineUV_NEON 4 .global SaveRegisters_NEON 5 .global TransposeWx8_NEON 6 .global TransposeUVWx8_NEON 7 .type RestoreRegisters_NEON, function 8 .type ReverseLine_NEON, function 9 .type ReverseLineUV_NEON, function 10 .type SaveRegisters_NEON, function 11 .type TransposeWx8_NEON, function 12 .type TransposeUVWx8_NEON, function 13 14 @ void ReverseLine_NEON (const uint8* src, uint8* dst, int width) 15 @ r0 const uint8* src 16 @ r1 uint8* dst 17 @ r2 width 18 ReverseLine_NEON: 19 20 @ compute where to start writing destination 21 add r1, r2 @ dst + width 22 23 @ work on segments that are multiples of 16 24 lsrs r3, r2, #4 25 26 @ the output is written in two block. 8 bytes followed 27 @ by another 8. reading is done sequentially, from left to 28 @ right. writing is done from right to left in block sizes 29 @ r1, the destination pointer is incremented after writing 30 @ the first of the two blocks. need to subtract that 8 off 31 @ along with 16 to get the next location. 32 mov r3, #-24 33 34 beq Lline_residuals 35 36 @ back of destination by the size of the register that is 37 @ going to be reversed 38 sub r1, #16 39 40 @ the loop needs to run on blocks of 16. what will be left 41 @ over is either a negative number, the residuals that need 42 @ to be done, or 0. if this isn't subtracted off here the 43 @ loop will run one extra time. 44 sub r2, #16 45 46 Lsegments_of_16: 47 vld1.8 {q0}, [r0]! @ src += 16 48 49 @ reverse the bytes in the 64 bit segments. unable to reverse 50 @ the bytes in the entire 128 bits in one go. 51 vrev64.8 q0, q0 52 53 @ because of the inability to reverse the entire 128 bits 54 @ reverse the writing out of the two 64 bit segments. 55 vst1.8 {d1}, [r1]! 56 vst1.8 {d0}, [r1], r3 @ dst -= 16 57 58 subs r2, #16 59 bge Lsegments_of_16 60 61 @ add 16 back to the counter. if the result is 0 there is no 62 @ residuals so return 63 adds r2, #16 64 bxeq lr 65 66 add r1, #16 67 68 Lline_residuals: 69 70 mov r3, #-3 71 72 sub r1, #2 73 subs r2, #2 74 @ check for 16*n+1 scenarios where segments_of_2 should not 75 @ be run, but there is something left over. 76 blt Lsegment_of_1 77 78 @ do this in neon registers as per 79 @ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ 80 Lsegments_of_2: 81 vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 82 83 vst1.8 {d1[0]}, [r1]! 84 vst1.8 {d0[0]}, [r1], r3 @ dst -= 2 85 86 subs r2, #2 87 bge Lsegments_of_2 88 89 adds r2, #2 90 bxeq lr 91 92 Lsegment_of_1: 93 add r1, #1 94 vld1.8 {d0[0]}, [r0] 95 vst1.8 {d0[0]}, [r1] 96 97 bx lr 98 99 @ void TransposeWx8_NEON (const uint8* src, int src_stride, 100 @ uint8* dst, int dst_stride, 101 @ int w) 102 @ r0 const uint8* src 103 @ r1 int src_stride 104 @ r2 uint8* dst 105 @ r3 int dst_stride 106 @ stack int w 107 TransposeWx8_NEON: 108 push {r4,r8,r9,lr} 109 110 ldr r8, [sp, #16] @ width 111 112 @ loops are on blocks of 8. loop will stop when 113 @ counter gets to or below 0. starting the counter 114 @ at w-8 allow for this 115 sub r8, #8 116 117 @ handle 8x8 blocks. this should be the majority of the plane 118 Lloop_8x8: 119 mov r9, r0 120 121 vld1.8 {d0}, [r9], r1 122 vld1.8 {d1}, [r9], r1 123 vld1.8 {d2}, [r9], r1 124 vld1.8 {d3}, [r9], r1 125 vld1.8 {d4}, [r9], r1 126 vld1.8 {d5}, [r9], r1 127 vld1.8 {d6}, [r9], r1 128 vld1.8 {d7}, [r9] 129 130 vtrn.8 d1, d0 131 vtrn.8 d3, d2 132 vtrn.8 d5, d4 133 vtrn.8 d7, d6 134 135 vtrn.16 d1, d3 136 vtrn.16 d0, d2 137 vtrn.16 d5, d7 138 vtrn.16 d4, d6 139 140 vtrn.32 d1, d5 141 vtrn.32 d0, d4 142 vtrn.32 d3, d7 143 vtrn.32 d2, d6 144 145 vrev16.8 q0, q0 146 vrev16.8 q1, q1 147 vrev16.8 q2, q2 148 vrev16.8 q3, q3 149 150 mov r9, r2 151 152 vst1.8 {d1}, [r9], r3 153 vst1.8 {d0}, [r9], r3 154 vst1.8 {d3}, [r9], r3 155 vst1.8 {d2}, [r9], r3 156 vst1.8 {d5}, [r9], r3 157 vst1.8 {d4}, [r9], r3 158 vst1.8 {d7}, [r9], r3 159 vst1.8 {d6}, [r9] 160 161 add r0, #8 @ src += 8 162 add r2, r3, lsl #3 @ dst += 8 * dst_stride 163 subs r8, #8 @ w -= 8 164 bge Lloop_8x8 165 166 @ add 8 back to counter. if the result is 0 there are 167 @ no residuals. 168 adds r8, #8 169 beq Ldone 170 171 @ some residual, so between 1 and 7 lines left to transpose 172 cmp r8, #2 173 blt Lblock_1x8 174 175 cmp r8, #4 176 blt Lblock_2x8 177 178 Lblock_4x8: 179 mov r9, r0 180 vld1.32 {d0[0]}, [r9], r1 181 vld1.32 {d0[1]}, [r9], r1 182 vld1.32 {d1[0]}, [r9], r1 183 vld1.32 {d1[1]}, [r9], r1 184 vld1.32 {d2[0]}, [r9], r1 185 vld1.32 {d2[1]}, [r9], r1 186 vld1.32 {d3[0]}, [r9], r1 187 vld1.32 {d3[1]}, [r9] 188 189 mov r9, r2 190 191 adr r12, vtbl_4x4_transpose 192 vld1.8 {q3}, [r12] 193 194 vtbl.8 d4, {d0, d1}, d6 195 vtbl.8 d5, {d0, d1}, d7 196 vtbl.8 d0, {d2, d3}, d6 197 vtbl.8 d1, {d2, d3}, d7 198 199 @ TODO: rework shuffle above to write 200 @ out with 4 instead of 8 writes 201 vst1.32 {d4[0]}, [r9], r3 202 vst1.32 {d4[1]}, [r9], r3 203 vst1.32 {d5[0]}, [r9], r3 204 vst1.32 {d5[1]}, [r9] 205 206 add r9, r2, #4 207 vst1.32 {d0[0]}, [r9], r3 208 vst1.32 {d0[1]}, [r9], r3 209 vst1.32 {d1[0]}, [r9], r3 210 vst1.32 {d1[1]}, [r9] 211 212 add r0, #4 @ src += 4 213 add r2, r3, lsl #2 @ dst += 4 * dst_stride 214 subs r8, #4 @ w -= 4 215 beq Ldone 216 217 @ some residual, check to see if it includes a 2x8 block, 218 @ or less 219 cmp r8, #2 220 blt Lblock_1x8 221 222 Lblock_2x8: 223 mov r9, r0 224 vld1.16 {d0[0]}, [r9], r1 225 vld1.16 {d1[0]}, [r9], r1 226 vld1.16 {d0[1]}, [r9], r1 227 vld1.16 {d1[1]}, [r9], r1 228 vld1.16 {d0[2]}, [r9], r1 229 vld1.16 {d1[2]}, [r9], r1 230 vld1.16 {d0[3]}, [r9], r1 231 vld1.16 {d1[3]}, [r9] 232 233 vtrn.8 d0, d1 234 235 mov r9, r2 236 237 vst1.64 {d0}, [r9], r3 238 vst1.64 {d1}, [r9] 239 240 add r0, #2 @ src += 2 241 add r2, r3, lsl #1 @ dst += 2 * dst_stride 242 subs r8, #2 @ w -= 2 243 beq Ldone 244 245 Lblock_1x8: 246 vld1.8 {d0[0]}, [r0], r1 247 vld1.8 {d0[1]}, [r0], r1 248 vld1.8 {d0[2]}, [r0], r1 249 vld1.8 {d0[3]}, [r0], r1 250 vld1.8 {d0[4]}, [r0], r1 251 vld1.8 {d0[5]}, [r0], r1 252 vld1.8 {d0[6]}, [r0], r1 253 vld1.8 {d0[7]}, [r0] 254 255 vst1.64 {d0}, [r2] 256 257 Ldone: 258 259 pop {r4,r8,r9,pc} 260 261 vtbl_4x4_transpose: 262 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 263 264 @ void SaveRegisters_NEON (unsigned long long store) 265 @ r0 unsigned long long store 266 SaveRegisters_NEON: 267 vst1.i64 {d8, d9, d10, d11}, [r0]! 268 vst1.i64 {d12, d13, d14, d15}, [r0]! 269 bx lr 270 271 @ void RestoreRegisters_NEON (unsigned long long store) 272 @ r0 unsigned long long store 273 RestoreRegisters_NEON: 274 vld1.i64 {d8, d9, d10, d11}, [r0]! 275 vld1.i64 {d12, d13, d14, d15}, [r0]! 276 bx lr 277 278 @ void ReverseLineUV_NEON (const uint8* src, 279 @ uint8* dst_a, 280 @ uint8* dst_b, 281 @ int width) 282 @ r0 const uint8* src 283 @ r1 uint8* dst_a 284 @ r2 uint8* dst_b 285 @ r3 width 286 ReverseLineUV_NEON: 287 288 @ compute where to start writing destination 289 add r1, r1, r3 @ dst_a + width 290 add r2, r2, r3 @ dst_b + width 291 292 @ work on input segments that are multiples of 16, but 293 @ width that has been passed is output segments, half 294 @ the size of input. 295 lsrs r12, r3, #3 296 297 beq Lline_residuals_di 298 299 @ the output is written in to two blocks. 300 mov r12, #-8 301 302 @ back of destination by the size of the register that is 303 @ going to be reversed 304 sub r1, r1, #8 305 sub r2, r2, #8 306 307 @ the loop needs to run on blocks of 8. what will be left 308 @ over is either a negative number, the residuals that need 309 @ to be done, or 0. if this isn't subtracted off here the 310 @ loop will run one extra time. 311 sub r3, r3, #8 312 313 Lsegments_of_8_di: 314 vld2.8 {d0, d1}, [r0]! @ src += 16 315 316 @ reverse the bytes in the 64 bit segments 317 vrev64.8 q0, q0 318 319 vst1.8 {d0}, [r1], r12 @ dst_a -= 8 320 vst1.8 {d1}, [r2], r12 @ dst_b -= 8 321 322 subs r3, r3, #8 323 bge Lsegments_of_8_di 324 325 @ add 8 back to the counter. if the result is 0 there is no 326 @ residuals so return 327 adds r3, r3, #8 328 bxeq lr 329 330 add r1, r1, #8 331 add r2, r2, #8 332 333 Lline_residuals_di: 334 335 mov r12, #-1 336 337 sub r1, r1, #1 338 sub r2, r2, #1 339 340 @ do this in neon registers as per 341 @ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ 342 Lsegments_of_1: 343 vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 344 345 vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1 346 vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1 347 348 subs r3, r3, #1 349 bgt Lsegments_of_1 350 351 bx lr 352 353 @ void TransposeUVWx8_NEON (const uint8* src, int src_stride, 354 @ uint8* dst_a, int dst_stride_a, 355 @ uint8* dst_b, int dst_stride_b, 356 @ int width) 357 @ r0 const uint8* src 358 @ r1 int src_stride 359 @ r2 uint8* dst_a 360 @ r3 int dst_stride_a 361 @ stack uint8* dst_b 362 @ stack int dst_stride_b 363 @ stack int width 364 TransposeUVWx8_NEON: 365 push {r4-r9,lr} 366 367 ldr r4, [sp, #28] @ dst_b 368 ldr r5, [sp, #32] @ dst_stride_b 369 ldr r8, [sp, #36] @ width 370 @ loops are on blocks of 8. loop will stop when 371 @ counter gets to or below 0. starting the counter 372 @ at w-8 allow for this 373 sub r8, #8 374 375 @ handle 8x8 blocks. this should be the majority of the plane 376 Lloop_8x8_di: 377 mov r9, r0 378 379 vld2.8 {d0, d1}, [r9], r1 380 vld2.8 {d2, d3}, [r9], r1 381 vld2.8 {d4, d5}, [r9], r1 382 vld2.8 {d6, d7}, [r9], r1 383 vld2.8 {d8, d9}, [r9], r1 384 vld2.8 {d10, d11}, [r9], r1 385 vld2.8 {d12, d13}, [r9], r1 386 vld2.8 {d14, d15}, [r9] 387 388 vtrn.8 q1, q0 389 vtrn.8 q3, q2 390 vtrn.8 q5, q4 391 vtrn.8 q7, q6 392 393 vtrn.16 q1, q3 394 vtrn.16 q0, q2 395 vtrn.16 q5, q7 396 vtrn.16 q4, q6 397 398 vtrn.32 q1, q5 399 vtrn.32 q0, q4 400 vtrn.32 q3, q7 401 vtrn.32 q2, q6 402 403 vrev16.8 q0, q0 404 vrev16.8 q1, q1 405 vrev16.8 q2, q2 406 vrev16.8 q3, q3 407 vrev16.8 q4, q4 408 vrev16.8 q5, q5 409 vrev16.8 q6, q6 410 vrev16.8 q7, q7 411 412 mov r9, r2 413 414 vst1.8 {d2}, [r9], r3 415 vst1.8 {d0}, [r9], r3 416 vst1.8 {d6}, [r9], r3 417 vst1.8 {d4}, [r9], r3 418 vst1.8 {d10}, [r9], r3 419 vst1.8 {d8}, [r9], r3 420 vst1.8 {d14}, [r9], r3 421 vst1.8 {d12}, [r9] 422 423 mov r9, r4 424 425 vst1.8 {d3}, [r9], r5 426 vst1.8 {d1}, [r9], r5 427 vst1.8 {d7}, [r9], r5 428 vst1.8 {d5}, [r9], r5 429 vst1.8 {d11}, [r9], r5 430 vst1.8 {d9}, [r9], r5 431 vst1.8 {d15}, [r9], r5 432 vst1.8 {d13}, [r9] 433 434 add r0, #8*2 @ src += 8*2 435 add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a 436 add r4, r5, lsl #3 @ dst_b += 8 * dst_stride_b 437 subs r8, #8 @ w -= 8 438 bge Lloop_8x8_di 439 440 @ add 8 back to counter. if the result is 0 there are 441 @ no residuals. 442 adds r8, #8 443 beq Ldone_di 444 445 @ some residual, so between 1 and 7 lines left to transpose 446 cmp r8, #2 447 blt Lblock_1x8_di 448 449 cmp r8, #4 450 blt Lblock_2x8_di 451 452 @ TODO(frkoenig) : clean this up 453 Lblock_4x8_di: 454 mov r9, r0 455 vld1.64 {d0}, [r9], r1 456 vld1.64 {d1}, [r9], r1 457 vld1.64 {d2}, [r9], r1 458 vld1.64 {d3}, [r9], r1 459 vld1.64 {d4}, [r9], r1 460 vld1.64 {d5}, [r9], r1 461 vld1.64 {d6}, [r9], r1 462 vld1.64 {d7}, [r9] 463 464 adr r12, vtbl_4x4_transpose_di 465 vld1.8 {q7}, [r12] 466 467 vtrn.8 q0, q1 468 vtrn.8 q2, q3 469 470 vtbl.8 d8, {d0, d1}, d14 471 vtbl.8 d9, {d0, d1}, d15 472 vtbl.8 d10, {d2, d3}, d14 473 vtbl.8 d11, {d2, d3}, d15 474 vtbl.8 d12, {d4, d5}, d14 475 vtbl.8 d13, {d4, d5}, d15 476 vtbl.8 d0, {d6, d7}, d14 477 vtbl.8 d1, {d6, d7}, d15 478 479 mov r9, r2 480 481 vst1.32 {d8[0]}, [r9], r3 482 vst1.32 {d8[1]}, [r9], r3 483 vst1.32 {d9[0]}, [r9], r3 484 vst1.32 {d9[1]}, [r9], r3 485 486 add r9, r2, #4 487 vst1.32 {d12[0]}, [r9], r3 488 vst1.32 {d12[1]}, [r9], r3 489 vst1.32 {d13[0]}, [r9], r3 490 vst1.32 {d13[1]}, [r9] 491 492 mov r9, r4 493 494 vst1.32 {d10[0]}, [r9], r5 495 vst1.32 {d10[1]}, [r9], r5 496 vst1.32 {d11[0]}, [r9], r5 497 vst1.32 {d11[1]}, [r9], r5 498 499 add r9, r4, #4 500 vst1.32 {d0[0]}, [r9], r5 501 vst1.32 {d0[1]}, [r9], r5 502 vst1.32 {d1[0]}, [r9], r5 503 vst1.32 {d1[1]}, [r9] 504 505 add r0, #4*2 @ src += 4 * 2 506 add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a 507 add r4, r5, lsl #2 @ dst_b += 4 * dst_stride_b 508 subs r8, #4 @ w -= 4 509 beq Ldone_di 510 511 @ some residual, check to see if it includes a 2x8 block, 512 @ or less 513 cmp r8, #2 514 blt Lblock_1x8_di 515 516 Lblock_2x8_di: 517 mov r9, r0 518 vld2.16 {d0[0], d2[0]}, [r9], r1 519 vld2.16 {d1[0], d3[0]}, [r9], r1 520 vld2.16 {d0[1], d2[1]}, [r9], r1 521 vld2.16 {d1[1], d3[1]}, [r9], r1 522 vld2.16 {d0[2], d2[2]}, [r9], r1 523 vld2.16 {d1[2], d3[2]}, [r9], r1 524 vld2.16 {d0[3], d2[3]}, [r9], r1 525 vld2.16 {d1[3], d3[3]}, [r9] 526 527 vtrn.8 d0, d1 528 vtrn.8 d2, d3 529 530 mov r9, r2 531 532 vst1.64 {d0}, [r9], r3 533 vst1.64 {d2}, [r9] 534 535 mov r9, r4 536 537 vst1.64 {d1}, [r9], r5 538 vst1.64 {d3}, [r9] 539 540 add r0, #2*2 @ src += 2 * 2 541 add r2, r3, lsl #1 @ dst_a += 2 * dst_stride_a 542 add r4, r5, lsl #1 @ dst_a += 2 * dst_stride_a 543 subs r8, #2 @ w -= 2 544 beq Ldone_di 545 546 Lblock_1x8_di: 547 vld2.8 {d0[0], d1[0]}, [r0], r1 548 vld2.8 {d0[1], d1[1]}, [r0], r1 549 vld2.8 {d0[2], d1[2]}, [r0], r1 550 vld2.8 {d0[3], d1[3]}, [r0], r1 551 vld2.8 {d0[4], d1[4]}, [r0], r1 552 vld2.8 {d0[5], d1[5]}, [r0], r1 553 vld2.8 {d0[6], d1[6]}, [r0], r1 554 vld2.8 {d0[7], d1[7]}, [r0] 555 556 vst1.64 {d0}, [r2] 557 vst1.64 {d1}, [r4] 558 559 Ldone_di: 560 pop {r4-r9, pc} 561 562 vtbl_4x4_transpose_di: 563 .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 564