1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/rotate_row.h" 12 #include "libyuv/row.h" 13 14 #include "libyuv/basic_types.h" 15 16 #ifdef __cplusplus 17 namespace libyuv { 18 extern "C" { 19 #endif 20 21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ 22 !defined(__aarch64__) 23 24 static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, 25 2, 6, 10, 14, 3, 7, 11, 15}; 26 27 void TransposeWx8_NEON(const uint8* src, 28 int src_stride, 29 uint8* dst, 30 int dst_stride, 31 int width) { 32 const uint8* src_temp; 33 asm volatile ( 34 // loops are on blocks of 8. loop will stop when 35 // counter gets to or below 0. starting the counter 36 // at w-8 allow for this 37 "sub %5, #8 \n" 38 39 // handle 8x8 blocks. this should be the majority of the plane 40 "1: \n" 41 "mov %0, %1 \n" 42 43 MEMACCESS(0) 44 "vld1.8 {d0}, [%0], %2 \n" 45 MEMACCESS(0) 46 "vld1.8 {d1}, [%0], %2 \n" 47 MEMACCESS(0) 48 "vld1.8 {d2}, [%0], %2 \n" 49 MEMACCESS(0) 50 "vld1.8 {d3}, [%0], %2 \n" 51 MEMACCESS(0) 52 "vld1.8 {d4}, [%0], %2 \n" 53 MEMACCESS(0) 54 "vld1.8 {d5}, [%0], %2 \n" 55 MEMACCESS(0) 56 "vld1.8 {d6}, [%0], %2 \n" 57 MEMACCESS(0) 58 "vld1.8 {d7}, [%0] \n" 59 60 "vtrn.8 d1, d0 \n" 61 "vtrn.8 d3, d2 \n" 62 "vtrn.8 d5, d4 \n" 63 "vtrn.8 d7, d6 \n" 64 65 "vtrn.16 d1, d3 \n" 66 "vtrn.16 d0, d2 \n" 67 "vtrn.16 d5, d7 \n" 68 "vtrn.16 d4, d6 \n" 69 70 "vtrn.32 d1, d5 \n" 71 "vtrn.32 d0, d4 \n" 72 "vtrn.32 d3, d7 \n" 73 "vtrn.32 d2, d6 \n" 74 75 "vrev16.8 q0, q0 \n" 76 "vrev16.8 q1, q1 \n" 77 "vrev16.8 q2, q2 \n" 78 "vrev16.8 q3, q3 \n" 79 80 "mov %0, %3 \n" 81 82 MEMACCESS(0) 83 "vst1.8 {d1}, [%0], %4 \n" 84 MEMACCESS(0) 85 "vst1.8 {d0}, [%0], %4 \n" 86 MEMACCESS(0) 87 "vst1.8 {d3}, [%0], %4 \n" 88 MEMACCESS(0) 89 "vst1.8 {d2}, [%0], %4 \n" 90 MEMACCESS(0) 91 "vst1.8 {d5}, [%0], %4 \n" 92 MEMACCESS(0) 93 "vst1.8 {d4}, [%0], %4 \n" 94 MEMACCESS(0) 95 "vst1.8 {d7}, [%0], %4 \n" 96 MEMACCESS(0) 97 "vst1.8 {d6}, [%0] \n" 98 99 "add %1, #8 \n" // src += 8 100 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride 101 "subs %5, #8 \n" // w -= 8 102 "bge 1b \n" 103 104 // add 8 back to counter. if the result is 0 there are 105 // no residuals. 106 "adds %5, #8 \n" 107 "beq 4f \n" 108 109 // some residual, so between 1 and 7 lines left to transpose 110 "cmp %5, #2 \n" 111 "blt 3f \n" 112 113 "cmp %5, #4 \n" 114 "blt 2f \n" 115 116 // 4x8 block 117 "mov %0, %1 \n" 118 MEMACCESS(0) 119 "vld1.32 {d0[0]}, [%0], %2 \n" 120 MEMACCESS(0) 121 "vld1.32 {d0[1]}, [%0], %2 \n" 122 MEMACCESS(0) 123 "vld1.32 {d1[0]}, [%0], %2 \n" 124 MEMACCESS(0) 125 "vld1.32 {d1[1]}, [%0], %2 \n" 126 MEMACCESS(0) 127 "vld1.32 {d2[0]}, [%0], %2 \n" 128 MEMACCESS(0) 129 "vld1.32 {d2[1]}, [%0], %2 \n" 130 MEMACCESS(0) 131 "vld1.32 {d3[0]}, [%0], %2 \n" 132 MEMACCESS(0) 133 "vld1.32 {d3[1]}, [%0] \n" 134 135 "mov %0, %3 \n" 136 137 MEMACCESS(6) 138 "vld1.8 {q3}, [%6] \n" 139 140 "vtbl.8 d4, {d0, d1}, d6 \n" 141 "vtbl.8 d5, {d0, d1}, d7 \n" 142 "vtbl.8 d0, {d2, d3}, d6 \n" 143 "vtbl.8 d1, {d2, d3}, d7 \n" 144 145 // TODO(frkoenig): Rework shuffle above to 146 // write out with 4 instead of 8 writes. 147 MEMACCESS(0) 148 "vst1.32 {d4[0]}, [%0], %4 \n" 149 MEMACCESS(0) 150 "vst1.32 {d4[1]}, [%0], %4 \n" 151 MEMACCESS(0) 152 "vst1.32 {d5[0]}, [%0], %4 \n" 153 MEMACCESS(0) 154 "vst1.32 {d5[1]}, [%0] \n" 155 156 "add %0, %3, #4 \n" 157 MEMACCESS(0) 158 "vst1.32 {d0[0]}, [%0], %4 \n" 159 MEMACCESS(0) 160 "vst1.32 {d0[1]}, [%0], %4 \n" 161 MEMACCESS(0) 162 "vst1.32 {d1[0]}, [%0], %4 \n" 163 MEMACCESS(0) 164 "vst1.32 {d1[1]}, [%0] \n" 165 166 "add %1, #4 \n" // src += 4 167 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride 168 "subs %5, #4 \n" // w -= 4 169 "beq 4f \n" 170 171 // some residual, check to see if it includes a 2x8 block, 172 // or less 173 "cmp %5, #2 \n" 174 "blt 3f \n" 175 176 // 2x8 block 177 "2: \n" 178 "mov %0, %1 \n" 179 MEMACCESS(0) 180 "vld1.16 {d0[0]}, [%0], %2 \n" 181 MEMACCESS(0) 182 "vld1.16 {d1[0]}, [%0], %2 \n" 183 MEMACCESS(0) 184 "vld1.16 {d0[1]}, [%0], %2 \n" 185 MEMACCESS(0) 186 "vld1.16 {d1[1]}, [%0], %2 \n" 187 MEMACCESS(0) 188 "vld1.16 {d0[2]}, [%0], %2 \n" 189 MEMACCESS(0) 190 "vld1.16 {d1[2]}, [%0], %2 \n" 191 MEMACCESS(0) 192 "vld1.16 {d0[3]}, [%0], %2 \n" 193 MEMACCESS(0) 194 "vld1.16 {d1[3]}, [%0] \n" 195 196 "vtrn.8 d0, d1 \n" 197 198 "mov %0, %3 \n" 199 200 MEMACCESS(0) 201 "vst1.64 {d0}, [%0], %4 \n" 202 MEMACCESS(0) 203 "vst1.64 {d1}, [%0] \n" 204 205 "add %1, #2 \n" // src += 2 206 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride 207 "subs %5, #2 \n" // w -= 2 208 "beq 4f \n" 209 210 // 1x8 block 211 "3: \n" 212 MEMACCESS(1) 213 "vld1.8 {d0[0]}, [%1], %2 \n" 214 MEMACCESS(1) 215 "vld1.8 {d0[1]}, [%1], %2 \n" 216 MEMACCESS(1) 217 "vld1.8 {d0[2]}, [%1], %2 \n" 218 MEMACCESS(1) 219 "vld1.8 {d0[3]}, [%1], %2 \n" 220 MEMACCESS(1) 221 "vld1.8 {d0[4]}, [%1], %2 \n" 222 MEMACCESS(1) 223 "vld1.8 {d0[5]}, [%1], %2 \n" 224 MEMACCESS(1) 225 "vld1.8 {d0[6]}, [%1], %2 \n" 226 MEMACCESS(1) 227 "vld1.8 {d0[7]}, [%1] \n" 228 229 MEMACCESS(3) 230 "vst1.64 {d0}, [%3] \n" 231 232 "4: \n" 233 234 : "=&r"(src_temp), // %0 235 "+r"(src), // %1 236 "+r"(src_stride), // %2 237 "+r"(dst), // %3 238 "+r"(dst_stride), // %4 239 "+r"(width) // %5 240 : "r"(&kVTbl4x4Transpose) // %6 241 : "memory", "cc", "q0", "q1", "q2", "q3" 242 ); 243 } 244 245 static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, 246 4, 12, 5, 13, 6, 14, 7, 15}; 247 248 void TransposeUVWx8_NEON(const uint8* src, 249 int src_stride, 250 uint8* dst_a, 251 int dst_stride_a, 252 uint8* dst_b, 253 int dst_stride_b, 254 int width) { 255 const uint8* src_temp; 256 asm volatile ( 257 // loops are on blocks of 8. loop will stop when 258 // counter gets to or below 0. starting the counter 259 // at w-8 allow for this 260 "sub %7, #8 \n" 261 262 // handle 8x8 blocks. this should be the majority of the plane 263 "1: \n" 264 "mov %0, %1 \n" 265 266 MEMACCESS(0) 267 "vld2.8 {d0, d1}, [%0], %2 \n" 268 MEMACCESS(0) 269 "vld2.8 {d2, d3}, [%0], %2 \n" 270 MEMACCESS(0) 271 "vld2.8 {d4, d5}, [%0], %2 \n" 272 MEMACCESS(0) 273 "vld2.8 {d6, d7}, [%0], %2 \n" 274 MEMACCESS(0) 275 "vld2.8 {d16, d17}, [%0], %2 \n" 276 MEMACCESS(0) 277 "vld2.8 {d18, d19}, [%0], %2 \n" 278 MEMACCESS(0) 279 "vld2.8 {d20, d21}, [%0], %2 \n" 280 MEMACCESS(0) 281 "vld2.8 {d22, d23}, [%0] \n" 282 283 "vtrn.8 q1, q0 \n" 284 "vtrn.8 q3, q2 \n" 285 "vtrn.8 q9, q8 \n" 286 "vtrn.8 q11, q10 \n" 287 288 "vtrn.16 q1, q3 \n" 289 "vtrn.16 q0, q2 \n" 290 "vtrn.16 q9, q11 \n" 291 "vtrn.16 q8, q10 \n" 292 293 "vtrn.32 q1, q9 \n" 294 "vtrn.32 q0, q8 \n" 295 "vtrn.32 q3, q11 \n" 296 "vtrn.32 q2, q10 \n" 297 298 "vrev16.8 q0, q0 \n" 299 "vrev16.8 q1, q1 \n" 300 "vrev16.8 q2, q2 \n" 301 "vrev16.8 q3, q3 \n" 302 "vrev16.8 q8, q8 \n" 303 "vrev16.8 q9, q9 \n" 304 "vrev16.8 q10, q10 \n" 305 "vrev16.8 q11, q11 \n" 306 307 "mov %0, %3 \n" 308 309 MEMACCESS(0) 310 "vst1.8 {d2}, [%0], %4 \n" 311 MEMACCESS(0) 312 "vst1.8 {d0}, [%0], %4 \n" 313 MEMACCESS(0) 314 "vst1.8 {d6}, [%0], %4 \n" 315 MEMACCESS(0) 316 "vst1.8 {d4}, [%0], %4 \n" 317 MEMACCESS(0) 318 "vst1.8 {d18}, [%0], %4 \n" 319 MEMACCESS(0) 320 "vst1.8 {d16}, [%0], %4 \n" 321 MEMACCESS(0) 322 "vst1.8 {d22}, [%0], %4 \n" 323 MEMACCESS(0) 324 "vst1.8 {d20}, [%0] \n" 325 326 "mov %0, %5 \n" 327 328 MEMACCESS(0) 329 "vst1.8 {d3}, [%0], %6 \n" 330 MEMACCESS(0) 331 "vst1.8 {d1}, [%0], %6 \n" 332 MEMACCESS(0) 333 "vst1.8 {d7}, [%0], %6 \n" 334 MEMACCESS(0) 335 "vst1.8 {d5}, [%0], %6 \n" 336 MEMACCESS(0) 337 "vst1.8 {d19}, [%0], %6 \n" 338 MEMACCESS(0) 339 "vst1.8 {d17}, [%0], %6 \n" 340 MEMACCESS(0) 341 "vst1.8 {d23}, [%0], %6 \n" 342 MEMACCESS(0) 343 "vst1.8 {d21}, [%0] \n" 344 345 "add %1, #8*2 \n" // src += 8*2 346 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a 347 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b 348 "subs %7, #8 \n" // w -= 8 349 "bge 1b \n" 350 351 // add 8 back to counter. if the result is 0 there are 352 // no residuals. 353 "adds %7, #8 \n" 354 "beq 4f \n" 355 356 // some residual, so between 1 and 7 lines left to transpose 357 "cmp %7, #2 \n" 358 "blt 3f \n" 359 360 "cmp %7, #4 \n" 361 "blt 2f \n" 362 363 // TODO(frkoenig): Clean this up 364 // 4x8 block 365 "mov %0, %1 \n" 366 MEMACCESS(0) 367 "vld1.64 {d0}, [%0], %2 \n" 368 MEMACCESS(0) 369 "vld1.64 {d1}, [%0], %2 \n" 370 MEMACCESS(0) 371 "vld1.64 {d2}, [%0], %2 \n" 372 MEMACCESS(0) 373 "vld1.64 {d3}, [%0], %2 \n" 374 MEMACCESS(0) 375 "vld1.64 {d4}, [%0], %2 \n" 376 MEMACCESS(0) 377 "vld1.64 {d5}, [%0], %2 \n" 378 MEMACCESS(0) 379 "vld1.64 {d6}, [%0], %2 \n" 380 MEMACCESS(0) 381 "vld1.64 {d7}, [%0] \n" 382 383 MEMACCESS(8) 384 "vld1.8 {q15}, [%8] \n" 385 386 "vtrn.8 q0, q1 \n" 387 "vtrn.8 q2, q3 \n" 388 389 "vtbl.8 d16, {d0, d1}, d30 \n" 390 "vtbl.8 d17, {d0, d1}, d31 \n" 391 "vtbl.8 d18, {d2, d3}, d30 \n" 392 "vtbl.8 d19, {d2, d3}, d31 \n" 393 "vtbl.8 d20, {d4, d5}, d30 \n" 394 "vtbl.8 d21, {d4, d5}, d31 \n" 395 "vtbl.8 d22, {d6, d7}, d30 \n" 396 "vtbl.8 d23, {d6, d7}, d31 \n" 397 398 "mov %0, %3 \n" 399 400 MEMACCESS(0) 401 "vst1.32 {d16[0]}, [%0], %4 \n" 402 MEMACCESS(0) 403 "vst1.32 {d16[1]}, [%0], %4 \n" 404 MEMACCESS(0) 405 "vst1.32 {d17[0]}, [%0], %4 \n" 406 MEMACCESS(0) 407 "vst1.32 {d17[1]}, [%0], %4 \n" 408 409 "add %0, %3, #4 \n" 410 MEMACCESS(0) 411 "vst1.32 {d20[0]}, [%0], %4 \n" 412 MEMACCESS(0) 413 "vst1.32 {d20[1]}, [%0], %4 \n" 414 MEMACCESS(0) 415 "vst1.32 {d21[0]}, [%0], %4 \n" 416 MEMACCESS(0) 417 "vst1.32 {d21[1]}, [%0] \n" 418 419 "mov %0, %5 \n" 420 421 MEMACCESS(0) 422 "vst1.32 {d18[0]}, [%0], %6 \n" 423 MEMACCESS(0) 424 "vst1.32 {d18[1]}, [%0], %6 \n" 425 MEMACCESS(0) 426 "vst1.32 {d19[0]}, [%0], %6 \n" 427 MEMACCESS(0) 428 "vst1.32 {d19[1]}, [%0], %6 \n" 429 430 "add %0, %5, #4 \n" 431 MEMACCESS(0) 432 "vst1.32 {d22[0]}, [%0], %6 \n" 433 MEMACCESS(0) 434 "vst1.32 {d22[1]}, [%0], %6 \n" 435 MEMACCESS(0) 436 "vst1.32 {d23[0]}, [%0], %6 \n" 437 MEMACCESS(0) 438 "vst1.32 {d23[1]}, [%0] \n" 439 440 "add %1, #4*2 \n" // src += 4 * 2 441 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a 442 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b 443 "subs %7, #4 \n" // w -= 4 444 "beq 4f \n" 445 446 // some residual, check to see if it includes a 2x8 block, 447 // or less 448 "cmp %7, #2 \n" 449 "blt 3f \n" 450 451 // 2x8 block 452 "2: \n" 453 "mov %0, %1 \n" 454 MEMACCESS(0) 455 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" 456 MEMACCESS(0) 457 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" 458 MEMACCESS(0) 459 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" 460 MEMACCESS(0) 461 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" 462 MEMACCESS(0) 463 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" 464 MEMACCESS(0) 465 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" 466 MEMACCESS(0) 467 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" 468 MEMACCESS(0) 469 "vld2.16 {d1[3], d3[3]}, [%0] \n" 470 471 "vtrn.8 d0, d1 \n" 472 "vtrn.8 d2, d3 \n" 473 474 "mov %0, %3 \n" 475 476 MEMACCESS(0) 477 "vst1.64 {d0}, [%0], %4 \n" 478 MEMACCESS(0) 479 "vst1.64 {d2}, [%0] \n" 480 481 "mov %0, %5 \n" 482 483 MEMACCESS(0) 484 "vst1.64 {d1}, [%0], %6 \n" 485 MEMACCESS(0) 486 "vst1.64 {d3}, [%0] \n" 487 488 "add %1, #2*2 \n" // src += 2 * 2 489 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a 490 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b 491 "subs %7, #2 \n" // w -= 2 492 "beq 4f \n" 493 494 // 1x8 block 495 "3: \n" 496 MEMACCESS(1) 497 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" 498 MEMACCESS(1) 499 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" 500 MEMACCESS(1) 501 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" 502 MEMACCESS(1) 503 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" 504 MEMACCESS(1) 505 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" 506 MEMACCESS(1) 507 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" 508 MEMACCESS(1) 509 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" 510 MEMACCESS(1) 511 "vld2.8 {d0[7], d1[7]}, [%1] \n" 512 513 MEMACCESS(3) 514 "vst1.64 {d0}, [%3] \n" 515 MEMACCESS(5) 516 "vst1.64 {d1}, [%5] \n" 517 518 "4: \n" 519 520 : "=&r"(src_temp), // %0 521 "+r"(src), // %1 522 "+r"(src_stride), // %2 523 "+r"(dst_a), // %3 524 "+r"(dst_stride_a), // %4 525 "+r"(dst_b), // %5 526 "+r"(dst_stride_b), // %6 527 "+r"(width) // %7 528 : "r"(&kVTbl4x4TransposeDi) // %8 529 : "memory", "cc", 530 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" 531 ); 532 } 533 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) 534 535 #ifdef __cplusplus 536 } // extern "C" 537 } // namespace libyuv 538 #endif 539