1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 #include "libyuv/rotate_row.h" 13 14 #include "libyuv/basic_types.h" 15 16 #ifdef __cplusplus 17 namespace libyuv { 18 extern "C" { 19 #endif 20 21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ 22 !defined(__aarch64__) 23 24 static uvec8 kVTbl4x4Transpose = 25 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; 26 27 void TransposeWx8_NEON(const uint8* src, int src_stride, 28 uint8* dst, int dst_stride, 29 int width) { 30 const uint8* src_temp; 31 asm volatile ( 32 // loops are on blocks of 8. loop will stop when 33 // counter gets to or below 0. starting the counter 34 // at w-8 allow for this 35 "sub %5, #8 \n" 36 37 // handle 8x8 blocks. this should be the majority of the plane 38 "1: \n" 39 "mov %0, %1 \n" 40 41 MEMACCESS(0) 42 "vld1.8 {d0}, [%0], %2 \n" 43 MEMACCESS(0) 44 "vld1.8 {d1}, [%0], %2 \n" 45 MEMACCESS(0) 46 "vld1.8 {d2}, [%0], %2 \n" 47 MEMACCESS(0) 48 "vld1.8 {d3}, [%0], %2 \n" 49 MEMACCESS(0) 50 "vld1.8 {d4}, [%0], %2 \n" 51 MEMACCESS(0) 52 "vld1.8 {d5}, [%0], %2 \n" 53 MEMACCESS(0) 54 "vld1.8 {d6}, [%0], %2 \n" 55 MEMACCESS(0) 56 "vld1.8 {d7}, [%0] \n" 57 58 "vtrn.8 d1, d0 \n" 59 "vtrn.8 d3, d2 \n" 60 "vtrn.8 d5, d4 \n" 61 "vtrn.8 d7, d6 \n" 62 63 "vtrn.16 d1, d3 \n" 64 "vtrn.16 d0, d2 \n" 65 "vtrn.16 d5, d7 \n" 66 "vtrn.16 d4, d6 \n" 67 68 "vtrn.32 d1, d5 \n" 69 "vtrn.32 d0, d4 \n" 70 "vtrn.32 d3, d7 \n" 71 "vtrn.32 d2, d6 \n" 72 73 "vrev16.8 q0, q0 \n" 74 "vrev16.8 q1, q1 \n" 75 "vrev16.8 q2, q2 \n" 76 "vrev16.8 q3, q3 \n" 77 78 "mov %0, %3 \n" 79 80 MEMACCESS(0) 81 "vst1.8 {d1}, [%0], %4 \n" 82 MEMACCESS(0) 83 "vst1.8 {d0}, [%0], %4 \n" 84 MEMACCESS(0) 85 "vst1.8 {d3}, [%0], %4 \n" 86 MEMACCESS(0) 87 "vst1.8 {d2}, [%0], %4 \n" 88 MEMACCESS(0) 89 "vst1.8 {d5}, [%0], %4 \n" 90 MEMACCESS(0) 91 "vst1.8 {d4}, [%0], %4 \n" 92 MEMACCESS(0) 93 "vst1.8 {d7}, [%0], %4 \n" 94 MEMACCESS(0) 95 "vst1.8 {d6}, [%0] \n" 96 97 "add %1, #8 \n" // src += 8 98 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride 99 "subs %5, #8 \n" // w -= 8 100 "bge 1b \n" 101 102 // add 8 back to counter. if the result is 0 there are 103 // no residuals. 104 "adds %5, #8 \n" 105 "beq 4f \n" 106 107 // some residual, so between 1 and 7 lines left to transpose 108 "cmp %5, #2 \n" 109 "blt 3f \n" 110 111 "cmp %5, #4 \n" 112 "blt 2f \n" 113 114 // 4x8 block 115 "mov %0, %1 \n" 116 MEMACCESS(0) 117 "vld1.32 {d0[0]}, [%0], %2 \n" 118 MEMACCESS(0) 119 "vld1.32 {d0[1]}, [%0], %2 \n" 120 MEMACCESS(0) 121 "vld1.32 {d1[0]}, [%0], %2 \n" 122 MEMACCESS(0) 123 "vld1.32 {d1[1]}, [%0], %2 \n" 124 MEMACCESS(0) 125 "vld1.32 {d2[0]}, [%0], %2 \n" 126 MEMACCESS(0) 127 "vld1.32 {d2[1]}, [%0], %2 \n" 128 MEMACCESS(0) 129 "vld1.32 {d3[0]}, [%0], %2 \n" 130 MEMACCESS(0) 131 "vld1.32 {d3[1]}, [%0] \n" 132 133 "mov %0, %3 \n" 134 135 MEMACCESS(6) 136 "vld1.8 {q3}, [%6] \n" 137 138 "vtbl.8 d4, {d0, d1}, d6 \n" 139 "vtbl.8 d5, {d0, d1}, d7 \n" 140 "vtbl.8 d0, {d2, d3}, d6 \n" 141 "vtbl.8 d1, {d2, d3}, d7 \n" 142 143 // TODO(frkoenig): Rework shuffle above to 144 // write out with 4 instead of 8 writes. 145 MEMACCESS(0) 146 "vst1.32 {d4[0]}, [%0], %4 \n" 147 MEMACCESS(0) 148 "vst1.32 {d4[1]}, [%0], %4 \n" 149 MEMACCESS(0) 150 "vst1.32 {d5[0]}, [%0], %4 \n" 151 MEMACCESS(0) 152 "vst1.32 {d5[1]}, [%0] \n" 153 154 "add %0, %3, #4 \n" 155 MEMACCESS(0) 156 "vst1.32 {d0[0]}, [%0], %4 \n" 157 MEMACCESS(0) 158 "vst1.32 {d0[1]}, [%0], %4 \n" 159 MEMACCESS(0) 160 "vst1.32 {d1[0]}, [%0], %4 \n" 161 MEMACCESS(0) 162 "vst1.32 {d1[1]}, [%0] \n" 163 164 "add %1, #4 \n" // src += 4 165 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride 166 "subs %5, #4 \n" // w -= 4 167 "beq 4f \n" 168 169 // some residual, check to see if it includes a 2x8 block, 170 // or less 171 "cmp %5, #2 \n" 172 "blt 3f \n" 173 174 // 2x8 block 175 "2: \n" 176 "mov %0, %1 \n" 177 MEMACCESS(0) 178 "vld1.16 {d0[0]}, [%0], %2 \n" 179 MEMACCESS(0) 180 "vld1.16 {d1[0]}, [%0], %2 \n" 181 MEMACCESS(0) 182 "vld1.16 {d0[1]}, [%0], %2 \n" 183 MEMACCESS(0) 184 "vld1.16 {d1[1]}, [%0], %2 \n" 185 MEMACCESS(0) 186 "vld1.16 {d0[2]}, [%0], %2 \n" 187 MEMACCESS(0) 188 "vld1.16 {d1[2]}, [%0], %2 \n" 189 MEMACCESS(0) 190 "vld1.16 {d0[3]}, [%0], %2 \n" 191 MEMACCESS(0) 192 "vld1.16 {d1[3]}, [%0] \n" 193 194 "vtrn.8 d0, d1 \n" 195 196 "mov %0, %3 \n" 197 198 MEMACCESS(0) 199 "vst1.64 {d0}, [%0], %4 \n" 200 MEMACCESS(0) 201 "vst1.64 {d1}, [%0] \n" 202 203 "add %1, #2 \n" // src += 2 204 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride 205 "subs %5, #2 \n" // w -= 2 206 "beq 4f \n" 207 208 // 1x8 block 209 "3: \n" 210 MEMACCESS(1) 211 "vld1.8 {d0[0]}, [%1], %2 \n" 212 MEMACCESS(1) 213 "vld1.8 {d0[1]}, [%1], %2 \n" 214 MEMACCESS(1) 215 "vld1.8 {d0[2]}, [%1], %2 \n" 216 MEMACCESS(1) 217 "vld1.8 {d0[3]}, [%1], %2 \n" 218 MEMACCESS(1) 219 "vld1.8 {d0[4]}, [%1], %2 \n" 220 MEMACCESS(1) 221 "vld1.8 {d0[5]}, [%1], %2 \n" 222 MEMACCESS(1) 223 "vld1.8 {d0[6]}, [%1], %2 \n" 224 MEMACCESS(1) 225 "vld1.8 {d0[7]}, [%1] \n" 226 227 MEMACCESS(3) 228 "vst1.64 {d0}, [%3] \n" 229 230 "4: \n" 231 232 : "=&r"(src_temp), // %0 233 "+r"(src), // %1 234 "+r"(src_stride), // %2 235 "+r"(dst), // %3 236 "+r"(dst_stride), // %4 237 "+r"(width) // %5 238 : "r"(&kVTbl4x4Transpose) // %6 239 : "memory", "cc", "q0", "q1", "q2", "q3" 240 ); 241 } 242 243 static uvec8 kVTbl4x4TransposeDi = 244 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; 245 246 void TransposeUVWx8_NEON(const uint8* src, int src_stride, 247 uint8* dst_a, int dst_stride_a, 248 uint8* dst_b, int dst_stride_b, 249 int width) { 250 const uint8* src_temp; 251 asm volatile ( 252 // loops are on blocks of 8. loop will stop when 253 // counter gets to or below 0. starting the counter 254 // at w-8 allow for this 255 "sub %7, #8 \n" 256 257 // handle 8x8 blocks. this should be the majority of the plane 258 "1: \n" 259 "mov %0, %1 \n" 260 261 MEMACCESS(0) 262 "vld2.8 {d0, d1}, [%0], %2 \n" 263 MEMACCESS(0) 264 "vld2.8 {d2, d3}, [%0], %2 \n" 265 MEMACCESS(0) 266 "vld2.8 {d4, d5}, [%0], %2 \n" 267 MEMACCESS(0) 268 "vld2.8 {d6, d7}, [%0], %2 \n" 269 MEMACCESS(0) 270 "vld2.8 {d16, d17}, [%0], %2 \n" 271 MEMACCESS(0) 272 "vld2.8 {d18, d19}, [%0], %2 \n" 273 MEMACCESS(0) 274 "vld2.8 {d20, d21}, [%0], %2 \n" 275 MEMACCESS(0) 276 "vld2.8 {d22, d23}, [%0] \n" 277 278 "vtrn.8 q1, q0 \n" 279 "vtrn.8 q3, q2 \n" 280 "vtrn.8 q9, q8 \n" 281 "vtrn.8 q11, q10 \n" 282 283 "vtrn.16 q1, q3 \n" 284 "vtrn.16 q0, q2 \n" 285 "vtrn.16 q9, q11 \n" 286 "vtrn.16 q8, q10 \n" 287 288 "vtrn.32 q1, q9 \n" 289 "vtrn.32 q0, q8 \n" 290 "vtrn.32 q3, q11 \n" 291 "vtrn.32 q2, q10 \n" 292 293 "vrev16.8 q0, q0 \n" 294 "vrev16.8 q1, q1 \n" 295 "vrev16.8 q2, q2 \n" 296 "vrev16.8 q3, q3 \n" 297 "vrev16.8 q8, q8 \n" 298 "vrev16.8 q9, q9 \n" 299 "vrev16.8 q10, q10 \n" 300 "vrev16.8 q11, q11 \n" 301 302 "mov %0, %3 \n" 303 304 MEMACCESS(0) 305 "vst1.8 {d2}, [%0], %4 \n" 306 MEMACCESS(0) 307 "vst1.8 {d0}, [%0], %4 \n" 308 MEMACCESS(0) 309 "vst1.8 {d6}, [%0], %4 \n" 310 MEMACCESS(0) 311 "vst1.8 {d4}, [%0], %4 \n" 312 MEMACCESS(0) 313 "vst1.8 {d18}, [%0], %4 \n" 314 MEMACCESS(0) 315 "vst1.8 {d16}, [%0], %4 \n" 316 MEMACCESS(0) 317 "vst1.8 {d22}, [%0], %4 \n" 318 MEMACCESS(0) 319 "vst1.8 {d20}, [%0] \n" 320 321 "mov %0, %5 \n" 322 323 MEMACCESS(0) 324 "vst1.8 {d3}, [%0], %6 \n" 325 MEMACCESS(0) 326 "vst1.8 {d1}, [%0], %6 \n" 327 MEMACCESS(0) 328 "vst1.8 {d7}, [%0], %6 \n" 329 MEMACCESS(0) 330 "vst1.8 {d5}, [%0], %6 \n" 331 MEMACCESS(0) 332 "vst1.8 {d19}, [%0], %6 \n" 333 MEMACCESS(0) 334 "vst1.8 {d17}, [%0], %6 \n" 335 MEMACCESS(0) 336 "vst1.8 {d23}, [%0], %6 \n" 337 MEMACCESS(0) 338 "vst1.8 {d21}, [%0] \n" 339 340 "add %1, #8*2 \n" // src += 8*2 341 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a 342 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b 343 "subs %7, #8 \n" // w -= 8 344 "bge 1b \n" 345 346 // add 8 back to counter. if the result is 0 there are 347 // no residuals. 348 "adds %7, #8 \n" 349 "beq 4f \n" 350 351 // some residual, so between 1 and 7 lines left to transpose 352 "cmp %7, #2 \n" 353 "blt 3f \n" 354 355 "cmp %7, #4 \n" 356 "blt 2f \n" 357 358 // TODO(frkoenig): Clean this up 359 // 4x8 block 360 "mov %0, %1 \n" 361 MEMACCESS(0) 362 "vld1.64 {d0}, [%0], %2 \n" 363 MEMACCESS(0) 364 "vld1.64 {d1}, [%0], %2 \n" 365 MEMACCESS(0) 366 "vld1.64 {d2}, [%0], %2 \n" 367 MEMACCESS(0) 368 "vld1.64 {d3}, [%0], %2 \n" 369 MEMACCESS(0) 370 "vld1.64 {d4}, [%0], %2 \n" 371 MEMACCESS(0) 372 "vld1.64 {d5}, [%0], %2 \n" 373 MEMACCESS(0) 374 "vld1.64 {d6}, [%0], %2 \n" 375 MEMACCESS(0) 376 "vld1.64 {d7}, [%0] \n" 377 378 MEMACCESS(8) 379 "vld1.8 {q15}, [%8] \n" 380 381 "vtrn.8 q0, q1 \n" 382 "vtrn.8 q2, q3 \n" 383 384 "vtbl.8 d16, {d0, d1}, d30 \n" 385 "vtbl.8 d17, {d0, d1}, d31 \n" 386 "vtbl.8 d18, {d2, d3}, d30 \n" 387 "vtbl.8 d19, {d2, d3}, d31 \n" 388 "vtbl.8 d20, {d4, d5}, d30 \n" 389 "vtbl.8 d21, {d4, d5}, d31 \n" 390 "vtbl.8 d22, {d6, d7}, d30 \n" 391 "vtbl.8 d23, {d6, d7}, d31 \n" 392 393 "mov %0, %3 \n" 394 395 MEMACCESS(0) 396 "vst1.32 {d16[0]}, [%0], %4 \n" 397 MEMACCESS(0) 398 "vst1.32 {d16[1]}, [%0], %4 \n" 399 MEMACCESS(0) 400 "vst1.32 {d17[0]}, [%0], %4 \n" 401 MEMACCESS(0) 402 "vst1.32 {d17[1]}, [%0], %4 \n" 403 404 "add %0, %3, #4 \n" 405 MEMACCESS(0) 406 "vst1.32 {d20[0]}, [%0], %4 \n" 407 MEMACCESS(0) 408 "vst1.32 {d20[1]}, [%0], %4 \n" 409 MEMACCESS(0) 410 "vst1.32 {d21[0]}, [%0], %4 \n" 411 MEMACCESS(0) 412 "vst1.32 {d21[1]}, [%0] \n" 413 414 "mov %0, %5 \n" 415 416 MEMACCESS(0) 417 "vst1.32 {d18[0]}, [%0], %6 \n" 418 MEMACCESS(0) 419 "vst1.32 {d18[1]}, [%0], %6 \n" 420 MEMACCESS(0) 421 "vst1.32 {d19[0]}, [%0], %6 \n" 422 MEMACCESS(0) 423 "vst1.32 {d19[1]}, [%0], %6 \n" 424 425 "add %0, %5, #4 \n" 426 MEMACCESS(0) 427 "vst1.32 {d22[0]}, [%0], %6 \n" 428 MEMACCESS(0) 429 "vst1.32 {d22[1]}, [%0], %6 \n" 430 MEMACCESS(0) 431 "vst1.32 {d23[0]}, [%0], %6 \n" 432 MEMACCESS(0) 433 "vst1.32 {d23[1]}, [%0] \n" 434 435 "add %1, #4*2 \n" // src += 4 * 2 436 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a 437 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b 438 "subs %7, #4 \n" // w -= 4 439 "beq 4f \n" 440 441 // some residual, check to see if it includes a 2x8 block, 442 // or less 443 "cmp %7, #2 \n" 444 "blt 3f \n" 445 446 // 2x8 block 447 "2: \n" 448 "mov %0, %1 \n" 449 MEMACCESS(0) 450 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" 451 MEMACCESS(0) 452 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" 453 MEMACCESS(0) 454 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" 455 MEMACCESS(0) 456 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" 457 MEMACCESS(0) 458 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" 459 MEMACCESS(0) 460 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" 461 MEMACCESS(0) 462 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" 463 MEMACCESS(0) 464 "vld2.16 {d1[3], d3[3]}, [%0] \n" 465 466 "vtrn.8 d0, d1 \n" 467 "vtrn.8 d2, d3 \n" 468 469 "mov %0, %3 \n" 470 471 MEMACCESS(0) 472 "vst1.64 {d0}, [%0], %4 \n" 473 MEMACCESS(0) 474 "vst1.64 {d2}, [%0] \n" 475 476 "mov %0, %5 \n" 477 478 MEMACCESS(0) 479 "vst1.64 {d1}, [%0], %6 \n" 480 MEMACCESS(0) 481 "vst1.64 {d3}, [%0] \n" 482 483 "add %1, #2*2 \n" // src += 2 * 2 484 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a 485 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b 486 "subs %7, #2 \n" // w -= 2 487 "beq 4f \n" 488 489 // 1x8 block 490 "3: \n" 491 MEMACCESS(1) 492 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" 493 MEMACCESS(1) 494 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" 495 MEMACCESS(1) 496 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" 497 MEMACCESS(1) 498 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" 499 MEMACCESS(1) 500 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" 501 MEMACCESS(1) 502 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" 503 MEMACCESS(1) 504 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" 505 MEMACCESS(1) 506 "vld2.8 {d0[7], d1[7]}, [%1] \n" 507 508 MEMACCESS(3) 509 "vst1.64 {d0}, [%3] \n" 510 MEMACCESS(5) 511 "vst1.64 {d1}, [%5] \n" 512 513 "4: \n" 514 515 : "=&r"(src_temp), // %0 516 "+r"(src), // %1 517 "+r"(src_stride), // %2 518 "+r"(dst_a), // %3 519 "+r"(dst_stride_a), // %4 520 "+r"(dst_b), // %5 521 "+r"(dst_stride_b), // %6 522 "+r"(width) // %7 523 : "r"(&kVTbl4x4TransposeDi) // %8 524 : "memory", "cc", 525 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" 526 ); 527 } 528 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) 529 530 #ifdef __cplusplus 531 } // extern "C" 532 } // namespace libyuv 533 #endif 534