1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 13 #include "libyuv/basic_types.h" 14 15 #ifdef __cplusplus 16 namespace libyuv { 17 extern "C" { 18 #endif 19 20 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) 21 22 static uvec8 kVTbl4x4Transpose = 23 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; 24 25 void TransposeWx8_NEON(const uint8* src, int src_stride, 26 uint8* dst, int dst_stride, 27 int width) { 28 const uint8* src_temp = NULL; 29 asm volatile ( 30 // loops are on blocks of 8. loop will stop when 31 // counter gets to or below 0. starting the counter 32 // at w-8 allow for this 33 "sub %5, #8 \n" 34 35 // handle 8x8 blocks. this should be the majority of the plane 36 ".p2align 2 \n" 37 "1: \n" 38 "mov %0, %1 \n" 39 40 MEMACCESS(0) 41 "vld1.8 {d0}, [%0], %2 \n" 42 MEMACCESS(0) 43 "vld1.8 {d1}, [%0], %2 \n" 44 MEMACCESS(0) 45 "vld1.8 {d2}, [%0], %2 \n" 46 MEMACCESS(0) 47 "vld1.8 {d3}, [%0], %2 \n" 48 MEMACCESS(0) 49 "vld1.8 {d4}, [%0], %2 \n" 50 MEMACCESS(0) 51 "vld1.8 {d5}, [%0], %2 \n" 52 MEMACCESS(0) 53 "vld1.8 {d6}, [%0], %2 \n" 54 MEMACCESS(0) 55 "vld1.8 {d7}, [%0] \n" 56 57 "vtrn.8 d1, d0 \n" 58 "vtrn.8 d3, d2 \n" 59 "vtrn.8 d5, d4 \n" 60 "vtrn.8 d7, d6 \n" 61 62 "vtrn.16 d1, d3 \n" 63 "vtrn.16 d0, d2 \n" 64 "vtrn.16 d5, d7 \n" 65 "vtrn.16 d4, d6 \n" 66 67 "vtrn.32 d1, d5 \n" 68 "vtrn.32 d0, d4 \n" 69 "vtrn.32 d3, d7 \n" 70 "vtrn.32 d2, d6 \n" 71 72 "vrev16.8 q0, q0 \n" 73 "vrev16.8 q1, q1 \n" 74 "vrev16.8 q2, q2 \n" 75 "vrev16.8 q3, q3 \n" 76 77 "mov %0, %3 \n" 78 79 MEMACCESS(0) 80 "vst1.8 {d1}, [%0], %4 \n" 81 MEMACCESS(0) 82 "vst1.8 {d0}, [%0], %4 \n" 83 MEMACCESS(0) 84 "vst1.8 {d3}, [%0], %4 \n" 85 MEMACCESS(0) 86 "vst1.8 {d2}, [%0], %4 \n" 87 MEMACCESS(0) 88 "vst1.8 {d5}, [%0], %4 \n" 89 MEMACCESS(0) 90 "vst1.8 {d4}, [%0], %4 \n" 91 MEMACCESS(0) 92 "vst1.8 {d7}, [%0], %4 \n" 93 MEMACCESS(0) 94 "vst1.8 {d6}, [%0] \n" 95 96 "add %1, #8 \n" // src += 8 97 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride 98 "subs %5, #8 \n" // w -= 8 99 "bge 1b \n" 100 101 // add 8 back to counter. if the result is 0 there are 102 // no residuals. 103 "adds %5, #8 \n" 104 "beq 4f \n" 105 106 // some residual, so between 1 and 7 lines left to transpose 107 "cmp %5, #2 \n" 108 "blt 3f \n" 109 110 "cmp %5, #4 \n" 111 "blt 2f \n" 112 113 // 4x8 block 114 "mov %0, %1 \n" 115 MEMACCESS(0) 116 "vld1.32 {d0[0]}, [%0], %2 \n" 117 MEMACCESS(0) 118 "vld1.32 {d0[1]}, [%0], %2 \n" 119 MEMACCESS(0) 120 "vld1.32 {d1[0]}, [%0], %2 \n" 121 MEMACCESS(0) 122 "vld1.32 {d1[1]}, [%0], %2 \n" 123 MEMACCESS(0) 124 "vld1.32 {d2[0]}, [%0], %2 \n" 125 MEMACCESS(0) 126 "vld1.32 {d2[1]}, [%0], %2 \n" 127 MEMACCESS(0) 128 "vld1.32 {d3[0]}, [%0], %2 \n" 129 MEMACCESS(0) 130 "vld1.32 {d3[1]}, [%0] \n" 131 132 "mov %0, %3 \n" 133 134 MEMACCESS(6) 135 "vld1.8 {q3}, [%6] \n" 136 137 "vtbl.8 d4, {d0, d1}, d6 \n" 138 "vtbl.8 d5, {d0, d1}, d7 \n" 139 "vtbl.8 d0, {d2, d3}, d6 \n" 140 "vtbl.8 d1, {d2, d3}, d7 \n" 141 142 // TODO(frkoenig): Rework shuffle above to 143 // write out with 4 instead of 8 writes. 144 MEMACCESS(0) 145 "vst1.32 {d4[0]}, [%0], %4 \n" 146 MEMACCESS(0) 147 "vst1.32 {d4[1]}, [%0], %4 \n" 148 MEMACCESS(0) 149 "vst1.32 {d5[0]}, [%0], %4 \n" 150 MEMACCESS(0) 151 "vst1.32 {d5[1]}, [%0] \n" 152 153 "add %0, %3, #4 \n" 154 MEMACCESS(0) 155 "vst1.32 {d0[0]}, [%0], %4 \n" 156 MEMACCESS(0) 157 "vst1.32 {d0[1]}, [%0], %4 \n" 158 MEMACCESS(0) 159 "vst1.32 {d1[0]}, [%0], %4 \n" 160 MEMACCESS(0) 161 "vst1.32 {d1[1]}, [%0] \n" 162 163 "add %1, #4 \n" // src += 4 164 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride 165 "subs %5, #4 \n" // w -= 4 166 "beq 4f \n" 167 168 // some residual, check to see if it includes a 2x8 block, 169 // or less 170 "cmp %5, #2 \n" 171 "blt 3f \n" 172 173 // 2x8 block 174 "2: \n" 175 "mov %0, %1 \n" 176 MEMACCESS(0) 177 "vld1.16 {d0[0]}, [%0], %2 \n" 178 MEMACCESS(0) 179 "vld1.16 {d1[0]}, [%0], %2 \n" 180 MEMACCESS(0) 181 "vld1.16 {d0[1]}, [%0], %2 \n" 182 MEMACCESS(0) 183 "vld1.16 {d1[1]}, [%0], %2 \n" 184 MEMACCESS(0) 185 "vld1.16 {d0[2]}, [%0], %2 \n" 186 MEMACCESS(0) 187 "vld1.16 {d1[2]}, [%0], %2 \n" 188 MEMACCESS(0) 189 "vld1.16 {d0[3]}, [%0], %2 \n" 190 MEMACCESS(0) 191 "vld1.16 {d1[3]}, [%0] \n" 192 193 "vtrn.8 d0, d1 \n" 194 195 "mov %0, %3 \n" 196 197 MEMACCESS(0) 198 "vst1.64 {d0}, [%0], %4 \n" 199 MEMACCESS(0) 200 "vst1.64 {d1}, [%0] \n" 201 202 "add %1, #2 \n" // src += 2 203 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride 204 "subs %5, #2 \n" // w -= 2 205 "beq 4f \n" 206 207 // 1x8 block 208 "3: \n" 209 MEMACCESS(1) 210 "vld1.8 {d0[0]}, [%1], %2 \n" 211 MEMACCESS(1) 212 "vld1.8 {d0[1]}, [%1], %2 \n" 213 MEMACCESS(1) 214 "vld1.8 {d0[2]}, [%1], %2 \n" 215 MEMACCESS(1) 216 "vld1.8 {d0[3]}, [%1], %2 \n" 217 MEMACCESS(1) 218 "vld1.8 {d0[4]}, [%1], %2 \n" 219 MEMACCESS(1) 220 "vld1.8 {d0[5]}, [%1], %2 \n" 221 MEMACCESS(1) 222 "vld1.8 {d0[6]}, [%1], %2 \n" 223 MEMACCESS(1) 224 "vld1.8 {d0[7]}, [%1] \n" 225 226 MEMACCESS(3) 227 "vst1.64 {d0}, [%3] \n" 228 229 "4: \n" 230 231 : "+r"(src_temp), // %0 232 "+r"(src), // %1 233 "+r"(src_stride), // %2 234 "+r"(dst), // %3 235 "+r"(dst_stride), // %4 236 "+r"(width) // %5 237 : "r"(&kVTbl4x4Transpose) // %6 238 : "memory", "cc", "q0", "q1", "q2", "q3" 239 ); 240 } 241 242 static uvec8 kVTbl4x4TransposeDi = 243 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; 244 245 void TransposeUVWx8_NEON(const uint8* src, int src_stride, 246 uint8* dst_a, int dst_stride_a, 247 uint8* dst_b, int dst_stride_b, 248 int width) { 249 const uint8* src_temp = NULL; 250 asm volatile ( 251 // loops are on blocks of 8. loop will stop when 252 // counter gets to or below 0. starting the counter 253 // at w-8 allow for this 254 "sub %7, #8 \n" 255 256 // handle 8x8 blocks. this should be the majority of the plane 257 ".p2align 2 \n" 258 "1: \n" 259 "mov %0, %1 \n" 260 261 MEMACCESS(0) 262 "vld2.8 {d0, d1}, [%0], %2 \n" 263 MEMACCESS(0) 264 "vld2.8 {d2, d3}, [%0], %2 \n" 265 MEMACCESS(0) 266 "vld2.8 {d4, d5}, [%0], %2 \n" 267 MEMACCESS(0) 268 "vld2.8 {d6, d7}, [%0], %2 \n" 269 MEMACCESS(0) 270 "vld2.8 {d16, d17}, [%0], %2 \n" 271 MEMACCESS(0) 272 "vld2.8 {d18, d19}, [%0], %2 \n" 273 MEMACCESS(0) 274 "vld2.8 {d20, d21}, [%0], %2 \n" 275 MEMACCESS(0) 276 "vld2.8 {d22, d23}, [%0] \n" 277 278 "vtrn.8 q1, q0 \n" 279 "vtrn.8 q3, q2 \n" 280 "vtrn.8 q9, q8 \n" 281 "vtrn.8 q11, q10 \n" 282 283 "vtrn.16 q1, q3 \n" 284 "vtrn.16 q0, q2 \n" 285 "vtrn.16 q9, q11 \n" 286 "vtrn.16 q8, q10 \n" 287 288 "vtrn.32 q1, q9 \n" 289 "vtrn.32 q0, q8 \n" 290 "vtrn.32 q3, q11 \n" 291 "vtrn.32 q2, q10 \n" 292 293 "vrev16.8 q0, q0 \n" 294 "vrev16.8 q1, q1 \n" 295 "vrev16.8 q2, q2 \n" 296 "vrev16.8 q3, q3 \n" 297 "vrev16.8 q8, q8 \n" 298 "vrev16.8 q9, q9 \n" 299 "vrev16.8 q10, q10 \n" 300 "vrev16.8 q11, q11 \n" 301 302 "mov %0, %3 \n" 303 304 MEMACCESS(0) 305 "vst1.8 {d2}, [%0], %4 \n" 306 MEMACCESS(0) 307 "vst1.8 {d0}, [%0], %4 \n" 308 MEMACCESS(0) 309 "vst1.8 {d6}, [%0], %4 \n" 310 MEMACCESS(0) 311 "vst1.8 {d4}, [%0], %4 \n" 312 MEMACCESS(0) 313 "vst1.8 {d18}, [%0], %4 \n" 314 MEMACCESS(0) 315 "vst1.8 {d16}, [%0], %4 \n" 316 MEMACCESS(0) 317 "vst1.8 {d22}, [%0], %4 \n" 318 MEMACCESS(0) 319 "vst1.8 {d20}, [%0] \n" 320 321 "mov %0, %5 \n" 322 323 MEMACCESS(0) 324 "vst1.8 {d3}, [%0], %6 \n" 325 MEMACCESS(0) 326 "vst1.8 {d1}, [%0], %6 \n" 327 MEMACCESS(0) 328 "vst1.8 {d7}, [%0], %6 \n" 329 MEMACCESS(0) 330 "vst1.8 {d5}, [%0], %6 \n" 331 MEMACCESS(0) 332 "vst1.8 {d19}, [%0], %6 \n" 333 MEMACCESS(0) 334 "vst1.8 {d17}, [%0], %6 \n" 335 MEMACCESS(0) 336 "vst1.8 {d23}, [%0], %6 \n" 337 MEMACCESS(0) 338 "vst1.8 {d21}, [%0] \n" 339 340 "add %1, #8*2 \n" // src += 8*2 341 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a 342 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b 343 "subs %7, #8 \n" // w -= 8 344 "bge 1b \n" 345 346 // add 8 back to counter. if the result is 0 there are 347 // no residuals. 348 "adds %7, #8 \n" 349 "beq 4f \n" 350 351 // some residual, so between 1 and 7 lines left to transpose 352 "cmp %7, #2 \n" 353 "blt 3f \n" 354 355 "cmp %7, #4 \n" 356 "blt 2f \n" 357 358 // TODO(frkoenig): Clean this up 359 // 4x8 block 360 "mov %0, %1 \n" 361 MEMACCESS(0) 362 "vld1.64 {d0}, [%0], %2 \n" 363 MEMACCESS(0) 364 "vld1.64 {d1}, [%0], %2 \n" 365 MEMACCESS(0) 366 "vld1.64 {d2}, [%0], %2 \n" 367 MEMACCESS(0) 368 "vld1.64 {d3}, [%0], %2 \n" 369 MEMACCESS(0) 370 "vld1.64 {d4}, [%0], %2 \n" 371 MEMACCESS(0) 372 "vld1.64 {d5}, [%0], %2 \n" 373 MEMACCESS(0) 374 "vld1.64 {d6}, [%0], %2 \n" 375 MEMACCESS(0) 376 "vld1.64 {d7}, [%0] \n" 377 378 MEMACCESS(8) 379 "vld1.8 {q15}, [%8] \n" 380 381 "vtrn.8 q0, q1 \n" 382 "vtrn.8 q2, q3 \n" 383 384 "vtbl.8 d16, {d0, d1}, d30 \n" 385 "vtbl.8 d17, {d0, d1}, d31 \n" 386 "vtbl.8 d18, {d2, d3}, d30 \n" 387 "vtbl.8 d19, {d2, d3}, d31 \n" 388 "vtbl.8 d20, {d4, d5}, d30 \n" 389 "vtbl.8 d21, {d4, d5}, d31 \n" 390 "vtbl.8 d22, {d6, d7}, d30 \n" 391 "vtbl.8 d23, {d6, d7}, d31 \n" 392 393 "mov %0, %3 \n" 394 395 MEMACCESS(0) 396 "vst1.32 {d16[0]}, [%0], %4 \n" 397 MEMACCESS(0) 398 "vst1.32 {d16[1]}, [%0], %4 \n" 399 MEMACCESS(0) 400 "vst1.32 {d17[0]}, [%0], %4 \n" 401 MEMACCESS(0) 402 "vst1.32 {d17[1]}, [%0], %4 \n" 403 404 "add %0, %3, #4 \n" 405 MEMACCESS(0) 406 "vst1.32 {d20[0]}, [%0], %4 \n" 407 MEMACCESS(0) 408 "vst1.32 {d20[1]}, [%0], %4 \n" 409 MEMACCESS(0) 410 "vst1.32 {d21[0]}, [%0], %4 \n" 411 MEMACCESS(0) 412 "vst1.32 {d21[1]}, [%0] \n" 413 414 "mov %0, %5 \n" 415 416 MEMACCESS(0) 417 "vst1.32 {d18[0]}, [%0], %6 \n" 418 MEMACCESS(0) 419 "vst1.32 {d18[1]}, [%0], %6 \n" 420 MEMACCESS(0) 421 "vst1.32 {d19[0]}, [%0], %6 \n" 422 MEMACCESS(0) 423 "vst1.32 {d19[1]}, [%0], %6 \n" 424 425 "add %0, %5, #4 \n" 426 MEMACCESS(0) 427 "vst1.32 {d22[0]}, [%0], %6 \n" 428 MEMACCESS(0) 429 "vst1.32 {d22[1]}, [%0], %6 \n" 430 MEMACCESS(0) 431 "vst1.32 {d23[0]}, [%0], %6 \n" 432 MEMACCESS(0) 433 "vst1.32 {d23[1]}, [%0] \n" 434 435 "add %1, #4*2 \n" // src += 4 * 2 436 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a 437 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b 438 "subs %7, #4 \n" // w -= 4 439 "beq 4f \n" 440 441 // some residual, check to see if it includes a 2x8 block, 442 // or less 443 "cmp %7, #2 \n" 444 "blt 3f \n" 445 446 // 2x8 block 447 "2: \n" 448 "mov %0, %1 \n" 449 MEMACCESS(0) 450 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" 451 MEMACCESS(0) 452 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" 453 MEMACCESS(0) 454 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" 455 MEMACCESS(0) 456 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" 457 MEMACCESS(0) 458 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" 459 MEMACCESS(0) 460 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" 461 MEMACCESS(0) 462 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" 463 MEMACCESS(0) 464 "vld2.16 {d1[3], d3[3]}, [%0] \n" 465 466 "vtrn.8 d0, d1 \n" 467 "vtrn.8 d2, d3 \n" 468 469 "mov %0, %3 \n" 470 471 MEMACCESS(0) 472 "vst1.64 {d0}, [%0], %4 \n" 473 MEMACCESS(0) 474 "vst1.64 {d2}, [%0] \n" 475 476 "mov %0, %5 \n" 477 478 MEMACCESS(0) 479 "vst1.64 {d1}, [%0], %6 \n" 480 MEMACCESS(0) 481 "vst1.64 {d3}, [%0] \n" 482 483 "add %1, #2*2 \n" // src += 2 * 2 484 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a 485 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b 486 "subs %7, #2 \n" // w -= 2 487 "beq 4f \n" 488 489 // 1x8 block 490 "3: \n" 491 MEMACCESS(1) 492 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" 493 MEMACCESS(1) 494 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" 495 MEMACCESS(1) 496 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" 497 MEMACCESS(1) 498 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" 499 MEMACCESS(1) 500 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" 501 MEMACCESS(1) 502 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" 503 MEMACCESS(1) 504 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" 505 MEMACCESS(1) 506 "vld2.8 {d0[7], d1[7]}, [%1] \n" 507 508 MEMACCESS(3) 509 "vst1.64 {d0}, [%3] \n" 510 MEMACCESS(5) 511 "vst1.64 {d1}, [%5] \n" 512 513 "4: \n" 514 515 : "+r"(src_temp), // %0 516 "+r"(src), // %1 517 "+r"(src_stride), // %2 518 "+r"(dst_a), // %3 519 "+r"(dst_stride_a), // %4 520 "+r"(dst_b), // %5 521 "+r"(dst_stride_b), // %6 522 "+r"(width) // %7 523 : "r"(&kVTbl4x4TransposeDi) // %8 524 : "memory", "cc", 525 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" 526 ); 527 } 528 #endif 529 530 #ifdef __cplusplus 531 } // extern "C" 532 } // namespace libyuv 533 #endif 534