1 /* 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/rotate_row.h" 12 #include "libyuv/row.h" 13 14 #include "libyuv/basic_types.h" 15 16 #ifdef __cplusplus 17 namespace libyuv { 18 extern "C" { 19 #endif 20 21 // This module is for GCC Neon armv8 64 bit. 22 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 23 24 static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, 25 2, 6, 10, 14, 3, 7, 11, 15}; 26 27 void TransposeWx8_NEON(const uint8* src, 28 int src_stride, 29 uint8* dst, 30 int dst_stride, 31 int width) { 32 const uint8* src_temp; 33 int64 width64 = (int64)width; // Work around clang 3.4 warning. 34 asm volatile ( 35 // loops are on blocks of 8. loop will stop when 36 // counter gets to or below 0. starting the counter 37 // at w-8 allow for this 38 "sub %3, %3, #8 \n" 39 40 // handle 8x8 blocks. this should be the majority of the plane 41 "1: \n" 42 "mov %0, %1 \n" 43 44 MEMACCESS(0) 45 "ld1 {v0.8b}, [%0], %5 \n" 46 MEMACCESS(0) 47 "ld1 {v1.8b}, [%0], %5 \n" 48 MEMACCESS(0) 49 "ld1 {v2.8b}, [%0], %5 \n" 50 MEMACCESS(0) 51 "ld1 {v3.8b}, [%0], %5 \n" 52 MEMACCESS(0) 53 "ld1 {v4.8b}, [%0], %5 \n" 54 MEMACCESS(0) 55 "ld1 {v5.8b}, [%0], %5 \n" 56 MEMACCESS(0) 57 "ld1 {v6.8b}, [%0], %5 \n" 58 MEMACCESS(0) 59 "ld1 {v7.8b}, [%0] \n" 60 61 "trn2 v16.8b, v0.8b, v1.8b \n" 62 "trn1 v17.8b, v0.8b, v1.8b \n" 63 "trn2 v18.8b, v2.8b, v3.8b \n" 64 "trn1 v19.8b, v2.8b, v3.8b \n" 65 "trn2 v20.8b, v4.8b, v5.8b \n" 66 "trn1 v21.8b, v4.8b, v5.8b \n" 67 "trn2 v22.8b, v6.8b, v7.8b \n" 68 "trn1 v23.8b, v6.8b, v7.8b \n" 69 70 "trn2 v3.4h, v17.4h, v19.4h \n" 71 "trn1 v1.4h, v17.4h, v19.4h \n" 72 "trn2 v2.4h, v16.4h, v18.4h \n" 73 "trn1 v0.4h, v16.4h, v18.4h \n" 74 "trn2 v7.4h, v21.4h, v23.4h \n" 75 "trn1 v5.4h, v21.4h, v23.4h \n" 76 "trn2 v6.4h, v20.4h, v22.4h \n" 77 "trn1 v4.4h, v20.4h, v22.4h \n" 78 79 "trn2 v21.2s, v1.2s, v5.2s \n" 80 "trn1 v17.2s, v1.2s, v5.2s \n" 81 "trn2 v20.2s, v0.2s, v4.2s \n" 82 "trn1 v16.2s, v0.2s, v4.2s \n" 83 "trn2 v23.2s, v3.2s, v7.2s \n" 84 "trn1 v19.2s, v3.2s, v7.2s \n" 85 "trn2 v22.2s, v2.2s, v6.2s \n" 86 "trn1 v18.2s, v2.2s, v6.2s \n" 87 88 "mov %0, %2 \n" 89 90 MEMACCESS(0) 91 "st1 {v17.8b}, [%0], %6 \n" 92 MEMACCESS(0) 93 "st1 {v16.8b}, [%0], %6 \n" 94 MEMACCESS(0) 95 "st1 {v19.8b}, [%0], %6 \n" 96 MEMACCESS(0) 97 "st1 {v18.8b}, [%0], %6 \n" 98 MEMACCESS(0) 99 "st1 {v21.8b}, [%0], %6 \n" 100 MEMACCESS(0) 101 "st1 {v20.8b}, [%0], %6 \n" 102 MEMACCESS(0) 103 "st1 {v23.8b}, [%0], %6 \n" 104 MEMACCESS(0) 105 "st1 {v22.8b}, [%0] \n" 106 107 "add %1, %1, #8 \n" // src += 8 108 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride 109 "subs %3, %3, #8 \n" // w -= 8 110 "b.ge 1b \n" 111 112 // add 8 back to counter. if the result is 0 there are 113 // no residuals. 114 "adds %3, %3, #8 \n" 115 "b.eq 4f \n" 116 117 // some residual, so between 1 and 7 lines left to transpose 118 "cmp %3, #2 \n" 119 "b.lt 3f \n" 120 121 "cmp %3, #4 \n" 122 "b.lt 2f \n" 123 124 // 4x8 block 125 "mov %0, %1 \n" 126 MEMACCESS(0) 127 "ld1 {v0.s}[0], [%0], %5 \n" 128 MEMACCESS(0) 129 "ld1 {v0.s}[1], [%0], %5 \n" 130 MEMACCESS(0) 131 "ld1 {v0.s}[2], [%0], %5 \n" 132 MEMACCESS(0) 133 "ld1 {v0.s}[3], [%0], %5 \n" 134 MEMACCESS(0) 135 "ld1 {v1.s}[0], [%0], %5 \n" 136 MEMACCESS(0) 137 "ld1 {v1.s}[1], [%0], %5 \n" 138 MEMACCESS(0) 139 "ld1 {v1.s}[2], [%0], %5 \n" 140 MEMACCESS(0) 141 "ld1 {v1.s}[3], [%0] \n" 142 143 "mov %0, %2 \n" 144 145 MEMACCESS(4) 146 "ld1 {v2.16b}, [%4] \n" 147 148 "tbl v3.16b, {v0.16b}, v2.16b \n" 149 "tbl v0.16b, {v1.16b}, v2.16b \n" 150 151 // TODO(frkoenig): Rework shuffle above to 152 // write out with 4 instead of 8 writes. 153 MEMACCESS(0) 154 "st1 {v3.s}[0], [%0], %6 \n" 155 MEMACCESS(0) 156 "st1 {v3.s}[1], [%0], %6 \n" 157 MEMACCESS(0) 158 "st1 {v3.s}[2], [%0], %6 \n" 159 MEMACCESS(0) 160 "st1 {v3.s}[3], [%0] \n" 161 162 "add %0, %2, #4 \n" 163 MEMACCESS(0) 164 "st1 {v0.s}[0], [%0], %6 \n" 165 MEMACCESS(0) 166 "st1 {v0.s}[1], [%0], %6 \n" 167 MEMACCESS(0) 168 "st1 {v0.s}[2], [%0], %6 \n" 169 MEMACCESS(0) 170 "st1 {v0.s}[3], [%0] \n" 171 172 "add %1, %1, #4 \n" // src += 4 173 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride 174 "subs %3, %3, #4 \n" // w -= 4 175 "b.eq 4f \n" 176 177 // some residual, check to see if it includes a 2x8 block, 178 // or less 179 "cmp %3, #2 \n" 180 "b.lt 3f \n" 181 182 // 2x8 block 183 "2: \n" 184 "mov %0, %1 \n" 185 MEMACCESS(0) 186 "ld1 {v0.h}[0], [%0], %5 \n" 187 MEMACCESS(0) 188 "ld1 {v1.h}[0], [%0], %5 \n" 189 MEMACCESS(0) 190 "ld1 {v0.h}[1], [%0], %5 \n" 191 MEMACCESS(0) 192 "ld1 {v1.h}[1], [%0], %5 \n" 193 MEMACCESS(0) 194 "ld1 {v0.h}[2], [%0], %5 \n" 195 MEMACCESS(0) 196 "ld1 {v1.h}[2], [%0], %5 \n" 197 MEMACCESS(0) 198 "ld1 {v0.h}[3], [%0], %5 \n" 199 MEMACCESS(0) 200 "ld1 {v1.h}[3], [%0] \n" 201 202 "trn2 v2.8b, v0.8b, v1.8b \n" 203 "trn1 v3.8b, v0.8b, v1.8b \n" 204 205 "mov %0, %2 \n" 206 207 MEMACCESS(0) 208 "st1 {v3.8b}, [%0], %6 \n" 209 MEMACCESS(0) 210 "st1 {v2.8b}, [%0] \n" 211 212 "add %1, %1, #2 \n" // src += 2 213 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride 214 "subs %3, %3, #2 \n" // w -= 2 215 "b.eq 4f \n" 216 217 // 1x8 block 218 "3: \n" 219 MEMACCESS(1) 220 "ld1 {v0.b}[0], [%1], %5 \n" 221 MEMACCESS(1) 222 "ld1 {v0.b}[1], [%1], %5 \n" 223 MEMACCESS(1) 224 "ld1 {v0.b}[2], [%1], %5 \n" 225 MEMACCESS(1) 226 "ld1 {v0.b}[3], [%1], %5 \n" 227 MEMACCESS(1) 228 "ld1 {v0.b}[4], [%1], %5 \n" 229 MEMACCESS(1) 230 "ld1 {v0.b}[5], [%1], %5 \n" 231 MEMACCESS(1) 232 "ld1 {v0.b}[6], [%1], %5 \n" 233 MEMACCESS(1) 234 "ld1 {v0.b}[7], [%1] \n" 235 236 MEMACCESS(2) 237 "st1 {v0.8b}, [%2] \n" 238 239 "4: \n" 240 241 : "=&r"(src_temp), // %0 242 "+r"(src), // %1 243 "+r"(dst), // %2 244 "+r"(width64) // %3 245 : "r"(&kVTbl4x4Transpose), // %4 246 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 247 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 248 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", 249 "v17", "v18", "v19", "v20", "v21", "v22", "v23" 250 ); 251 } 252 253 static uint8 kVTbl4x4TransposeDi[32] = { 254 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, 255 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; 256 257 void TransposeUVWx8_NEON(const uint8* src, 258 int src_stride, 259 uint8* dst_a, 260 int dst_stride_a, 261 uint8* dst_b, 262 int dst_stride_b, 263 int width) { 264 const uint8* src_temp; 265 int64 width64 = (int64)width; // Work around clang 3.4 warning. 266 asm volatile ( 267 // loops are on blocks of 8. loop will stop when 268 // counter gets to or below 0. starting the counter 269 // at w-8 allow for this 270 "sub %4, %4, #8 \n" 271 272 // handle 8x8 blocks. this should be the majority of the plane 273 "1: \n" 274 "mov %0, %1 \n" 275 276 MEMACCESS(0) 277 "ld1 {v0.16b}, [%0], %5 \n" 278 MEMACCESS(0) 279 "ld1 {v1.16b}, [%0], %5 \n" 280 MEMACCESS(0) 281 "ld1 {v2.16b}, [%0], %5 \n" 282 MEMACCESS(0) 283 "ld1 {v3.16b}, [%0], %5 \n" 284 MEMACCESS(0) 285 "ld1 {v4.16b}, [%0], %5 \n" 286 MEMACCESS(0) 287 "ld1 {v5.16b}, [%0], %5 \n" 288 MEMACCESS(0) 289 "ld1 {v6.16b}, [%0], %5 \n" 290 MEMACCESS(0) 291 "ld1 {v7.16b}, [%0] \n" 292 293 "trn1 v16.16b, v0.16b, v1.16b \n" 294 "trn2 v17.16b, v0.16b, v1.16b \n" 295 "trn1 v18.16b, v2.16b, v3.16b \n" 296 "trn2 v19.16b, v2.16b, v3.16b \n" 297 "trn1 v20.16b, v4.16b, v5.16b \n" 298 "trn2 v21.16b, v4.16b, v5.16b \n" 299 "trn1 v22.16b, v6.16b, v7.16b \n" 300 "trn2 v23.16b, v6.16b, v7.16b \n" 301 302 "trn1 v0.8h, v16.8h, v18.8h \n" 303 "trn2 v1.8h, v16.8h, v18.8h \n" 304 "trn1 v2.8h, v20.8h, v22.8h \n" 305 "trn2 v3.8h, v20.8h, v22.8h \n" 306 "trn1 v4.8h, v17.8h, v19.8h \n" 307 "trn2 v5.8h, v17.8h, v19.8h \n" 308 "trn1 v6.8h, v21.8h, v23.8h \n" 309 "trn2 v7.8h, v21.8h, v23.8h \n" 310 311 "trn1 v16.4s, v0.4s, v2.4s \n" 312 "trn2 v17.4s, v0.4s, v2.4s \n" 313 "trn1 v18.4s, v1.4s, v3.4s \n" 314 "trn2 v19.4s, v1.4s, v3.4s \n" 315 "trn1 v20.4s, v4.4s, v6.4s \n" 316 "trn2 v21.4s, v4.4s, v6.4s \n" 317 "trn1 v22.4s, v5.4s, v7.4s \n" 318 "trn2 v23.4s, v5.4s, v7.4s \n" 319 320 "mov %0, %2 \n" 321 322 MEMACCESS(0) 323 "st1 {v16.d}[0], [%0], %6 \n" 324 MEMACCESS(0) 325 "st1 {v18.d}[0], [%0], %6 \n" 326 MEMACCESS(0) 327 "st1 {v17.d}[0], [%0], %6 \n" 328 MEMACCESS(0) 329 "st1 {v19.d}[0], [%0], %6 \n" 330 MEMACCESS(0) 331 "st1 {v16.d}[1], [%0], %6 \n" 332 MEMACCESS(0) 333 "st1 {v18.d}[1], [%0], %6 \n" 334 MEMACCESS(0) 335 "st1 {v17.d}[1], [%0], %6 \n" 336 MEMACCESS(0) 337 "st1 {v19.d}[1], [%0] \n" 338 339 "mov %0, %3 \n" 340 341 MEMACCESS(0) 342 "st1 {v20.d}[0], [%0], %7 \n" 343 MEMACCESS(0) 344 "st1 {v22.d}[0], [%0], %7 \n" 345 MEMACCESS(0) 346 "st1 {v21.d}[0], [%0], %7 \n" 347 MEMACCESS(0) 348 "st1 {v23.d}[0], [%0], %7 \n" 349 MEMACCESS(0) 350 "st1 {v20.d}[1], [%0], %7 \n" 351 MEMACCESS(0) 352 "st1 {v22.d}[1], [%0], %7 \n" 353 MEMACCESS(0) 354 "st1 {v21.d}[1], [%0], %7 \n" 355 MEMACCESS(0) 356 "st1 {v23.d}[1], [%0] \n" 357 358 "add %1, %1, #16 \n" // src += 8*2 359 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a 360 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b 361 "subs %4, %4, #8 \n" // w -= 8 362 "b.ge 1b \n" 363 364 // add 8 back to counter. if the result is 0 there are 365 // no residuals. 366 "adds %4, %4, #8 \n" 367 "b.eq 4f \n" 368 369 // some residual, so between 1 and 7 lines left to transpose 370 "cmp %4, #2 \n" 371 "b.lt 3f \n" 372 373 "cmp %4, #4 \n" 374 "b.lt 2f \n" 375 376 // TODO(frkoenig): Clean this up 377 // 4x8 block 378 "mov %0, %1 \n" 379 MEMACCESS(0) 380 "ld1 {v0.8b}, [%0], %5 \n" 381 MEMACCESS(0) 382 "ld1 {v1.8b}, [%0], %5 \n" 383 MEMACCESS(0) 384 "ld1 {v2.8b}, [%0], %5 \n" 385 MEMACCESS(0) 386 "ld1 {v3.8b}, [%0], %5 \n" 387 MEMACCESS(0) 388 "ld1 {v4.8b}, [%0], %5 \n" 389 MEMACCESS(0) 390 "ld1 {v5.8b}, [%0], %5 \n" 391 MEMACCESS(0) 392 "ld1 {v6.8b}, [%0], %5 \n" 393 MEMACCESS(0) 394 "ld1 {v7.8b}, [%0] \n" 395 396 MEMACCESS(8) 397 "ld1 {v30.16b}, [%8], #16 \n" 398 "ld1 {v31.16b}, [%8] \n" 399 400 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" 401 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" 402 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" 403 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" 404 405 "mov %0, %2 \n" 406 407 MEMACCESS(0) 408 "st1 {v16.s}[0], [%0], %6 \n" 409 MEMACCESS(0) 410 "st1 {v16.s}[1], [%0], %6 \n" 411 MEMACCESS(0) 412 "st1 {v16.s}[2], [%0], %6 \n" 413 MEMACCESS(0) 414 "st1 {v16.s}[3], [%0], %6 \n" 415 416 "add %0, %2, #4 \n" 417 MEMACCESS(0) 418 "st1 {v18.s}[0], [%0], %6 \n" 419 MEMACCESS(0) 420 "st1 {v18.s}[1], [%0], %6 \n" 421 MEMACCESS(0) 422 "st1 {v18.s}[2], [%0], %6 \n" 423 MEMACCESS(0) 424 "st1 {v18.s}[3], [%0] \n" 425 426 "mov %0, %3 \n" 427 428 MEMACCESS(0) 429 "st1 {v17.s}[0], [%0], %7 \n" 430 MEMACCESS(0) 431 "st1 {v17.s}[1], [%0], %7 \n" 432 MEMACCESS(0) 433 "st1 {v17.s}[2], [%0], %7 \n" 434 MEMACCESS(0) 435 "st1 {v17.s}[3], [%0], %7 \n" 436 437 "add %0, %3, #4 \n" 438 MEMACCESS(0) 439 "st1 {v19.s}[0], [%0], %7 \n" 440 MEMACCESS(0) 441 "st1 {v19.s}[1], [%0], %7 \n" 442 MEMACCESS(0) 443 "st1 {v19.s}[2], [%0], %7 \n" 444 MEMACCESS(0) 445 "st1 {v19.s}[3], [%0] \n" 446 447 "add %1, %1, #8 \n" // src += 4 * 2 448 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a 449 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b 450 "subs %4, %4, #4 \n" // w -= 4 451 "b.eq 4f \n" 452 453 // some residual, check to see if it includes a 2x8 block, 454 // or less 455 "cmp %4, #2 \n" 456 "b.lt 3f \n" 457 458 // 2x8 block 459 "2: \n" 460 "mov %0, %1 \n" 461 MEMACCESS(0) 462 "ld2 {v0.h, v1.h}[0], [%0], %5 \n" 463 MEMACCESS(0) 464 "ld2 {v2.h, v3.h}[0], [%0], %5 \n" 465 MEMACCESS(0) 466 "ld2 {v0.h, v1.h}[1], [%0], %5 \n" 467 MEMACCESS(0) 468 "ld2 {v2.h, v3.h}[1], [%0], %5 \n" 469 MEMACCESS(0) 470 "ld2 {v0.h, v1.h}[2], [%0], %5 \n" 471 MEMACCESS(0) 472 "ld2 {v2.h, v3.h}[2], [%0], %5 \n" 473 MEMACCESS(0) 474 "ld2 {v0.h, v1.h}[3], [%0], %5 \n" 475 MEMACCESS(0) 476 "ld2 {v2.h, v3.h}[3], [%0] \n" 477 478 "trn1 v4.8b, v0.8b, v2.8b \n" 479 "trn2 v5.8b, v0.8b, v2.8b \n" 480 "trn1 v6.8b, v1.8b, v3.8b \n" 481 "trn2 v7.8b, v1.8b, v3.8b \n" 482 483 "mov %0, %2 \n" 484 485 MEMACCESS(0) 486 "st1 {v4.d}[0], [%0], %6 \n" 487 MEMACCESS(0) 488 "st1 {v6.d}[0], [%0] \n" 489 490 "mov %0, %3 \n" 491 492 MEMACCESS(0) 493 "st1 {v5.d}[0], [%0], %7 \n" 494 MEMACCESS(0) 495 "st1 {v7.d}[0], [%0] \n" 496 497 "add %1, %1, #4 \n" // src += 2 * 2 498 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a 499 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b 500 "subs %4, %4, #2 \n" // w -= 2 501 "b.eq 4f \n" 502 503 // 1x8 block 504 "3: \n" 505 MEMACCESS(1) 506 "ld2 {v0.b, v1.b}[0], [%1], %5 \n" 507 MEMACCESS(1) 508 "ld2 {v0.b, v1.b}[1], [%1], %5 \n" 509 MEMACCESS(1) 510 "ld2 {v0.b, v1.b}[2], [%1], %5 \n" 511 MEMACCESS(1) 512 "ld2 {v0.b, v1.b}[3], [%1], %5 \n" 513 MEMACCESS(1) 514 "ld2 {v0.b, v1.b}[4], [%1], %5 \n" 515 MEMACCESS(1) 516 "ld2 {v0.b, v1.b}[5], [%1], %5 \n" 517 MEMACCESS(1) 518 "ld2 {v0.b, v1.b}[6], [%1], %5 \n" 519 MEMACCESS(1) 520 "ld2 {v0.b, v1.b}[7], [%1] \n" 521 522 MEMACCESS(2) 523 "st1 {v0.d}[0], [%2] \n" 524 MEMACCESS(3) 525 "st1 {v1.d}[0], [%3] \n" 526 527 "4: \n" 528 529 : "=&r"(src_temp), // %0 530 "+r"(src), // %1 531 "+r"(dst_a), // %2 532 "+r"(dst_b), // %3 533 "+r"(width64) // %4 534 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 535 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 536 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 537 "r"(&kVTbl4x4TransposeDi) // %8 538 : "memory", "cc", 539 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 540 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", 541 "v30", "v31" 542 ); 543 } 544 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 545 546 #ifdef __cplusplus 547 } // extern "C" 548 } // namespace libyuv 549 #endif 550