1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 13 #include "libyuv/basic_types.h" 14 15 #ifdef __cplusplus 16 namespace libyuv { 17 extern "C" { 18 #endif 19 20 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) 21 22 static const uvec8 kVTbl4x4Transpose = 23 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; 24 25 void TransposeWx8_NEON(const uint8* src, int src_stride, 26 uint8* dst, int dst_stride, 27 int width) { 28 asm volatile ( 29 // loops are on blocks of 8. loop will stop when 30 // counter gets to or below 0. starting the counter 31 // at w-8 allow for this 32 "sub %4, #8 \n" 33 34 // handle 8x8 blocks. this should be the majority of the plane 35 ".p2align 4 \n" 36 "1: \n" 37 "mov r9, %0 \n" 38 39 "vld1.8 {d0}, [r9], %1 \n" 40 "vld1.8 {d1}, [r9], %1 \n" 41 "vld1.8 {d2}, [r9], %1 \n" 42 "vld1.8 {d3}, [r9], %1 \n" 43 "vld1.8 {d4}, [r9], %1 \n" 44 "vld1.8 {d5}, [r9], %1 \n" 45 "vld1.8 {d6}, [r9], %1 \n" 46 "vld1.8 {d7}, [r9] \n" 47 48 "vtrn.8 d1, d0 \n" 49 "vtrn.8 d3, d2 \n" 50 "vtrn.8 d5, d4 \n" 51 "vtrn.8 d7, d6 \n" 52 53 "vtrn.16 d1, d3 \n" 54 "vtrn.16 d0, d2 \n" 55 "vtrn.16 d5, d7 \n" 56 "vtrn.16 d4, d6 \n" 57 58 "vtrn.32 d1, d5 \n" 59 "vtrn.32 d0, d4 \n" 60 "vtrn.32 d3, d7 \n" 61 "vtrn.32 d2, d6 \n" 62 63 "vrev16.8 q0, q0 \n" 64 "vrev16.8 q1, q1 \n" 65 "vrev16.8 q2, q2 \n" 66 "vrev16.8 q3, q3 \n" 67 68 "mov r9, %2 \n" 69 70 "vst1.8 {d1}, [r9], %3 \n" 71 "vst1.8 {d0}, [r9], %3 \n" 72 "vst1.8 {d3}, [r9], %3 \n" 73 "vst1.8 {d2}, [r9], %3 \n" 74 "vst1.8 {d5}, [r9], %3 \n" 75 "vst1.8 {d4}, [r9], %3 \n" 76 "vst1.8 {d7}, [r9], %3 \n" 77 "vst1.8 {d6}, [r9] \n" 78 79 "add %0, #8 \n" // src += 8 80 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride 81 "subs %4, #8 \n" // w -= 8 82 "bge 1b \n" 83 84 // add 8 back to counter. if the result is 0 there are 85 // no residuals. 86 "adds %4, #8 \n" 87 "beq 4f \n" 88 89 // some residual, so between 1 and 7 lines left to transpose 90 "cmp %4, #2 \n" 91 "blt 3f \n" 92 93 "cmp %4, #4 \n" 94 "blt 2f \n" 95 96 // 4x8 block 97 "mov r9, %0 \n" 98 "vld1.32 {d0[0]}, [r9], %1 \n" 99 "vld1.32 {d0[1]}, [r9], %1 \n" 100 "vld1.32 {d1[0]}, [r9], %1 \n" 101 "vld1.32 {d1[1]}, [r9], %1 \n" 102 "vld1.32 {d2[0]}, [r9], %1 \n" 103 "vld1.32 {d2[1]}, [r9], %1 \n" 104 "vld1.32 {d3[0]}, [r9], %1 \n" 105 "vld1.32 {d3[1]}, [r9] \n" 106 107 "mov r9, %2 \n" 108 109 "vld1.8 {q3}, [%5] \n" 110 111 "vtbl.8 d4, {d0, d1}, d6 \n" 112 "vtbl.8 d5, {d0, d1}, d7 \n" 113 "vtbl.8 d0, {d2, d3}, d6 \n" 114 "vtbl.8 d1, {d2, d3}, d7 \n" 115 116 // TODO: rework shuffle above to write 117 // out with 4 instead of 8 writes 118 "vst1.32 {d4[0]}, [r9], %3 \n" 119 "vst1.32 {d4[1]}, [r9], %3 \n" 120 "vst1.32 {d5[0]}, [r9], %3 \n" 121 "vst1.32 {d5[1]}, [r9] \n" 122 123 "add r9, %2, #4 \n" 124 "vst1.32 {d0[0]}, [r9], %3 \n" 125 "vst1.32 {d0[1]}, [r9], %3 \n" 126 "vst1.32 {d1[0]}, [r9], %3 \n" 127 "vst1.32 {d1[1]}, [r9] \n" 128 129 "add %0, #4 \n" // src += 4 130 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride 131 "subs %4, #4 \n" // w -= 4 132 "beq 4f \n" 133 134 // some residual, check to see if it includes a 2x8 block, 135 // or less 136 "cmp %4, #2 \n" 137 "blt 3f \n" 138 139 // 2x8 block 140 "2: \n" 141 "mov r9, %0 \n" 142 "vld1.16 {d0[0]}, [r9], %1 \n" 143 "vld1.16 {d1[0]}, [r9], %1 \n" 144 "vld1.16 {d0[1]}, [r9], %1 \n" 145 "vld1.16 {d1[1]}, [r9], %1 \n" 146 "vld1.16 {d0[2]}, [r9], %1 \n" 147 "vld1.16 {d1[2]}, [r9], %1 \n" 148 "vld1.16 {d0[3]}, [r9], %1 \n" 149 "vld1.16 {d1[3]}, [r9] \n" 150 151 "vtrn.8 d0, d1 \n" 152 153 "mov r9, %2 \n" 154 155 "vst1.64 {d0}, [r9], %3 \n" 156 "vst1.64 {d1}, [r9] \n" 157 158 "add %0, #2 \n" // src += 2 159 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride 160 "subs %4, #2 \n" // w -= 2 161 "beq 4f \n" 162 163 // 1x8 block 164 "3: \n" 165 "vld1.8 {d0[0]}, [%0], %1 \n" 166 "vld1.8 {d0[1]}, [%0], %1 \n" 167 "vld1.8 {d0[2]}, [%0], %1 \n" 168 "vld1.8 {d0[3]}, [%0], %1 \n" 169 "vld1.8 {d0[4]}, [%0], %1 \n" 170 "vld1.8 {d0[5]}, [%0], %1 \n" 171 "vld1.8 {d0[6]}, [%0], %1 \n" 172 "vld1.8 {d0[7]}, [%0] \n" 173 174 "vst1.64 {d0}, [%2] \n" 175 176 "4: \n" 177 178 : "+r"(src), // %0 179 "+r"(src_stride), // %1 180 "+r"(dst), // %2 181 "+r"(dst_stride), // %3 182 "+r"(width) // %4 183 : "r"(&kVTbl4x4Transpose) // %5 184 : "memory", "cc", "r9", "q0", "q1", "q2", "q3" 185 ); 186 } 187 188 static const uvec8 kVTbl4x4TransposeDi = 189 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; 190 191 void TransposeUVWx8_NEON(const uint8* src, int src_stride, 192 uint8* dst_a, int dst_stride_a, 193 uint8* dst_b, int dst_stride_b, 194 int width) { 195 asm volatile ( 196 // loops are on blocks of 8. loop will stop when 197 // counter gets to or below 0. starting the counter 198 // at w-8 allow for this 199 "sub %6, #8 \n" 200 201 // handle 8x8 blocks. this should be the majority of the plane 202 ".p2align 4 \n" 203 "1: \n" 204 "mov r9, %0 \n" 205 206 "vld2.8 {d0, d1}, [r9], %1 \n" 207 "vld2.8 {d2, d3}, [r9], %1 \n" 208 "vld2.8 {d4, d5}, [r9], %1 \n" 209 "vld2.8 {d6, d7}, [r9], %1 \n" 210 "vld2.8 {d16, d17}, [r9], %1 \n" 211 "vld2.8 {d18, d19}, [r9], %1 \n" 212 "vld2.8 {d20, d21}, [r9], %1 \n" 213 "vld2.8 {d22, d23}, [r9] \n" 214 215 "vtrn.8 q1, q0 \n" 216 "vtrn.8 q3, q2 \n" 217 "vtrn.8 q9, q8 \n" 218 "vtrn.8 q11, q10 \n" 219 220 "vtrn.16 q1, q3 \n" 221 "vtrn.16 q0, q2 \n" 222 "vtrn.16 q9, q11 \n" 223 "vtrn.16 q8, q10 \n" 224 225 "vtrn.32 q1, q9 \n" 226 "vtrn.32 q0, q8 \n" 227 "vtrn.32 q3, q11 \n" 228 "vtrn.32 q2, q10 \n" 229 230 "vrev16.8 q0, q0 \n" 231 "vrev16.8 q1, q1 \n" 232 "vrev16.8 q2, q2 \n" 233 "vrev16.8 q3, q3 \n" 234 "vrev16.8 q8, q8 \n" 235 "vrev16.8 q9, q9 \n" 236 "vrev16.8 q10, q10 \n" 237 "vrev16.8 q11, q11 \n" 238 239 "mov r9, %2 \n" 240 241 "vst1.8 {d2}, [r9], %3 \n" 242 "vst1.8 {d0}, [r9], %3 \n" 243 "vst1.8 {d6}, [r9], %3 \n" 244 "vst1.8 {d4}, [r9], %3 \n" 245 "vst1.8 {d18}, [r9], %3 \n" 246 "vst1.8 {d16}, [r9], %3 \n" 247 "vst1.8 {d22}, [r9], %3 \n" 248 "vst1.8 {d20}, [r9] \n" 249 250 "mov r9, %4 \n" 251 252 "vst1.8 {d3}, [r9], %5 \n" 253 "vst1.8 {d1}, [r9], %5 \n" 254 "vst1.8 {d7}, [r9], %5 \n" 255 "vst1.8 {d5}, [r9], %5 \n" 256 "vst1.8 {d19}, [r9], %5 \n" 257 "vst1.8 {d17}, [r9], %5 \n" 258 "vst1.8 {d23}, [r9], %5 \n" 259 "vst1.8 {d21}, [r9] \n" 260 261 "add %0, #8*2 \n" // src += 8*2 262 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a 263 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b 264 "subs %6, #8 \n" // w -= 8 265 "bge 1b \n" 266 267 // add 8 back to counter. if the result is 0 there are 268 // no residuals. 269 "adds %6, #8 \n" 270 "beq 4f \n" 271 272 // some residual, so between 1 and 7 lines left to transpose 273 "cmp %6, #2 \n" 274 "blt 3f \n" 275 276 "cmp %6, #4 \n" 277 "blt 2f \n" 278 279 //TODO(frkoenig) : clean this up 280 // 4x8 block 281 "mov r9, %0 \n" 282 "vld1.64 {d0}, [r9], %1 \n" 283 "vld1.64 {d1}, [r9], %1 \n" 284 "vld1.64 {d2}, [r9], %1 \n" 285 "vld1.64 {d3}, [r9], %1 \n" 286 "vld1.64 {d4}, [r9], %1 \n" 287 "vld1.64 {d5}, [r9], %1 \n" 288 "vld1.64 {d6}, [r9], %1 \n" 289 "vld1.64 {d7}, [r9] \n" 290 291 "vld1.8 {q15}, [%7] \n" 292 293 "vtrn.8 q0, q1 \n" 294 "vtrn.8 q2, q3 \n" 295 296 "vtbl.8 d16, {d0, d1}, d30 \n" 297 "vtbl.8 d17, {d0, d1}, d31 \n" 298 "vtbl.8 d18, {d2, d3}, d30 \n" 299 "vtbl.8 d19, {d2, d3}, d31 \n" 300 "vtbl.8 d20, {d4, d5}, d30 \n" 301 "vtbl.8 d21, {d4, d5}, d31 \n" 302 "vtbl.8 d22, {d6, d7}, d30 \n" 303 "vtbl.8 d23, {d6, d7}, d31 \n" 304 305 "mov r9, %2 \n" 306 307 "vst1.32 {d16[0]}, [r9], %3 \n" 308 "vst1.32 {d16[1]}, [r9], %3 \n" 309 "vst1.32 {d17[0]}, [r9], %3 \n" 310 "vst1.32 {d17[1]}, [r9], %3 \n" 311 312 "add r9, %2, #4 \n" 313 "vst1.32 {d20[0]}, [r9], %3 \n" 314 "vst1.32 {d20[1]}, [r9], %3 \n" 315 "vst1.32 {d21[0]}, [r9], %3 \n" 316 "vst1.32 {d21[1]}, [r9] \n" 317 318 "mov r9, %4 \n" 319 320 "vst1.32 {d18[0]}, [r9], %5 \n" 321 "vst1.32 {d18[1]}, [r9], %5 \n" 322 "vst1.32 {d19[0]}, [r9], %5 \n" 323 "vst1.32 {d19[1]}, [r9], %5 \n" 324 325 "add r9, %4, #4 \n" 326 "vst1.32 {d22[0]}, [r9], %5 \n" 327 "vst1.32 {d22[1]}, [r9], %5 \n" 328 "vst1.32 {d23[0]}, [r9], %5 \n" 329 "vst1.32 {d23[1]}, [r9] \n" 330 331 "add %0, #4*2 \n" // src += 4 * 2 332 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a 333 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b 334 "subs %6, #4 \n" // w -= 4 335 "beq 4f \n" 336 337 // some residual, check to see if it includes a 2x8 block, 338 // or less 339 "cmp %6, #2 \n" 340 "blt 3f \n" 341 342 // 2x8 block 343 "2: \n" 344 "mov r9, %0 \n" 345 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n" 346 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n" 347 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n" 348 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n" 349 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n" 350 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n" 351 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n" 352 "vld2.16 {d1[3], d3[3]}, [r9] \n" 353 354 "vtrn.8 d0, d1 \n" 355 "vtrn.8 d2, d3 \n" 356 357 "mov r9, %2 \n" 358 359 "vst1.64 {d0}, [r9], %3 \n" 360 "vst1.64 {d2}, [r9] \n" 361 362 "mov r9, %4 \n" 363 364 "vst1.64 {d1}, [r9], %5 \n" 365 "vst1.64 {d3}, [r9] \n" 366 367 "add %0, #2*2 \n" // src += 2 * 2 368 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a 369 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b 370 "subs %6, #2 \n" // w -= 2 371 "beq 4f \n" 372 373 // 1x8 block 374 "3: \n" 375 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n" 376 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n" 377 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n" 378 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n" 379 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n" 380 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n" 381 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n" 382 "vld2.8 {d0[7], d1[7]}, [%0] \n" 383 384 "vst1.64 {d0}, [%2] \n" 385 "vst1.64 {d1}, [%4] \n" 386 387 "4: \n" 388 389 : "+r"(src), // %0 390 "+r"(src_stride), // %1 391 "+r"(dst_a), // %2 392 "+r"(dst_stride_a), // %3 393 "+r"(dst_b), // %4 394 "+r"(dst_stride_b), // %5 395 "+r"(width) // %6 396 : "r"(&kVTbl4x4TransposeDi) // %7 397 : "memory", "cc", "r9", 398 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" 399 ); 400 } 401 #endif 402 403 #ifdef __cplusplus 404 } // extern "C" 405 } // namespace libyuv 406 #endif 407