1 /* 2 * Copyright 2015 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/rotate_row.h" 12 #include "libyuv/row.h" 13 14 #ifdef __cplusplus 15 namespace libyuv { 16 extern "C" { 17 #endif 18 19 // This module is for GCC x86 and x64. 20 #if !defined(LIBYUV_DISABLE_X86) && \ 21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 22 23 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. 24 #if defined(HAS_TRANSPOSEWX8_SSSE3) 25 void TransposeWx8_SSSE3(const uint8* src, 26 int src_stride, 27 uint8* dst, 28 int dst_stride, 29 int width) { 30 asm volatile( 31 // Read in the data from the source pointer. 32 // First round of bit swap. 33 LABELALIGN 34 "1: \n" 35 "movq (%0),%%xmm0 \n" 36 "movq (%0,%3),%%xmm1 \n" 37 "lea (%0,%3,2),%0 \n" 38 "punpcklbw %%xmm1,%%xmm0 \n" 39 "movq (%0),%%xmm2 \n" 40 "movdqa %%xmm0,%%xmm1 \n" 41 "palignr $0x8,%%xmm1,%%xmm1 \n" 42 "movq (%0,%3),%%xmm3 \n" 43 "lea (%0,%3,2),%0 \n" 44 "punpcklbw %%xmm3,%%xmm2 \n" 45 "movdqa %%xmm2,%%xmm3 \n" 46 "movq (%0),%%xmm4 \n" 47 "palignr $0x8,%%xmm3,%%xmm3 \n" 48 "movq (%0,%3),%%xmm5 \n" 49 "lea (%0,%3,2),%0 \n" 50 "punpcklbw %%xmm5,%%xmm4 \n" 51 "movdqa %%xmm4,%%xmm5 \n" 52 "movq (%0),%%xmm6 \n" 53 "palignr $0x8,%%xmm5,%%xmm5 \n" 54 "movq (%0,%3),%%xmm7 \n" 55 "lea (%0,%3,2),%0 \n" 56 "punpcklbw %%xmm7,%%xmm6 \n" 57 "neg %3 \n" 58 "movdqa %%xmm6,%%xmm7 \n" 59 "lea 0x8(%0,%3,8),%0 \n" 60 "palignr $0x8,%%xmm7,%%xmm7 \n" 61 "neg %3 \n" 62 // Second round of bit swap. 63 "punpcklwd %%xmm2,%%xmm0 \n" 64 "punpcklwd %%xmm3,%%xmm1 \n" 65 "movdqa %%xmm0,%%xmm2 \n" 66 "movdqa %%xmm1,%%xmm3 \n" 67 "palignr $0x8,%%xmm2,%%xmm2 \n" 68 "palignr $0x8,%%xmm3,%%xmm3 \n" 69 "punpcklwd %%xmm6,%%xmm4 \n" 70 "punpcklwd %%xmm7,%%xmm5 \n" 71 "movdqa %%xmm4,%%xmm6 \n" 72 "movdqa %%xmm5,%%xmm7 \n" 73 "palignr $0x8,%%xmm6,%%xmm6 \n" 74 "palignr $0x8,%%xmm7,%%xmm7 \n" 75 // Third round of bit swap. 76 // Write to the destination pointer. 77 "punpckldq %%xmm4,%%xmm0 \n" 78 "movq %%xmm0,(%1) \n" 79 "movdqa %%xmm0,%%xmm4 \n" 80 "palignr $0x8,%%xmm4,%%xmm4 \n" 81 "movq %%xmm4,(%1,%4) \n" 82 "lea (%1,%4,2),%1 \n" 83 "punpckldq %%xmm6,%%xmm2 \n" 84 "movdqa %%xmm2,%%xmm6 \n" 85 "movq %%xmm2,(%1) \n" 86 "palignr $0x8,%%xmm6,%%xmm6 \n" 87 "punpckldq %%xmm5,%%xmm1 \n" 88 "movq %%xmm6,(%1,%4) \n" 89 "lea (%1,%4,2),%1 \n" 90 "movdqa %%xmm1,%%xmm5 \n" 91 "movq %%xmm1,(%1) \n" 92 "palignr $0x8,%%xmm5,%%xmm5 \n" 93 "movq %%xmm5,(%1,%4) \n" 94 "lea (%1,%4,2),%1 \n" 95 "punpckldq %%xmm7,%%xmm3 \n" 96 "movq %%xmm3,(%1) \n" 97 "movdqa %%xmm3,%%xmm7 \n" 98 "palignr $0x8,%%xmm7,%%xmm7 \n" 99 "sub $0x8,%2 \n" 100 "movq %%xmm7,(%1,%4) \n" 101 "lea (%1,%4,2),%1 \n" 102 "jg 1b \n" 103 : "+r"(src), // %0 104 "+r"(dst), // %1 105 "+r"(width) // %2 106 : "r"((intptr_t)(src_stride)), // %3 107 "r"((intptr_t)(dst_stride)) // %4 108 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 109 "xmm7"); 110 } 111 #endif // defined(HAS_TRANSPOSEWX8_SSSE3) 112 113 // Transpose 16x8. 64 bit 114 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) 115 void TransposeWx8_Fast_SSSE3(const uint8* src, 116 int src_stride, 117 uint8* dst, 118 int dst_stride, 119 int width) { 120 asm volatile( 121 // Read in the data from the source pointer. 122 // First round of bit swap. 123 LABELALIGN 124 "1: \n" 125 "movdqu (%0),%%xmm0 \n" 126 "movdqu (%0,%3),%%xmm1 \n" 127 "lea (%0,%3,2),%0 \n" 128 "movdqa %%xmm0,%%xmm8 \n" 129 "punpcklbw %%xmm1,%%xmm0 \n" 130 "punpckhbw %%xmm1,%%xmm8 \n" 131 "movdqu (%0),%%xmm2 \n" 132 "movdqa %%xmm0,%%xmm1 \n" 133 "movdqa %%xmm8,%%xmm9 \n" 134 "palignr $0x8,%%xmm1,%%xmm1 \n" 135 "palignr $0x8,%%xmm9,%%xmm9 \n" 136 "movdqu (%0,%3),%%xmm3 \n" 137 "lea (%0,%3,2),%0 \n" 138 "movdqa %%xmm2,%%xmm10 \n" 139 "punpcklbw %%xmm3,%%xmm2 \n" 140 "punpckhbw %%xmm3,%%xmm10 \n" 141 "movdqa %%xmm2,%%xmm3 \n" 142 "movdqa %%xmm10,%%xmm11 \n" 143 "movdqu (%0),%%xmm4 \n" 144 "palignr $0x8,%%xmm3,%%xmm3 \n" 145 "palignr $0x8,%%xmm11,%%xmm11 \n" 146 "movdqu (%0,%3),%%xmm5 \n" 147 "lea (%0,%3,2),%0 \n" 148 "movdqa %%xmm4,%%xmm12 \n" 149 "punpcklbw %%xmm5,%%xmm4 \n" 150 "punpckhbw %%xmm5,%%xmm12 \n" 151 "movdqa %%xmm4,%%xmm5 \n" 152 "movdqa %%xmm12,%%xmm13 \n" 153 "movdqu (%0),%%xmm6 \n" 154 "palignr $0x8,%%xmm5,%%xmm5 \n" 155 "palignr $0x8,%%xmm13,%%xmm13 \n" 156 "movdqu (%0,%3),%%xmm7 \n" 157 "lea (%0,%3,2),%0 \n" 158 "movdqa %%xmm6,%%xmm14 \n" 159 "punpcklbw %%xmm7,%%xmm6 \n" 160 "punpckhbw %%xmm7,%%xmm14 \n" 161 "neg %3 \n" 162 "movdqa %%xmm6,%%xmm7 \n" 163 "movdqa %%xmm14,%%xmm15 \n" 164 "lea 0x10(%0,%3,8),%0 \n" 165 "palignr $0x8,%%xmm7,%%xmm7 \n" 166 "palignr $0x8,%%xmm15,%%xmm15 \n" 167 "neg %3 \n" 168 // Second round of bit swap. 169 "punpcklwd %%xmm2,%%xmm0 \n" 170 "punpcklwd %%xmm3,%%xmm1 \n" 171 "movdqa %%xmm0,%%xmm2 \n" 172 "movdqa %%xmm1,%%xmm3 \n" 173 "palignr $0x8,%%xmm2,%%xmm2 \n" 174 "palignr $0x8,%%xmm3,%%xmm3 \n" 175 "punpcklwd %%xmm6,%%xmm4 \n" 176 "punpcklwd %%xmm7,%%xmm5 \n" 177 "movdqa %%xmm4,%%xmm6 \n" 178 "movdqa %%xmm5,%%xmm7 \n" 179 "palignr $0x8,%%xmm6,%%xmm6 \n" 180 "palignr $0x8,%%xmm7,%%xmm7 \n" 181 "punpcklwd %%xmm10,%%xmm8 \n" 182 "punpcklwd %%xmm11,%%xmm9 \n" 183 "movdqa %%xmm8,%%xmm10 \n" 184 "movdqa %%xmm9,%%xmm11 \n" 185 "palignr $0x8,%%xmm10,%%xmm10 \n" 186 "palignr $0x8,%%xmm11,%%xmm11 \n" 187 "punpcklwd %%xmm14,%%xmm12 \n" 188 "punpcklwd %%xmm15,%%xmm13 \n" 189 "movdqa %%xmm12,%%xmm14 \n" 190 "movdqa %%xmm13,%%xmm15 \n" 191 "palignr $0x8,%%xmm14,%%xmm14 \n" 192 "palignr $0x8,%%xmm15,%%xmm15 \n" 193 // Third round of bit swap. 194 // Write to the destination pointer. 195 "punpckldq %%xmm4,%%xmm0 \n" 196 "movq %%xmm0,(%1) \n" 197 "movdqa %%xmm0,%%xmm4 \n" 198 "palignr $0x8,%%xmm4,%%xmm4 \n" 199 "movq %%xmm4,(%1,%4) \n" 200 "lea (%1,%4,2),%1 \n" 201 "punpckldq %%xmm6,%%xmm2 \n" 202 "movdqa %%xmm2,%%xmm6 \n" 203 "movq %%xmm2,(%1) \n" 204 "palignr $0x8,%%xmm6,%%xmm6 \n" 205 "punpckldq %%xmm5,%%xmm1 \n" 206 "movq %%xmm6,(%1,%4) \n" 207 "lea (%1,%4,2),%1 \n" 208 "movdqa %%xmm1,%%xmm5 \n" 209 "movq %%xmm1,(%1) \n" 210 "palignr $0x8,%%xmm5,%%xmm5 \n" 211 "movq %%xmm5,(%1,%4) \n" 212 "lea (%1,%4,2),%1 \n" 213 "punpckldq %%xmm7,%%xmm3 \n" 214 "movq %%xmm3,(%1) \n" 215 "movdqa %%xmm3,%%xmm7 \n" 216 "palignr $0x8,%%xmm7,%%xmm7 \n" 217 "movq %%xmm7,(%1,%4) \n" 218 "lea (%1,%4,2),%1 \n" 219 "punpckldq %%xmm12,%%xmm8 \n" 220 "movq %%xmm8,(%1) \n" 221 "movdqa %%xmm8,%%xmm12 \n" 222 "palignr $0x8,%%xmm12,%%xmm12 \n" 223 "movq %%xmm12,(%1,%4) \n" 224 "lea (%1,%4,2),%1 \n" 225 "punpckldq %%xmm14,%%xmm10 \n" 226 "movdqa %%xmm10,%%xmm14 \n" 227 "movq %%xmm10,(%1) \n" 228 "palignr $0x8,%%xmm14,%%xmm14 \n" 229 "punpckldq %%xmm13,%%xmm9 \n" 230 "movq %%xmm14,(%1,%4) \n" 231 "lea (%1,%4,2),%1 \n" 232 "movdqa %%xmm9,%%xmm13 \n" 233 "movq %%xmm9,(%1) \n" 234 "palignr $0x8,%%xmm13,%%xmm13 \n" 235 "movq %%xmm13,(%1,%4) \n" 236 "lea (%1,%4,2),%1 \n" 237 "punpckldq %%xmm15,%%xmm11 \n" 238 "movq %%xmm11,(%1) \n" 239 "movdqa %%xmm11,%%xmm15 \n" 240 "palignr $0x8,%%xmm15,%%xmm15 \n" 241 "sub $0x10,%2 \n" 242 "movq %%xmm15,(%1,%4) \n" 243 "lea (%1,%4,2),%1 \n" 244 "jg 1b \n" 245 : "+r"(src), // %0 246 "+r"(dst), // %1 247 "+r"(width) // %2 248 : "r"((intptr_t)(src_stride)), // %3 249 "r"((intptr_t)(dst_stride)) // %4 250 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 251 "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 252 "xmm15"); 253 } 254 #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) 255 256 // Transpose UV 8x8. 64 bit. 257 #if defined(HAS_TRANSPOSEUVWX8_SSE2) 258 void TransposeUVWx8_SSE2(const uint8* src, 259 int src_stride, 260 uint8* dst_a, 261 int dst_stride_a, 262 uint8* dst_b, 263 int dst_stride_b, 264 int width) { 265 asm volatile( 266 // Read in the data from the source pointer. 267 // First round of bit swap. 268 LABELALIGN 269 "1: \n" 270 "movdqu (%0),%%xmm0 \n" 271 "movdqu (%0,%4),%%xmm1 \n" 272 "lea (%0,%4,2),%0 \n" 273 "movdqa %%xmm0,%%xmm8 \n" 274 "punpcklbw %%xmm1,%%xmm0 \n" 275 "punpckhbw %%xmm1,%%xmm8 \n" 276 "movdqa %%xmm8,%%xmm1 \n" 277 "movdqu (%0),%%xmm2 \n" 278 "movdqu (%0,%4),%%xmm3 \n" 279 "lea (%0,%4,2),%0 \n" 280 "movdqa %%xmm2,%%xmm8 \n" 281 "punpcklbw %%xmm3,%%xmm2 \n" 282 "punpckhbw %%xmm3,%%xmm8 \n" 283 "movdqa %%xmm8,%%xmm3 \n" 284 "movdqu (%0),%%xmm4 \n" 285 "movdqu (%0,%4),%%xmm5 \n" 286 "lea (%0,%4,2),%0 \n" 287 "movdqa %%xmm4,%%xmm8 \n" 288 "punpcklbw %%xmm5,%%xmm4 \n" 289 "punpckhbw %%xmm5,%%xmm8 \n" 290 "movdqa %%xmm8,%%xmm5 \n" 291 "movdqu (%0),%%xmm6 \n" 292 "movdqu (%0,%4),%%xmm7 \n" 293 "lea (%0,%4,2),%0 \n" 294 "movdqa %%xmm6,%%xmm8 \n" 295 "punpcklbw %%xmm7,%%xmm6 \n" 296 "neg %4 \n" 297 "lea 0x10(%0,%4,8),%0 \n" 298 "punpckhbw %%xmm7,%%xmm8 \n" 299 "movdqa %%xmm8,%%xmm7 \n" 300 "neg %4 \n" 301 // Second round of bit swap. 302 "movdqa %%xmm0,%%xmm8 \n" 303 "movdqa %%xmm1,%%xmm9 \n" 304 "punpckhwd %%xmm2,%%xmm8 \n" 305 "punpckhwd %%xmm3,%%xmm9 \n" 306 "punpcklwd %%xmm2,%%xmm0 \n" 307 "punpcklwd %%xmm3,%%xmm1 \n" 308 "movdqa %%xmm8,%%xmm2 \n" 309 "movdqa %%xmm9,%%xmm3 \n" 310 "movdqa %%xmm4,%%xmm8 \n" 311 "movdqa %%xmm5,%%xmm9 \n" 312 "punpckhwd %%xmm6,%%xmm8 \n" 313 "punpckhwd %%xmm7,%%xmm9 \n" 314 "punpcklwd %%xmm6,%%xmm4 \n" 315 "punpcklwd %%xmm7,%%xmm5 \n" 316 "movdqa %%xmm8,%%xmm6 \n" 317 "movdqa %%xmm9,%%xmm7 \n" 318 // Third round of bit swap. 319 // Write to the destination pointer. 320 "movdqa %%xmm0,%%xmm8 \n" 321 "punpckldq %%xmm4,%%xmm0 \n" 322 "movlpd %%xmm0,(%1) \n" // Write back U channel 323 "movhpd %%xmm0,(%2) \n" // Write back V channel 324 "punpckhdq %%xmm4,%%xmm8 \n" 325 "movlpd %%xmm8,(%1,%5) \n" 326 "lea (%1,%5,2),%1 \n" 327 "movhpd %%xmm8,(%2,%6) \n" 328 "lea (%2,%6,2),%2 \n" 329 "movdqa %%xmm2,%%xmm8 \n" 330 "punpckldq %%xmm6,%%xmm2 \n" 331 "movlpd %%xmm2,(%1) \n" 332 "movhpd %%xmm2,(%2) \n" 333 "punpckhdq %%xmm6,%%xmm8 \n" 334 "movlpd %%xmm8,(%1,%5) \n" 335 "lea (%1,%5,2),%1 \n" 336 "movhpd %%xmm8,(%2,%6) \n" 337 "lea (%2,%6,2),%2 \n" 338 "movdqa %%xmm1,%%xmm8 \n" 339 "punpckldq %%xmm5,%%xmm1 \n" 340 "movlpd %%xmm1,(%1) \n" 341 "movhpd %%xmm1,(%2) \n" 342 "punpckhdq %%xmm5,%%xmm8 \n" 343 "movlpd %%xmm8,(%1,%5) \n" 344 "lea (%1,%5,2),%1 \n" 345 "movhpd %%xmm8,(%2,%6) \n" 346 "lea (%2,%6,2),%2 \n" 347 "movdqa %%xmm3,%%xmm8 \n" 348 "punpckldq %%xmm7,%%xmm3 \n" 349 "movlpd %%xmm3,(%1) \n" 350 "movhpd %%xmm3,(%2) \n" 351 "punpckhdq %%xmm7,%%xmm8 \n" 352 "sub $0x8,%3 \n" 353 "movlpd %%xmm8,(%1,%5) \n" 354 "lea (%1,%5,2),%1 \n" 355 "movhpd %%xmm8,(%2,%6) \n" 356 "lea (%2,%6,2),%2 \n" 357 "jg 1b \n" 358 : "+r"(src), // %0 359 "+r"(dst_a), // %1 360 "+r"(dst_b), // %2 361 "+r"(width) // %3 362 : "r"((intptr_t)(src_stride)), // %4 363 "r"((intptr_t)(dst_stride_a)), // %5 364 "r"((intptr_t)(dst_stride_b)) // %6 365 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 366 "xmm7", "xmm8", "xmm9"); 367 } 368 #endif // defined(HAS_TRANSPOSEUVWX8_SSE2) 369 #endif // defined(__x86_64__) || defined(__i386__) 370 371 #ifdef __cplusplus 372 } // extern "C" 373 } // namespace libyuv 374 #endif 375