1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 %macro LF_ABS 2 15 ; %1 value not preserved 16 ; %2 value preserved 17 ; output in %1 18 movdqa scratch1, %2 ; v2 19 20 psubusb scratch1, %1 ; v2 - v1 21 psubusb %1, %2 ; v1 - v2 22 por %1, scratch1 ; abs(v2 - v1) 23 %endmacro 24 25 %macro LF_FILTER_HEV_MASK 8-9 26 27 LF_ABS %1, %2 ; abs(p3 - p2) 28 LF_ABS %2, %3 ; abs(p2 - p1) 29 pmaxub %1, %2 ; accumulate mask 30 %if %0 == 8 31 movdqa scratch2, %3 ; save p1 32 LF_ABS scratch2, %4 ; abs(p1 - p0) 33 %endif 34 LF_ABS %4, %5 ; abs(p0 - q0) 35 LF_ABS %5, %6 ; abs(q0 - q1) 36 %if %0 == 8 37 pmaxub %5, scratch2 ; accumulate hev 38 %else 39 pmaxub %5, %9 40 %endif 41 pmaxub %1, %5 ; accumulate mask 42 43 LF_ABS %3, %6 ; abs(p1 - q1) 44 LF_ABS %6, %7 ; abs(q1 - q2) 45 pmaxub %1, %6 ; accumulate mask 46 LF_ABS %7, %8 ; abs(q2 - q3) 47 pmaxub %1, %7 ; accumulate mask 48 49 paddusb %4, %4 ; 2 * abs(p0 - q0) 50 pand %3, [GLOBAL(tfe)] 51 psrlw %3, 1 ; abs(p1 - q1) / 2 52 paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 53 54 psubusb %1, [limit] 55 psubusb %4, [blimit] 56 por %1, %4 57 pcmpeqb %1, zero ; mask 58 59 psubusb %5, [thresh] 60 pcmpeqb %5, zero ; ~hev 61 %endmacro 62 63 %macro LF_FILTER 6 64 ; %1-%4: p1-q1 65 ; %5: mask 66 ; %6: hev 67 68 movdqa scratch2, %6 ; save hev 69 70 pxor %1, [GLOBAL(t80)] ; ps1 71 pxor %4, [GLOBAL(t80)] ; qs1 72 movdqa scratch1, %1 73 psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) 74 pandn scratch2, scratch1 ; vp8_filter &= hev 75 76 pxor %2, [GLOBAL(t80)] ; ps0 77 pxor %3, [GLOBAL(t80)] ; qs0 78 movdqa scratch1, %3 79 psubsb scratch1, %2 ; qs0 - ps0 80 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) 81 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) 82 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) 83 pand %5, scratch2 ; &= mask 84 85 movdqa scratch2, %5 86 paddsb %5, [GLOBAL(t4)] ; Filter1 87 paddsb scratch2, [GLOBAL(t3)] ; Filter2 88 89 ; Filter1 >> 3 90 movdqa scratch1, zero 91 pcmpgtb scratch1, %5 92 psrlw %5, 3 93 pand scratch1, [GLOBAL(te0)] 94 pand %5, [GLOBAL(t1f)] 95 por %5, scratch1 96 97 psubsb %3, %5 ; qs0 - Filter1 98 pxor %3, [GLOBAL(t80)] 99 100 ; Filter2 >> 3 101 movdqa scratch1, zero 102 pcmpgtb scratch1, scratch2 103 psrlw scratch2, 3 104 pand scratch1, [GLOBAL(te0)] 105 pand scratch2, [GLOBAL(t1f)] 106 por scratch2, scratch1 107 108 paddsb %2, scratch2 ; ps0 + Filter2 109 pxor %2, [GLOBAL(t80)] 110 111 ; outer tap adjustments 112 paddsb %5, [GLOBAL(t1)] 113 movdqa scratch1, zero 114 pcmpgtb scratch1, %5 115 psrlw %5, 1 116 pand scratch1, [GLOBAL(t80)] 117 pand %5, [GLOBAL(t7f)] 118 por %5, scratch1 119 pand %5, %6 ; vp8_filter &= ~hev 120 121 psubsb %4, %5 ; qs1 - vp8_filter 122 pxor %4, [GLOBAL(t80)] 123 124 paddsb %1, %5 ; ps1 + vp8_filter 125 pxor %1, [GLOBAL(t80)] 126 %endmacro 127 128 ;void vp8_loop_filter_bh_y_sse2 129 ;( 130 ; unsigned char *src_ptr, 131 ; int src_pixel_step, 132 ; const char *blimit, 133 ; const char *limit, 134 ; const char *thresh 135 ;) 136 global sym(vp8_loop_filter_bh_y_sse2) PRIVATE 137 sym(vp8_loop_filter_bh_y_sse2): 138 139 %if LIBVPX_YASM_WIN64 140 %define src rcx ; src_ptr 141 %define stride rdx ; src_pixel_step 142 %define blimit r8 143 %define limit r9 144 %define thresh r10 145 146 %define spp rax 147 %define stride3 r11 148 %define stride5 r12 149 %define stride7 r13 150 151 push rbp 152 mov rbp, rsp 153 SAVE_XMM 11 154 push r12 155 push r13 156 mov thresh, arg(4) 157 %else 158 %define src rdi ; src_ptr 159 %define stride rsi ; src_pixel_step 160 %define blimit rdx 161 %define limit rcx 162 %define thresh r8 163 164 %define spp rax 165 %define stride3 r9 166 %define stride5 r10 167 %define stride7 r11 168 %endif 169 170 %define scratch1 xmm5 171 %define scratch2 xmm6 172 %define zero xmm7 173 174 %define i0 [src] 175 %define i1 [spp] 176 %define i2 [src + 2 * stride] 177 %define i3 [spp + 2 * stride] 178 %define i4 [src + 4 * stride] 179 %define i5 [spp + 4 * stride] 180 %define i6 [src + 2 * stride3] 181 %define i7 [spp + 2 * stride3] 182 %define i8 [src + 8 * stride] 183 %define i9 [spp + 8 * stride] 184 %define i10 [src + 2 * stride5] 185 %define i11 [spp + 2 * stride5] 186 %define i12 [src + 4 * stride3] 187 %define i13 [spp + 4 * stride3] 188 %define i14 [src + 2 * stride7] 189 %define i15 [spp + 2 * stride7] 190 191 ; prep work 192 lea spp, [src + stride] 193 lea stride3, [stride + 2 * stride] 194 lea stride5, [stride3 + 2 * stride] 195 lea stride7, [stride3 + 4 * stride] 196 pxor zero, zero 197 198 ; load the first set into registers 199 movdqa xmm0, i0 200 movdqa xmm1, i1 201 movdqa xmm2, i2 202 movdqa xmm3, i3 203 movdqa xmm4, i4 204 movdqa xmm8, i5 205 movdqa xmm9, i6 ; q2, will contain abs(p1-p0) 206 movdqa xmm10, i7 207 LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 208 209 movdqa xmm1, i2 210 movdqa xmm2, i3 211 movdqa xmm3, i4 212 movdqa xmm8, i5 213 LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 214 movdqa i2, xmm1 215 movdqa i3, xmm2 216 217 ; second set 218 movdqa i4, xmm3 219 movdqa i5, xmm8 220 221 movdqa xmm0, i6 222 movdqa xmm1, i7 223 movdqa xmm2, i8 224 movdqa xmm4, i9 225 movdqa xmm10, i10 ; q2, will contain abs(p1-p0) 226 movdqa xmm11, i11 227 LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 228 229 movdqa xmm0, i6 230 movdqa xmm1, i7 231 movdqa xmm4, i8 232 movdqa xmm8, i9 233 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 234 movdqa i6, xmm0 235 movdqa i7, xmm1 236 237 ; last set 238 movdqa i8, xmm4 239 movdqa i9, xmm8 240 241 movdqa xmm0, i10 242 movdqa xmm1, i11 243 movdqa xmm2, i12 244 movdqa xmm3, i13 245 movdqa xmm9, i14 ; q2, will contain abs(p1-p0) 246 movdqa xmm11, i15 247 LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 248 249 movdqa xmm0, i10 250 movdqa xmm1, i11 251 movdqa xmm3, i12 252 movdqa xmm8, i13 253 LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 254 movdqa i10, xmm0 255 movdqa i11, xmm1 256 movdqa i12, xmm3 257 movdqa i13, xmm8 258 259 %if LIBVPX_YASM_WIN64 260 pop r13 261 pop r12 262 RESTORE_XMM 263 pop rbp 264 %endif 265 266 ret 267 268 269 ;void vp8_loop_filter_bv_y_sse2 270 ;( 271 ; unsigned char *src_ptr, 272 ; int src_pixel_step, 273 ; const char *blimit, 274 ; const char *limit, 275 ; const char *thresh 276 ;) 277 278 global sym(vp8_loop_filter_bv_y_sse2) PRIVATE 279 sym(vp8_loop_filter_bv_y_sse2): 280 281 %if LIBVPX_YASM_WIN64 282 %define src rcx ; src_ptr 283 %define stride rdx ; src_pixel_step 284 %define blimit r8 285 %define limit r9 286 %define thresh r10 287 288 %define spp rax 289 %define stride3 r11 290 %define stride5 r12 291 %define stride7 r13 292 293 push rbp 294 mov rbp, rsp 295 SAVE_XMM 15 296 push r12 297 push r13 298 mov thresh, arg(4) 299 %else 300 %define src rdi 301 %define stride rsi 302 %define blimit rdx 303 %define limit rcx 304 %define thresh r8 305 306 %define spp rax 307 %define stride3 r9 308 %define stride5 r10 309 %define stride7 r11 310 %endif 311 312 %define scratch1 xmm5 313 %define scratch2 xmm6 314 %define zero xmm7 315 316 %define s0 [src] 317 %define s1 [spp] 318 %define s2 [src + 2 * stride] 319 %define s3 [spp + 2 * stride] 320 %define s4 [src + 4 * stride] 321 %define s5 [spp + 4 * stride] 322 %define s6 [src + 2 * stride3] 323 %define s7 [spp + 2 * stride3] 324 %define s8 [src + 8 * stride] 325 %define s9 [spp + 8 * stride] 326 %define s10 [src + 2 * stride5] 327 %define s11 [spp + 2 * stride5] 328 %define s12 [src + 4 * stride3] 329 %define s13 [spp + 4 * stride3] 330 %define s14 [src + 2 * stride7] 331 %define s15 [spp + 2 * stride7] 332 333 %define i0 [rsp] 334 %define i1 [rsp + 16] 335 %define i2 [rsp + 32] 336 %define i3 [rsp + 48] 337 %define i4 [rsp + 64] 338 %define i5 [rsp + 80] 339 %define i6 [rsp + 96] 340 %define i7 [rsp + 112] 341 %define i8 [rsp + 128] 342 %define i9 [rsp + 144] 343 %define i10 [rsp + 160] 344 %define i11 [rsp + 176] 345 %define i12 [rsp + 192] 346 %define i13 [rsp + 208] 347 %define i14 [rsp + 224] 348 %define i15 [rsp + 240] 349 350 ALIGN_STACK 16, rax 351 352 ; reserve stack space 353 %define temp_storage 0 ; size is 256 (16*16) 354 %define stack_size 256 355 sub rsp, stack_size 356 357 ; prep work 358 lea spp, [src + stride] 359 lea stride3, [stride + 2 * stride] 360 lea stride5, [stride3 + 2 * stride] 361 lea stride7, [stride3 + 4 * stride] 362 363 ; 8-f 364 movdqa xmm0, s8 365 movdqa xmm1, xmm0 366 punpcklbw xmm0, s9 ; 80 90 367 punpckhbw xmm1, s9 ; 88 98 368 369 movdqa xmm2, s10 370 movdqa xmm3, xmm2 371 punpcklbw xmm2, s11 ; a0 b0 372 punpckhbw xmm3, s11 ; a8 b8 373 374 movdqa xmm4, xmm0 375 punpcklwd xmm0, xmm2 ; 80 90 a0 b0 376 punpckhwd xmm4, xmm2 ; 84 94 a4 b4 377 378 movdqa xmm2, xmm1 379 punpcklwd xmm1, xmm3 ; 88 98 a8 b8 380 punpckhwd xmm2, xmm3 ; 8c 9c ac bc 381 382 ; using xmm[0124] 383 ; work on next 4 rows 384 385 movdqa xmm3, s12 386 movdqa xmm5, xmm3 387 punpcklbw xmm3, s13 ; c0 d0 388 punpckhbw xmm5, s13 ; c8 d8 389 390 movdqa xmm6, s14 391 movdqa xmm7, xmm6 392 punpcklbw xmm6, s15 ; e0 f0 393 punpckhbw xmm7, s15 ; e8 f8 394 395 movdqa xmm8, xmm3 396 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 397 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 398 399 movdqa xmm6, xmm5 400 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 401 punpckhwd xmm6, xmm7 ; cc dc ec fc 402 403 ; pull the third and fourth sets together 404 405 movdqa xmm7, xmm0 406 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 407 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 408 409 movdqa xmm3, xmm4 410 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 411 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 412 413 movdqa xmm8, xmm1 414 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 415 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa 416 417 movdqa xmm5, xmm2 418 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc 419 punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe 420 421 ; save the calculations. we only have 15 registers ... 422 movdqa i0, xmm0 423 movdqa i1, xmm7 424 movdqa i2, xmm4 425 movdqa i3, xmm3 426 movdqa i4, xmm1 427 movdqa i5, xmm8 428 movdqa i6, xmm2 429 movdqa i7, xmm5 430 431 ; 0-7 432 movdqa xmm0, s0 433 movdqa xmm1, xmm0 434 punpcklbw xmm0, s1 ; 00 10 435 punpckhbw xmm1, s1 ; 08 18 436 437 movdqa xmm2, s2 438 movdqa xmm3, xmm2 439 punpcklbw xmm2, s3 ; 20 30 440 punpckhbw xmm3, s3 ; 28 38 441 442 movdqa xmm4, xmm0 443 punpcklwd xmm0, xmm2 ; 00 10 20 30 444 punpckhwd xmm4, xmm2 ; 04 14 24 34 445 446 movdqa xmm2, xmm1 447 punpcklwd xmm1, xmm3 ; 08 18 28 38 448 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c 449 450 ; using xmm[0124] 451 ; work on next 4 rows 452 453 movdqa xmm3, s4 454 movdqa xmm5, xmm3 455 punpcklbw xmm3, s5 ; 40 50 456 punpckhbw xmm5, s5 ; 48 58 457 458 movdqa xmm6, s6 459 movdqa xmm7, xmm6 460 punpcklbw xmm6, s7 ; 60 70 461 punpckhbw xmm7, s7 ; 68 78 462 463 movdqa xmm8, xmm3 464 punpcklwd xmm3, xmm6 ; 40 50 60 70 465 punpckhwd xmm8, xmm6 ; 44 54 64 74 466 467 movdqa xmm6, xmm5 468 punpcklwd xmm5, xmm7 ; 48 58 68 78 469 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c 470 471 ; pull the first two sets together 472 473 movdqa xmm7, xmm0 474 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 475 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 476 477 movdqa xmm3, xmm4 478 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 479 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 480 481 movdqa xmm8, xmm1 482 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 483 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a 484 485 movdqa xmm5, xmm2 486 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c 487 punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e 488 ; final combination 489 490 movdqa xmm6, xmm0 491 punpcklqdq xmm0, i0 492 punpckhqdq xmm6, i0 493 494 movdqa xmm9, xmm7 495 punpcklqdq xmm7, i1 496 punpckhqdq xmm9, i1 497 498 movdqa xmm10, xmm4 499 punpcklqdq xmm4, i2 500 punpckhqdq xmm10, i2 501 502 movdqa xmm11, xmm3 503 punpcklqdq xmm3, i3 504 punpckhqdq xmm11, i3 505 506 movdqa xmm12, xmm1 507 punpcklqdq xmm1, i4 508 punpckhqdq xmm12, i4 509 510 movdqa xmm13, xmm8 511 punpcklqdq xmm8, i5 512 punpckhqdq xmm13, i5 513 514 movdqa xmm14, xmm2 515 punpcklqdq xmm2, i6 516 punpckhqdq xmm14, i6 517 518 movdqa xmm15, xmm5 519 punpcklqdq xmm5, i7 520 punpckhqdq xmm15, i7 521 522 movdqa i0, xmm0 523 movdqa i1, xmm6 524 movdqa i2, xmm7 525 movdqa i3, xmm9 526 movdqa i4, xmm4 527 movdqa i5, xmm10 528 movdqa i6, xmm3 529 movdqa i7, xmm11 530 movdqa i8, xmm1 531 movdqa i9, xmm12 532 movdqa i10, xmm8 533 movdqa i11, xmm13 534 movdqa i12, xmm2 535 movdqa i13, xmm14 536 movdqa i14, xmm5 537 movdqa i15, xmm15 538 539 ; TRANSPOSED DATA AVAILABLE ON THE STACK 540 541 movdqa xmm12, xmm6 542 movdqa xmm13, xmm7 543 544 pxor zero, zero 545 546 LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 547 548 movdqa xmm1, i2 549 movdqa xmm2, i3 550 movdqa xmm8, i4 551 movdqa xmm9, i5 552 LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 553 movdqa i2, xmm1 554 movdqa i3, xmm2 555 556 ; second set 557 movdqa i4, xmm8 558 movdqa i5, xmm9 559 560 movdqa xmm0, i6 561 movdqa xmm1, i7 562 movdqa xmm2, i8 563 movdqa xmm4, i9 564 movdqa xmm10, i10 ; q2, will contain abs(p1-p0) 565 movdqa xmm11, i11 566 LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 567 568 movdqa xmm0, i6 569 movdqa xmm1, i7 570 movdqa xmm3, i8 571 movdqa xmm4, i9 572 LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 573 movdqa i6, xmm0 574 movdqa i7, xmm1 575 576 ; last set 577 movdqa i8, xmm3 578 movdqa i9, xmm4 579 580 movdqa xmm0, i10 581 movdqa xmm1, i11 582 movdqa xmm2, i12 583 movdqa xmm8, i13 584 movdqa xmm9, i14 ; q2, will contain abs(p1-p0) 585 movdqa xmm11, i15 586 LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 587 588 movdqa xmm0, i10 589 movdqa xmm1, i11 590 movdqa xmm4, i12 591 movdqa xmm8, i13 592 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 593 movdqa i10, xmm0 594 movdqa i11, xmm1 595 movdqa i12, xmm4 596 movdqa i13, xmm8 597 598 599 ; RESHUFFLE AND WRITE OUT 600 ; 8-f 601 movdqa xmm0, i8 602 movdqa xmm1, xmm0 603 punpcklbw xmm0, i9 ; 80 90 604 punpckhbw xmm1, i9 ; 88 98 605 606 movdqa xmm2, i10 607 movdqa xmm3, xmm2 608 punpcklbw xmm2, i11 ; a0 b0 609 punpckhbw xmm3, i11 ; a8 b8 610 611 movdqa xmm4, xmm0 612 punpcklwd xmm0, xmm2 ; 80 90 a0 b0 613 punpckhwd xmm4, xmm2 ; 84 94 a4 b4 614 615 movdqa xmm2, xmm1 616 punpcklwd xmm1, xmm3 ; 88 98 a8 b8 617 punpckhwd xmm2, xmm3 ; 8c 9c ac bc 618 619 ; using xmm[0124] 620 ; work on next 4 rows 621 622 movdqa xmm3, i12 623 movdqa xmm5, xmm3 624 punpcklbw xmm3, i13 ; c0 d0 625 punpckhbw xmm5, i13 ; c8 d8 626 627 movdqa xmm6, i14 628 movdqa xmm7, xmm6 629 punpcklbw xmm6, i15 ; e0 f0 630 punpckhbw xmm7, i15 ; e8 f8 631 632 movdqa xmm8, xmm3 633 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 634 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 635 636 movdqa xmm6, xmm5 637 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 638 punpckhwd xmm6, xmm7 ; cc dc ec fc 639 640 ; pull the third and fourth sets together 641 642 movdqa xmm7, xmm0 643 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 644 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 645 646 movdqa xmm3, xmm4 647 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 648 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 649 650 movdqa xmm8, xmm1 651 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 652 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa 653 654 movdqa xmm5, xmm2 655 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc 656 punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe 657 658 ; save the calculations. we only have 15 registers ... 659 movdqa i8, xmm0 660 movdqa i9, xmm7 661 movdqa i10, xmm4 662 movdqa i11, xmm3 663 movdqa i12, xmm1 664 movdqa i13, xmm8 665 movdqa i14, xmm2 666 movdqa i15, xmm5 667 668 ; 0-7 669 movdqa xmm0, i0 670 movdqa xmm1, xmm0 671 punpcklbw xmm0, i1 ; 00 10 672 punpckhbw xmm1, i1 ; 08 18 673 674 movdqa xmm2, i2 675 movdqa xmm3, xmm2 676 punpcklbw xmm2, i3 ; 20 30 677 punpckhbw xmm3, i3 ; 28 38 678 679 movdqa xmm4, xmm0 680 punpcklwd xmm0, xmm2 ; 00 10 20 30 681 punpckhwd xmm4, xmm2 ; 04 14 24 34 682 683 movdqa xmm2, xmm1 684 punpcklwd xmm1, xmm3 ; 08 18 28 38 685 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c 686 687 ; using xmm[0124] 688 ; work on next 4 rows 689 690 movdqa xmm3, i4 691 movdqa xmm5, xmm3 692 punpcklbw xmm3, i5 ; 40 50 693 punpckhbw xmm5, i5 ; 48 58 694 695 movdqa xmm6, i6 696 movdqa xmm7, xmm6 697 punpcklbw xmm6, i7 ; 60 70 698 punpckhbw xmm7, i7 ; 68 78 699 700 movdqa xmm8, xmm3 701 punpcklwd xmm3, xmm6 ; 40 50 60 70 702 punpckhwd xmm8, xmm6 ; 44 54 64 74 703 704 movdqa xmm6, xmm5 705 punpcklwd xmm5, xmm7 ; 48 58 68 78 706 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c 707 708 ; pull the first two sets together 709 710 movdqa xmm7, xmm0 711 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 712 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 713 714 movdqa xmm3, xmm4 715 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 716 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 717 718 movdqa xmm8, xmm1 719 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 720 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a 721 722 movdqa xmm5, xmm2 723 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c 724 punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e 725 ; final combination 726 727 movdqa xmm6, xmm0 728 punpcklqdq xmm0, i8 729 punpckhqdq xmm6, i8 730 731 movdqa xmm9, xmm7 732 punpcklqdq xmm7, i9 733 punpckhqdq xmm9, i9 734 735 movdqa xmm10, xmm4 736 punpcklqdq xmm4, i10 737 punpckhqdq xmm10, i10 738 739 movdqa xmm11, xmm3 740 punpcklqdq xmm3, i11 741 punpckhqdq xmm11, i11 742 743 movdqa xmm12, xmm1 744 punpcklqdq xmm1, i12 745 punpckhqdq xmm12, i12 746 747 movdqa xmm13, xmm8 748 punpcklqdq xmm8, i13 749 punpckhqdq xmm13, i13 750 751 movdqa xmm14, xmm2 752 punpcklqdq xmm2, i14 753 punpckhqdq xmm14, i14 754 755 movdqa xmm15, xmm5 756 punpcklqdq xmm5, i15 757 punpckhqdq xmm15, i15 758 759 movdqa s0, xmm0 760 movdqa s1, xmm6 761 movdqa s2, xmm7 762 movdqa s3, xmm9 763 movdqa s4, xmm4 764 movdqa s5, xmm10 765 movdqa s6, xmm3 766 movdqa s7, xmm11 767 movdqa s8, xmm1 768 movdqa s9, xmm12 769 movdqa s10, xmm8 770 movdqa s11, xmm13 771 movdqa s12, xmm2 772 movdqa s13, xmm14 773 movdqa s14, xmm5 774 movdqa s15, xmm15 775 776 ; free stack space 777 add rsp, stack_size 778 779 ; un-ALIGN_STACK 780 pop rsp 781 782 %if LIBVPX_YASM_WIN64 783 pop r13 784 pop r12 785 RESTORE_XMM 786 pop rbp 787 %endif 788 789 ret 790 791 SECTION_RODATA 792 align 16 793 te0: 794 times 16 db 0xe0 795 align 16 796 t7f: 797 times 16 db 0x7f 798 align 16 799 tfe: 800 times 16 db 0xfe 801 align 16 802 t1f: 803 times 16 db 0x1f 804 align 16 805 t80: 806 times 16 db 0x80 807 align 16 808 t1: 809 times 16 db 0x01 810 align 16 811 t3: 812 times 16 db 0x03 813 align 16 814 t4: 815 times 16 db 0x04 816