1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 15 %macro LFH_FILTER_MASK 1 16 %if %1 17 movdqa xmm2, [rdi+2*rax] ; q3 18 movdqa xmm1, [rsi+2*rax] ; q2 19 %else 20 movq xmm0, [rsi + rcx*2] ; q3 21 movq xmm2, [rdi + rcx*2] 22 pslldq xmm2, 8 23 por xmm2, xmm0 24 movq xmm1, [rsi + rcx] ; q2 25 movq xmm3, [rdi + rcx] 26 pslldq xmm3, 8 27 por xmm1, xmm3 28 movdqa XMMWORD PTR [rsp], xmm1 ; store q2 29 %endif 30 31 movdqa xmm6, xmm1 ; q2 32 psubusb xmm1, xmm2 ; q2-=q3 33 psubusb xmm2, xmm6 ; q3-=q2 34 por xmm1, xmm2 ; abs(q3-q2) 35 36 psubusb xmm1, xmm7 37 38 %if %1 39 movdqa xmm4, [rsi+rax] ; q1 40 %else 41 movq xmm0, [rsi] ; q1 42 movq xmm4, [rdi] 43 pslldq xmm4, 8 44 por xmm4, xmm0 45 movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 46 %endif 47 48 movdqa xmm3, xmm4 ; q1 49 psubusb xmm4, xmm6 ; q1-=q2 50 psubusb xmm6, xmm3 ; q2-=q1 51 por xmm4, xmm6 ; abs(q2-q1) 52 psubusb xmm4, xmm7 53 54 por xmm1, xmm4 55 56 %if %1 57 movdqa xmm4, [rsi] ; q0 58 %else 59 movq xmm4, [rsi + rax] ; q0 60 movq xmm0, [rdi + rax] 61 pslldq xmm0, 8 62 por xmm4, xmm0 63 %endif 64 65 movdqa xmm0, xmm4 ; q0 66 psubusb xmm4, xmm3 ; q0-=q1 67 psubusb xmm3, xmm0 ; q1-=q0 68 por xmm4, xmm3 ; abs(q0-q1) 69 movdqa t0, xmm4 ; save to t0 70 71 psubusb xmm4, xmm7 72 por xmm1, xmm4 73 74 %if %1 75 neg rax ; negate pitch to deal with above border 76 77 movdqa xmm2, [rsi+4*rax] ; p3 78 movdqa xmm4, [rdi+4*rax] ; p2 79 %else 80 lea rsi, [rsi + rax*4] 81 lea rdi, [rdi + rax*4] 82 83 movq xmm2, [rsi + rax] ; p3 84 movq xmm3, [rdi + rax] 85 pslldq xmm3, 8 86 por xmm2, xmm3 87 movq xmm4, [rsi] ; p2 88 movq xmm5, [rdi] 89 pslldq xmm5, 8 90 por xmm4, xmm5 91 movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 92 %endif 93 94 movdqa xmm5, xmm4 ; p2 95 psubusb xmm4, xmm2 ; p2-=p3 96 psubusb xmm2, xmm5 ; p3-=p2 97 por xmm4, xmm2 ; abs(p3 - p2) 98 99 psubusb xmm4, xmm7 100 por xmm1, xmm4 101 102 %if %1 103 movdqa xmm4, [rsi+2*rax] ; p1 104 %else 105 movq xmm4, [rsi + rcx] ; p1 106 movq xmm3, [rdi + rcx] 107 pslldq xmm3, 8 108 por xmm4, xmm3 109 movdqa XMMWORD PTR [rsp + 48], xmm4 ; store p1 110 %endif 111 112 movdqa xmm3, xmm4 ; p1 113 psubusb xmm4, xmm5 ; p1-=p2 114 psubusb xmm5, xmm3 ; p2-=p1 115 por xmm4, xmm5 ; abs(p2 - p1) 116 psubusb xmm4, xmm7 117 118 por xmm1, xmm4 119 movdqa xmm2, xmm3 ; p1 120 121 %if %1 122 movdqa xmm4, [rsi+rax] ; p0 123 %else 124 movq xmm4, [rsi + rcx*2] ; p0 125 movq xmm5, [rdi + rcx*2] 126 pslldq xmm5, 8 127 por xmm4, xmm5 128 %endif 129 130 movdqa xmm5, xmm4 ; p0 131 psubusb xmm4, xmm3 ; p0-=p1 132 psubusb xmm3, xmm5 ; p1-=p0 133 por xmm4, xmm3 ; abs(p1 - p0) 134 movdqa t1, xmm4 ; save to t1 135 136 psubusb xmm4, xmm7 137 por xmm1, xmm4 138 139 %if %1 140 movdqa xmm3, [rdi] ; q1 141 %else 142 movdqa xmm3, q1 ; q1 143 %endif 144 145 movdqa xmm4, xmm3 ; q1 146 psubusb xmm3, xmm2 ; q1-=p1 147 psubusb xmm2, xmm4 ; p1-=q1 148 por xmm2, xmm3 ; abs(p1-q1) 149 pand xmm2, [tfe GLOBAL] ; set lsb of each byte to zero 150 psrlw xmm2, 1 ; abs(p1-q1)/2 151 152 movdqa xmm6, xmm5 ; p0 153 movdqa xmm3, xmm0 ; q0 154 psubusb xmm5, xmm3 ; p0-=q0 155 psubusb xmm3, xmm6 ; q0-=p0 156 por xmm5, xmm3 ; abs(p0 - q0) 157 paddusb xmm5, xmm5 ; abs(p0-q0)*2 158 paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 159 160 mov rdx, arg(2) ; get flimit 161 movdqa xmm2, XMMWORD PTR [rdx] 162 paddb xmm2, xmm2 ; flimit*2 (less than 255) 163 paddb xmm7, xmm2 ; flimit * 2 + limit (less than 255) 164 165 psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 166 por xmm1, xmm5 167 pxor xmm5, xmm5 168 pcmpeqb xmm1, xmm5 ; mask mm1 169 %endmacro 170 171 %macro LFH_HEV_MASK 0 172 mov rdx, arg(4) ; get thresh 173 movdqa xmm7, XMMWORD PTR [rdx] 174 175 movdqa xmm4, t0 ; get abs (q1 - q0) 176 psubusb xmm4, xmm7 177 movdqa xmm3, t1 ; get abs (p1 - p0) 178 psubusb xmm3, xmm7 179 paddb xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 180 pcmpeqb xmm4, xmm5 181 182 pcmpeqb xmm5, xmm5 183 pxor xmm4, xmm5 184 %endmacro 185 186 %macro BH_FILTER 1 187 %if %1 188 movdqa xmm2, [rsi+2*rax] ; p1 189 movdqa xmm7, [rdi] ; q1 190 %else 191 movdqa xmm2, p1 ; p1 192 movdqa xmm7, q1 ; q1 193 %endif 194 195 pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values 196 pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values 197 198 psubsb xmm2, xmm7 ; p1 - q1 199 pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values 200 201 pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) 202 pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values 203 204 movdqa xmm3, xmm0 ; q0 205 psubsb xmm0, xmm6 ; q0 - p0 206 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 207 paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 208 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 209 pand xmm1, xmm2 ; mask filter values we don't care about 210 movdqa xmm2, xmm1 211 paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 212 paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 213 214 punpckhbw xmm5, xmm2 ; axbxcxdx 215 punpcklbw xmm2, xmm2 ; exfxgxhx 216 217 psraw xmm5, 11 ; sign extended shift right by 3 218 psraw xmm2, 11 ; sign extended shift right by 3 219 packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 220 221 punpcklbw xmm0, xmm1 ; exfxgxhx 222 punpckhbw xmm1, xmm1 ; axbxcxdx 223 224 psraw xmm0, 11 ; sign extended shift right by 3 225 psraw xmm1, 11 ; sign extended shift right by 3 226 227 movdqa xmm5, xmm0 ; save results 228 packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 229 230 paddsw xmm5, [ones GLOBAL] 231 paddsw xmm1, [ones GLOBAL] 232 233 psraw xmm5, 1 ; partial shifted one more time for 2nd tap 234 psraw xmm1, 1 ; partial shifted one more time for 2nd tap 235 236 packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 237 pandn xmm4, xmm5 ; high edge variance additive 238 %endmacro 239 240 %macro BH_WRITEBACK 1 241 paddsb xmm6, xmm2 ; p0+= p0 add 242 pxor xmm6, [t80 GLOBAL] ; unoffset 243 %if %1 244 movdqa [rsi+rax], xmm6 ; write back 245 %else 246 lea rsi, [rsi + rcx*2] 247 lea rdi, [rdi + rcx*2] 248 movq MMWORD PTR [rsi], xmm6 ; p0 249 psrldq xmm6, 8 250 movq MMWORD PTR [rdi], xmm6 251 %endif 252 253 %if %1 254 movdqa xmm6, [rsi+2*rax] ; p1 255 %else 256 movdqa xmm6, p1 ; p1 257 %endif 258 pxor xmm6, [t80 GLOBAL] ; reoffset 259 paddsb xmm6, xmm4 ; p1+= p1 add 260 pxor xmm6, [t80 GLOBAL] ; unoffset 261 %if %1 262 movdqa [rsi+2*rax], xmm6 ; write back 263 %else 264 movq MMWORD PTR [rsi + rax], xmm6 ; p1 265 psrldq xmm6, 8 266 movq MMWORD PTR [rdi + rax], xmm6 267 %endif 268 269 psubsb xmm3, xmm0 ; q0-= q0 add 270 pxor xmm3, [t80 GLOBAL] ; unoffset 271 %if %1 272 movdqa [rsi], xmm3 ; write back 273 %else 274 movq MMWORD PTR [rsi + rcx], xmm3 ; q0 275 psrldq xmm3, 8 276 movq MMWORD PTR [rdi + rcx], xmm3 277 %endif 278 279 psubsb xmm7, xmm4 ; q1-= q1 add 280 pxor xmm7, [t80 GLOBAL] ; unoffset 281 %if %1 282 movdqa [rdi], xmm7 ; write back 283 %else 284 movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 285 psrldq xmm7, 8 286 movq MMWORD PTR [rdi + rcx*2],xmm7 287 %endif 288 %endmacro 289 290 291 ;void vp8_loop_filter_horizontal_edge_sse2 292 ;( 293 ; unsigned char *src_ptr, 294 ; int src_pixel_step, 295 ; const char *flimit, 296 ; const char *limit, 297 ; const char *thresh, 298 ; int count 299 ;) 300 global sym(vp8_loop_filter_horizontal_edge_sse2) 301 sym(vp8_loop_filter_horizontal_edge_sse2): 302 push rbp 303 mov rbp, rsp 304 SHADOW_ARGS_TO_STACK 6 305 SAVE_XMM 306 GET_GOT rbx 307 push rsi 308 push rdi 309 ; end prolog 310 311 ALIGN_STACK 16, rax 312 sub rsp, 32 ; reserve 32 bytes 313 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 314 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 315 316 mov rsi, arg(0) ;src_ptr 317 movsxd rax, dword ptr arg(1) ;src_pixel_step 318 319 mov rdx, arg(3) ;limit 320 movdqa xmm7, XMMWORD PTR [rdx] 321 322 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 323 324 ; calculate breakout conditions 325 LFH_FILTER_MASK 1 326 327 ; calculate high edge variance 328 LFH_HEV_MASK 329 330 ; start work on filters 331 BH_FILTER 1 332 ; write back the result 333 BH_WRITEBACK 1 334 335 add rsp, 32 336 pop rsp 337 ; begin epilog 338 pop rdi 339 pop rsi 340 RESTORE_GOT 341 RESTORE_XMM 342 UNSHADOW_ARGS 343 pop rbp 344 ret 345 346 347 ;void vp8_loop_filter_horizontal_edge_uv_sse2 348 ;( 349 ; unsigned char *src_ptr, 350 ; int src_pixel_step, 351 ; const char *flimit, 352 ; const char *limit, 353 ; const char *thresh, 354 ; int count 355 ;) 356 global sym(vp8_loop_filter_horizontal_edge_uv_sse2) 357 sym(vp8_loop_filter_horizontal_edge_uv_sse2): 358 push rbp 359 mov rbp, rsp 360 SHADOW_ARGS_TO_STACK 6 361 SAVE_XMM 362 GET_GOT rbx 363 push rsi 364 push rdi 365 ; end prolog 366 367 ALIGN_STACK 16, rax 368 sub rsp, 96 ; reserve 96 bytes 369 %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; 370 %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; 371 %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; 372 %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; 373 %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; 374 %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; 375 376 mov rsi, arg(0) ; u 377 mov rdi, arg(5) ; v 378 movsxd rax, dword ptr arg(1) ; src_pixel_step 379 mov rcx, rax 380 neg rax ; negate pitch to deal with above border 381 382 mov rdx, arg(3) ;limit 383 movdqa xmm7, XMMWORD PTR [rdx] 384 385 lea rsi, [rsi + rcx] 386 lea rdi, [rdi + rcx] 387 388 ; calculate breakout conditions 389 LFH_FILTER_MASK 0 390 ; calculate high edge variance 391 LFH_HEV_MASK 392 393 ; start work on filters 394 BH_FILTER 0 395 ; write back the result 396 BH_WRITEBACK 0 397 398 add rsp, 96 399 pop rsp 400 ; begin epilog 401 pop rdi 402 pop rsi 403 RESTORE_GOT 404 RESTORE_XMM 405 UNSHADOW_ARGS 406 pop rbp 407 ret 408 409 410 %macro MBH_FILTER 1 411 %if %1 412 movdqa xmm2, [rsi+2*rax] ; p1 413 movdqa xmm7, [rdi] ; q1 414 %else 415 movdqa xmm2, p1 ; p1 416 movdqa xmm7, q1 ; q1 417 %endif 418 pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values 419 pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values 420 421 psubsb xmm2, xmm7 ; p1 - q1 422 pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values 423 pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values 424 movdqa xmm3, xmm0 ; q0 425 psubsb xmm0, xmm6 ; q0 - p0 426 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) 427 paddsb xmm2, xmm0 ; 2 * (q0 - p0) 428 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) 429 430 pand xmm1, xmm2 ; mask filter values we don't care about 431 movdqa xmm2, xmm1 ; vp8_filter 432 pand xmm2, xmm4; ; Filter2 = vp8_filter & hev 433 434 movdqa xmm5, xmm2 435 paddsb xmm5, [t3 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 3) 436 437 punpckhbw xmm7, xmm5 ; axbxcxdx 438 punpcklbw xmm5, xmm5 ; exfxgxhx 439 440 psraw xmm7, 11 ; sign extended shift right by 3 441 psraw xmm5, 11 ; sign extended shift right by 3 442 443 packsswb xmm5, xmm7 ; Filter2 >>=3; 444 paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) 445 446 punpckhbw xmm7, xmm2 ; axbxcxdx 447 punpcklbw xmm0, xmm2 ; exfxgxhx 448 449 psraw xmm7, 11 ; sign extended shift right by 3 450 psraw xmm0, 11 ; sign extended shift right by 3 451 452 packsswb xmm0, xmm7 ; Filter2 >>=3; 453 paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 454 455 psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 456 pandn xmm4, xmm1 ; vp8_filter&=~hev 457 %endmacro 458 459 %macro MBH_WRITEBACK 1 460 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 461 ; s = vp8_signed_char_clamp(qs0 - u); 462 ; *oq0 = s^0x80; 463 ; s = vp8_signed_char_clamp(ps0 + u); 464 ; *op0 = s^0x80; 465 pxor xmm1, xmm1 466 467 pxor xmm2, xmm2 468 punpcklbw xmm1, xmm4 469 470 punpckhbw xmm2, xmm4 471 pmulhw xmm1, [s27 GLOBAL] 472 473 pmulhw xmm2, [s27 GLOBAL] 474 paddw xmm1, [s63 GLOBAL] 475 476 paddw xmm2, [s63 GLOBAL] 477 psraw xmm1, 7 478 479 psraw xmm2, 7 480 packsswb xmm1, xmm2 481 482 psubsb xmm3, xmm1 483 paddsb xmm6, xmm1 484 485 pxor xmm3, [t80 GLOBAL] 486 pxor xmm6, [t80 GLOBAL] 487 488 %if %1 489 movdqa XMMWORD PTR [rsi+rax], xmm6 490 movdqa XMMWORD PTR [rsi], xmm3 491 %else 492 lea rsi, [rsi + rcx*2] 493 lea rdi, [rdi + rcx*2] 494 495 movq MMWORD PTR [rsi], xmm6 ; p0 496 psrldq xmm6, 8 497 movq MMWORD PTR [rdi], xmm6 498 movq MMWORD PTR [rsi + rcx], xmm3 ; q0 499 psrldq xmm3, 8 500 movq MMWORD PTR [rdi + rcx], xmm3 501 %endif 502 503 ; roughly 2/7th difference across boundary 504 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 505 ; s = vp8_signed_char_clamp(qs1 - u); 506 ; *oq1 = s^0x80; 507 ; s = vp8_signed_char_clamp(ps1 + u); 508 ; *op1 = s^0x80; 509 pxor xmm1, xmm1 510 pxor xmm2, xmm2 511 512 punpcklbw xmm1, xmm4 513 punpckhbw xmm2, xmm4 514 515 pmulhw xmm1, [s18 GLOBAL] 516 pmulhw xmm2, [s18 GLOBAL] 517 518 paddw xmm1, [s63 GLOBAL] 519 paddw xmm2, [s63 GLOBAL] 520 521 psraw xmm1, 7 522 psraw xmm2, 7 523 524 packsswb xmm1, xmm2 525 526 %if %1 527 movdqa xmm3, XMMWORD PTR [rdi] 528 movdqa xmm6, XMMWORD PTR [rsi+rax*2] ; p1 529 %else 530 movdqa xmm3, q1 ; q1 531 movdqa xmm6, p1 ; p1 532 %endif 533 534 pxor xmm3, [t80 GLOBAL] 535 pxor xmm6, [t80 GLOBAL] 536 537 paddsb xmm6, xmm1 538 psubsb xmm3, xmm1 539 540 pxor xmm6, [t80 GLOBAL] 541 pxor xmm3, [t80 GLOBAL] 542 543 %if %1 544 movdqa XMMWORD PTR [rdi], xmm3 545 movdqa XMMWORD PTR [rsi+rax*2],xmm6 546 %else 547 movq MMWORD PTR [rsi + rcx*2],xmm3 ; q1 548 psrldq xmm3, 8 549 movq MMWORD PTR [rdi + rcx*2],xmm3 550 551 movq MMWORD PTR [rsi + rax], xmm6 ; p1 552 psrldq xmm6, 8 553 movq MMWORD PTR [rdi + rax], xmm6 554 %endif 555 ; roughly 1/7th difference across boundary 556 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 557 ; s = vp8_signed_char_clamp(qs2 - u); 558 ; *oq2 = s^0x80; 559 ; s = vp8_signed_char_clamp(ps2 + u); 560 ; *op2 = s^0x80; 561 pxor xmm1, xmm1 562 pxor xmm2, xmm2 563 564 punpcklbw xmm1, xmm4 565 punpckhbw xmm2, xmm4 566 567 pmulhw xmm1, [s9 GLOBAL] 568 pmulhw xmm2, [s9 GLOBAL] 569 570 paddw xmm1, [s63 GLOBAL] 571 paddw xmm2, [s63 GLOBAL] 572 573 psraw xmm1, 7 574 psraw xmm2, 7 575 576 packsswb xmm1, xmm2 577 578 %if %1 579 movdqa xmm6, XMMWORD PTR [rdi+rax*4] 580 neg rax 581 582 movdqa xmm3, XMMWORD PTR [rdi+rax] 583 %else 584 movdqa xmm6, p2 ; p2 585 movdqa xmm3, q2 ; q2 586 %endif 587 588 pxor xmm6, [t80 GLOBAL] 589 pxor xmm3, [t80 GLOBAL] 590 591 paddsb xmm6, xmm1 592 psubsb xmm3, xmm1 593 594 pxor xmm6, [t80 GLOBAL] 595 pxor xmm3, [t80 GLOBAL] 596 %if %1 597 movdqa XMMWORD PTR [rdi+rax ],xmm3 598 neg rax 599 600 movdqa XMMWORD PTR [rdi+rax*4],xmm6 601 %else 602 movq MMWORD PTR [rsi+rax*2], xmm6 ; p2 603 psrldq xmm6, 8 604 movq MMWORD PTR [rdi+rax*2], xmm6 605 606 lea rsi, [rsi + rcx] 607 lea rdi, [rdi + rcx] 608 movq MMWORD PTR [rsi+rcx*2 ],xmm3 ; q2 609 psrldq xmm3, 8 610 movq MMWORD PTR [rdi+rcx*2 ],xmm3 611 %endif 612 %endmacro 613 614 615 ;void vp8_mbloop_filter_horizontal_edge_sse2 616 ;( 617 ; unsigned char *src_ptr, 618 ; int src_pixel_step, 619 ; const char *flimit, 620 ; const char *limit, 621 ; const char *thresh, 622 ; int count 623 ;) 624 global sym(vp8_mbloop_filter_horizontal_edge_sse2) 625 sym(vp8_mbloop_filter_horizontal_edge_sse2): 626 push rbp 627 mov rbp, rsp 628 SHADOW_ARGS_TO_STACK 6 629 SAVE_XMM 630 GET_GOT rbx 631 push rsi 632 push rdi 633 ; end prolog 634 635 ALIGN_STACK 16, rax 636 sub rsp, 32 ; reserve 32 bytes 637 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 638 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 639 640 mov rsi, arg(0) ;src_ptr 641 movsxd rax, dword ptr arg(1) ;src_pixel_step 642 643 mov rdx, arg(3) ;limit 644 movdqa xmm7, XMMWORD PTR [rdx] 645 646 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 647 648 ; calculate breakout conditions 649 LFH_FILTER_MASK 1 650 651 ; calculate high edge variance 652 LFH_HEV_MASK 653 654 ; start work on filters 655 MBH_FILTER 1 656 ; write back the result 657 MBH_WRITEBACK 1 658 659 add rsp, 32 660 pop rsp 661 ; begin epilog 662 pop rdi 663 pop rsi 664 RESTORE_GOT 665 RESTORE_XMM 666 UNSHADOW_ARGS 667 pop rbp 668 ret 669 670 671 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2 672 ;( 673 ; unsigned char *u, 674 ; int src_pixel_step, 675 ; const char *flimit, 676 ; const char *limit, 677 ; const char *thresh, 678 ; unsigned char *v 679 ;) 680 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) 681 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): 682 push rbp 683 mov rbp, rsp 684 SHADOW_ARGS_TO_STACK 6 685 SAVE_XMM 686 GET_GOT rbx 687 push rsi 688 push rdi 689 ; end prolog 690 691 ALIGN_STACK 16, rax 692 sub rsp, 96 ; reserve 96 bytes 693 %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; 694 %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; 695 %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; 696 %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; 697 %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; 698 %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; 699 700 mov rsi, arg(0) ; u 701 mov rdi, arg(5) ; v 702 movsxd rax, dword ptr arg(1) ; src_pixel_step 703 mov rcx, rax 704 neg rax ; negate pitch to deal with above border 705 706 mov rdx, arg(3) ;limit 707 movdqa xmm7, XMMWORD PTR [rdx] 708 709 lea rsi, [rsi + rcx] 710 lea rdi, [rdi + rcx] 711 712 ; calculate breakout conditions 713 LFH_FILTER_MASK 0 714 715 ; calculate high edge variance 716 LFH_HEV_MASK 717 718 ; start work on filters 719 MBH_FILTER 0 720 ; write back the result 721 MBH_WRITEBACK 0 722 723 add rsp, 96 724 pop rsp 725 ; begin epilog 726 pop rdi 727 pop rsi 728 RESTORE_GOT 729 RESTORE_XMM 730 UNSHADOW_ARGS 731 pop rbp 732 ret 733 734 735 %macro TRANSPOSE_16X8_1 0 736 movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 737 movq xmm7, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 738 739 punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 740 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 741 742 movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 743 744 movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 745 punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 746 747 movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 748 movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 749 750 punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 751 movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 752 753 movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 754 movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 755 756 punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 757 punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 758 759 punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 760 761 punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 762 punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 763 764 movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 765 movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 766 767 punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 768 punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 769 770 punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 771 punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 772 773 movdqa t0, xmm2 ; save to free XMM2 774 %endmacro 775 776 %macro TRANSPOSE_16X8_2 1 777 movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 778 movq xmm5, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 779 780 punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 781 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 782 783 movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 784 punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 785 786 movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 787 movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 788 789 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 790 movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 791 792 movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 793 punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 794 795 movdqa xmm6, xmm1 ; 796 punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 797 798 punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 799 movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 800 801 punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 802 punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 803 804 movdqa xmm0, xmm5 805 punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 806 807 808 punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 809 movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 810 811 punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 812 punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 813 814 movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 815 punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 816 817 punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 818 %if %1 819 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 820 821 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 822 823 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 824 movdqa [rdx], xmm2 ; save 2 825 826 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 827 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 828 829 movdqa [rdx+16], xmm3 ; save 3 830 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 831 832 movdqa [rdx+32], xmm4 ; save 4 833 movdqa [rdx+48], xmm5 ; save 5 834 835 movdqa xmm1, t0 ; get 836 movdqa xmm2, xmm1 ; 837 838 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 839 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 840 %else 841 movdqa [rdx+112], xmm7 ; save 7 842 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 843 844 movdqa [rdx+96], xmm6 ; save 6 845 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 846 847 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 848 movdqa [rdx+32], xmm2 ; save 2 849 850 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 851 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 852 853 movdqa [rdx+48], xmm3 ; save 3 854 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 855 856 movdqa [rdx+64], xmm4 ; save 4 857 movdqa [rdx+80], xmm5 ; save 5 858 859 movdqa xmm1, t0 ; get 860 movdqa xmm2, xmm1 861 862 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 863 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 864 865 movdqa [rdx+16], xmm1 866 movdqa [rdx], xmm2 867 %endif 868 %endmacro 869 870 %macro LFV_FILTER_MASK 1 871 movdqa xmm0, xmm6 ; q2 872 psubusb xmm0, xmm7 ; q2-q3 873 874 psubusb xmm7, xmm6 ; q3-q2 875 por xmm7, xmm0 ; abs (q3-q2) 876 877 movdqa xmm4, xmm5 ; q1 878 psubusb xmm4, xmm6 ; q1-q2 879 880 psubusb xmm6, xmm5 ; q2-q1 881 por xmm6, xmm4 ; abs (q2-q1) 882 883 movdqa xmm0, xmm1 884 885 psubusb xmm0, xmm2 ; p2 - p3; 886 psubusb xmm2, xmm1 ; p3 - p2; 887 888 por xmm0, xmm2 ; abs(p2-p3) 889 %if %1 890 movdqa xmm2, [rdx] ; p1 891 %else 892 movdqa xmm2, [rdx+32] ; p1 893 %endif 894 movdqa xmm5, xmm2 ; p1 895 896 psubusb xmm5, xmm1 ; p1-p2 897 psubusb xmm1, xmm2 ; p2-p1 898 899 por xmm1, xmm5 ; abs(p2-p1) 900 901 mov rdx, arg(3) ; limit 902 movdqa xmm4, [rdx] ; limit 903 904 psubusb xmm7, xmm4 905 906 psubusb xmm0, xmm4 ; abs(p3-p2) > limit 907 psubusb xmm1, xmm4 ; abs(p2-p1) > limit 908 909 psubusb xmm6, xmm4 ; abs(q2-q1) > limit 910 por xmm7, xmm6 ; or 911 912 por xmm0, xmm1 913 por xmm0, xmm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 914 915 movdqa xmm1, xmm2 ; p1 916 917 movdqa xmm7, xmm3 ; p0 918 psubusb xmm7, xmm2 ; p0-p1 919 920 psubusb xmm2, xmm3 ; p1-p0 921 por xmm2, xmm7 ; abs(p1-p0) 922 923 movdqa t0, xmm2 ; save abs(p1-p0) 924 lea rdx, srct 925 926 psubusb xmm2, xmm4 ; abs(p1-p0)>limit 927 por xmm0, xmm2 ; mask 928 %if %1 929 movdqa xmm5, [rdx+32] ; q0 930 movdqa xmm7, [rdx+48] ; q1 931 %else 932 movdqa xmm5, [rdx+64] ; q0 933 movdqa xmm7, [rdx+80] ; q1 934 %endif 935 movdqa xmm6, xmm5 ; q0 936 movdqa xmm2, xmm7 ; q1 937 psubusb xmm5, xmm7 ; q0-q1 938 939 psubusb xmm7, xmm6 ; q1-q0 940 por xmm7, xmm5 ; abs(q1-q0) 941 942 movdqa t1, xmm7 ; save abs(q1-q0) 943 psubusb xmm7, xmm4 ; abs(q1-q0)> limit 944 945 por xmm0, xmm7 ; mask 946 947 movdqa xmm5, xmm2 ; q1 948 psubusb xmm5, xmm1 ; q1-=p1 949 psubusb xmm1, xmm2 ; p1-=q1 950 por xmm5, xmm1 ; abs(p1-q1) 951 pand xmm5, [tfe GLOBAL] ; set lsb of each byte to zero 952 psrlw xmm5, 1 ; abs(p1-q1)/2 953 954 mov rdx, arg(2) ; flimit 955 movdqa xmm2, [rdx] ; flimit 956 957 movdqa xmm1, xmm3 ; p0 958 movdqa xmm7, xmm6 ; q0 959 psubusb xmm1, xmm7 ; p0-q0 960 psubusb xmm7, xmm3 ; q0-p0 961 por xmm1, xmm7 ; abs(q0-p0) 962 paddusb xmm1, xmm1 ; abs(q0-p0)*2 963 paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 964 965 paddb xmm2, xmm2 ; flimit*2 (less than 255) 966 paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) 967 968 psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 969 por xmm1, xmm0; ; mask 970 pxor xmm0, xmm0 971 pcmpeqb xmm1, xmm0 972 %endmacro 973 974 %macro LFV_HEV_MASK 0 975 mov rdx, arg(4) ; get thresh 976 movdqa xmm7, XMMWORD PTR [rdx] 977 978 movdqa xmm4, t0 ; get abs (q1 - q0) 979 psubusb xmm4, xmm7 ; abs(q1 - q0) > thresh 980 981 movdqa xmm3, t1 ; get abs (p1 - p0) 982 psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh 983 984 por xmm4, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 985 pcmpeqb xmm4, xmm0 986 987 pcmpeqb xmm0, xmm0 988 pxor xmm4, xmm0 989 %endmacro 990 991 %macro BV_FILTER 0 992 lea rdx, srct 993 994 movdqa xmm2, [rdx] ; p1 lea rsi, [rsi+rcx*8] 995 movdqa xmm7, [rdx+48] ; q1 996 movdqa xmm6, [rdx+16] ; p0 997 movdqa xmm0, [rdx+32] ; q0 998 999 pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values 1000 pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values 1001 1002 psubsb xmm2, xmm7 ; p1 - q1 1003 pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) 1004 1005 pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values 1006 pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values 1007 1008 movdqa xmm3, xmm0 ; q0 1009 psubsb xmm0, xmm6 ; q0 - p0 1010 1011 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 1012 paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 1013 1014 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 1015 pand xmm1, xmm2 ; mask filter values we don't care about 1016 1017 movdqa xmm2, xmm1 1018 paddsb xmm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 1019 1020 paddsb xmm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 1021 1022 punpckhbw xmm5, xmm2 1023 punpcklbw xmm2, xmm2 1024 1025 psraw xmm5, 11 1026 psraw xmm2, 11 1027 1028 packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 1029 punpcklbw xmm0, xmm1 ; exfxgxhx 1030 1031 punpckhbw xmm1, xmm1 ; axbxcxdx 1032 psraw xmm0, 11 ; sign extended shift right by 3 1033 1034 psraw xmm1, 11 ; sign extended shift right by 3 1035 movdqa xmm5, xmm0 ; save results 1036 1037 packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 1038 paddsw xmm5, [ones GLOBAL] 1039 1040 paddsw xmm1, [ones GLOBAL] 1041 psraw xmm5, 1 ; partial shifted one more time for 2nd tap 1042 1043 psraw xmm1, 1 ; partial shifted one more time for 2nd tap 1044 packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 1045 1046 pandn xmm4, xmm5 ; high edge variance additive 1047 1048 paddsb xmm6, xmm2 ; p0+= p0 add 1049 pxor xmm6, [t80 GLOBAL] ; unoffset 1050 1051 movdqa xmm1, [rdx] ; p1 1052 pxor xmm1, [t80 GLOBAL] ; reoffset 1053 1054 paddsb xmm1, xmm4 ; p1+= p1 add 1055 pxor xmm1, [t80 GLOBAL] ; unoffset 1056 1057 psubsb xmm3, xmm0 ; q0-= q0 add 1058 pxor xmm3, [t80 GLOBAL] ; unoffset 1059 1060 psubsb xmm7, xmm4 ; q1-= q1 add 1061 pxor xmm7, [t80 GLOBAL] ; unoffset 1062 %endmacro 1063 1064 %macro BV_TRANSPOSE 0 1065 ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1066 ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1067 ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1068 ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 1069 movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1070 punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1071 1072 movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1073 punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1074 1075 punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1076 punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 1077 1078 movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1079 punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 1080 1081 punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 1082 movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1083 1084 punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 1085 punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 1086 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 1087 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 1088 ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 1089 ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 1090 %endmacro 1091 1092 %macro BV_WRITEBACK 2 1093 movd [rsi+2], %1 1094 psrldq %1, 4 1095 1096 movd [rdi+2], %1 1097 psrldq %1, 4 1098 1099 movd [rsi+2*rax+2], %1 1100 psrldq %1, 4 1101 1102 movd [rdi+2*rax+2], %1 1103 1104 movd [rsi+4*rax+2], %2 1105 psrldq %2, 4 1106 1107 movd [rdi+4*rax+2], %2 1108 psrldq %2, 4 1109 1110 movd [rsi+2*rcx+2], %2 1111 psrldq %2, 4 1112 1113 movd [rdi+2*rcx+2], %2 1114 %endmacro 1115 1116 1117 ;void vp8_loop_filter_vertical_edge_sse2 1118 ;( 1119 ; unsigned char *src_ptr, 1120 ; int src_pixel_step, 1121 ; const char *flimit, 1122 ; const char *limit, 1123 ; const char *thresh, 1124 ; int count 1125 ;) 1126 global sym(vp8_loop_filter_vertical_edge_sse2) 1127 sym(vp8_loop_filter_vertical_edge_sse2): 1128 push rbp 1129 mov rbp, rsp 1130 SHADOW_ARGS_TO_STACK 6 1131 SAVE_XMM 1132 GET_GOT rbx 1133 push rsi 1134 push rdi 1135 ; end prolog 1136 1137 ALIGN_STACK 16, rax 1138 sub rsp, 96 ; reserve 96 bytes 1139 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1140 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1141 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 1142 1143 mov rsi, arg(0) ; src_ptr 1144 movsxd rax, dword ptr arg(1) ; src_pixel_step 1145 1146 lea rsi, [rsi - 4] 1147 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1148 lea rcx, [rax*2+rax] 1149 1150 ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1151 TRANSPOSE_16X8_1 1152 1153 lea rsi, [rsi+rax*8] 1154 lea rdi, [rdi+rax*8] 1155 lea rdx, srct 1156 TRANSPOSE_16X8_2 1 1157 1158 ; calculate filter mask 1159 LFV_FILTER_MASK 1 1160 ; calculate high edge variance 1161 LFV_HEV_MASK 1162 1163 ; start work on filters 1164 BV_FILTER 1165 1166 ; tranpose and write back - only work on q1, q0, p0, p1 1167 BV_TRANSPOSE 1168 ; store 16-line result 1169 1170 lea rdx, [rax] 1171 neg rdx 1172 1173 BV_WRITEBACK xmm1, xmm5 1174 1175 lea rsi, [rsi+rdx*8] 1176 lea rdi, [rdi+rdx*8] 1177 BV_WRITEBACK xmm2, xmm6 1178 1179 add rsp, 96 1180 pop rsp 1181 ; begin epilog 1182 pop rdi 1183 pop rsi 1184 RESTORE_GOT 1185 RESTORE_XMM 1186 UNSHADOW_ARGS 1187 pop rbp 1188 ret 1189 1190 1191 ;void vp8_loop_filter_vertical_edge_uv_sse2 1192 ;( 1193 ; unsigned char *u, 1194 ; int src_pixel_step, 1195 ; const char *flimit, 1196 ; const char *limit, 1197 ; const char *thresh, 1198 ; unsigned char *v 1199 ;) 1200 global sym(vp8_loop_filter_vertical_edge_uv_sse2) 1201 sym(vp8_loop_filter_vertical_edge_uv_sse2): 1202 push rbp 1203 mov rbp, rsp 1204 SHADOW_ARGS_TO_STACK 6 1205 SAVE_XMM 1206 GET_GOT rbx 1207 push rsi 1208 push rdi 1209 ; end prolog 1210 1211 ALIGN_STACK 16, rax 1212 sub rsp, 96 ; reserve 96 bytes 1213 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1214 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1215 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 1216 1217 mov rsi, arg(0) ; u_ptr 1218 movsxd rax, dword ptr arg(1) ; src_pixel_step 1219 1220 lea rsi, [rsi - 4] 1221 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1222 lea rcx, [rax+2*rax] 1223 1224 ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1225 TRANSPOSE_16X8_1 1226 1227 mov rsi, arg(5) ; v_ptr 1228 lea rsi, [rsi - 4] 1229 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1230 1231 lea rdx, srct 1232 TRANSPOSE_16X8_2 1 1233 1234 ; calculate filter mask 1235 LFV_FILTER_MASK 1 1236 ; calculate high edge variance 1237 LFV_HEV_MASK 1238 1239 ; start work on filters 1240 BV_FILTER 1241 1242 ; tranpose and write back - only work on q1, q0, p0, p1 1243 BV_TRANSPOSE 1244 1245 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1246 1247 ; store 16-line result 1248 BV_WRITEBACK xmm1, xmm5 1249 1250 mov rsi, arg(0) ; u_ptr 1251 lea rsi, [rsi - 4] 1252 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1253 BV_WRITEBACK xmm2, xmm6 1254 1255 add rsp, 96 1256 pop rsp 1257 ; begin epilog 1258 pop rdi 1259 pop rsi 1260 RESTORE_GOT 1261 RESTORE_XMM 1262 UNSHADOW_ARGS 1263 pop rbp 1264 ret 1265 1266 1267 %macro MBV_FILTER 0 1268 lea rdx, srct 1269 1270 movdqa xmm2, [rdx+32] ; p1 1271 movdqa xmm7, [rdx+80] ; q1 1272 movdqa xmm6, [rdx+48] ; p0 1273 movdqa xmm0, [rdx+64] ; q0 1274 1275 pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values 1276 pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values 1277 pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values 1278 pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values 1279 1280 psubsb xmm2, xmm7 ; p1 - q1 1281 1282 movdqa xmm3, xmm0 ; q0 1283 1284 psubsb xmm0, xmm6 ; q0 - p0 1285 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) 1286 1287 paddsb xmm2, xmm0 ; 2 * (q0 - p0) 1288 paddsb xmm2, xmm0 ; 3 * (q0 - p0)+ (p1 - q1) 1289 1290 pand xmm1, xmm2 ; mask filter values we don't care about 1291 1292 movdqa xmm2, xmm1 ; vp8_filter 1293 pand xmm2, xmm4; ; Filter2 = vp8_filter & hev 1294 1295 movdqa xmm5, xmm2 1296 paddsb xmm5, [t3 GLOBAL] 1297 1298 punpckhbw xmm7, xmm5 ; axbxcxdx 1299 punpcklbw xmm5, xmm5 ; exfxgxhx 1300 1301 psraw xmm7, 11 ; sign extended shift right by 3 1302 psraw xmm5, 11 ; sign extended shift right by 3 1303 1304 packsswb xmm5, xmm7 ; Filter2 >>=3; 1305 1306 paddsb xmm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) 1307 1308 punpcklbw xmm0, xmm2 ; exfxgxhx 1309 punpckhbw xmm7, xmm2 ; axbxcxdx 1310 1311 psraw xmm0, 11 ; sign extended shift right by 3 1312 psraw xmm7, 11 ; sign extended shift right by 3 1313 1314 packsswb xmm0, xmm7 ; Filter2 >>=3; 1315 1316 psubsb xmm3, xmm0 ; qs0 =qs0 - filter1 1317 paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 1318 1319 ; vp8_filter &= ~hev; 1320 ; Filter2 = vp8_filter; 1321 pandn xmm4, xmm1 ; vp8_filter&=~hev 1322 1323 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 1324 ; s = vp8_signed_char_clamp(qs0 - u); 1325 ; *oq0 = s^0x80; 1326 ; s = vp8_signed_char_clamp(ps0 + u); 1327 ; *op0 = s^0x80; 1328 pxor xmm1, xmm1 1329 1330 pxor xmm2, xmm2 1331 punpcklbw xmm1, xmm4 1332 1333 punpckhbw xmm2, xmm4 1334 pmulhw xmm1, [s27 GLOBAL] 1335 1336 pmulhw xmm2, [s27 GLOBAL] 1337 paddw xmm1, [s63 GLOBAL] 1338 1339 paddw xmm2, [s63 GLOBAL] 1340 psraw xmm1, 7 1341 1342 psraw xmm2, 7 1343 packsswb xmm1, xmm2 1344 1345 psubsb xmm3, xmm1 1346 paddsb xmm6, xmm1 1347 1348 pxor xmm3, [t80 GLOBAL] 1349 pxor xmm6, [t80 GLOBAL] 1350 1351 movdqa [rdx+48], xmm6 1352 movdqa [rdx+64], xmm3 1353 1354 ; roughly 2/7th difference across boundary 1355 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 1356 ; s = vp8_signed_char_clamp(qs1 - u); 1357 ; *oq1 = s^0x80; 1358 ; s = vp8_signed_char_clamp(ps1 + u); 1359 ; *op1 = s^0x80; 1360 pxor xmm1, xmm1 1361 pxor xmm2, xmm2 1362 1363 punpcklbw xmm1, xmm4 1364 punpckhbw xmm2, xmm4 1365 1366 pmulhw xmm1, [s18 GLOBAL] 1367 pmulhw xmm2, [s18 GLOBAL] 1368 1369 paddw xmm1, [s63 GLOBAL] 1370 paddw xmm2, [s63 GLOBAL] 1371 1372 psraw xmm1, 7 1373 psraw xmm2, 7 1374 1375 packsswb xmm1, xmm2 1376 1377 movdqa xmm3, [rdx + 80] ; q1 1378 movdqa xmm6, [rdx + 32] ; p1 1379 1380 pxor xmm3, [t80 GLOBAL] 1381 pxor xmm6, [t80 GLOBAL] 1382 1383 paddsb xmm6, xmm1 1384 psubsb xmm3, xmm1 1385 1386 pxor xmm6, [t80 GLOBAL] 1387 pxor xmm3, [t80 GLOBAL] 1388 1389 movdqa [rdx + 80], xmm3 1390 movdqa [rdx + 32], xmm6 1391 1392 ; roughly 1/7th difference across boundary 1393 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 1394 ; s = vp8_signed_char_clamp(qs2 - u); 1395 ; *oq2 = s^0x80; 1396 ; s = vp8_signed_char_clamp(ps2 + u); 1397 ; *op2 = s^0x80; 1398 pxor xmm1, xmm1 1399 pxor xmm2, xmm2 1400 1401 punpcklbw xmm1, xmm4 1402 punpckhbw xmm2, xmm4 1403 1404 pmulhw xmm1, [s9 GLOBAL] 1405 pmulhw xmm2, [s9 GLOBAL] 1406 1407 paddw xmm1, [s63 GLOBAL] 1408 paddw xmm2, [s63 GLOBAL] 1409 1410 psraw xmm1, 7 1411 psraw xmm2, 7 1412 1413 packsswb xmm1, xmm2 1414 1415 movdqa xmm6, [rdx+16] 1416 movdqa xmm3, [rdx+96] 1417 1418 pxor xmm6, [t80 GLOBAL] 1419 pxor xmm3, [t80 GLOBAL] 1420 1421 paddsb xmm6, xmm1 1422 psubsb xmm3, xmm1 1423 1424 pxor xmm6, [t80 GLOBAL] ; xmm6 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1425 pxor xmm3, [t80 GLOBAL] ; xmm3 = f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 15 06 1426 %endmacro 1427 1428 %macro MBV_TRANSPOSE 0 1429 movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1430 movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1431 1432 punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1433 punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1434 1435 movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1436 movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1437 1438 punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1439 punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1440 1441 movdqa xmm5, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1442 punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1443 1444 punpckhwd xmm5, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1445 movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1446 1447 punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1448 punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1449 1450 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1451 punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1452 1453 movdqa xmm6, xmm3 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 1454 punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 1455 1456 movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1457 punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 1458 1459 punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 1460 movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1461 1462 punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 1463 punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 1464 %endmacro 1465 1466 %macro MBV_WRITEBACK_1 0 1467 movq QWORD PTR [rsi], xmm0 1468 psrldq xmm0, 8 1469 1470 movq QWORD PTR [rdi], xmm0 1471 1472 movq QWORD PTR [rsi+2*rax], xmm6 1473 psrldq xmm6, 8 1474 1475 movq QWORD PTR [rdi+2*rax], xmm6 1476 1477 movdqa xmm0, xmm5 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1478 punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 1479 1480 punpckhdq xmm5, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 1481 1482 movq QWORD PTR [rsi+4*rax], xmm0 1483 psrldq xmm0, 8 1484 1485 movq QWORD PTR [rdi+4*rax], xmm0 1486 1487 movq QWORD PTR [rsi+2*rcx], xmm5 1488 psrldq xmm5, 8 1489 1490 movq QWORD PTR [rdi+2*rcx], xmm5 1491 1492 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1493 punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 1494 1495 punpckhbw xmm3, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 1496 movdqa xmm0, xmm2 1497 1498 punpcklwd xmm0, xmm3 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 1499 punpckhwd xmm2, xmm3 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 1500 1501 movdqa xmm3, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1502 punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 1503 1504 punpckhdq xmm3, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 1505 %endmacro 1506 1507 %macro MBV_WRITEBACK_2 0 1508 movq QWORD PTR [rsi], xmm1 1509 psrldq xmm1, 8 1510 1511 movq QWORD PTR [rdi], xmm1 1512 1513 movq QWORD PTR [rsi+2*rax], xmm3 1514 psrldq xmm3, 8 1515 1516 movq QWORD PTR [rdi+2*rax], xmm3 1517 1518 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1519 punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 1520 1521 punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 1522 movq QWORD PTR [rsi+4*rax], xmm1 1523 1524 psrldq xmm1, 8 1525 1526 movq QWORD PTR [rdi+4*rax], xmm1 1527 1528 movq QWORD PTR [rsi+2*rcx], xmm4 1529 psrldq xmm4, 8 1530 1531 movq QWORD PTR [rdi+2*rcx], xmm4 1532 %endmacro 1533 1534 1535 ;void vp8_mbloop_filter_vertical_edge_sse2 1536 ;( 1537 ; unsigned char *src_ptr, 1538 ; int src_pixel_step, 1539 ; const char *flimit, 1540 ; const char *limit, 1541 ; const char *thresh, 1542 ; int count 1543 ;) 1544 global sym(vp8_mbloop_filter_vertical_edge_sse2) 1545 sym(vp8_mbloop_filter_vertical_edge_sse2): 1546 push rbp 1547 mov rbp, rsp 1548 SHADOW_ARGS_TO_STACK 6 1549 SAVE_XMM 1550 GET_GOT rbx 1551 push rsi 1552 push rdi 1553 ; end prolog 1554 1555 ALIGN_STACK 16, rax 1556 sub rsp, 160 ; reserve 160 bytes 1557 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1558 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1559 %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; 1560 1561 mov rsi, arg(0) ; src_ptr 1562 movsxd rax, dword ptr arg(1) ; src_pixel_step 1563 1564 lea rsi, [rsi - 4] 1565 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1566 lea rcx, [rax*2+rax] 1567 1568 ; Transpose 1569 TRANSPOSE_16X8_1 1570 1571 lea rsi, [rsi+rax*8] 1572 lea rdi, [rdi+rax*8] 1573 lea rdx, srct 1574 TRANSPOSE_16X8_2 0 1575 1576 ; calculate filter mask 1577 LFV_FILTER_MASK 0 1578 ; calculate high edge variance 1579 LFV_HEV_MASK 1580 1581 neg rax 1582 ; start work on filters 1583 MBV_FILTER 1584 1585 lea rsi, [rsi+rax*8] 1586 lea rdi, [rdi+rax*8] 1587 1588 ; transpose and write back 1589 MBV_TRANSPOSE 1590 1591 neg rax 1592 1593 MBV_WRITEBACK_1 1594 1595 lea rsi, [rsi+rax*8] 1596 lea rdi, [rdi+rax*8] 1597 MBV_WRITEBACK_2 1598 1599 add rsp, 160 1600 pop rsp 1601 ; begin epilog 1602 pop rdi 1603 pop rsi 1604 RESTORE_GOT 1605 RESTORE_XMM 1606 UNSHADOW_ARGS 1607 pop rbp 1608 ret 1609 1610 1611 ;void vp8_mbloop_filter_vertical_edge_uv_sse2 1612 ;( 1613 ; unsigned char *u, 1614 ; int src_pixel_step, 1615 ; const char *flimit, 1616 ; const char *limit, 1617 ; const char *thresh, 1618 ; unsigned char *v 1619 ;) 1620 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) 1621 sym(vp8_mbloop_filter_vertical_edge_uv_sse2): 1622 push rbp 1623 mov rbp, rsp 1624 SHADOW_ARGS_TO_STACK 6 1625 SAVE_XMM 1626 GET_GOT rbx 1627 push rsi 1628 push rdi 1629 ; end prolog 1630 1631 ALIGN_STACK 16, rax 1632 sub rsp, 160 ; reserve 160 bytes 1633 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1634 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1635 %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; 1636 1637 mov rsi, arg(0) ; u_ptr 1638 movsxd rax, dword ptr arg(1) ; src_pixel_step 1639 1640 lea rsi, [rsi - 4] 1641 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1642 lea rcx, [rax+2*rax] 1643 1644 ; Transpose 1645 TRANSPOSE_16X8_1 1646 1647 ; XMM3 XMM4 XMM7 in use 1648 mov rsi, arg(5) ; v_ptr 1649 lea rsi, [rsi - 4] 1650 lea rdi, [rsi + rax] 1651 lea rdx, srct 1652 TRANSPOSE_16X8_2 0 1653 1654 ; calculate filter mask 1655 LFV_FILTER_MASK 0 1656 ; calculate high edge variance 1657 LFV_HEV_MASK 1658 1659 ; start work on filters 1660 MBV_FILTER 1661 1662 ; transpose and write back 1663 MBV_TRANSPOSE 1664 1665 mov rsi, arg(0) ;u_ptr 1666 lea rsi, [rsi - 4] 1667 lea rdi, [rsi + rax] 1668 MBV_WRITEBACK_1 1669 mov rsi, arg(5) ;v_ptr 1670 lea rsi, [rsi - 4] 1671 lea rdi, [rsi + rax] 1672 MBV_WRITEBACK_2 1673 1674 add rsp, 160 1675 pop rsp 1676 ; begin epilog 1677 pop rdi 1678 pop rsi 1679 RESTORE_GOT 1680 RESTORE_XMM 1681 UNSHADOW_ARGS 1682 pop rbp 1683 ret 1684 1685 1686 ;void vp8_loop_filter_simple_horizontal_edge_sse2 1687 ;( 1688 ; unsigned char *src_ptr, 1689 ; int src_pixel_step, 1690 ; const char *flimit, 1691 ; const char *limit, 1692 ; const char *thresh, 1693 ; int count 1694 ;) 1695 global sym(vp8_loop_filter_simple_horizontal_edge_sse2) 1696 sym(vp8_loop_filter_simple_horizontal_edge_sse2): 1697 push rbp 1698 mov rbp, rsp 1699 SHADOW_ARGS_TO_STACK 6 1700 SAVE_XMM 1701 GET_GOT rbx 1702 push rsi 1703 push rdi 1704 ; end prolog 1705 1706 mov rsi, arg(0) ;src_ptr 1707 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1708 mov rdx, arg(2) ;flimit ; get flimit 1709 movdqa xmm3, XMMWORD PTR [rdx] 1710 mov rdx, arg(3) ;limit 1711 movdqa xmm7, XMMWORD PTR [rdx] 1712 1713 paddb xmm3, xmm3 ; flimit*2 (less than 255) 1714 paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255) 1715 1716 mov rdi, rsi ; rdi points to row +1 for indirect addressing 1717 add rdi, rax 1718 neg rax 1719 1720 ; calculate mask 1721 movdqu xmm1, [rsi+2*rax] ; p1 1722 movdqu xmm0, [rdi] ; q1 1723 movdqa xmm2, xmm1 1724 movdqa xmm7, xmm0 1725 movdqa xmm4, xmm0 1726 psubusb xmm0, xmm1 ; q1-=p1 1727 psubusb xmm1, xmm4 ; p1-=q1 1728 por xmm1, xmm0 ; abs(p1-q1) 1729 pand xmm1, [tfe GLOBAL] ; set lsb of each byte to zero 1730 psrlw xmm1, 1 ; abs(p1-q1)/2 1731 1732 movdqu xmm5, [rsi+rax] ; p0 1733 movdqu xmm4, [rsi] ; q0 1734 movdqa xmm0, xmm4 ; q0 1735 movdqa xmm6, xmm5 ; p0 1736 psubusb xmm5, xmm4 ; p0-=q0 1737 psubusb xmm4, xmm6 ; q0-=p0 1738 por xmm5, xmm4 ; abs(p0 - q0) 1739 paddusb xmm5, xmm5 ; abs(p0-q0)*2 1740 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1741 1742 psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 1743 pxor xmm3, xmm3 1744 pcmpeqb xmm5, xmm3 1745 1746 ; start work on filters 1747 pxor xmm2, [t80 GLOBAL] ; p1 offset to convert to signed values 1748 pxor xmm7, [t80 GLOBAL] ; q1 offset to convert to signed values 1749 psubsb xmm2, xmm7 ; p1 - q1 1750 1751 pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values 1752 pxor xmm0, [t80 GLOBAL] ; offset to convert to signed values 1753 movdqa xmm3, xmm0 ; q0 1754 psubsb xmm0, xmm6 ; q0 - p0 1755 paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) 1756 paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) 1757 paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) 1758 pand xmm5, xmm2 ; mask filter values we don't care about 1759 1760 ; do + 4 side 1761 paddsb xmm5, [t4 GLOBAL] ; 3* (q0 - p0) + (p1 - q1) + 4 1762 1763 movdqa xmm0, xmm5 ; get a copy of filters 1764 psllw xmm0, 8 ; shift left 8 1765 psraw xmm0, 3 ; arithmetic shift right 11 1766 psrlw xmm0, 8 1767 movdqa xmm1, xmm5 ; get a copy of filters 1768 psraw xmm1, 11 ; arithmetic shift right 11 1769 psllw xmm1, 8 ; shift left 8 to put it back 1770 1771 por xmm0, xmm1 ; put the two together to get result 1772 1773 psubsb xmm3, xmm0 ; q0-= q0 add 1774 pxor xmm3, [t80 GLOBAL] ; unoffset 1775 movdqu [rsi], xmm3 ; write back 1776 1777 ; now do +3 side 1778 psubsb xmm5, [t1s GLOBAL] ; +3 instead of +4 1779 1780 movdqa xmm0, xmm5 ; get a copy of filters 1781 psllw xmm0, 8 ; shift left 8 1782 psraw xmm0, 3 ; arithmetic shift right 11 1783 psrlw xmm0, 8 1784 psraw xmm5, 11 ; arithmetic shift right 11 1785 psllw xmm5, 8 ; shift left 8 to put it back 1786 por xmm0, xmm5 ; put the two together to get result 1787 1788 1789 paddsb xmm6, xmm0 ; p0+= p0 add 1790 pxor xmm6, [t80 GLOBAL] ; unoffset 1791 movdqu [rsi+rax], xmm6 ; write back 1792 1793 ; begin epilog 1794 pop rdi 1795 pop rsi 1796 RESTORE_GOT 1797 RESTORE_XMM 1798 UNSHADOW_ARGS 1799 pop rbp 1800 ret 1801 1802 1803 ;void vp8_loop_filter_simple_vertical_edge_sse2 1804 ;( 1805 ; unsigned char *src_ptr, 1806 ; int src_pixel_step, 1807 ; const char *flimit, 1808 ; const char *limit, 1809 ; const char *thresh, 1810 ; int count 1811 ;) 1812 global sym(vp8_loop_filter_simple_vertical_edge_sse2) 1813 sym(vp8_loop_filter_simple_vertical_edge_sse2): 1814 push rbp ; save old base pointer value. 1815 mov rbp, rsp ; set new base pointer value. 1816 SHADOW_ARGS_TO_STACK 6 1817 SAVE_XMM 1818 GET_GOT rbx ; save callee-saved reg 1819 push rsi 1820 push rdi 1821 ; end prolog 1822 1823 ALIGN_STACK 16, rax 1824 sub rsp, 32 ; reserve 32 bytes 1825 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1826 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1827 1828 mov rsi, arg(0) ;src_ptr 1829 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1830 1831 lea rsi, [rsi - 2 ] 1832 lea rdi, [rsi + rax] 1833 lea rdx, [rsi + rax*4] 1834 lea rcx, [rdx + rax] 1835 1836 movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 1837 movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 1838 movdqu xmm2, [rdi] ; 13 12 11 10 1839 movdqu xmm3, [rcx] ; 53 52 51 50 1840 punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 1841 punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 1842 1843 movdqu xmm4, [rsi + rax*2] ; 23 22 21 20 1844 movdqu xmm5, [rdx + rax*2] ; 63 62 61 60 1845 movdqu xmm6, [rdi + rax*2] ; 33 32 31 30 1846 movdqu xmm7, [rcx + rax*2] ; 73 72 71 70 1847 punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 1848 punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 1849 1850 punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 1851 punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 1852 1853 movdqa xmm1, xmm0 1854 punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 1855 punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 1856 1857 movdqa xmm2, xmm0 1858 punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 1859 punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 1860 1861 movdqa t0, xmm0 ; save to t0 1862 movdqa t1, xmm2 ; save to t1 1863 1864 lea rsi, [rsi + rax*8] 1865 lea rdi, [rsi + rax] 1866 lea rdx, [rsi + rax*4] 1867 lea rcx, [rdx + rax] 1868 1869 movdqu xmm4, [rsi] ; 83 82 81 80 1870 movdqu xmm1, [rdx] ; c3 c2 c1 c0 1871 movdqu xmm6, [rdi] ; 93 92 91 90 1872 movdqu xmm3, [rcx] ; d3 d2 d1 d0 1873 punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 1874 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 1875 1876 movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0 1877 movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0 1878 movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0 1879 movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0 1880 punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 1881 punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 1882 1883 punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 1884 punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 1885 1886 movdqa xmm1, xmm4 1887 punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 1888 punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 1889 1890 movdqa xmm6, xmm4 1891 punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 1892 punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 1893 1894 movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 1895 movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 1896 movdqa xmm1, xmm0 1897 movdqa xmm3, xmm2 1898 1899 punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1900 punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1901 punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1902 punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1903 1904 ; calculate mask 1905 movdqa xmm6, xmm0 ; p1 1906 movdqa xmm7, xmm3 ; q1 1907 psubusb xmm7, xmm0 ; q1-=p1 1908 psubusb xmm6, xmm3 ; p1-=q1 1909 por xmm6, xmm7 ; abs(p1-q1) 1910 pand xmm6, [tfe GLOBAL] ; set lsb of each byte to zero 1911 psrlw xmm6, 1 ; abs(p1-q1)/2 1912 1913 movdqa xmm5, xmm1 ; p0 1914 movdqa xmm4, xmm2 ; q0 1915 psubusb xmm5, xmm2 ; p0-=q0 1916 psubusb xmm4, xmm1 ; q0-=p0 1917 por xmm5, xmm4 ; abs(p0 - q0) 1918 paddusb xmm5, xmm5 ; abs(p0-q0)*2 1919 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1920 1921 mov rdx, arg(2) ;flimit 1922 movdqa xmm7, XMMWORD PTR [rdx] 1923 mov rdx, arg(3) ; get limit 1924 movdqa xmm6, XMMWORD PTR [rdx] 1925 paddb xmm7, xmm7 ; flimit*2 (less than 255) 1926 paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255) 1927 1928 psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 1929 pxor xmm7, xmm7 1930 pcmpeqb xmm5, xmm7 ; mm5 = mask 1931 1932 ; start work on filters 1933 movdqa t0, xmm0 1934 movdqa t1, xmm3 1935 1936 pxor xmm0, [t80 GLOBAL] ; p1 offset to convert to signed values 1937 pxor xmm3, [t80 GLOBAL] ; q1 offset to convert to signed values 1938 1939 psubsb xmm0, xmm3 ; p1 - q1 1940 movdqa xmm6, xmm1 ; p0 1941 1942 movdqa xmm7, xmm2 ; q0 1943 pxor xmm6, [t80 GLOBAL] ; offset to convert to signed values 1944 1945 pxor xmm7, [t80 GLOBAL] ; offset to convert to signed values 1946 movdqa xmm3, xmm7 ; offseted ; q0 1947 1948 psubsb xmm7, xmm6 ; q0 - p0 1949 paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) 1950 1951 paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) 1952 paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) 1953 1954 pand xmm5, xmm0 ; mask filter values we don't care about 1955 1956 1957 paddsb xmm5, [t4 GLOBAL] ; 3* (q0 - p0) + (p1 - q1) + 4 1958 1959 movdqa xmm0, xmm5 ; get a copy of filters 1960 psllw xmm0, 8 ; shift left 8 1961 1962 psraw xmm0, 3 ; arithmetic shift right 11 1963 psrlw xmm0, 8 1964 1965 movdqa xmm7, xmm5 ; get a copy of filters 1966 psraw xmm7, 11 ; arithmetic shift right 11 1967 1968 psllw xmm7, 8 ; shift left 8 to put it back 1969 por xmm0, xmm7 ; put the two together to get result 1970 1971 psubsb xmm3, xmm0 ; q0-= q0sz add 1972 pxor xmm3, [t80 GLOBAL] ; unoffset q0 1973 1974 ; now do +3 side 1975 psubsb xmm5, [t1s GLOBAL] ; +3 instead of +4 1976 movdqa xmm0, xmm5 ; get a copy of filters 1977 1978 psllw xmm0, 8 ; shift left 8 1979 psraw xmm0, 3 ; arithmetic shift right 11 1980 1981 psrlw xmm0, 8 1982 psraw xmm5, 11 ; arithmetic shift right 11 1983 1984 psllw xmm5, 8 ; shift left 8 to put it back 1985 por xmm0, xmm5 ; put the two together to get result 1986 1987 paddsb xmm6, xmm0 ; p0+= p0 add 1988 pxor xmm6, [t80 GLOBAL] ; unoffset p0 1989 1990 movdqa xmm0, t0 ; p1 1991 movdqa xmm4, t1 ; q1 1992 1993 ; transpose back to write out 1994 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1995 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1996 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1997 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1998 movdqa xmm1, xmm0 1999 punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 2000 punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 2001 2002 movdqa xmm5, xmm3 2003 punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 2004 punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 2005 2006 movdqa xmm2, xmm0 2007 punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 2008 punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 2009 2010 movdqa xmm3, xmm1 2011 punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 2012 punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 2013 2014 ; write out order: xmm0 xmm2 xmm1 xmm3 2015 lea rdx, [rsi + rax*4] 2016 2017 movd [rsi], xmm1 ; write the second 8-line result 2018 psrldq xmm1, 4 2019 movd [rdi], xmm1 2020 psrldq xmm1, 4 2021 movd [rsi + rax*2], xmm1 2022 psrldq xmm1, 4 2023 movd [rdi + rax*2], xmm1 2024 2025 movd [rdx], xmm3 2026 psrldq xmm3, 4 2027 movd [rcx], xmm3 2028 psrldq xmm3, 4 2029 movd [rdx + rax*2], xmm3 2030 psrldq xmm3, 4 2031 movd [rcx + rax*2], xmm3 2032 2033 neg rax 2034 lea rsi, [rsi + rax*8] 2035 neg rax 2036 lea rdi, [rsi + rax] 2037 lea rdx, [rsi + rax*4] 2038 lea rcx, [rdx + rax] 2039 2040 movd [rsi], xmm0 ; write the first 8-line result 2041 psrldq xmm0, 4 2042 movd [rdi], xmm0 2043 psrldq xmm0, 4 2044 movd [rsi + rax*2], xmm0 2045 psrldq xmm0, 4 2046 movd [rdi + rax*2], xmm0 2047 2048 movd [rdx], xmm2 2049 psrldq xmm2, 4 2050 movd [rcx], xmm2 2051 psrldq xmm2, 4 2052 movd [rdx + rax*2], xmm2 2053 psrldq xmm2, 4 2054 movd [rcx + rax*2], xmm2 2055 2056 add rsp, 32 2057 pop rsp 2058 ; begin epilog 2059 pop rdi 2060 pop rsi 2061 RESTORE_GOT 2062 RESTORE_XMM 2063 UNSHADOW_ARGS 2064 pop rbp 2065 ret 2066 2067 SECTION_RODATA 2068 align 16 2069 tfe: 2070 times 16 db 0xfe 2071 align 16 2072 t80: 2073 times 16 db 0x80 2074 align 16 2075 t1s: 2076 times 16 db 0x01 2077 align 16 2078 t3: 2079 times 16 db 0x03 2080 align 16 2081 t4: 2082 times 16 db 0x04 2083 align 16 2084 ones: 2085 times 8 dw 0x0001 2086 align 16 2087 s27: 2088 times 8 dw 0x1b00 2089 align 16 2090 s18: 2091 times 8 dw 0x1200 2092 align 16 2093 s9: 2094 times 8 dw 0x0900 2095 align 16 2096 s63: 2097 times 8 dw 0x003f 2098